def _check_term_consistency(self): """Check if the loaded terms and inclusions/exclusions are consistent size.""" if self.inclusions and self.n_terms != len(self.inclusions): raise InconsistentDataError( 'Mismatch in number of inclusions and terms!') if self.exclusions and self.n_terms != len(self.exclusions): raise InconsistentDataError( 'Mismatch in number of exclusions and terms!')
def _check_term_consistency(self): """Check if loaded term definitions are consistent.""" if self.n_terms != len(self.inclusions): raise InconsistentDataError( 'There is a mismatch in number of inclusions and terms.') if self.n_terms != len(self.exclusions): raise InconsistentDataError( 'There is a mismatch in number of exclusions and terms.') if self.n_terms != len(self._labels): raise InconsistentDataError( 'There is a mismatch in number of labels and terms.')
def load_pickle_obj(f_name, db=None): """Load a custom object, from a pickle file, for SCANR project. Parameters ---------- f_name : str File name of the object to be loaded. db : SCDB object, optional Database object for the SCANR project. """ # Check for database object, initialize if not provided db = check_db(db) # Get all available files, for Count and Words pickled objects counts_objs = os.listdir(db.counts_path) words_objs = os.listdir(db.words_path) # Search for object in saved Count files, and set path if found if f_name + '.p' in counts_objs: load_path = os.path.join(db.counts_path, f_name + '.p') # Search for object in saved Words files, and set path if found elif f_name + '.p' in words_objs: load_path = os.path.join(db.words_path, f_name + '.p') # Raise an error if the file name is not found else: raise InconsistentDataError('Can not find requested file name.') # Load and return the data return pickle.load(open(load_path, 'rb'))
def _check_labels(self): """Check loaded terms and labels, and set None labels if needed.""" if self.has_terms and (not self._labels or self._labels == [None] * len(self._labels)): self._set_none_labels() if not len(self.labels) == len(set(self.labels)): raise InconsistentDataError( 'Not all labels are unique. Labels must be unique.')
def check_args(self, args_to_use): """Checks whether the requested arguments are defined, so that they can be used. Parameters ---------- args_to_use : list of str Requested arguments to check that they are defined. """ # Check that all requested arguments are available. Catch and raise custom error if not try: [self.args[arg] for arg in args_to_use] except KeyError: raise InconsistentDataError('Not all requested settings provided - can not proceed.')
def _check_results(self): """Check for consistency in extracted results. Notes ----- If everything worked, each data field (ids, titles, words, etc) should have the same length, equal to the number of articles. Some entries may be blank (missing data), but if the lengths are not the same then the data does not line up and something went wrong. """ # Check that all data fields have length n_articles if not (self.n_articles == len(self.ids) == len(self.titles) == len(self.words) == len(self.journals) == len(self.authors) == len(self.keywords) == len(self.years) == len(self.dois)): raise InconsistentDataError('Words data is inconsistent.')
def set_exclusions_file(self): """Load exclusion words from a txt file.""" # Unload previous terms if some are already loaded self.unload_exclusions() # Get exclusion words from module data file exclusions = _terms_load_file('exclusions') # Check that the number of exclusions matches n_terms if len(exclusions) != self.n_terms: raise InconsistentDataError( 'Mismatch in number of exclusions and terms!') # Drop number indices for exclusions, and set as list for i in range(self.n_terms): self.exclusions.append(exclusions[i][3:].split(','))
def set_exclusions(self, exclusions): """Sets the given list of strings as exclusion words. Parameters ---------- exclusions : list of str OR list of list of str List of exclusion words to be used. """ # Unload previous terms if some are already loaded self.unload_exclusions() # Set given list as exclusion words for exclude in exclusions: self.exclusions.append(_check_type(exclude)) # Check that the number of exclusions matches n_terms if len(exclusions) != self.n_terms: raise InconsistentDataError( 'Mismatch in number of exclusions and terms!')
def check_results(self): """Check for consistencty in extracted results. If everything worked, each data field (ids, titles, words, years) should have the same length, equal to the number of articles. Some entries may be blank (missing data), but if the lengths are not the same then the data does not line up and cannot be trusted. """ # Check that all data fields have length n_articles if not (self.n_articles == len(self.ids) == len(self.titles) == len( self.words) == len(self.journals) == len(self.authors) == len( self.kws) == len(self.years) == len(self.months) == len( self.dois)): # If not, print out error self.update_history('Failed Check') raise InconsistentDataError('term Words data is inconsistent.') # Update history self.update_history('Passed Check')
def save_pickle_obj(obj, f_name, db=None): """Save a custom object from LISC as a pickle file. Parameters ---------- obj : {Counts() object, Words() object} LISC custom object to save out. f_name : str Name to append to saved out file name. db : SCDB() object, optional Database object for the LISC project. """ # Check for database object, initialize if not provided db = check_db(db) # If it's a Counts object, set path and name if isinstance(obj, Count): save_name = f_name + '_counts.p' save_path = db.counts_path # If it's a Words object, set path and name elif isinstance(obj, Words): save_name = f_name + '_words.p' save_path = db.words_path # If neither, raise error as object type is unclear else: raise InconsistentDataError('Object type unclear - can not save.') # Save out labels header file #with open(os.path.join(save_path, 'labels.txt'), 'w') as outfile: # for label in obj.labels: # outfile.write("%s\n" % label) # Save pickle file save_file = os.path.join(save_path, save_name) pickle.dump(obj, open(save_file, 'wb'))