def darwinize_list(termlist, dwccloudfile, namespace=None): ''' Translate the terms in a list to standard Darwin Core terms. parameters: termlist - list of values to translate (required) dwccloudfile - the vocabulary file for the Darwin Cloud (required) encoding - a string designating the input file encoding (optional; default None) (e.g., 'utf-8', 'mac_roman', 'latin_1', 'cp1252') returns: a list with all translatable terms translated ''' functionname = 'darwinize_list()' if termlist is None or len(termlist) == 0: s = 'No termlist given in %s.' % functionname logging.debug(s) return None dialect = csv_file_dialect(dwccloudfile) # No need to check if dwccloudfile is given and exists, vetted_vocab_dict_from_file() # does that. darwinclouddict = darwin_cloud_vocab_dict_from_file(dwccloudfile) if darwinclouddict is None: s = 'No Darwin Cloud terms in %s.' % functionname logging.debug(s) return None thelist = [] for term in termlist: thelist.append(ustripstr(term)) addnamespace = False if namespace is not None and 'y' in namespace: addnamespace = True darwinizedlist = [] i = 0 j = 1 for term in thelist: if term in darwinclouddict: if darwinclouddict[term]['standard'] is not None and \ len(darwinclouddict[term]['standard'].strip()) > 0: if addnamespace == True: ns = darwinclouddict[term]['namespace'] newterm = ns + ':' + darwinclouddict[term]['standard'] else: newterm = darwinclouddict[term]['standard'] else: newterm = termlist[i].strip() else: newterm = termlist[i].strip() if len(newterm) == 0: newterm = 'UNNAMED_COLUMN_%s' % j j += 1 darwinizedlist.append(newterm) i += 1 return darwinizedlist
def terms_not_in_darwin_cloud(checklist, dwccloudfile, encoding=None, vetted=True, casesensitive=False): ''' Get the list of distinct values in a checklist that are not in the Darwin Cloud vocabulary. Verbatim values in the Darwin Cloud vocabulary should be lower-case and stripped already, so that is what must be matched here. The Darwin Cloud vocabulary should have the case-sensitive standard value. parameters: checklist - list of values to check against the target list (required) dwccloudfile - the vocabulary file for the Darwin Cloud (required) vetted - set to False if unvetted values should also be returned (default True) encoding - a string designating the input file encoding (optional; default None) (e.g., 'utf-8', 'mac_roman', 'latin_1', 'cp1252') returns: a sorted list of distinct new values not in the Darwin Cloud vocabulary ''' functionname = 'terms_not_in_darwin_cloud()' if checklist is None or len(checklist) == 0: s = 'No checklist given in %s.' % functionname logging.debug(s) return None dialect = csv_file_dialect(dwccloudfile) # Try to determine the encoding of the inputfile. if encoding is None or len(encoding.strip()) == 0: encoding = csv_file_encoding(dwccloudfile) # csv_file_encoding() always returns an encoding if there is an input file. # No need to check if dwccloudfile is given and exists, vocab_dict_from_file() and # vetted_vocab_dict_from_file() do that. if vetted == True: darwinclouddict = vetted_vocab_dict_from_file(dwccloudfile, 'fieldname', dialect=dialect, encoding=encoding) else: darwinclouddict = vocab_dict_from_file(dwccloudfile, 'fieldname', dialect=dialect, encoding=encoding) dwcloudlist = [] for key, value in darwinclouddict.iteritems(): dwcloudlist.append(key) if casesensitive == True: return not_in_list(dwcloudlist, checklist) lowerdwclist = [] for term in dwcloudlist: lowerdwclist.append(ustripstr(term)) notfound = not_in_list(lowerdwclist, checklist, function=ustripstr) return notfound
def terms_not_in_dwc(checklist, casesensitive=False): ''' From a list of terms, get those that are not Darwin Core terms. parameters: checklist - list of values to check against Darwin Core (required) casesensitive - True if the test for inclusion is case sensitive (default True) returns: a sorted list of non-Darwin Core terms from the checklist ''' # No need to check if checklist is given, not_in_list() does that if casesensitive == True: return not_in_list(simpledwctermlist, checklist) lowerdwc = [] for term in simpledwctermlist: lowerdwc.append(ustripstr(term)) notfound = not_in_list(lowerdwc, checklist, function=ustripstr) return notfound
def term_standardizer_report(inputfile, reportfile, vocabfile, key, separator=None, encoding=None, format=None): ''' Write a file with substitutions from a vocabfile for fields in a key and appended terms showing the original values. parameters: inputfile - full path to the input file (required) reportfile - full path to the output file (required) vocabfile - path to the vocabulary file (required) key - field or separator-separated fields to set (required) separator - string to use as the key and value separator (optional; default '|') encoding - string signifying the encoding of the input file. If known, it speeds up processing a great deal. (optional; default None) (e.g., 'utf-8') format - string signifying the csv.dialect of the report file ('csv' or 'txt') (optional; default: txt) returns: success - True if the report was written, else False ''' functionname = 'term_standardizer_report()' if reportfile is None or len(reportfile) == 0: s = 'No reportfile name given in %s.' % functionname logging.debug(s) return False if inputfile is None or len(inputfile) == 0: s = 'No inputfile file given in %s.' % functionname logging.debug(s) return False if os.path.isfile(inputfile) == False: s = 'Inputfile file %s not found in %s.' % (inputfile, functionname) logging.debug(s) return False # Determine the dialect of the input file inputdialect = csv_file_dialect(inputfile) # Determine the dialect of the input file if encoding is None or len(encoding.strip()) == 0: encoding = csv_file_encoding(inputfile) # Read the header from the input file inputheader = read_header(inputfile, dialect=inputdialect, encoding=encoding) if inputheader is None: s = 'Unable to read header from input file %s in %s.' % (inputfile, functionname) logging.debug(s) return False if key is None or len(key.strip()) == 0: s = 'No key given in %s.' % functionname logging.debug(s) return False # Make sure there is a separator for the next step if separator is None or len(separator) == 0: separator = '|' # Make a list of the fields in the key by splitting it on the separator fieldlist = key.split(separator) # Assume none of the fields is in the file headerhaskey = False # Search the cleaned up header for any field from the key cleanedinputheader = strip_list(inputheader) for field in fieldlist: if field in cleanedinputheader: headerhaskey = True break if headerhaskey == False: s = 'No field from %s found ' % fieldlist s += 'in input file %s in %s.' % (inputfile, functionname) logging.debug(s) return False if vocabfile is None or len(vocabfile) == 0: logging.debug('No vocabulary file given in %s.') % functionname return False if os.path.isfile(vocabfile) == False: s = 'Vocabulary file %s not found in %s.' % (vocabfile, functionname) logging.debug(s) return False # Get the vocabulary dictionary, but convert all entries using ustripstr. Assume # vocabulary file is encoded as utf-8. vocabdict = vocab_dict_from_file(vocabfile, key, encoding='utf-8', \ separator=separator, function=ustripstr) if len(vocabdict) == 0: s = 'Vocabulary file %s ' % vocabfile s += 'had zero recommendations in %s.' % functionname logging.debug(s) return False if format is None or format.lower() == 'txt': dialect = tsv_dialect() else: dialect = csv_dialect() if format is None or format.lower() == 'txt': outputdialect = tsv_dialect() else: outputdialect = csv_dialect() # Create an output header that is the same as the input header with fields # appended to hold the original values of the key fields # Get the fields to add by splitting the key with the separator outputheader = cleanedinputheader for field in fieldlist: if field in outputheader: outputheader = outputheader + [field + '_orig'] else: outputheader = outputheader + [field] # Create the outputfile and write the new header to it write_header(reportfile, outputheader, outputdialect) # Check to see if the outputfile was created if os.path.isfile(reportfile) == False: s = 'reportfile: %s not created in %s.' % (reportfile, functionname) logging.debug(s) return False # Open the outputfile to append rows having the added fields with open(reportfile, 'a') as outfile: writer = csv.DictWriter(outfile, dialect=outputdialect, encoding='utf-8', fieldnames=outputheader) # Iterate through all rows in the input file for row in read_csv_row(inputfile, dialect=inputdialect, encoding=encoding, header=True, fieldnames=cleanedinputheader): # Set the _orig values for every field in the field list that exists in # the row for field in fieldlist: if field in row: row[field + '_orig'] = row[field] # Construct a composite field value for the row to match a key in the # vocabulary file rowkey = extract_values_from_row(row, fieldlist, separator) # Get dictionary for recommended value for the ustripstr(rowkey) newvaluedict = recommended_value(vocabdict, ustripstr(rowkey)) # Only make changes if there is a standardized value found if newvaluedict is not None: # ustripstr(rowkey) was found in the vocabulary # Get the standard value standard = newvaluedict['standard'] # Treat standard value that is None or only whitespace as '' if standard is None or len(standard.strip()) == 0: standard = '' # Make a list of values given in standard newvalues = standard.split(separator) # Only make changes if the number of recommendation fields is the # same as the number of fields in the key if len(newvalues) == len(fieldlist): i = 0 # Update or add new value to field in the fieldlist for field in fieldlist: row[field] = newvalues[i] i += 1 writer.writerow(row) s = 'Report written to %s in %s.' % (reportfile, functionname) logging.debug(s) return True
def missing_vocab_list_from_file(checklist, vocabfile, key, separator=None, dialect=None, encoding=None): ''' Given a checklist of values, get values not found in the given vocabulary file. Values can match exactly, or they can match after making them upper case and stripping whitespace. parameters: checklist - list of values to get from the vocabfile (required) vocabfile - full path to the vocabulary lookup file (required) key - the field or separator-separated fieldnames that hold the distinct values in the vocabulary file (required) separator - string to use as the value separator in the string (optional; default None) dialect - csv.dialect object with the attributes of the vocabulary lookup file (default None) encoding - a string designating the input file encoding (optional; default None) (e.g., 'utf-8', 'mac_roman', 'latin_1', 'cp1252') returns: missingvocabdict - values in the checklist not found in the vocabulary file ''' functionname = 'missing_vocab_list_from_file()' if checklist is None or len(checklist) == 0: s = 'No list of values given in %s.' % functionname logging.debug(s) return None vocabdict = vocab_dict_from_file(vocabfile, key, separator, dialect, encoding) if vocabdict is None or len(vocabdict) == 0: s = 'No vocabdict constructed in %s.' % functionname logging.debug(s) return None missingvocabset = set() # Look through every value in the checklist for value in checklist: if separator is None: terms = [value] else: try: terms = value.split(separator) except Exception, e: s = 'Exception splitting value: %s Exception: %s ' % (value, e) s += 'in %s' % functionname logging.debug(s) terms = [value] # cop out newvalue = '' n = 0 for term in terms: if n == 0: newvalue = ustripstr(term) n = 1 else: newvalue = newvalue + separator + ustripstr(term) # If value or newvalue is in the vocabulary, nevermind if value in vocabdict or newvalue in vocabdict: pass # Otherwise, add the upper case, stripped value to the list else: missingvocabset.add(newvalue)
def matching_vocab_dict_from_file(checklist, vocabfile, key, separator=None, dialect=None, encoding=None): ''' Given a checklist of values, get matching values from a vocabulary file. Values can match exactly, or they can match after making them upper case and stripping whitespace. parameters: checklist - list of values to get from the vocabfile (required) vocabfile - full path to the vocabulary lookup file (required) key - the field or separator-separated fieldnames that hold the distinct values in the vocabulary file (required) separator - string to use as the value separator in the string (optional; default None) dialect - csv.dialect object with the attributes of the vocabulary lookup file (default None) encoding - a string designating the input file encoding (optional; default None) (e.g., 'utf-8', 'mac_roman', 'latin_1', 'cp1252') returns: matchingvocabdict - dictionary of complete vocabulary records matching the values in the checklist ''' functionname = 'matching_vocab_dict_from_file()' if checklist is None or len(checklist) == 0: s = 'No list of values given in %s.' % functionname logging.debug(s) return None #print 'checklist: %s' % checklist vocabdict = vocab_dict_from_file(vocabfile, key, separator, dialect, encoding) if vocabdict is None or len(vocabdict) == 0: s = 'No vocabdict constructed in %s' % functionname logging.debug(s) return None #print 'vocabdict: %s' % vocabdict matchingvocabdict = {} # Look through every value in the checklist for value in checklist: if separator is None: terms = [value] else: try: terms = value.split(separator) except Exception, e: s = 'Exception splitting value: %s Exception: %s ' % (value, e) s += 'in %s' % functionname logging.debug(s) terms = [value] # cop out newvalue = '' n = 0 for term in terms: if n == 0: newvalue = ustripstr(term) n = 1 else: newvalue = newvalue + separator + ustripstr(term) # If the simplified version of the value is in the dictionary, get the # vocabulary entry for it. if value in vocabdict or newvalue in vocabdict: matchingvocabdict[value] = vocabdict[newvalue]