def parse_props(inputfile, outputfile): ''' Function to parse the dynamic properties field values into separate columns parameters: inputfile - the full path to the input file outputfile - the full path to the output file returns: success - True if the task is completed, otherwise False ''' functionname = 'parse_props()' # Check for required values if inputfile is None or len(inputfile) == 0: s = 'No input file given in %s.' % functionname logging.debug(s) return False if outputfile is None or len(outputfile) == 0: s = 'No output file given in %s.' % functionname logging.debug(s) return False # determine the csv dialect of the inputfile and create empty set for fields dialect = csv_file_dialect(inputfile) propfields = set() # initial parse of the csv file to collect new fields to add based # on values of dynamicproperties with open(inputfile, 'rb') as csvfile: csvreader = csv.DictReader(csvfile, dialect=dialect) fieldnames = csvreader.fieldnames for row in csvreader: properties = parse_row(row) fields = set(properties.keys()) propfields = propfields.union(fields) # add all the parsed properties to the list of fieldnames fieldnames += propfields # add new fields and parsed values to the end of the output csvfile with open(outputfile, 'w') as csvout: csvwriter = csv.DictWriter(csvout, fieldnames, dialect=dialect) csvwriter.writeheader() with open(inputfile, 'rb') as csvfile: csvreader = csv.DictReader(csvfile, dialect=dialect) for row in csvreader: properties = parse_row(row) row.update(properties) csvwriter.writerow(row) s = 'Parsed csv written to %s in %s.' % (outputfile, functionname) logging.debug(s) # Success return True
def darwinize_list(termlist, dwccloudfile, namespace=None): ''' Translate the terms in a list to standard Darwin Core terms. parameters: termlist - list of values to translate (required) dwccloudfile - the vocabulary file for the Darwin Cloud (required) encoding - a string designating the input file encoding (optional; default None) (e.g., 'utf-8', 'mac_roman', 'latin_1', 'cp1252') returns: a list with all translatable terms translated ''' functionname = 'darwinize_list()' if termlist is None or len(termlist) == 0: s = 'No termlist given in %s.' % functionname logging.debug(s) return None dialect = csv_file_dialect(dwccloudfile) # No need to check if dwccloudfile is given and exists, vetted_vocab_dict_from_file() # does that. darwinclouddict = darwin_cloud_vocab_dict_from_file(dwccloudfile) if darwinclouddict is None: s = 'No Darwin Cloud terms in %s.' % functionname logging.debug(s) return None thelist = [] for term in termlist: thelist.append(ustripstr(term)) addnamespace = False if namespace is not None and 'y' in namespace: addnamespace = True darwinizedlist = [] i = 0 j = 1 for term in thelist: if term in darwinclouddict: if darwinclouddict[term]['standard'] is not None and \ len(darwinclouddict[term]['standard'].strip()) > 0: if addnamespace == True: ns = darwinclouddict[term]['namespace'] newterm = ns + ':' + darwinclouddict[term]['standard'] else: newterm = darwinclouddict[term]['standard'] else: newterm = termlist[i].strip() else: newterm = termlist[i].strip() if len(newterm) == 0: newterm = 'UNNAMED_COLUMN_%s' % j j += 1 darwinizedlist.append(newterm) i += 1 return darwinizedlist
def terms_not_in_darwin_cloud(checklist, dwccloudfile, encoding=None, vetted=True, casesensitive=False): ''' Get the list of distinct values in a checklist that are not in the Darwin Cloud vocabulary. Verbatim values in the Darwin Cloud vocabulary should be lower-case and stripped already, so that is what must be matched here. The Darwin Cloud vocabulary should have the case-sensitive standard value. parameters: checklist - list of values to check against the target list (required) dwccloudfile - the vocabulary file for the Darwin Cloud (required) vetted - set to False if unvetted values should also be returned (default True) encoding - a string designating the input file encoding (optional; default None) (e.g., 'utf-8', 'mac_roman', 'latin_1', 'cp1252') returns: a sorted list of distinct new values not in the Darwin Cloud vocabulary ''' functionname = 'terms_not_in_darwin_cloud()' if checklist is None or len(checklist) == 0: s = 'No checklist given in %s.' % functionname logging.debug(s) return None dialect = csv_file_dialect(dwccloudfile) # Try to determine the encoding of the inputfile. if encoding is None or len(encoding.strip()) == 0: encoding = csv_file_encoding(dwccloudfile) # csv_file_encoding() always returns an encoding if there is an input file. # No need to check if dwccloudfile is given and exists, vocab_dict_from_file() and # vetted_vocab_dict_from_file() do that. if vetted == True: darwinclouddict = vetted_vocab_dict_from_file(dwccloudfile, 'fieldname', dialect=dialect, encoding=encoding) else: darwinclouddict = vocab_dict_from_file(dwccloudfile, 'fieldname', dialect=dialect, encoding=encoding) dwcloudlist = [] for key, value in darwinclouddict.iteritems(): dwcloudlist.append(key) if casesensitive == True: return not_in_list(dwcloudlist, checklist) lowerdwclist = [] for term in dwcloudlist: lowerdwclist.append(ustripstr(term)) notfound = not_in_list(lowerdwclist, checklist, function=ustripstr) return notfound
def darwin_cloud_vocab_dict_from_file(vocabfile): ''' Get a Darwin Cloud vocabulary as a dictionary from a file. parameters: vocabfile - path to the vocabulary file (required) returns: vocabdict - dictionary of complete vocabulary records ''' functionname = 'darwin_cloud_vocab_dict_from_file()' if vocabfile is None or len(vocabfile) == 0: s = 'No vocabulary file given in %s.' % functionname logging.debug(s) return None if os.path.isfile(vocabfile) == False: s = 'Vocabulary file %s not found in %s.' % (vocabfile, functionname) logging.debug(s) return None dialect = csv_file_dialect(vocabfile) # Create a dictionary to hold the vocabulary vocabdict = {} header = read_header(vocabfile, dialect=dialect, encoding='utf8') # Iterate through all rows in the input file. Let read_csv_row figure out the dialect for row in read_csv_row(vocabfile, dialect=dialect, encoding='utf-8', header=True, fieldnames=header): # Make a complete copy of the row rowdict = copy.deepcopy(row) key = row['fieldname'] # Remove the key from the row copy rowdict.pop('fieldname') vocabdict[key] = rowdict return vocabdict
def text_file_field_stripper(options): ''' Filter a text file into a new file based on matching a list of fields to keep. options - a dictionary of parameters loglevel - level at which to log (e.g., DEBUG) (optional) workspace - the directory in which the output will be written (optional) inputfile - full path to the input file (required) outputfile - name of the output file, without path (required) separator - string that separates the values in termlist (e.g., '|') (optional; default None) encoding - string signifying the encoding of the input file. If known, it speeds up processing a great deal. (optional; default None) (e.g., 'utf-8') format - output file format (e.g., 'csv' or 'txt') (optional; default 'txt') termlist - list of fields to extract from the input file (required) returns a dictionary with information about the results workspace - actual path to the directory where the outputfile was written outputfile - actual full path to the output tsv file success - True if process completed successfully, otherwise False message - an explanation of the reason if success=False artifacts - a dictionary of persistent objects created ''' #print '%s options: %s' % (__version__, options) setup_actor_logging(options) logging.debug( 'Started %s' % __version__ ) logging.debug( 'options: %s' % options ) # Make a list for the response returnvars = ['workspace', 'outputfile', 'success', 'message', 'artifacts'] ### Standard outputs ### success = False message = None # Make a dictionary for artifacts left behind artifacts = {} ### Establish variables ### workspace = './' inputfile = None outputfile = None format = 'txt' termlist = None separator = None encoding = None ### Required inputs ### try: workspace = options['workspace'] except: pass try: inputfile = options['inputfile'] except: pass if inputfile is None or len(inputfile)==0: message = 'No input file given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) if os.path.isfile(inputfile) == False: message = 'Input file %s not found. %s' % (inputfile, __version__) returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) try: termlist = options['termlist'] except: pass if termlist is None or len(termlist)==0: message = 'No termlist given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message: %s' % message) return response(returnvars, returnvals) try: separator = options['separator'] except: pass try: encoding = options['encoding'] except: pass if separator is None or len(separator.strip())==0: theterms = [termlist] else: theterms = termlist.split(separator) # Determine the file dialect inputdialect = csv_file_dialect(inputfile) # Determine the file encoding if encoding is None or len(encoding.strip()) == 0: encoding = csv_file_encoding(inputfile) # csv_file_encoding() always returns an encoding if there is an input file. # No need to check. # If the termname is not in the header of the inputfile, nothing to do. header = read_header(inputfile, dialect=inputdialect, encoding=encoding) # Make a clean version of the input header cleaninputheader = clean_header(header) try: format = options['format'] except: pass try: outputfile = options['outputfile'] except: pass if outputfile is None or len(outputfile)==0: message = 'No output file given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile) # Prepare the outputfile if format is None or format.lower()=='txt': outputdialect = tsv_dialect() else: outputdialect = csv_dialect() if separator is None or len(separator.strip())==0: theterms = [termlist] else: theterms = termlist.split(separator) # Make a clean version of the output header cleanoutputheader = clean_header(theterms) # Create the outputfile and write the new header to it write_header(outputfile, cleanoutputheader, outputdialect) # Check to see that the file was created if os.path.isfile(outputfile) == False: message = 'Outputfile %s was not created. %s' % (outputfile, __version__) returnvals = [workspace, outputfile, success, message, artifacts] return response(returnvars, returnvals) # Open the outputfile to start writing matching rows with open(outputfile, 'a') as outfile: writer = csv.DictWriter(outfile, dialect=outputdialect, encoding='utf-8', fieldnames=cleanoutputheader) # Iterate through all rows in the input file for row in read_csv_row(inputfile, dialect=inputdialect, encoding=encoding, header=True, fieldnames=cleaninputheader): newrow = extract_fields_from_row(row, cleanoutputheader) writer.writerow(newrow) success = True s = 'stripped_file' artifacts[s] = outputfile # Prepare the response dictionary returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('Finishing %s' % __version__) return response(returnvars, returnvals)
def term_standardizer_report(inputfile, reportfile, vocabfile, key, separator=None, encoding=None, format=None): ''' Write a file with substitutions from a vocabfile for fields in a key and appended terms showing the original values. parameters: inputfile - full path to the input file (required) reportfile - full path to the output file (required) vocabfile - path to the vocabulary file (required) key - field or separator-separated fields to set (required) separator - string to use as the key and value separator (optional; default '|') encoding - string signifying the encoding of the input file. If known, it speeds up processing a great deal. (optional; default None) (e.g., 'utf-8') format - string signifying the csv.dialect of the report file ('csv' or 'txt') (optional; default: txt) returns: success - True if the report was written, else False ''' functionname = 'term_standardizer_report()' if reportfile is None or len(reportfile) == 0: s = 'No reportfile name given in %s.' % functionname logging.debug(s) return False if inputfile is None or len(inputfile) == 0: s = 'No inputfile file given in %s.' % functionname logging.debug(s) return False if os.path.isfile(inputfile) == False: s = 'Inputfile file %s not found in %s.' % (inputfile, functionname) logging.debug(s) return False # Determine the dialect of the input file inputdialect = csv_file_dialect(inputfile) # Determine the dialect of the input file if encoding is None or len(encoding.strip()) == 0: encoding = csv_file_encoding(inputfile) # Read the header from the input file inputheader = read_header(inputfile, dialect=inputdialect, encoding=encoding) if inputheader is None: s = 'Unable to read header from input file %s in %s.' % (inputfile, functionname) logging.debug(s) return False if key is None or len(key.strip()) == 0: s = 'No key given in %s.' % functionname logging.debug(s) return False # Make sure there is a separator for the next step if separator is None or len(separator) == 0: separator = '|' # Make a list of the fields in the key by splitting it on the separator fieldlist = key.split(separator) # Assume none of the fields is in the file headerhaskey = False # Search the cleaned up header for any field from the key cleanedinputheader = strip_list(inputheader) for field in fieldlist: if field in cleanedinputheader: headerhaskey = True break if headerhaskey == False: s = 'No field from %s found ' % fieldlist s += 'in input file %s in %s.' % (inputfile, functionname) logging.debug(s) return False if vocabfile is None or len(vocabfile) == 0: logging.debug('No vocabulary file given in %s.') % functionname return False if os.path.isfile(vocabfile) == False: s = 'Vocabulary file %s not found in %s.' % (vocabfile, functionname) logging.debug(s) return False # Get the vocabulary dictionary, but convert all entries using ustripstr. Assume # vocabulary file is encoded as utf-8. vocabdict = vocab_dict_from_file(vocabfile, key, encoding='utf-8', \ separator=separator, function=ustripstr) if len(vocabdict) == 0: s = 'Vocabulary file %s ' % vocabfile s += 'had zero recommendations in %s.' % functionname logging.debug(s) return False if format is None or format.lower() == 'txt': dialect = tsv_dialect() else: dialect = csv_dialect() if format is None or format.lower() == 'txt': outputdialect = tsv_dialect() else: outputdialect = csv_dialect() # Create an output header that is the same as the input header with fields # appended to hold the original values of the key fields # Get the fields to add by splitting the key with the separator outputheader = cleanedinputheader for field in fieldlist: if field in outputheader: outputheader = outputheader + [field + '_orig'] else: outputheader = outputheader + [field] # Create the outputfile and write the new header to it write_header(reportfile, outputheader, outputdialect) # Check to see if the outputfile was created if os.path.isfile(reportfile) == False: s = 'reportfile: %s not created in %s.' % (reportfile, functionname) logging.debug(s) return False # Open the outputfile to append rows having the added fields with open(reportfile, 'a') as outfile: writer = csv.DictWriter(outfile, dialect=outputdialect, encoding='utf-8', fieldnames=outputheader) # Iterate through all rows in the input file for row in read_csv_row(inputfile, dialect=inputdialect, encoding=encoding, header=True, fieldnames=cleanedinputheader): # Set the _orig values for every field in the field list that exists in # the row for field in fieldlist: if field in row: row[field + '_orig'] = row[field] # Construct a composite field value for the row to match a key in the # vocabulary file rowkey = extract_values_from_row(row, fieldlist, separator) # Get dictionary for recommended value for the ustripstr(rowkey) newvaluedict = recommended_value(vocabdict, ustripstr(rowkey)) # Only make changes if there is a standardized value found if newvaluedict is not None: # ustripstr(rowkey) was found in the vocabulary # Get the standard value standard = newvaluedict['standard'] # Treat standard value that is None or only whitespace as '' if standard is None or len(standard.strip()) == 0: standard = '' # Make a list of values given in standard newvalues = standard.split(separator) # Only make changes if the number of recommendation fields is the # same as the number of fields in the key if len(newvalues) == len(fieldlist): i = 0 # Update or add new value to field in the fieldlist for field in fieldlist: row[field] = newvalues[i] i += 1 writer.writerow(row) s = 'Report written to %s in %s.' % (reportfile, functionname) logging.debug(s) return True
def uuid_term_appender(inputfile, outputfile, key, guidtype=None, encoding=None, format=None): ''' Write a file adding a field populated by global unique identifiers (GUIDs) to the fields in the input file. parameters: inputfile - full path to the input file (required) outputfile - full path to the output file (required) key - field or separator-separated fields to set (required) guidtype - type of GUID to use to populate the key (optional; default 'uuid') encoding - string signifying the encoding of the input file. If known, it speeds up processing a great deal. (optional; default None) (e.g., 'utf-8') format - string signifying the csv.dialect of the report file ('csv' or 'txt') (optional; default: txt) returns: success - True if the report was written, else False ''' functionname = 'uuid_term_appender()' if outputfile is None or len(outputfile) == 0: s = 'No outputfile name given in %s.' % functionname logging.debug(s) return False if inputfile is None or len(inputfile) == 0: s = 'No inputfile file given in %s.' % functionname logging.debug(s) return False if os.path.isfile(inputfile) == False: s = 'Inputfile file %s not found in %s.' % (inputfile, functionname) logging.debug(s) return False # Determine the dialect of the input file inputdialect = csv_file_dialect(inputfile) # Determine the dialect of the input file if encoding is None or len(encoding.strip()) == 0: encoding = csv_file_encoding(inputfile) # Read the header from the input file inputheader = read_header(inputfile, dialect=inputdialect, encoding=encoding) if inputheader is None: s = 'Unable to read header from input file %s in %s.' % (inputfile, functionname) logging.debug(s) return False if key is None or len(key.strip()) == 0: s = 'No key given in %s.' % functionname logging.debug(s) return False # Abort if the key exists in the inputheader if key in inputheader: s = 'field %s ' % key s += 'already exists in file %s ' % inputfile s += 'in %s.' % functionname logging.debug(s) return False if format is None or format.lower() == 'txt': outputdialect = tsv_dialect() else: outputdialect = csv_dialect() # Make an outputheader that is a copy of the inputheader plus the new field to hold # GUID. outputheader = inputheader + [key] # Create the outputfile and write the new header to it write_header(outputfile, outputheader, outputdialect) # Check to see if the outputfile was created if os.path.isfile(outputfile) == False: s = 'outputfile: %s was not created in %s.' % (outputfile, functionname) logging.debug(s) return False # Open the outputfile to append rows with appended GUID field with open(outputfile, 'a') as outfile: writer = csv.DictWriter(outfile, dialect=outputdialect, encoding='utf-8', fieldnames=outputheader) # Iterate through all rows in the input file for row in read_csv_row(inputfile, dialect=inputdialect, encoding=encoding, header=True, fieldnames=inputheader): # Create a GUID based on the selected guidtype guid = get_guid(guidtype) # Set the value of the key field to a GUID row[key] = guid # Write the updated row to the outputfile writer.writerow(row) s = 'Output file written to %s in %s.' % (outputfile, functionname) logging.debug(s) return True
def term_setter_report(inputfile, reportfile, key, constantvalues=None, separator=None, encoding=None, format=None): ''' Write a file substituting constants for fields that already exist in an input file and with added fields with constants for fields that do not already exist in an inputfile. Field name matching is exact. parameters: inputfile - full path to the input file (required) reportfile - full path to the output file (required) key - field or separator-separated fields to set (required) constantvalues - value or separator-separated values to set the field(s) to (required) separator - string to use as the key and value separator (optional; default '|') encoding - string signifying the encoding of the input file. If known, it speeds up processing a great deal. (optional; default None) (e.g., 'utf-8') format - string signifying the csv.dialect of the report file ('csv' or 'txt') (optional; default: txt) returns: success - True if the report was written, else False ''' functionname = 'term_setter_report()' if reportfile is None or len(reportfile) == 0: s = 'No reportfile name given in %s.' % functionname logging.debug(s) return False if inputfile is None or len(inputfile) == 0: s = 'No inputfile file given in %s.' % functionname logging.debug(s) return False if os.path.isfile(inputfile) == False: s = 'Inputfile file %s not found in %s.' % (inputfile, functionname) logging.debug(s) return False # Determine the dialect of the input file inputdialect = csv_file_dialect(inputfile) # Determine the dialect of the input file if encoding is None or len(encoding.strip()) == 0: encoding = csv_file_encoding(inputfile) # Read the header from the input file inputheader = read_header(inputfile, dialect=inputdialect, encoding=encoding) if inputheader is None: s = 'Unable to read header from input file %s in %s.' % (inputfile, functionname) logging.debug(s) return False if key is None or len(key.strip()) == 0: s = 'No key given in %s.' % functionname logging.debug(s) return False if constantvalues is None or len(constantvalues) == 0: s = 'No constantvalues given in %s.' % functionname logging.debug(s) return False # Make sure there is a separator for the next step if separator is None or len(separator) == 0: separator = '|' # Get the fields to set by splitting the key with the separator fields = key.split(separator) # Get the values to set by splitting the constantvalues with the separator addedvalues = constantvalues.split(separator) # Abort if there is a mismatch in the lengths of the field and constants lists if len(fields) != len(addedvalues): s = 'length of field list: %s ' % key s += 'does not match length of constants list: %s ' % constantvalues s += 'in %s.' % functionname logging.debug(s) return False if format is None or format.lower() == 'txt': outputdialect = tsv_dialect() else: outputdialect = csv_dialect() # Make an outputheader that is a copy of the inputheader outputheader = inputheader # Add to the output header fields that are not in the inputheader for field in fields: if field not in outputheader: outputheader = outputheader + [field] # Create the outputfile and write the new header to it write_header(reportfile, outputheader, outputdialect) # Check to see if the outputfile was created if os.path.isfile(reportfile) == False: s = 'reportfile: %s was not created in %s.' % (outputfile, functionname) logging.debug(s) return False # Open the outputfile to append rows with fields set to constant values with open(reportfile, 'a') as outfile: writer = csv.DictWriter(outfile, dialect=outputdialect, encoding='utf-8', fieldnames=outputheader) # Iterate through all rows in the input file for row in read_csv_row(inputfile, dialect=inputdialect, encoding=encoding, header=True, fieldnames=inputheader): # For every field in the key list for i in range(0, len(fields)): # Set the value of the ith field to the ith constant row[fields[i]] = addedvalues[i] # Write the updated row to the outputfile writer.writerow(row) s = 'Report written to %s in %s.' % (reportfile, functionname) logging.debug(s) return True
def darwinize_header(options): ''' Translate field names from input file to Darwin Core field names in outputfile using a Darwin Cloud vocabulary lookup. options - a dictionary of parameters loglevel - level at which to log (e.g., DEBUG) (optional) workspace - path to a directory for the outputfile (optional) inputfile - full path to the input file (required) dwccloudfile - full path to the vocabulary file containing the Darwin Cloud terms (required) outputfile - name of the output file, without path (required) encoding - string signifying the encoding of the input file. If known, it speeds up processing a great deal. (optional; default None) (e.g., 'utf-8') format - output file format (e.g., 'csv' or 'txt') (optional; default 'txt') namespace - prepend namespace to fields that were darwinized (optional; default 'no') (e.g., 'y', 'n') returns a dictionary with information about the results outputfile - actual full path to the output file success - True if process completed successfully, otherwise False message - an explanation of the reason if success=False ''' #print '%s options: %s' % (__version__, options) setup_actor_logging(options) logging.debug('Started %s' % __version__) logging.debug('options: %s' % options) # Make a list for the response returnvars = ['workspace', 'outputfile', 'success', 'message', 'artifacts'] ### Standard outputs ### success = False message = None # Make a dictionary for artifacts left behind artifacts = {} ### Establish variables ### workspace = './' inputfile = None dwccloudfile = None outputfile = None encoding = None namespace = 'n' format = None ### Required inputs ### try: workspace = options['workspace'] except: pass try: inputfile = options['inputfile'] except: pass if inputfile is None or len(inputfile) == 0: message = 'No input file given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) if os.path.isfile(inputfile) == False: message = 'Input file %s not found. %s' % (inputfile, __version__) returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) try: dwccloudfile = options['dwccloudfile'] except: pass if dwccloudfile is None or len(dwccloudfile) == 0: message = 'No Darwin Cloud vocabulary file given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) if os.path.isfile(dwccloudfile) == False: message = 'Darwin Cloud vocabulary file not found. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) try: outputfile = options['outputfile'] except: pass if outputfile is None or len(outputfile) == 0: message = 'No output file given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile) try: encoding = options['encoding'] except: pass if encoding is None or len(encoding.strip()) == 0: encoding = csv_file_encoding(inputfile) try: namespace = options['namespace'] except: pass inputdialect = csv_file_dialect(inputfile) try: format = options['format'] except: pass if format is None or len(format) == 0: outputdialect = inputdialect elif format.lower() == 'csv': outputdialect = csv_dialect() else: outputdialect = tsv_dialect() header = read_header(inputfile, dialect=inputdialect, encoding=encoding) dwcheader = darwinize_list(header, dwccloudfile, namespace) if dwcheader is None: message = 'Unable to create darwinized header. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) # Write the new header to the outputfile if write_header(outputfile, dwcheader, dialect=outputdialect) == False: message = 'Unable to write header to output file. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) # Read the rows of the input file, append them to the output file after the # header with columns in the same order. with open(outputfile, 'a') as outfile: writer = csv.DictWriter(outfile, dialect=outputdialect, encoding='utf-8', fieldnames=header) for row in read_csv_row(inputfile, inputdialect, encoding): writer.writerow(row) #print 'row: %s' % row success = True artifacts['darwinized_header_file'] = outputfile returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('Finishing %s' % __version__) return response(returnvars, returnvals)
def text_file_aggregator(options): ''' Join the contents of files in a given path. Headers and encodings are not assumed to be the same. Write a file containing the joined files with one header line. options - a dictionary of parameters loglevel - level at which to log (e.g., DEBUG) (optional) workspace - path to a directory for the outputfile (optional) inputpath - full path to the input file set. The file extension of the outputfile will be the substring following the last '.' in the inputpath. Example: ./workspace/thefiles.txt will produce an output file ending in '.txt' (required) outputfile - name of the output file, without path (optional) format - output file format (e.g., 'csv' or 'txt') (optional; default 'txt') returns a dictionary with information about the results workspace - actual path to the directory where the outputfile was written outputfile - actual full path to the output file aggregaterowcount - the number of rows in the aggregated file, not counting header success - True if process completed successfully, otherwise False message - an explanation of the reason if success=False artifacts - a dictionary of persistent objects created ''' #print '%s options: %s' % (__version__, options) setup_actor_logging(options) logging.debug( 'Started %s' % __version__ ) logging.debug( 'options: %s' % options ) # Make a list for the response returnvars = ['workspace', 'outputfile', 'aggregaterowcount', 'success', 'message', 'artifacts'] ### Standard outputs ### success = False message = None ### Custom outputs ### aggregaterowcount = None # Make a dictionary for artifacts left behind artifacts = {} ### Establish variables ### workspace = './' inputpath = None outputfile = None format = None ### Required inputs ### try: workspace = options['workspace'] except: pass try: inputpath = options['inputpath'] except: pass if inputpath is None or len(inputpath)==0: message = 'No input file given. %s' % __version__ returnvals = [workspace, outputfile, aggregaterowcount, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) try: format = options['format'] except: pass if format is None or len(format)==0: format = 'txt' if format.lower() == 'txt': dialect = tsv_dialect() else: dialect = csv_dialect() try: outputfile = options['outputfile'] except: pass if outputfile is None or len(outputfile)==0: outputfile='aggregate_'+str(uuid.uuid1())+format # Construct the output file path in the workspace outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile) # Create the composite header. Let composite_header determine the dialects and # encodings of the files to aggregate. aggregateheader = composite_header(inputpath) aggregaterowcount = 0 # Open a file to write the aggregated results in chosen format and utf-8. with open(outputfile, 'w') as outfile: writer = csv.DictWriter(outfile, dialect=dialect, encoding='utf-8', fieldnames=aggregateheader, extrasaction='ignore') writer.writeheader() files = glob.glob(inputpath) for file in files: dialect = csv_file_dialect(file) encoding = csv_file_encoding(file) with open(file, 'rU') as inputfile: reader = csv.DictReader(utf8_data_encoder(inputfile, encoding), dialect=dialect, encoding=encoding) for line in reader: try: writer.writerow(line) aggregaterowcount += 1 except: message = 'failed to write line:\n%s\n' % line message += 'to file %s. %s' % (file, __version__) returnvals = [workspace, outputfile, aggregaterowcount, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) success = True artifacts['aggregated_file'] = outputfile if aggregateheader is not None: aggregateheader = list(aggregateheader) returnvals = [workspace, outputfile, aggregaterowcount, success, message, artifacts] logging.debug('Finishing %s' % __version__) return response(returnvars, returnvals)
def distinct_vocabs_to_file(vocabfile, valuelist, key, separator=None, dialect=None): ''' Add distinct new verbatim values from a valuelist to a vocabulary file. Always write new values as utf-8. parameters: vocabfile - full path to the vocabulary file (required) valuelist - list of values to check for adding to the vocabulary file (required) key - the field or separator-separated fieldnames that hold the distinct values in the vocabulary file (required) separator - string to use as the value separator in the string (optional; default None) dialect - a csv.dialect object with the attributes of the vocabulary file (default None) returns: newvaluelist - a sorted list of distinct verbatim values added to the vocabulary lookup file ''' functionname = 'distinct_vocabs_to_file()' if vocabfile is None or len(vocabfile.strip()) == 0: s = 'No vocab file given in %s.' % functionname logging.debug(s) return None # Determine the dialect of the input file if dialect is None: dialect = csv_file_dialect(vocabfile) # csv_file_dialect() always returns a dialect if there is an input file. # No need to check. # No need to check if valuelist is given, not_in_list() does that # Get the distinct verbatim values from the vocab file vocablist = extract_values_from_file(vocabfile, [key], separator=separator, dialect=dialect, encoding='utf-8') # Get the values not already in the vocab file newvaluelist = not_in_list(vocablist, valuelist) if newvaluelist is None or len(newvaluelist) == 0: s = 'No new values found for %s in %s' % (vocabfile, functionname) logging.debug(s) return None if dialect is None: dialect = csv_file_dialect(vocabfile) fieldnames = vocabheader(key, separator) if not os.path.isfile(vocabfile): write_header(vocabfile, fieldnames, vocab_dialect()) if os.path.isfile(vocabfile) == False: s = 'Vocab file %s not found in %s.' % (vocabfile, functionname) logging.debug(s) return None with open(vocabfile, 'a') as csvfile: writer = csv.DictWriter(csvfile, dialect=dialect, encoding='utf-8', fieldnames=fieldnames) for term in newvaluelist: row = copy.deepcopy(vocabrowdict) row[key] = term writer.writerow(row) s = 'Vocabulary file written to %s in %s.' % (vocabfile, functionname) logging.debug(s) return newvaluelist
def term_token_count_from_file(inputfile, termname, dialect=None, encoding=None): ''' Make a dictionary of tokens for a given term in a file along with the number of times each occurs. parameters: inputfile - full path to the input file (required) termname - term for which to count rows (required) dialect - csv.dialect object with the attributes of the input files, which must all have the same dialect if dialect is given, otherwise it will be detected (default None) encoding - a string designating the input file encoding (optional; default None) (e.g., 'utf-8', 'mac_roman', 'latin_1', 'cp1252') returns: tokens - a dictionary containing the tokens and their counts ''' functionname = 'term_token_count_from_file()' if inputfile is None or len(inputfile) == 0: s = 'No input file given in %s.' % functionname logging.debug(s) return 0 if os.path.isfile(inputfile) == False: s = 'File %s not found in %s.' % (inputfile, functionname) logging.debug(s) return 0 if termname is None or len(termname) == 0: s = 'No term name given in %s.' % functionname logging.debug(s) return 0 # Determine the dialect of the input file if dialect is None: dialect = csv_file_dialect(inputfile) # csv_file_dialect() always returns a dialect if there is an input file. # No need to check. # Determine the encoding of the input file if encoding is None or len(encoding.strip()) == 0: encoding = csv_file_encoding(inputfile) # csv_file_encoding() always returns an encoding if there is an input file. # No need to check. inputheader = read_header(inputfile, dialect=dialect, encoding=encoding) if termname not in inputheader: s = 'Term %s not found in file %s ' % (termname, inputfile) s += 'in %s.' % functionname logging.debug(s) return None rowcount = 0 tokencount = 0 populatedrowcount = 0 tokens = {'tokenlist': {}} for row in read_csv_row(inputfile, dialect, encoding): try: value = row[termname] except: pass if value is not None and len(value.strip()) > 0: rowdict = {} wordlist = re.sub("[^\w]", " ", value).split() for token in wordlist: if token in rowdict: rowdict[token][ 'totalcount'] = rowdict[token]['totalcount'] + 1 else: rowdict[token] = {} rowdict[token]['rowcount'] = 1 rowdict[token]['totalcount'] = 1 populatedrowcount += 1 for key, value in rowdict.iteritems(): tokenlist = tokens['tokenlist'] if key in tokenlist: tokenlist[key]['rowcount'] = \ tokenlist[key]['rowcount'] + value['rowcount'] tokenlist[key]['totalcount'] = \ tokenlist[key]['totalcount'] + value['totalcount'] else: tokenlist[key] = {} tokenlist[key]['rowcount'] = value['rowcount'] tokenlist[key]['totalcount'] = value['totalcount'] rowcount += 1 tokencount += len(wordlist) tokens['rowcount'] = rowcount tokens['tokencount'] = tokencount tokens['input'] = inputfile tokens['term'] = termname return tokens