def writevocabheader(fullpath, fieldnames, dialect=None): ''' Write a vocabulary header to a file in utf-8 using the chosen dialect. parameters: fullpath - the full path to the file to write into (required) fieldnames - list of field names in the header (required) dialect - csv.dialect object with the attributes of the vocabulary lookup file (default None) returns: success - True if the header was written to the file, otherwise False ''' functionname = 'writevocabheader()' if fullpath is None or len(fullpath) == 0: s = 'No vocabulary file given in %s.' % functionname logging.debug(s) return False if fieldnames is None or len(fieldnames) == 0: s = 'No list of field names given in %s.' % functionname logging.debug(s) return False if dialect is None: dialect = tsv_dialect() success = write_header(fullpath, fieldnames, dialect) if success == False: s = 'No header written to file %s in %s.' % (fullpath, functionname) logging.debug(s) return False s = 'Header written to %s in %s.' % (fullpath, functionname) logging.debug(s) return True
def vocab_dialect(): ''' Get a dialect object with properties for vocabulary management files. parameters: None returns: dialect - a csv.dialect object with TSV attributes ''' return tsv_dialect()
def term_list_report(reportfile, termlist, key, separator=None, format=None): ''' Write a report with a list of terms. parameters: reportfile - full path to the output report file (optional) termlist - list of terms to report (required) format - string signifying the csv.dialect of the report file ('csv' or 'txt') key - the field or separator-separated fieldnames that hold the distinct values in the vocabulary file (required) separator - string to use as the value separator in the string (optional; default None) returns: success - True if the report was written, else False ''' functionname = 'term_list_report()' if termlist is None or len(termlist) == 0: s = 'No term list given in %s.' % functionname logging.debug(s) return False if reportfile is None or len(reportfile) == 0: s = 'No recommendation file name given in %s.' % functionname logging.debug(s) return False fieldnames = vocabheader(key, separator) if format is None or format.lower() == 'csv': dialect = csv_dialect() else: dialect = tsv_dialect() # Create the outputfile and write the new header to it write_header(reportfile, fieldnames, dialect) if os.path.isfile(reportfile) == False: s = 'reportfile: %s not created in %s.' % (reportfile, functionname) logging.debug(s) return False with open(reportfile, 'a') as csvfile: writer = csv.DictWriter(csvfile, dialect=dialect, encoding='utf-8', fieldnames=fieldnames) for value in termlist: row = {key: value, 'standard': '', 'vetted': '0'} if separator is None: fields = [key] else: fields = key.split(separator) if len(fields) > 1: for field in fields: row[field] = value writer.writerow(row) s = 'Report written to %s in %s.' % (reportfile, functionname) logging.debug(s) return True
def term_completeness_report(reportfile, fieldcountdict, format=None): ''' Write a report with a list of fields and the number of times they are populated. parameters: reportfile - full path to the output report file (optional) fieldcountdict - dictionary of field names and the number of rows in which they are populated in the inputfile format - string signifying the csv.dialect of the report file ('csv' or 'txt') returns: success - True if the report was written, else False ''' functionname = 'term_completeness_report()' if fieldcountdict is None or len(fieldcountdict) == 0: s = 'No field count dictionary given in %s.' % functionname logging.debug(s) return False if reportfile is None or len(reportfile) == 0: s = 'No recommendation file name given in %s.' % functionname logging.debug(s) return False if format is None or format.lower() == 'csv': dialect = csv_dialect() else: dialect = tsv_dialect() fields = [] # Make an alphabetically sorted list of field names for key, value in fieldcountdict.iteritems(): fields.append(key) fieldlist = sorted(fields) outputheader = ['field', 'count'] # Create the outputfile and write the new header to it write_header(reportfile, outputheader, dialect) if os.path.isfile(reportfile) == False: s = 'reportfile: %s not created in %s.' % (reportfile, functionname) logging.debug(s) return False with open(reportfile, 'a') as csvfile: writer = csv.DictWriter(csvfile, dialect=dialect, encoding='utf-8', fieldnames=outputheader) for field in fieldlist: row = {'field': field, 'count': fieldcountdict[field]} writer.writerow(row) s = 'Report written to %s in %s.' % (reportfile, functionname) logging.debug(s) return True
def term_value_count_report(reportfile, termcountlist, termname='value', format=None): ''' Write a report of the counts of values for the term. parameters: reportfile - full path to the output report file termcountlist - list of terms with counts (required) termname - name of the term for which counts were made (optional; default 'value') format - string signifying the csv.dialect of the report file ('csv' or 'txt') returns: success - True if report was written or if there is nothing to write, else False ''' functionname = 'term_value_count_report()' if reportfile is None or len(reportfile)==0: s = 'No report file given in %s.' % functionname logging.debug(s) return False if termcountlist is None or len(termcountlist)==0: s = 'No term count list given in %s.' % functionname logging.debug(s) return True if format=='csv' or format is None: dialect = csv_dialect() else: dialect = tsv_dialect() countreporttermlist = [termname, 'count'] # Create the outputfile and write the new header to it write_header(reportfile, countreporttermlist, dialect) if os.path.isfile(reportfile) == False: s = 'reportfile: %s not created in %s' % (reportfile, functionname) logging.debug(s) return False with open(reportfile, 'a') as csvfile: writer = csv.DictWriter(csvfile, dialect=dialect, \ fieldnames=countreporttermlist) for item in termcountlist: writer.writerow({termname:item[0], 'count':item[1] }) return True
def token_report(reportfile, tokens, dialect=None): ''' Write a term token report to a file. parameters: reportfile - full path to the report file (optional) tokens - dictionary of token occurrences (required) (see output from term_token_count_from_file() dialect - csv.dialect object with the attributes of the report file (default None) returns: True if the report was written, otherwise False ''' functionname = 'token_report()' if tokens is None or len(tokens) == 0: s = 'No token dictionary given in %s.' % functionname logging.debug(s) return False if dialect is None: dialect = tsv_dialect() if reportfile is not None: # Create the outputfile and write the new header to it write_header(reportfile, tokenreportfieldlist, dialect) if os.path.isfile(reportfile) == False: s = 'File %s not found in %s.' % (reportfile, functionname) logging.debug(s) return False with open(reportfile, 'a') as csvfile: writer = csv.DictWriter(csvfile, dialect=dialect, \ encoding='utf-8', fieldnames=tokenreportfieldlist) for key, value in tokens['tokenlist'].iteritems(): writer.writerow({'token':key, 'rowcount':value['rowcount'], \ 'totalcount':value['totalcount'] }) else: # Write the report to stdout for key, value in tokens['tokenlist'].iteritems(): print 'token: %s rowcount: %s totalcount: %s' % (key, value['rowcount'], \ value['totalcount']) return True
def write_core_csv_file(dwcareader, outputfile): ''' Create a csv file from the Darwin Core Reader. parameters: dwcareader - Darwin Core Reader class instance outputfile - the path to the csv file returns: rowcount - the number of rows in the core file ''' # Get the fully qualified field names from the Darwin Core Reader termnames = list(dwcareader.descriptor.core.terms) # Make a list of field names without full qualification, ordered as they are in # Darwin Core shorttermnames = dwc_ordered_header(short_term_names(termnames)) #shorttermnames=short_term_names(termnames); dialect = tsv_dialect() success = write_header(outputfile, shorttermnames, dialect) if success == False: return None rowcount = 0 with open(outputfile, 'a') as outfile: writer = csv.DictWriter(outfile, dialect=dialect, fieldnames=shorttermnames, encoding='utf-8') rowout = {} for row in dwcareader: for f in row.data: rowout[shortname(f)] = row.data[f].encode('utf-8') writer.writerow(rowout) rowcount += 1 return rowcount
def text_file_field_stripper(options): ''' Filter a text file into a new file based on matching a list of fields to keep. options - a dictionary of parameters loglevel - level at which to log (e.g., DEBUG) (optional) workspace - the directory in which the output will be written (optional) inputfile - full path to the input file (required) outputfile - name of the output file, without path (required) separator - string that separates the values in termlist (e.g., '|') (optional; default None) encoding - string signifying the encoding of the input file. If known, it speeds up processing a great deal. (optional; default None) (e.g., 'utf-8') format - output file format (e.g., 'csv' or 'txt') (optional; default 'txt') termlist - list of fields to extract from the input file (required) returns a dictionary with information about the results workspace - actual path to the directory where the outputfile was written outputfile - actual full path to the output tsv file success - True if process completed successfully, otherwise False message - an explanation of the reason if success=False artifacts - a dictionary of persistent objects created ''' #print '%s options: %s' % (__version__, options) setup_actor_logging(options) logging.debug( 'Started %s' % __version__ ) logging.debug( 'options: %s' % options ) # Make a list for the response returnvars = ['workspace', 'outputfile', 'success', 'message', 'artifacts'] ### Standard outputs ### success = False message = None # Make a dictionary for artifacts left behind artifacts = {} ### Establish variables ### workspace = './' inputfile = None outputfile = None format = 'txt' termlist = None separator = None encoding = None ### Required inputs ### try: workspace = options['workspace'] except: pass try: inputfile = options['inputfile'] except: pass if inputfile is None or len(inputfile)==0: message = 'No input file given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) if os.path.isfile(inputfile) == False: message = 'Input file %s not found. %s' % (inputfile, __version__) returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) try: termlist = options['termlist'] except: pass if termlist is None or len(termlist)==0: message = 'No termlist given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message: %s' % message) return response(returnvars, returnvals) try: separator = options['separator'] except: pass try: encoding = options['encoding'] except: pass if separator is None or len(separator.strip())==0: theterms = [termlist] else: theterms = termlist.split(separator) # Determine the file dialect inputdialect = csv_file_dialect(inputfile) # Determine the file encoding if encoding is None or len(encoding.strip()) == 0: encoding = csv_file_encoding(inputfile) # csv_file_encoding() always returns an encoding if there is an input file. # No need to check. # If the termname is not in the header of the inputfile, nothing to do. header = read_header(inputfile, dialect=inputdialect, encoding=encoding) # Make a clean version of the input header cleaninputheader = clean_header(header) try: format = options['format'] except: pass try: outputfile = options['outputfile'] except: pass if outputfile is None or len(outputfile)==0: message = 'No output file given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile) # Prepare the outputfile if format is None or format.lower()=='txt': outputdialect = tsv_dialect() else: outputdialect = csv_dialect() if separator is None or len(separator.strip())==0: theterms = [termlist] else: theterms = termlist.split(separator) # Make a clean version of the output header cleanoutputheader = clean_header(theterms) # Create the outputfile and write the new header to it write_header(outputfile, cleanoutputheader, outputdialect) # Check to see that the file was created if os.path.isfile(outputfile) == False: message = 'Outputfile %s was not created. %s' % (outputfile, __version__) returnvals = [workspace, outputfile, success, message, artifacts] return response(returnvars, returnvals) # Open the outputfile to start writing matching rows with open(outputfile, 'a') as outfile: writer = csv.DictWriter(outfile, dialect=outputdialect, encoding='utf-8', fieldnames=cleanoutputheader) # Iterate through all rows in the input file for row in read_csv_row(inputfile, dialect=inputdialect, encoding=encoding, header=True, fieldnames=cleaninputheader): newrow = extract_fields_from_row(row, cleanoutputheader) writer.writerow(newrow) success = True s = 'stripped_file' artifacts[s] = outputfile # Prepare the response dictionary returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('Finishing %s' % __version__) return response(returnvars, returnvals)
def term_standardizer_report(inputfile, reportfile, vocabfile, key, separator=None, encoding=None, format=None): ''' Write a file with substitutions from a vocabfile for fields in a key and appended terms showing the original values. parameters: inputfile - full path to the input file (required) reportfile - full path to the output file (required) vocabfile - path to the vocabulary file (required) key - field or separator-separated fields to set (required) separator - string to use as the key and value separator (optional; default '|') encoding - string signifying the encoding of the input file. If known, it speeds up processing a great deal. (optional; default None) (e.g., 'utf-8') format - string signifying the csv.dialect of the report file ('csv' or 'txt') (optional; default: txt) returns: success - True if the report was written, else False ''' functionname = 'term_standardizer_report()' if reportfile is None or len(reportfile) == 0: s = 'No reportfile name given in %s.' % functionname logging.debug(s) return False if inputfile is None or len(inputfile) == 0: s = 'No inputfile file given in %s.' % functionname logging.debug(s) return False if os.path.isfile(inputfile) == False: s = 'Inputfile file %s not found in %s.' % (inputfile, functionname) logging.debug(s) return False # Determine the dialect of the input file inputdialect = csv_file_dialect(inputfile) # Determine the dialect of the input file if encoding is None or len(encoding.strip()) == 0: encoding = csv_file_encoding(inputfile) # Read the header from the input file inputheader = read_header(inputfile, dialect=inputdialect, encoding=encoding) if inputheader is None: s = 'Unable to read header from input file %s in %s.' % (inputfile, functionname) logging.debug(s) return False if key is None or len(key.strip()) == 0: s = 'No key given in %s.' % functionname logging.debug(s) return False # Make sure there is a separator for the next step if separator is None or len(separator) == 0: separator = '|' # Make a list of the fields in the key by splitting it on the separator fieldlist = key.split(separator) # Assume none of the fields is in the file headerhaskey = False # Search the cleaned up header for any field from the key cleanedinputheader = strip_list(inputheader) for field in fieldlist: if field in cleanedinputheader: headerhaskey = True break if headerhaskey == False: s = 'No field from %s found ' % fieldlist s += 'in input file %s in %s.' % (inputfile, functionname) logging.debug(s) return False if vocabfile is None or len(vocabfile) == 0: logging.debug('No vocabulary file given in %s.') % functionname return False if os.path.isfile(vocabfile) == False: s = 'Vocabulary file %s not found in %s.' % (vocabfile, functionname) logging.debug(s) return False # Get the vocabulary dictionary, but convert all entries using ustripstr. Assume # vocabulary file is encoded as utf-8. vocabdict = vocab_dict_from_file(vocabfile, key, encoding='utf-8', \ separator=separator, function=ustripstr) if len(vocabdict) == 0: s = 'Vocabulary file %s ' % vocabfile s += 'had zero recommendations in %s.' % functionname logging.debug(s) return False if format is None or format.lower() == 'txt': dialect = tsv_dialect() else: dialect = csv_dialect() if format is None or format.lower() == 'txt': outputdialect = tsv_dialect() else: outputdialect = csv_dialect() # Create an output header that is the same as the input header with fields # appended to hold the original values of the key fields # Get the fields to add by splitting the key with the separator outputheader = cleanedinputheader for field in fieldlist: if field in outputheader: outputheader = outputheader + [field + '_orig'] else: outputheader = outputheader + [field] # Create the outputfile and write the new header to it write_header(reportfile, outputheader, outputdialect) # Check to see if the outputfile was created if os.path.isfile(reportfile) == False: s = 'reportfile: %s not created in %s.' % (reportfile, functionname) logging.debug(s) return False # Open the outputfile to append rows having the added fields with open(reportfile, 'a') as outfile: writer = csv.DictWriter(outfile, dialect=outputdialect, encoding='utf-8', fieldnames=outputheader) # Iterate through all rows in the input file for row in read_csv_row(inputfile, dialect=inputdialect, encoding=encoding, header=True, fieldnames=cleanedinputheader): # Set the _orig values for every field in the field list that exists in # the row for field in fieldlist: if field in row: row[field + '_orig'] = row[field] # Construct a composite field value for the row to match a key in the # vocabulary file rowkey = extract_values_from_row(row, fieldlist, separator) # Get dictionary for recommended value for the ustripstr(rowkey) newvaluedict = recommended_value(vocabdict, ustripstr(rowkey)) # Only make changes if there is a standardized value found if newvaluedict is not None: # ustripstr(rowkey) was found in the vocabulary # Get the standard value standard = newvaluedict['standard'] # Treat standard value that is None or only whitespace as '' if standard is None or len(standard.strip()) == 0: standard = '' # Make a list of values given in standard newvalues = standard.split(separator) # Only make changes if the number of recommendation fields is the # same as the number of fields in the key if len(newvalues) == len(fieldlist): i = 0 # Update or add new value to field in the fieldlist for field in fieldlist: row[field] = newvalues[i] i += 1 writer.writerow(row) s = 'Report written to %s in %s.' % (reportfile, functionname) logging.debug(s) return True
def uuid_term_appender(inputfile, outputfile, key, guidtype=None, encoding=None, format=None): ''' Write a file adding a field populated by global unique identifiers (GUIDs) to the fields in the input file. parameters: inputfile - full path to the input file (required) outputfile - full path to the output file (required) key - field or separator-separated fields to set (required) guidtype - type of GUID to use to populate the key (optional; default 'uuid') encoding - string signifying the encoding of the input file. If known, it speeds up processing a great deal. (optional; default None) (e.g., 'utf-8') format - string signifying the csv.dialect of the report file ('csv' or 'txt') (optional; default: txt) returns: success - True if the report was written, else False ''' functionname = 'uuid_term_appender()' if outputfile is None or len(outputfile) == 0: s = 'No outputfile name given in %s.' % functionname logging.debug(s) return False if inputfile is None or len(inputfile) == 0: s = 'No inputfile file given in %s.' % functionname logging.debug(s) return False if os.path.isfile(inputfile) == False: s = 'Inputfile file %s not found in %s.' % (inputfile, functionname) logging.debug(s) return False # Determine the dialect of the input file inputdialect = csv_file_dialect(inputfile) # Determine the dialect of the input file if encoding is None or len(encoding.strip()) == 0: encoding = csv_file_encoding(inputfile) # Read the header from the input file inputheader = read_header(inputfile, dialect=inputdialect, encoding=encoding) if inputheader is None: s = 'Unable to read header from input file %s in %s.' % (inputfile, functionname) logging.debug(s) return False if key is None or len(key.strip()) == 0: s = 'No key given in %s.' % functionname logging.debug(s) return False # Abort if the key exists in the inputheader if key in inputheader: s = 'field %s ' % key s += 'already exists in file %s ' % inputfile s += 'in %s.' % functionname logging.debug(s) return False if format is None or format.lower() == 'txt': outputdialect = tsv_dialect() else: outputdialect = csv_dialect() # Make an outputheader that is a copy of the inputheader plus the new field to hold # GUID. outputheader = inputheader + [key] # Create the outputfile and write the new header to it write_header(outputfile, outputheader, outputdialect) # Check to see if the outputfile was created if os.path.isfile(outputfile) == False: s = 'outputfile: %s was not created in %s.' % (outputfile, functionname) logging.debug(s) return False # Open the outputfile to append rows with appended GUID field with open(outputfile, 'a') as outfile: writer = csv.DictWriter(outfile, dialect=outputdialect, encoding='utf-8', fieldnames=outputheader) # Iterate through all rows in the input file for row in read_csv_row(inputfile, dialect=inputdialect, encoding=encoding, header=True, fieldnames=inputheader): # Create a GUID based on the selected guidtype guid = get_guid(guidtype) # Set the value of the key field to a GUID row[key] = guid # Write the updated row to the outputfile writer.writerow(row) s = 'Output file written to %s in %s.' % (outputfile, functionname) logging.debug(s) return True
def term_setter_report(inputfile, reportfile, key, constantvalues=None, separator=None, encoding=None, format=None): ''' Write a file substituting constants for fields that already exist in an input file and with added fields with constants for fields that do not already exist in an inputfile. Field name matching is exact. parameters: inputfile - full path to the input file (required) reportfile - full path to the output file (required) key - field or separator-separated fields to set (required) constantvalues - value or separator-separated values to set the field(s) to (required) separator - string to use as the key and value separator (optional; default '|') encoding - string signifying the encoding of the input file. If known, it speeds up processing a great deal. (optional; default None) (e.g., 'utf-8') format - string signifying the csv.dialect of the report file ('csv' or 'txt') (optional; default: txt) returns: success - True if the report was written, else False ''' functionname = 'term_setter_report()' if reportfile is None or len(reportfile) == 0: s = 'No reportfile name given in %s.' % functionname logging.debug(s) return False if inputfile is None or len(inputfile) == 0: s = 'No inputfile file given in %s.' % functionname logging.debug(s) return False if os.path.isfile(inputfile) == False: s = 'Inputfile file %s not found in %s.' % (inputfile, functionname) logging.debug(s) return False # Determine the dialect of the input file inputdialect = csv_file_dialect(inputfile) # Determine the dialect of the input file if encoding is None or len(encoding.strip()) == 0: encoding = csv_file_encoding(inputfile) # Read the header from the input file inputheader = read_header(inputfile, dialect=inputdialect, encoding=encoding) if inputheader is None: s = 'Unable to read header from input file %s in %s.' % (inputfile, functionname) logging.debug(s) return False if key is None or len(key.strip()) == 0: s = 'No key given in %s.' % functionname logging.debug(s) return False if constantvalues is None or len(constantvalues) == 0: s = 'No constantvalues given in %s.' % functionname logging.debug(s) return False # Make sure there is a separator for the next step if separator is None or len(separator) == 0: separator = '|' # Get the fields to set by splitting the key with the separator fields = key.split(separator) # Get the values to set by splitting the constantvalues with the separator addedvalues = constantvalues.split(separator) # Abort if there is a mismatch in the lengths of the field and constants lists if len(fields) != len(addedvalues): s = 'length of field list: %s ' % key s += 'does not match length of constants list: %s ' % constantvalues s += 'in %s.' % functionname logging.debug(s) return False if format is None or format.lower() == 'txt': outputdialect = tsv_dialect() else: outputdialect = csv_dialect() # Make an outputheader that is a copy of the inputheader outputheader = inputheader # Add to the output header fields that are not in the inputheader for field in fields: if field not in outputheader: outputheader = outputheader + [field] # Create the outputfile and write the new header to it write_header(reportfile, outputheader, outputdialect) # Check to see if the outputfile was created if os.path.isfile(reportfile) == False: s = 'reportfile: %s was not created in %s.' % (outputfile, functionname) logging.debug(s) return False # Open the outputfile to append rows with fields set to constant values with open(reportfile, 'a') as outfile: writer = csv.DictWriter(outfile, dialect=outputdialect, encoding='utf-8', fieldnames=outputheader) # Iterate through all rows in the input file for row in read_csv_row(inputfile, dialect=inputdialect, encoding=encoding, header=True, fieldnames=inputheader): # For every field in the key list for i in range(0, len(fields)): # Set the value of the ith field to the ith constant row[fields[i]] = addedvalues[i] # Write the updated row to the outputfile writer.writerow(row) s = 'Report written to %s in %s.' % (reportfile, functionname) logging.debug(s) return True
def darwinize_header(options): ''' Translate field names from input file to Darwin Core field names in outputfile using a Darwin Cloud vocabulary lookup. options - a dictionary of parameters loglevel - level at which to log (e.g., DEBUG) (optional) workspace - path to a directory for the outputfile (optional) inputfile - full path to the input file (required) dwccloudfile - full path to the vocabulary file containing the Darwin Cloud terms (required) outputfile - name of the output file, without path (required) encoding - string signifying the encoding of the input file. If known, it speeds up processing a great deal. (optional; default None) (e.g., 'utf-8') format - output file format (e.g., 'csv' or 'txt') (optional; default 'txt') namespace - prepend namespace to fields that were darwinized (optional; default 'no') (e.g., 'y', 'n') returns a dictionary with information about the results outputfile - actual full path to the output file success - True if process completed successfully, otherwise False message - an explanation of the reason if success=False ''' #print '%s options: %s' % (__version__, options) setup_actor_logging(options) logging.debug('Started %s' % __version__) logging.debug('options: %s' % options) # Make a list for the response returnvars = ['workspace', 'outputfile', 'success', 'message', 'artifacts'] ### Standard outputs ### success = False message = None # Make a dictionary for artifacts left behind artifacts = {} ### Establish variables ### workspace = './' inputfile = None dwccloudfile = None outputfile = None encoding = None namespace = 'n' format = None ### Required inputs ### try: workspace = options['workspace'] except: pass try: inputfile = options['inputfile'] except: pass if inputfile is None or len(inputfile) == 0: message = 'No input file given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) if os.path.isfile(inputfile) == False: message = 'Input file %s not found. %s' % (inputfile, __version__) returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) try: dwccloudfile = options['dwccloudfile'] except: pass if dwccloudfile is None or len(dwccloudfile) == 0: message = 'No Darwin Cloud vocabulary file given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) if os.path.isfile(dwccloudfile) == False: message = 'Darwin Cloud vocabulary file not found. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) try: outputfile = options['outputfile'] except: pass if outputfile is None or len(outputfile) == 0: message = 'No output file given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile) try: encoding = options['encoding'] except: pass if encoding is None or len(encoding.strip()) == 0: encoding = csv_file_encoding(inputfile) try: namespace = options['namespace'] except: pass inputdialect = csv_file_dialect(inputfile) try: format = options['format'] except: pass if format is None or len(format) == 0: outputdialect = inputdialect elif format.lower() == 'csv': outputdialect = csv_dialect() else: outputdialect = tsv_dialect() header = read_header(inputfile, dialect=inputdialect, encoding=encoding) dwcheader = darwinize_list(header, dwccloudfile, namespace) if dwcheader is None: message = 'Unable to create darwinized header. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) # Write the new header to the outputfile if write_header(outputfile, dwcheader, dialect=outputdialect) == False: message = 'Unable to write header to output file. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) # Read the rows of the input file, append them to the output file after the # header with columns in the same order. with open(outputfile, 'a') as outfile: writer = csv.DictWriter(outfile, dialect=outputdialect, encoding='utf-8', fieldnames=header) for row in read_csv_row(inputfile, inputdialect, encoding): writer.writerow(row) #print 'row: %s' % row success = True artifacts['darwinized_header_file'] = outputfile returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('Finishing %s' % __version__) return response(returnvars, returnvals)
def text_file_aggregator(options): ''' Join the contents of files in a given path. Headers and encodings are not assumed to be the same. Write a file containing the joined files with one header line. options - a dictionary of parameters loglevel - level at which to log (e.g., DEBUG) (optional) workspace - path to a directory for the outputfile (optional) inputpath - full path to the input file set. The file extension of the outputfile will be the substring following the last '.' in the inputpath. Example: ./workspace/thefiles.txt will produce an output file ending in '.txt' (required) outputfile - name of the output file, without path (optional) format - output file format (e.g., 'csv' or 'txt') (optional; default 'txt') returns a dictionary with information about the results workspace - actual path to the directory where the outputfile was written outputfile - actual full path to the output file aggregaterowcount - the number of rows in the aggregated file, not counting header success - True if process completed successfully, otherwise False message - an explanation of the reason if success=False artifacts - a dictionary of persistent objects created ''' #print '%s options: %s' % (__version__, options) setup_actor_logging(options) logging.debug( 'Started %s' % __version__ ) logging.debug( 'options: %s' % options ) # Make a list for the response returnvars = ['workspace', 'outputfile', 'aggregaterowcount', 'success', 'message', 'artifacts'] ### Standard outputs ### success = False message = None ### Custom outputs ### aggregaterowcount = None # Make a dictionary for artifacts left behind artifacts = {} ### Establish variables ### workspace = './' inputpath = None outputfile = None format = None ### Required inputs ### try: workspace = options['workspace'] except: pass try: inputpath = options['inputpath'] except: pass if inputpath is None or len(inputpath)==0: message = 'No input file given. %s' % __version__ returnvals = [workspace, outputfile, aggregaterowcount, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) try: format = options['format'] except: pass if format is None or len(format)==0: format = 'txt' if format.lower() == 'txt': dialect = tsv_dialect() else: dialect = csv_dialect() try: outputfile = options['outputfile'] except: pass if outputfile is None or len(outputfile)==0: outputfile='aggregate_'+str(uuid.uuid1())+format # Construct the output file path in the workspace outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile) # Create the composite header. Let composite_header determine the dialects and # encodings of the files to aggregate. aggregateheader = composite_header(inputpath) aggregaterowcount = 0 # Open a file to write the aggregated results in chosen format and utf-8. with open(outputfile, 'w') as outfile: writer = csv.DictWriter(outfile, dialect=dialect, encoding='utf-8', fieldnames=aggregateheader, extrasaction='ignore') writer.writeheader() files = glob.glob(inputpath) for file in files: dialect = csv_file_dialect(file) encoding = csv_file_encoding(file) with open(file, 'rU') as inputfile: reader = csv.DictReader(utf8_data_encoder(inputfile, encoding), dialect=dialect, encoding=encoding) for line in reader: try: writer.writerow(line) aggregaterowcount += 1 except: message = 'failed to write line:\n%s\n' % line message += 'to file %s. %s' % (file, __version__) returnvals = [workspace, outputfile, aggregaterowcount, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) success = True artifacts['aggregated_file'] = outputfile if aggregateheader is not None: aggregateheader = list(aggregateheader) returnvals = [workspace, outputfile, aggregaterowcount, success, message, artifacts] logging.debug('Finishing %s' % __version__) return response(returnvars, returnvals)
def composite_header_constructor(options): ''' Create a file with a header that contains the distinct union of column names from two input files. options - a dictionary of parameters loglevel - level at which to log (e.g., DEBUG) (optional) workspace - path to a directory for the output file (optional; default './') inputfile1 - full path to one of the input files (optional) inputfile2 - full path to the second input file (optional) outputfile - name of the output file, without path (required) format - output file format (e.g., 'csv' or 'txt') (optional; default 'txt') returns a dictionary with information about the results compositeheader - header combining two inputs outputfile - actual full path to the output file success - True if process completed successfully, otherwise False message - an explanation of the reason if success=False artifacts - a dictionary of persistent objects created ''' #print '%s options: %s' % (__version__, options) setup_actor_logging(options) logging.debug( 'Started %s' % __version__ ) logging.debug( 'options: %s' % options ) # Make a list for the response returnvars = ['workspace', 'compositeheader', 'outputfile', 'success', 'message', \ 'artifacts'] ### Standard outputs ### success = False message = None # Make a dictionary for artifacts left behind artifacts = {} ### Establish variables ### workspace = './' inputfile1 = None inputfile2 = None outputfile = None format = 'txt' compositeheader = None ### Required inputs ### try: workspace = options['workspace'] except: pass try: inputfile1 = options['inputfile1'] except: pass try: inputfile2 = options['inputfile2'] except: pass try: outputfile = options['outputfile'] except: pass if outputfile is None or len(outputfile)==0: message = 'No output file given. %s' % __version__ returnvals = [workspace, compositeheader, outputfile, success, message, artifacts] return response(returnvars, returnvals) outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile) try: format = options['format'] except: pass # Read the headers of the two files and let read_header figure out the dialects and # encodings. header1 = read_header(inputfile1) header2 = read_header(inputfile2) compositeheader = merge_headers(header1, header2) if format is None or format.lower()=='txt': dialect = tsv_dialect() else: dialect = csv_dialect() # Write the resulting header into outputfile success = write_header(outputfile, compositeheader, dialect) if success == False: message = 'Header was not written. %s' % __version__ returnvals = [workspace, compositeheader, outputfile, success, message, artifacts] return response(returnvars, returnvals) if compositeheader is not None: compositeheader = list(compositeheader) artifacts['composite_header_file'] = outputfile returnvals = [workspace, compositeheader, outputfile, success, message, artifacts] logging.debug('Finishing %s' % __version__) return response(returnvars, returnvals)