def darwin_cloud_vocab_dict_from_file(vocabfile): ''' Get a Darwin Cloud vocabulary as a dictionary from a file. parameters: vocabfile - path to the vocabulary file (required) returns: vocabdict - dictionary of complete vocabulary records ''' functionname = 'darwin_cloud_vocab_dict_from_file()' if vocabfile is None or len(vocabfile) == 0: s = 'No vocabulary file given in %s.' % functionname logging.debug(s) return None if os.path.isfile(vocabfile) == False: s = 'Vocabulary file %s not found in %s.' % (vocabfile, functionname) logging.debug(s) return None dialect = csv_file_dialect(vocabfile) # Create a dictionary to hold the vocabulary vocabdict = {} header = read_header(vocabfile, dialect=dialect, encoding='utf8') # Iterate through all rows in the input file. Let read_csv_row figure out the dialect for row in read_csv_row(vocabfile, dialect=dialect, encoding='utf-8', header=True, fieldnames=header): # Make a complete copy of the row rowdict = copy.deepcopy(row) key = row['fieldname'] # Remove the key from the row copy rowdict.pop('fieldname') vocabdict[key] = rowdict return vocabdict
def darwin_cloud_collector(options): ''' Get field names from inputfile and put any that are not Simple Darwin Core into outputfile. options - a dictionary of parameters loglevel - level at which to log (e.g., DEBUG) (optional) workspace - path to a directory for the outputfile (optional) inputfile - full path to the input file (required) outputfile - name of the output file, without path (required) encoding - string signifying the encoding of the input file. If known, it speeds up processing a great deal. (optional; default None) (e.g., 'utf-8') returns a dictionary with information about the results addedvalues - new values added to the output file outputfile - actual full path to the output file success - True if process completed successfully, otherwise False message - an explanation of the reason if success=False ''' #print '%s options: %s' % (__version__, options) setup_actor_logging(options) logging.debug('Started %s' % __version__) logging.debug('options: %s' % options) # Make a list for the response returnvars = ['workspace', 'addedvalues', 'outputfile', 'success', 'message', \ 'artifacts'] ### Standard outputs ### success = False message = None ### Custom outputs ### addedvalues = None # Make a dictionary for artifacts left behind artifacts = {} ### Establish variables ### workspace = './' inputfile = None outputfile = None encoding = None ### Required inputs ### try: workspace = options['workspace'] except: pass try: inputfile = options['inputfile'] except: pass try: outputfile = options['outputfile'] except: pass if inputfile is None or len(inputfile) == 0: message = 'No input file given. %s' % __version__ returnvals = [ workspace, addedvalues, outputfile, success, message, artifacts ] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) if os.path.isfile(inputfile) == False: message = 'Input file %s not found. %s' % (inputfile, __version__) returnvals = [ workspace, addedvalues, outputfile, success, message, artifacts ] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) if outputfile is None or len(outputfile) == 0: message = 'No output file given. %s' % __version__ returnvals = [ workspace, addedvalues, outputfile, success, message, artifacts ] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile) try: encoding = options['encoding'] except: pass # Read the header and let read_header figure out the dialect and encoding. header = read_header(inputfile, encoding=encoding) nondwc = terms_not_in_dwc(header, casesensitive=False) dialect = vocab_dialect() addedvalues = distinct_vocabs_to_file(outputfile, nondwc, 'fieldname', dialect=dialect) success = True if addedvalues is not None: artifacts['darwin_cloud_collector_file'] = outputfile returnvals = [ workspace, addedvalues, outputfile, success, message, artifacts ] logging.debug('Finishing %s' % __version__) return response(returnvars, returnvals)
def text_file_field_stripper(options): ''' Filter a text file into a new file based on matching a list of fields to keep. options - a dictionary of parameters loglevel - level at which to log (e.g., DEBUG) (optional) workspace - the directory in which the output will be written (optional) inputfile - full path to the input file (required) outputfile - name of the output file, without path (required) separator - string that separates the values in termlist (e.g., '|') (optional; default None) encoding - string signifying the encoding of the input file. If known, it speeds up processing a great deal. (optional; default None) (e.g., 'utf-8') format - output file format (e.g., 'csv' or 'txt') (optional; default 'txt') termlist - list of fields to extract from the input file (required) returns a dictionary with information about the results workspace - actual path to the directory where the outputfile was written outputfile - actual full path to the output tsv file success - True if process completed successfully, otherwise False message - an explanation of the reason if success=False artifacts - a dictionary of persistent objects created ''' #print '%s options: %s' % (__version__, options) setup_actor_logging(options) logging.debug( 'Started %s' % __version__ ) logging.debug( 'options: %s' % options ) # Make a list for the response returnvars = ['workspace', 'outputfile', 'success', 'message', 'artifacts'] ### Standard outputs ### success = False message = None # Make a dictionary for artifacts left behind artifacts = {} ### Establish variables ### workspace = './' inputfile = None outputfile = None format = 'txt' termlist = None separator = None encoding = None ### Required inputs ### try: workspace = options['workspace'] except: pass try: inputfile = options['inputfile'] except: pass if inputfile is None or len(inputfile)==0: message = 'No input file given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) if os.path.isfile(inputfile) == False: message = 'Input file %s not found. %s' % (inputfile, __version__) returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) try: termlist = options['termlist'] except: pass if termlist is None or len(termlist)==0: message = 'No termlist given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message: %s' % message) return response(returnvars, returnvals) try: separator = options['separator'] except: pass try: encoding = options['encoding'] except: pass if separator is None or len(separator.strip())==0: theterms = [termlist] else: theterms = termlist.split(separator) # Determine the file dialect inputdialect = csv_file_dialect(inputfile) # Determine the file encoding if encoding is None or len(encoding.strip()) == 0: encoding = csv_file_encoding(inputfile) # csv_file_encoding() always returns an encoding if there is an input file. # No need to check. # If the termname is not in the header of the inputfile, nothing to do. header = read_header(inputfile, dialect=inputdialect, encoding=encoding) # Make a clean version of the input header cleaninputheader = clean_header(header) try: format = options['format'] except: pass try: outputfile = options['outputfile'] except: pass if outputfile is None or len(outputfile)==0: message = 'No output file given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile) # Prepare the outputfile if format is None or format.lower()=='txt': outputdialect = tsv_dialect() else: outputdialect = csv_dialect() if separator is None or len(separator.strip())==0: theterms = [termlist] else: theterms = termlist.split(separator) # Make a clean version of the output header cleanoutputheader = clean_header(theterms) # Create the outputfile and write the new header to it write_header(outputfile, cleanoutputheader, outputdialect) # Check to see that the file was created if os.path.isfile(outputfile) == False: message = 'Outputfile %s was not created. %s' % (outputfile, __version__) returnvals = [workspace, outputfile, success, message, artifacts] return response(returnvars, returnvals) # Open the outputfile to start writing matching rows with open(outputfile, 'a') as outfile: writer = csv.DictWriter(outfile, dialect=outputdialect, encoding='utf-8', fieldnames=cleanoutputheader) # Iterate through all rows in the input file for row in read_csv_row(inputfile, dialect=inputdialect, encoding=encoding, header=True, fieldnames=cleaninputheader): newrow = extract_fields_from_row(row, cleanoutputheader) writer.writerow(newrow) success = True s = 'stripped_file' artifacts[s] = outputfile # Prepare the response dictionary returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('Finishing %s' % __version__) return response(returnvars, returnvals)
def term_standardizer_report(inputfile, reportfile, vocabfile, key, separator=None, encoding=None, format=None): ''' Write a file with substitutions from a vocabfile for fields in a key and appended terms showing the original values. parameters: inputfile - full path to the input file (required) reportfile - full path to the output file (required) vocabfile - path to the vocabulary file (required) key - field or separator-separated fields to set (required) separator - string to use as the key and value separator (optional; default '|') encoding - string signifying the encoding of the input file. If known, it speeds up processing a great deal. (optional; default None) (e.g., 'utf-8') format - string signifying the csv.dialect of the report file ('csv' or 'txt') (optional; default: txt) returns: success - True if the report was written, else False ''' functionname = 'term_standardizer_report()' if reportfile is None or len(reportfile) == 0: s = 'No reportfile name given in %s.' % functionname logging.debug(s) return False if inputfile is None or len(inputfile) == 0: s = 'No inputfile file given in %s.' % functionname logging.debug(s) return False if os.path.isfile(inputfile) == False: s = 'Inputfile file %s not found in %s.' % (inputfile, functionname) logging.debug(s) return False # Determine the dialect of the input file inputdialect = csv_file_dialect(inputfile) # Determine the dialect of the input file if encoding is None or len(encoding.strip()) == 0: encoding = csv_file_encoding(inputfile) # Read the header from the input file inputheader = read_header(inputfile, dialect=inputdialect, encoding=encoding) if inputheader is None: s = 'Unable to read header from input file %s in %s.' % (inputfile, functionname) logging.debug(s) return False if key is None or len(key.strip()) == 0: s = 'No key given in %s.' % functionname logging.debug(s) return False # Make sure there is a separator for the next step if separator is None or len(separator) == 0: separator = '|' # Make a list of the fields in the key by splitting it on the separator fieldlist = key.split(separator) # Assume none of the fields is in the file headerhaskey = False # Search the cleaned up header for any field from the key cleanedinputheader = strip_list(inputheader) for field in fieldlist: if field in cleanedinputheader: headerhaskey = True break if headerhaskey == False: s = 'No field from %s found ' % fieldlist s += 'in input file %s in %s.' % (inputfile, functionname) logging.debug(s) return False if vocabfile is None or len(vocabfile) == 0: logging.debug('No vocabulary file given in %s.') % functionname return False if os.path.isfile(vocabfile) == False: s = 'Vocabulary file %s not found in %s.' % (vocabfile, functionname) logging.debug(s) return False # Get the vocabulary dictionary, but convert all entries using ustripstr. Assume # vocabulary file is encoded as utf-8. vocabdict = vocab_dict_from_file(vocabfile, key, encoding='utf-8', \ separator=separator, function=ustripstr) if len(vocabdict) == 0: s = 'Vocabulary file %s ' % vocabfile s += 'had zero recommendations in %s.' % functionname logging.debug(s) return False if format is None or format.lower() == 'txt': dialect = tsv_dialect() else: dialect = csv_dialect() if format is None or format.lower() == 'txt': outputdialect = tsv_dialect() else: outputdialect = csv_dialect() # Create an output header that is the same as the input header with fields # appended to hold the original values of the key fields # Get the fields to add by splitting the key with the separator outputheader = cleanedinputheader for field in fieldlist: if field in outputheader: outputheader = outputheader + [field + '_orig'] else: outputheader = outputheader + [field] # Create the outputfile and write the new header to it write_header(reportfile, outputheader, outputdialect) # Check to see if the outputfile was created if os.path.isfile(reportfile) == False: s = 'reportfile: %s not created in %s.' % (reportfile, functionname) logging.debug(s) return False # Open the outputfile to append rows having the added fields with open(reportfile, 'a') as outfile: writer = csv.DictWriter(outfile, dialect=outputdialect, encoding='utf-8', fieldnames=outputheader) # Iterate through all rows in the input file for row in read_csv_row(inputfile, dialect=inputdialect, encoding=encoding, header=True, fieldnames=cleanedinputheader): # Set the _orig values for every field in the field list that exists in # the row for field in fieldlist: if field in row: row[field + '_orig'] = row[field] # Construct a composite field value for the row to match a key in the # vocabulary file rowkey = extract_values_from_row(row, fieldlist, separator) # Get dictionary for recommended value for the ustripstr(rowkey) newvaluedict = recommended_value(vocabdict, ustripstr(rowkey)) # Only make changes if there is a standardized value found if newvaluedict is not None: # ustripstr(rowkey) was found in the vocabulary # Get the standard value standard = newvaluedict['standard'] # Treat standard value that is None or only whitespace as '' if standard is None or len(standard.strip()) == 0: standard = '' # Make a list of values given in standard newvalues = standard.split(separator) # Only make changes if the number of recommendation fields is the # same as the number of fields in the key if len(newvalues) == len(fieldlist): i = 0 # Update or add new value to field in the fieldlist for field in fieldlist: row[field] = newvalues[i] i += 1 writer.writerow(row) s = 'Report written to %s in %s.' % (reportfile, functionname) logging.debug(s) return True
def uuid_term_appender(inputfile, outputfile, key, guidtype=None, encoding=None, format=None): ''' Write a file adding a field populated by global unique identifiers (GUIDs) to the fields in the input file. parameters: inputfile - full path to the input file (required) outputfile - full path to the output file (required) key - field or separator-separated fields to set (required) guidtype - type of GUID to use to populate the key (optional; default 'uuid') encoding - string signifying the encoding of the input file. If known, it speeds up processing a great deal. (optional; default None) (e.g., 'utf-8') format - string signifying the csv.dialect of the report file ('csv' or 'txt') (optional; default: txt) returns: success - True if the report was written, else False ''' functionname = 'uuid_term_appender()' if outputfile is None or len(outputfile) == 0: s = 'No outputfile name given in %s.' % functionname logging.debug(s) return False if inputfile is None or len(inputfile) == 0: s = 'No inputfile file given in %s.' % functionname logging.debug(s) return False if os.path.isfile(inputfile) == False: s = 'Inputfile file %s not found in %s.' % (inputfile, functionname) logging.debug(s) return False # Determine the dialect of the input file inputdialect = csv_file_dialect(inputfile) # Determine the dialect of the input file if encoding is None or len(encoding.strip()) == 0: encoding = csv_file_encoding(inputfile) # Read the header from the input file inputheader = read_header(inputfile, dialect=inputdialect, encoding=encoding) if inputheader is None: s = 'Unable to read header from input file %s in %s.' % (inputfile, functionname) logging.debug(s) return False if key is None or len(key.strip()) == 0: s = 'No key given in %s.' % functionname logging.debug(s) return False # Abort if the key exists in the inputheader if key in inputheader: s = 'field %s ' % key s += 'already exists in file %s ' % inputfile s += 'in %s.' % functionname logging.debug(s) return False if format is None or format.lower() == 'txt': outputdialect = tsv_dialect() else: outputdialect = csv_dialect() # Make an outputheader that is a copy of the inputheader plus the new field to hold # GUID. outputheader = inputheader + [key] # Create the outputfile and write the new header to it write_header(outputfile, outputheader, outputdialect) # Check to see if the outputfile was created if os.path.isfile(outputfile) == False: s = 'outputfile: %s was not created in %s.' % (outputfile, functionname) logging.debug(s) return False # Open the outputfile to append rows with appended GUID field with open(outputfile, 'a') as outfile: writer = csv.DictWriter(outfile, dialect=outputdialect, encoding='utf-8', fieldnames=outputheader) # Iterate through all rows in the input file for row in read_csv_row(inputfile, dialect=inputdialect, encoding=encoding, header=True, fieldnames=inputheader): # Create a GUID based on the selected guidtype guid = get_guid(guidtype) # Set the value of the key field to a GUID row[key] = guid # Write the updated row to the outputfile writer.writerow(row) s = 'Output file written to %s in %s.' % (outputfile, functionname) logging.debug(s) return True
def term_setter_report(inputfile, reportfile, key, constantvalues=None, separator=None, encoding=None, format=None): ''' Write a file substituting constants for fields that already exist in an input file and with added fields with constants for fields that do not already exist in an inputfile. Field name matching is exact. parameters: inputfile - full path to the input file (required) reportfile - full path to the output file (required) key - field or separator-separated fields to set (required) constantvalues - value or separator-separated values to set the field(s) to (required) separator - string to use as the key and value separator (optional; default '|') encoding - string signifying the encoding of the input file. If known, it speeds up processing a great deal. (optional; default None) (e.g., 'utf-8') format - string signifying the csv.dialect of the report file ('csv' or 'txt') (optional; default: txt) returns: success - True if the report was written, else False ''' functionname = 'term_setter_report()' if reportfile is None or len(reportfile) == 0: s = 'No reportfile name given in %s.' % functionname logging.debug(s) return False if inputfile is None or len(inputfile) == 0: s = 'No inputfile file given in %s.' % functionname logging.debug(s) return False if os.path.isfile(inputfile) == False: s = 'Inputfile file %s not found in %s.' % (inputfile, functionname) logging.debug(s) return False # Determine the dialect of the input file inputdialect = csv_file_dialect(inputfile) # Determine the dialect of the input file if encoding is None or len(encoding.strip()) == 0: encoding = csv_file_encoding(inputfile) # Read the header from the input file inputheader = read_header(inputfile, dialect=inputdialect, encoding=encoding) if inputheader is None: s = 'Unable to read header from input file %s in %s.' % (inputfile, functionname) logging.debug(s) return False if key is None or len(key.strip()) == 0: s = 'No key given in %s.' % functionname logging.debug(s) return False if constantvalues is None or len(constantvalues) == 0: s = 'No constantvalues given in %s.' % functionname logging.debug(s) return False # Make sure there is a separator for the next step if separator is None or len(separator) == 0: separator = '|' # Get the fields to set by splitting the key with the separator fields = key.split(separator) # Get the values to set by splitting the constantvalues with the separator addedvalues = constantvalues.split(separator) # Abort if there is a mismatch in the lengths of the field and constants lists if len(fields) != len(addedvalues): s = 'length of field list: %s ' % key s += 'does not match length of constants list: %s ' % constantvalues s += 'in %s.' % functionname logging.debug(s) return False if format is None or format.lower() == 'txt': outputdialect = tsv_dialect() else: outputdialect = csv_dialect() # Make an outputheader that is a copy of the inputheader outputheader = inputheader # Add to the output header fields that are not in the inputheader for field in fields: if field not in outputheader: outputheader = outputheader + [field] # Create the outputfile and write the new header to it write_header(reportfile, outputheader, outputdialect) # Check to see if the outputfile was created if os.path.isfile(reportfile) == False: s = 'reportfile: %s was not created in %s.' % (outputfile, functionname) logging.debug(s) return False # Open the outputfile to append rows with fields set to constant values with open(reportfile, 'a') as outfile: writer = csv.DictWriter(outfile, dialect=outputdialect, encoding='utf-8', fieldnames=outputheader) # Iterate through all rows in the input file for row in read_csv_row(inputfile, dialect=inputdialect, encoding=encoding, header=True, fieldnames=inputheader): # For every field in the key list for i in range(0, len(fields)): # Set the value of the ith field to the ith constant row[fields[i]] = addedvalues[i] # Write the updated row to the outputfile writer.writerow(row) s = 'Report written to %s in %s.' % (reportfile, functionname) logging.debug(s) return True
def darwinize_header(options): ''' Translate field names from input file to Darwin Core field names in outputfile using a Darwin Cloud vocabulary lookup. options - a dictionary of parameters loglevel - level at which to log (e.g., DEBUG) (optional) workspace - path to a directory for the outputfile (optional) inputfile - full path to the input file (required) dwccloudfile - full path to the vocabulary file containing the Darwin Cloud terms (required) outputfile - name of the output file, without path (required) encoding - string signifying the encoding of the input file. If known, it speeds up processing a great deal. (optional; default None) (e.g., 'utf-8') format - output file format (e.g., 'csv' or 'txt') (optional; default 'txt') namespace - prepend namespace to fields that were darwinized (optional; default 'no') (e.g., 'y', 'n') returns a dictionary with information about the results outputfile - actual full path to the output file success - True if process completed successfully, otherwise False message - an explanation of the reason if success=False ''' #print '%s options: %s' % (__version__, options) setup_actor_logging(options) logging.debug('Started %s' % __version__) logging.debug('options: %s' % options) # Make a list for the response returnvars = ['workspace', 'outputfile', 'success', 'message', 'artifacts'] ### Standard outputs ### success = False message = None # Make a dictionary for artifacts left behind artifacts = {} ### Establish variables ### workspace = './' inputfile = None dwccloudfile = None outputfile = None encoding = None namespace = 'n' format = None ### Required inputs ### try: workspace = options['workspace'] except: pass try: inputfile = options['inputfile'] except: pass if inputfile is None or len(inputfile) == 0: message = 'No input file given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) if os.path.isfile(inputfile) == False: message = 'Input file %s not found. %s' % (inputfile, __version__) returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) try: dwccloudfile = options['dwccloudfile'] except: pass if dwccloudfile is None or len(dwccloudfile) == 0: message = 'No Darwin Cloud vocabulary file given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) if os.path.isfile(dwccloudfile) == False: message = 'Darwin Cloud vocabulary file not found. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) try: outputfile = options['outputfile'] except: pass if outputfile is None or len(outputfile) == 0: message = 'No output file given. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile) try: encoding = options['encoding'] except: pass if encoding is None or len(encoding.strip()) == 0: encoding = csv_file_encoding(inputfile) try: namespace = options['namespace'] except: pass inputdialect = csv_file_dialect(inputfile) try: format = options['format'] except: pass if format is None or len(format) == 0: outputdialect = inputdialect elif format.lower() == 'csv': outputdialect = csv_dialect() else: outputdialect = tsv_dialect() header = read_header(inputfile, dialect=inputdialect, encoding=encoding) dwcheader = darwinize_list(header, dwccloudfile, namespace) if dwcheader is None: message = 'Unable to create darwinized header. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) # Write the new header to the outputfile if write_header(outputfile, dwcheader, dialect=outputdialect) == False: message = 'Unable to write header to output file. %s' % __version__ returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('message:\n%s' % message) return response(returnvars, returnvals) # Read the rows of the input file, append them to the output file after the # header with columns in the same order. with open(outputfile, 'a') as outfile: writer = csv.DictWriter(outfile, dialect=outputdialect, encoding='utf-8', fieldnames=header) for row in read_csv_row(inputfile, inputdialect, encoding): writer.writerow(row) #print 'row: %s' % row success = True artifacts['darwinized_header_file'] = outputfile returnvals = [workspace, outputfile, success, message, artifacts] logging.debug('Finishing %s' % __version__) return response(returnvars, returnvals)
def composite_header_constructor(options): ''' Create a file with a header that contains the distinct union of column names from two input files. options - a dictionary of parameters loglevel - level at which to log (e.g., DEBUG) (optional) workspace - path to a directory for the output file (optional; default './') inputfile1 - full path to one of the input files (optional) inputfile2 - full path to the second input file (optional) outputfile - name of the output file, without path (required) format - output file format (e.g., 'csv' or 'txt') (optional; default 'txt') returns a dictionary with information about the results compositeheader - header combining two inputs outputfile - actual full path to the output file success - True if process completed successfully, otherwise False message - an explanation of the reason if success=False artifacts - a dictionary of persistent objects created ''' #print '%s options: %s' % (__version__, options) setup_actor_logging(options) logging.debug( 'Started %s' % __version__ ) logging.debug( 'options: %s' % options ) # Make a list for the response returnvars = ['workspace', 'compositeheader', 'outputfile', 'success', 'message', \ 'artifacts'] ### Standard outputs ### success = False message = None # Make a dictionary for artifacts left behind artifacts = {} ### Establish variables ### workspace = './' inputfile1 = None inputfile2 = None outputfile = None format = 'txt' compositeheader = None ### Required inputs ### try: workspace = options['workspace'] except: pass try: inputfile1 = options['inputfile1'] except: pass try: inputfile2 = options['inputfile2'] except: pass try: outputfile = options['outputfile'] except: pass if outputfile is None or len(outputfile)==0: message = 'No output file given. %s' % __version__ returnvals = [workspace, compositeheader, outputfile, success, message, artifacts] return response(returnvars, returnvals) outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile) try: format = options['format'] except: pass # Read the headers of the two files and let read_header figure out the dialects and # encodings. header1 = read_header(inputfile1) header2 = read_header(inputfile2) compositeheader = merge_headers(header1, header2) if format is None or format.lower()=='txt': dialect = tsv_dialect() else: dialect = csv_dialect() # Write the resulting header into outputfile success = write_header(outputfile, compositeheader, dialect) if success == False: message = 'Header was not written. %s' % __version__ returnvals = [workspace, compositeheader, outputfile, success, message, artifacts] return response(returnvars, returnvals) if compositeheader is not None: compositeheader = list(compositeheader) artifacts['composite_header_file'] = outputfile returnvals = [workspace, compositeheader, outputfile, success, message, artifacts] logging.debug('Finishing %s' % __version__) return response(returnvars, returnvals)
def term_token_count_from_file(inputfile, termname, dialect=None, encoding=None): ''' Make a dictionary of tokens for a given term in a file along with the number of times each occurs. parameters: inputfile - full path to the input file (required) termname - term for which to count rows (required) dialect - csv.dialect object with the attributes of the input files, which must all have the same dialect if dialect is given, otherwise it will be detected (default None) encoding - a string designating the input file encoding (optional; default None) (e.g., 'utf-8', 'mac_roman', 'latin_1', 'cp1252') returns: tokens - a dictionary containing the tokens and their counts ''' functionname = 'term_token_count_from_file()' if inputfile is None or len(inputfile) == 0: s = 'No input file given in %s.' % functionname logging.debug(s) return 0 if os.path.isfile(inputfile) == False: s = 'File %s not found in %s.' % (inputfile, functionname) logging.debug(s) return 0 if termname is None or len(termname) == 0: s = 'No term name given in %s.' % functionname logging.debug(s) return 0 # Determine the dialect of the input file if dialect is None: dialect = csv_file_dialect(inputfile) # csv_file_dialect() always returns a dialect if there is an input file. # No need to check. # Determine the encoding of the input file if encoding is None or len(encoding.strip()) == 0: encoding = csv_file_encoding(inputfile) # csv_file_encoding() always returns an encoding if there is an input file. # No need to check. inputheader = read_header(inputfile, dialect=dialect, encoding=encoding) if termname not in inputheader: s = 'Term %s not found in file %s ' % (termname, inputfile) s += 'in %s.' % functionname logging.debug(s) return None rowcount = 0 tokencount = 0 populatedrowcount = 0 tokens = {'tokenlist': {}} for row in read_csv_row(inputfile, dialect, encoding): try: value = row[termname] except: pass if value is not None and len(value.strip()) > 0: rowdict = {} wordlist = re.sub("[^\w]", " ", value).split() for token in wordlist: if token in rowdict: rowdict[token][ 'totalcount'] = rowdict[token]['totalcount'] + 1 else: rowdict[token] = {} rowdict[token]['rowcount'] = 1 rowdict[token]['totalcount'] = 1 populatedrowcount += 1 for key, value in rowdict.iteritems(): tokenlist = tokens['tokenlist'] if key in tokenlist: tokenlist[key]['rowcount'] = \ tokenlist[key]['rowcount'] + value['rowcount'] tokenlist[key]['totalcount'] = \ tokenlist[key]['totalcount'] + value['totalcount'] else: tokenlist[key] = {} tokenlist[key]['rowcount'] = value['rowcount'] tokenlist[key]['totalcount'] = value['totalcount'] rowcount += 1 tokencount += len(wordlist) tokens['rowcount'] = rowcount tokens['tokencount'] = tokencount tokens['input'] = inputfile tokens['term'] = termname return tokens