コード例 #1
0
def terms_not_in_darwin_cloud(checklist,
                              dwccloudfile,
                              encoding=None,
                              vetted=True,
                              casesensitive=False):
    ''' Get the list of distinct values in a checklist that are not in the Darwin Cloud
        vocabulary. Verbatim values in the Darwin Cloud vocabulary should be lower-case and
        stripped already, so that is what must be matched here. The Darwin Cloud vocabulary
        should have the case-sensitive standard value.
    parameters:
        checklist - list of values to check against the target list (required)
        dwccloudfile - the vocabulary file for the Darwin Cloud (required)
        vetted - set to False if unvetted values should also be returned (default True)
        encoding - a string designating the input file encoding (optional; default None) 
            (e.g., 'utf-8', 'mac_roman', 'latin_1', 'cp1252')
    returns:
        a sorted list of distinct new values not in the Darwin Cloud vocabulary
    '''
    functionname = 'terms_not_in_darwin_cloud()'

    if checklist is None or len(checklist) == 0:
        s = 'No checklist given in %s.' % functionname
        logging.debug(s)
        return None

    dialect = csv_file_dialect(dwccloudfile)

    # Try to determine the encoding of the inputfile.
    if encoding is None or len(encoding.strip()) == 0:
        encoding = csv_file_encoding(dwccloudfile)
        # csv_file_encoding() always returns an encoding if there is an input file.

    # No need to check if dwccloudfile is given and exists, vocab_dict_from_file() and
    # vetted_vocab_dict_from_file() do that.
    if vetted == True:
        darwinclouddict = vetted_vocab_dict_from_file(dwccloudfile,
                                                      'fieldname',
                                                      dialect=dialect,
                                                      encoding=encoding)
    else:
        darwinclouddict = vocab_dict_from_file(dwccloudfile,
                                               'fieldname',
                                               dialect=dialect,
                                               encoding=encoding)

    dwcloudlist = []
    for key, value in darwinclouddict.iteritems():
        dwcloudlist.append(key)

    if casesensitive == True:
        return not_in_list(dwcloudlist, checklist)

    lowerdwclist = []
    for term in dwcloudlist:
        lowerdwclist.append(ustripstr(term))

    notfound = not_in_list(lowerdwclist, checklist, function=ustripstr)

    return notfound
コード例 #2
0
def text_file_field_stripper(options):
    ''' Filter a text file into a new file based on matching a list of fields to keep.
    options - a dictionary of parameters
        loglevel - level at which to log (e.g., DEBUG) (optional)
        workspace - the directory in which the output will be written (optional)
        inputfile - full path to the input file (required)
        outputfile - name of the output file, without path (required)
        separator - string that separates the values in termlist (e.g., '|') 
            (optional; default None)
        encoding - string signifying the encoding of the input file. If known, it speeds
            up processing a great deal. (optional; default None) (e.g., 'utf-8')
        format - output file format (e.g., 'csv' or 'txt') (optional; default 'txt')
        termlist - list of fields to extract from the input file (required)
    returns a dictionary with information about the results
        workspace - actual path to the directory where the outputfile was written
        outputfile - actual full path to the output tsv file
        success - True if process completed successfully, otherwise False
        message - an explanation of the reason if success=False
        artifacts - a dictionary of persistent objects created
    '''
    #print '%s options: %s' % (__version__, options)

    setup_actor_logging(options)

    logging.debug( 'Started %s' % __version__ )
    logging.debug( 'options: %s' % options )

    # Make a list for the response
    returnvars = ['workspace', 'outputfile', 'success', 'message', 'artifacts']

    ### Standard outputs ###
    success = False
    message = None

    # Make a dictionary for artifacts left behind
    artifacts = {}

    ### Establish variables ###
    workspace = './'
    inputfile = None
    outputfile = None
    format = 'txt'
    termlist = None
    separator = None
    encoding = None

    ### Required inputs ###
    try:
        workspace = options['workspace']
    except:
        pass

    try:
        inputfile = options['inputfile']
    except:
        pass

    if inputfile is None or len(inputfile)==0:
        message = 'No input file given. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    if os.path.isfile(inputfile) == False:
        message = 'Input file %s not found. %s' % (inputfile, __version__)
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    try:
        termlist = options['termlist']
    except:
        pass

    if termlist is None or len(termlist)==0:
        message = 'No termlist given. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message: %s' % message)
        return response(returnvars, returnvals)

    try:
        separator = options['separator']
    except:
        pass

    try:
        encoding = options['encoding']
    except:
        pass

    if separator is None or len(separator.strip())==0:
        theterms = [termlist]
    else:
        theterms = termlist.split(separator)

    # Determine the file dialect
    inputdialect = csv_file_dialect(inputfile)

    # Determine the file encoding
    if encoding is None or len(encoding.strip()) == 0:
        encoding = csv_file_encoding(inputfile)
        # csv_file_encoding() always returns an encoding if there is an input file.
        # No need to check.

    # If the termname is not in the header of the inputfile, nothing to do.
    header = read_header(inputfile, dialect=inputdialect, encoding=encoding)

    # Make a clean version of the input header
    cleaninputheader = clean_header(header)

    try:
        format = options['format']
    except:
        pass

    try:
        outputfile = options['outputfile']
    except:
        pass

    if outputfile is None or len(outputfile)==0:
        message = 'No output file given. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile)

    # Prepare the outputfile
    if format is None or format.lower()=='txt':
        outputdialect = tsv_dialect()
    else:
        outputdialect = csv_dialect()

    if separator is None or len(separator.strip())==0:
        theterms = [termlist]
    else:
        theterms = termlist.split(separator)

    # Make a clean version of the output header
    cleanoutputheader = clean_header(theterms)

    # Create the outputfile and write the new header to it
    write_header(outputfile, cleanoutputheader, outputdialect)

    # Check to see that the file was created
    if os.path.isfile(outputfile) == False:
        message = 'Outputfile %s was not created. %s' % (outputfile, __version__)
        returnvals = [workspace, outputfile, success, message, artifacts]
        return response(returnvars, returnvals)

    # Open the outputfile to start writing matching rows
    with open(outputfile, 'a') as outfile:
        writer = csv.DictWriter(outfile, dialect=outputdialect, encoding='utf-8', 
            fieldnames=cleanoutputheader)

        # Iterate through all rows in the input file
        for row in read_csv_row(inputfile, dialect=inputdialect, encoding=encoding, 
            header=True, fieldnames=cleaninputheader):
            newrow = extract_fields_from_row(row, cleanoutputheader)
            writer.writerow(newrow)

    success = True
    s = 'stripped_file'
    artifacts[s] = outputfile
    
    # Prepare the response dictionary
    returnvals = [workspace, outputfile, success, message, artifacts]
    logging.debug('Finishing %s' % __version__)
    return response(returnvars, returnvals)
コード例 #3
0
def term_standardizer_report(inputfile,
                             reportfile,
                             vocabfile,
                             key,
                             separator=None,
                             encoding=None,
                             format=None):
    ''' Write a file with substitutions from a vocabfile for fields in a key and appended 
        terms showing the original values.
    parameters:
        inputfile - full path to the input file (required)
        reportfile - full path to the output file (required)
        vocabfile - path to the vocabulary file (required)
        key - field or separator-separated fields to set (required)
        separator - string to use as the key and value separator (optional; default '|')
        encoding - string signifying the encoding of the input file. If known, it speeds
            up processing a great deal. (optional; default None) (e.g., 'utf-8')
        format - string signifying the csv.dialect of the report file ('csv' or 'txt')
            (optional; default: txt)
    returns:
        success - True if the report was written, else False
    '''
    functionname = 'term_standardizer_report()'

    if reportfile is None or len(reportfile) == 0:
        s = 'No reportfile name given in %s.' % functionname
        logging.debug(s)
        return False

    if inputfile is None or len(inputfile) == 0:
        s = 'No inputfile file given in %s.' % functionname
        logging.debug(s)
        return False

    if os.path.isfile(inputfile) == False:
        s = 'Inputfile file %s not found in %s.' % (inputfile, functionname)
        logging.debug(s)
        return False

    # Determine the dialect of the input file
    inputdialect = csv_file_dialect(inputfile)

    # Determine the dialect of the input file
    if encoding is None or len(encoding.strip()) == 0:
        encoding = csv_file_encoding(inputfile)

    # Read the header from the input file
    inputheader = read_header(inputfile,
                              dialect=inputdialect,
                              encoding=encoding)

    if inputheader is None:
        s = 'Unable to read header from input file %s in %s.' % (inputfile,
                                                                 functionname)
        logging.debug(s)
        return False

    if key is None or len(key.strip()) == 0:
        s = 'No key given in %s.' % functionname
        logging.debug(s)
        return False

    # Make sure there is a separator for the next step
    if separator is None or len(separator) == 0:
        separator = '|'

    # Make a list of the fields in the key by splitting it on the separator
    fieldlist = key.split(separator)

    # Assume none of the fields is in the file
    headerhaskey = False

    # Search the cleaned up header for any field from the key
    cleanedinputheader = strip_list(inputheader)
    for field in fieldlist:
        if field in cleanedinputheader:
            headerhaskey = True
            break

    if headerhaskey == False:
        s = 'No field from %s found ' % fieldlist
        s += 'in input file %s in %s.' % (inputfile, functionname)
        logging.debug(s)
        return False

    if vocabfile is None or len(vocabfile) == 0:
        logging.debug('No vocabulary file given in %s.') % functionname
        return False

    if os.path.isfile(vocabfile) == False:
        s = 'Vocabulary file %s not found in %s.' % (vocabfile, functionname)
        logging.debug(s)
        return False

    # Get the vocabulary dictionary, but convert all entries using ustripstr. Assume
    # vocabulary file is encoded as utf-8.
    vocabdict = vocab_dict_from_file(vocabfile, key, encoding='utf-8', \
        separator=separator, function=ustripstr)
    if len(vocabdict) == 0:
        s = 'Vocabulary file %s ' % vocabfile
        s += 'had zero recommendations in %s.' % functionname
        logging.debug(s)
        return False

    if format is None or format.lower() == 'txt':
        dialect = tsv_dialect()
    else:
        dialect = csv_dialect()

    if format is None or format.lower() == 'txt':
        outputdialect = tsv_dialect()
    else:
        outputdialect = csv_dialect()

    # Create an output header that is the same as the input header with fields
    # appended to hold the original values of the key fields
    # Get the fields to add by splitting the key with the separator
    outputheader = cleanedinputheader
    for field in fieldlist:
        if field in outputheader:
            outputheader = outputheader + [field + '_orig']
        else:
            outputheader = outputheader + [field]

    # Create the outputfile and write the new header to it
    write_header(reportfile, outputheader, outputdialect)

    # Check to see if the outputfile was created
    if os.path.isfile(reportfile) == False:
        s = 'reportfile: %s not created in %s.' % (reportfile, functionname)
        logging.debug(s)
        return False

    # Open the outputfile to append rows having the added fields
    with open(reportfile, 'a') as outfile:
        writer = csv.DictWriter(outfile,
                                dialect=outputdialect,
                                encoding='utf-8',
                                fieldnames=outputheader)
        # Iterate through all rows in the input file
        for row in read_csv_row(inputfile,
                                dialect=inputdialect,
                                encoding=encoding,
                                header=True,
                                fieldnames=cleanedinputheader):
            # Set the _orig values for every field in the field list that exists in
            # the row
            for field in fieldlist:
                if field in row:
                    row[field + '_orig'] = row[field]

            # Construct a composite field value for the row to match a key in the
            # vocabulary file
            rowkey = extract_values_from_row(row, fieldlist, separator)

            # Get dictionary for recommended value for the ustripstr(rowkey)
            newvaluedict = recommended_value(vocabdict, ustripstr(rowkey))

            # Only make changes if there is a standardized value found
            if newvaluedict is not None:
                # ustripstr(rowkey) was found in the vocabulary
                # Get the standard value
                standard = newvaluedict['standard']

                # Treat standard value that is None or only whitespace as ''
                if standard is None or len(standard.strip()) == 0:
                    standard = ''

                # Make a list of values given in standard
                newvalues = standard.split(separator)

                # Only make changes if the number of recommendation fields is the
                # same as the number of fields in the key
                if len(newvalues) == len(fieldlist):
                    i = 0
                    # Update or add new value to field in the fieldlist
                    for field in fieldlist:
                        row[field] = newvalues[i]
                        i += 1

            writer.writerow(row)

    s = 'Report written to %s in %s.' % (reportfile, functionname)
    logging.debug(s)
    return True
コード例 #4
0
def uuid_term_appender(inputfile,
                       outputfile,
                       key,
                       guidtype=None,
                       encoding=None,
                       format=None):
    ''' Write a file adding a field populated by global unique identifiers (GUIDs) to the 
        fields in the input file.
    parameters:
        inputfile - full path to the input file (required)
        outputfile - full path to the output file (required)
        key - field or separator-separated fields to set (required)
        guidtype - type of GUID to use to populate the key (optional; default 'uuid')
        encoding - string signifying the encoding of the input file. If known, it speeds
            up processing a great deal. (optional; default None) (e.g., 'utf-8')
        format - string signifying the csv.dialect of the report file ('csv' or 'txt')
            (optional; default: txt)
    returns:
        success - True if the report was written, else False
    '''
    functionname = 'uuid_term_appender()'

    if outputfile is None or len(outputfile) == 0:
        s = 'No outputfile name given in %s.' % functionname
        logging.debug(s)
        return False

    if inputfile is None or len(inputfile) == 0:
        s = 'No inputfile file given in %s.' % functionname
        logging.debug(s)
        return False

    if os.path.isfile(inputfile) == False:
        s = 'Inputfile file %s not found in %s.' % (inputfile, functionname)
        logging.debug(s)
        return False

    # Determine the dialect of the input file
    inputdialect = csv_file_dialect(inputfile)

    # Determine the dialect of the input file
    if encoding is None or len(encoding.strip()) == 0:
        encoding = csv_file_encoding(inputfile)

    # Read the header from the input file
    inputheader = read_header(inputfile,
                              dialect=inputdialect,
                              encoding=encoding)

    if inputheader is None:
        s = 'Unable to read header from input file %s in %s.' % (inputfile,
                                                                 functionname)
        logging.debug(s)
        return False

    if key is None or len(key.strip()) == 0:
        s = 'No key given in %s.' % functionname
        logging.debug(s)
        return False

    # Abort if the key exists in the inputheader
    if key in inputheader:
        s = 'field %s ' % key
        s += 'already exists in file %s ' % inputfile
        s += 'in %s.' % functionname
        logging.debug(s)
        return False

    if format is None or format.lower() == 'txt':
        outputdialect = tsv_dialect()
    else:
        outputdialect = csv_dialect()

    # Make an outputheader that is a copy of the inputheader plus the new field to hold
    # GUID.
    outputheader = inputheader + [key]

    # Create the outputfile and write the new header to it
    write_header(outputfile, outputheader, outputdialect)

    # Check to see if the outputfile was created
    if os.path.isfile(outputfile) == False:
        s = 'outputfile: %s was not created in %s.' % (outputfile,
                                                       functionname)
        logging.debug(s)
        return False

    # Open the outputfile to append rows with appended GUID field
    with open(outputfile, 'a') as outfile:
        writer = csv.DictWriter(outfile,
                                dialect=outputdialect,
                                encoding='utf-8',
                                fieldnames=outputheader)

        # Iterate through all rows in the input file
        for row in read_csv_row(inputfile,
                                dialect=inputdialect,
                                encoding=encoding,
                                header=True,
                                fieldnames=inputheader):
            # Create a GUID based on the selected guidtype
            guid = get_guid(guidtype)

            # Set the value of the key field to a GUID
            row[key] = guid

            # Write the updated row to the outputfile
            writer.writerow(row)

    s = 'Output file written to %s in %s.' % (outputfile, functionname)
    logging.debug(s)
    return True
コード例 #5
0
def term_setter_report(inputfile,
                       reportfile,
                       key,
                       constantvalues=None,
                       separator=None,
                       encoding=None,
                       format=None):
    ''' Write a file substituting constants for fields that already exist in an input file 
        and with added fields with constants for fields that do not already exist in an 
       inputfile. Field name matching is exact.
    parameters:
        inputfile - full path to the input file (required)
        reportfile - full path to the output file (required)
        key - field or separator-separated fields to set (required)
        constantvalues - value or separator-separated values to set the field(s) to 
            (required)
        separator - string to use as the key and value separator (optional; default '|')
        encoding - string signifying the encoding of the input file. If known, it speeds
            up processing a great deal. (optional; default None) (e.g., 'utf-8')
        format - string signifying the csv.dialect of the report file ('csv' or 'txt')
            (optional; default: txt)
    returns:
        success - True if the report was written, else False
    '''
    functionname = 'term_setter_report()'

    if reportfile is None or len(reportfile) == 0:
        s = 'No reportfile name given in %s.' % functionname
        logging.debug(s)
        return False

    if inputfile is None or len(inputfile) == 0:
        s = 'No inputfile file given in %s.' % functionname
        logging.debug(s)
        return False

    if os.path.isfile(inputfile) == False:
        s = 'Inputfile file %s not found in %s.' % (inputfile, functionname)
        logging.debug(s)
        return False

    # Determine the dialect of the input file
    inputdialect = csv_file_dialect(inputfile)

    # Determine the dialect of the input file
    if encoding is None or len(encoding.strip()) == 0:
        encoding = csv_file_encoding(inputfile)

    # Read the header from the input file
    inputheader = read_header(inputfile,
                              dialect=inputdialect,
                              encoding=encoding)

    if inputheader is None:
        s = 'Unable to read header from input file %s in %s.' % (inputfile,
                                                                 functionname)
        logging.debug(s)
        return False

    if key is None or len(key.strip()) == 0:
        s = 'No key given in %s.' % functionname
        logging.debug(s)
        return False

    if constantvalues is None or len(constantvalues) == 0:
        s = 'No constantvalues given in %s.' % functionname
        logging.debug(s)
        return False

    # Make sure there is a separator for the next step
    if separator is None or len(separator) == 0:
        separator = '|'

    # Get the fields to set by splitting the key with the separator
    fields = key.split(separator)

    # Get the values to set by splitting the constantvalues with the separator
    addedvalues = constantvalues.split(separator)

    # Abort if there is a mismatch in the lengths of the field and constants lists
    if len(fields) != len(addedvalues):
        s = 'length of field list: %s ' % key
        s += 'does not match length of constants list: %s ' % constantvalues
        s += 'in %s.' % functionname
        logging.debug(s)
        return False

    if format is None or format.lower() == 'txt':
        outputdialect = tsv_dialect()
    else:
        outputdialect = csv_dialect()

    # Make an outputheader that is a copy of the inputheader
    outputheader = inputheader

    # Add to the output header fields that are not in the inputheader
    for field in fields:
        if field not in outputheader:
            outputheader = outputheader + [field]

    # Create the outputfile and write the new header to it
    write_header(reportfile, outputheader, outputdialect)

    # Check to see if the outputfile was created
    if os.path.isfile(reportfile) == False:
        s = 'reportfile: %s was not created in %s.' % (outputfile,
                                                       functionname)
        logging.debug(s)
        return False

    # Open the outputfile to append rows with fields set to constant values
    with open(reportfile, 'a') as outfile:
        writer = csv.DictWriter(outfile,
                                dialect=outputdialect,
                                encoding='utf-8',
                                fieldnames=outputheader)

        # Iterate through all rows in the input file
        for row in read_csv_row(inputfile,
                                dialect=inputdialect,
                                encoding=encoding,
                                header=True,
                                fieldnames=inputheader):
            # For every field in the key list
            for i in range(0, len(fields)):
                # Set the value of the ith field to the ith constant
                row[fields[i]] = addedvalues[i]
            # Write the updated row to the outputfile
            writer.writerow(row)

    s = 'Report written to %s in %s.' % (reportfile, functionname)
    logging.debug(s)
    return True
コード例 #6
0
def darwinize_header(options):
    ''' Translate field names from input file to Darwin Core field names in outputfile
        using a Darwin Cloud vocabulary lookup.
    options - a dictionary of parameters
        loglevel - level at which to log (e.g., DEBUG) (optional)
        workspace - path to a directory for the outputfile (optional)
        inputfile - full path to the input file (required)
        dwccloudfile - full path to the vocabulary file containing the Darwin Cloud 
           terms (required)
        outputfile - name of the output file, without path (required)
        encoding - string signifying the encoding of the input file. If known, it speeds
            up processing a great deal. (optional; default None) (e.g., 'utf-8')
        format - output file format (e.g., 'csv' or 'txt') (optional; default 'txt')
        namespace - prepend namespace to fields that were darwinized 
        (optional; default 'no') (e.g., 'y', 'n')
    returns a dictionary with information about the results
        outputfile - actual full path to the output file
        success - True if process completed successfully, otherwise False
        message - an explanation of the reason if success=False
    '''
    #print '%s options: %s' % (__version__, options)

    setup_actor_logging(options)

    logging.debug('Started %s' % __version__)
    logging.debug('options: %s' % options)

    # Make a list for the response
    returnvars = ['workspace', 'outputfile', 'success', 'message', 'artifacts']

    ### Standard outputs ###
    success = False
    message = None

    # Make a dictionary for artifacts left behind
    artifacts = {}

    ### Establish variables ###
    workspace = './'
    inputfile = None
    dwccloudfile = None
    outputfile = None
    encoding = None
    namespace = 'n'
    format = None

    ### Required inputs ###
    try:
        workspace = options['workspace']
    except:
        pass

    try:
        inputfile = options['inputfile']
    except:
        pass

    if inputfile is None or len(inputfile) == 0:
        message = 'No input file given. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    if os.path.isfile(inputfile) == False:
        message = 'Input file %s not found. %s' % (inputfile, __version__)
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    try:
        dwccloudfile = options['dwccloudfile']
    except:
        pass

    if dwccloudfile is None or len(dwccloudfile) == 0:
        message = 'No Darwin Cloud vocabulary file given. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    if os.path.isfile(dwccloudfile) == False:
        message = 'Darwin Cloud vocabulary file not found. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    try:
        outputfile = options['outputfile']
    except:
        pass

    if outputfile is None or len(outputfile) == 0:
        message = 'No output file given. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile)

    try:
        encoding = options['encoding']
    except:
        pass

    if encoding is None or len(encoding.strip()) == 0:
        encoding = csv_file_encoding(inputfile)
    try:
        namespace = options['namespace']
    except:
        pass

    inputdialect = csv_file_dialect(inputfile)

    try:
        format = options['format']
    except:
        pass

    if format is None or len(format) == 0:
        outputdialect = inputdialect
    elif format.lower() == 'csv':
        outputdialect = csv_dialect()
    else:
        outputdialect = tsv_dialect()

    header = read_header(inputfile, dialect=inputdialect, encoding=encoding)
    dwcheader = darwinize_list(header, dwccloudfile, namespace)

    if dwcheader is None:
        message = 'Unable to create darwinized header. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    # Write the new header to the outputfile
    if write_header(outputfile, dwcheader, dialect=outputdialect) == False:
        message = 'Unable to write header to output file. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    # Read the rows of the input file, append them to the output file after the
    # header with columns in the same order.
    with open(outputfile, 'a') as outfile:
        writer = csv.DictWriter(outfile,
                                dialect=outputdialect,
                                encoding='utf-8',
                                fieldnames=header)
        for row in read_csv_row(inputfile, inputdialect, encoding):
            writer.writerow(row)
            #print 'row: %s' % row

    success = True
    artifacts['darwinized_header_file'] = outputfile
    returnvals = [workspace, outputfile, success, message, artifacts]
    logging.debug('Finishing %s' % __version__)
    return response(returnvars, returnvals)
コード例 #7
0
def text_file_aggregator(options):
    ''' Join the contents of files in a given path. Headers and encodings are not assumed 
        to be the same. Write a file containing the joined files with one header line.
    options - a dictionary of parameters
        loglevel - level at which to log (e.g., DEBUG) (optional)
        workspace - path to a directory for the outputfile (optional)
        inputpath - full path to the input file set. The file extension of the outputfile
            will be the substring following the last '.' in the inputpath.
            Example: ./workspace/thefiles.txt will produce an output file ending in
            '.txt' (required) 
        outputfile - name of the output file, without path (optional)
        format - output file format (e.g., 'csv' or 'txt') (optional; default 'txt')
    returns a dictionary with information about the results
        workspace - actual path to the directory where the outputfile was written
        outputfile - actual full path to the output file
        aggregaterowcount - the number of rows in the aggregated file, not counting header
        success - True if process completed successfully, otherwise False
        message - an explanation of the reason if success=False
        artifacts - a dictionary of persistent objects created
    '''
    #print '%s options: %s' % (__version__, options)

    setup_actor_logging(options)

    logging.debug( 'Started %s' % __version__ )
    logging.debug( 'options: %s' % options )

    # Make a list for the response
    returnvars = ['workspace', 'outputfile', 'aggregaterowcount', 'success', 'message', 
        'artifacts']

    ### Standard outputs ###
    success = False
    message = None

    ### Custom outputs ###
    aggregaterowcount = None

    # Make a dictionary for artifacts left behind
    artifacts = {}

    ### Establish variables ###
    workspace = './'
    inputpath = None
    outputfile = None
    format = None

    ### Required inputs ###
    try:
        workspace = options['workspace']
    except:
        pass

    try:
        inputpath = options['inputpath']
    except:
        pass

    if inputpath is None or len(inputpath)==0:
        message = 'No input file given. %s' % __version__
        returnvals = [workspace, outputfile, aggregaterowcount, success, message,
            artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    try:
        format = options['format']
    except:
        pass

    if format is None or len(format)==0:
        format = 'txt'

    if format.lower() == 'txt':
        dialect = tsv_dialect()
    else:
        dialect = csv_dialect()

    try:
        outputfile = options['outputfile']
    except:
        pass

    if outputfile is None or len(outputfile)==0:
        outputfile='aggregate_'+str(uuid.uuid1())+format

    # Construct the output file path in the workspace
    outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile)

    # Create the composite header. Let composite_header determine the dialects and 
    # encodings of the files to aggregate.
    aggregateheader = composite_header(inputpath)
    aggregaterowcount = 0

    # Open a file to write the aggregated results in chosen format and utf-8.
    with open(outputfile, 'w') as outfile:
        writer = csv.DictWriter(outfile, dialect=dialect, encoding='utf-8', 
            fieldnames=aggregateheader, extrasaction='ignore')
        writer.writeheader()
        files = glob.glob(inputpath)
        for file in files:
            dialect = csv_file_dialect(file)
            encoding = csv_file_encoding(file)
            with open(file, 'rU') as inputfile:
                reader = csv.DictReader(utf8_data_encoder(inputfile, encoding), 
                    dialect=dialect, encoding=encoding)
                for line in reader:
                    try:
                        writer.writerow(line)
                        aggregaterowcount += 1
                    except:
                        message = 'failed to write line:\n%s\n' % line
                        message += 'to file %s. %s' % (file, __version__)
                        returnvals = [workspace, outputfile, aggregaterowcount, success, 
                            message, artifacts]
                        logging.debug('message:\n%s' % message)
                        return response(returnvars, returnvals)

    success = True
    artifacts['aggregated_file'] = outputfile
    if aggregateheader is not None:
        aggregateheader = list(aggregateheader)
        returnvals = [workspace, outputfile, aggregaterowcount, success, message,
            artifacts]
    logging.debug('Finishing %s' % __version__)
    return response(returnvars, returnvals)
コード例 #8
0
def vocab_dict_from_file(vocabfile,
                         key,
                         separator=None,
                         dialect=None,
                         encoding=None,
                         function=None,
                         *args,
                         **kwargs):
    ''' Get a vocabulary as a dictionary from a file.
    parameters:
        vocabfile - path to the vocabulary file (required)
        key - the field or separator-separated fieldnames that hold the distinct values 
            in the vocabulary file (required)
        separator - string to use as the value separator in the string 
            (optional; default None)
        dialect - a csv.dialect object with the attributes of the vocabulary file
            (default None)
        encoding - a string designating the input file encoding (optional; default None) 
            (e.g., 'utf-8', 'mac_roman', 'latin_1', 'cp1252')
        function - function to call for each value to compare (default None)
        args - unnamed parameters to function as tuple (optional)
        kwargs - named parameters to function as dictionary (optional)
    Example:
       vocab_dict_from_file(v,k,function=ustripstr) would return all of the stripped, 
       uppercased keys and their values from the vocabfile v.
    returns:
        vocabdict - dictionary of complete vocabulary records
    '''
    functionname = 'vocab_dict_from_file()'

    if key is None or len(key.strip()) == 0:
        s = 'No key given in %s.' % functionname
        logging.debug(s)
        return None

    if vocabfile is None or len(vocabfile) == 0:
        s = 'No vocabulary file given in %s.' % functionname
        logging.debug(s)
        return None

    if os.path.isfile(vocabfile) == False:
        s = 'Vocabulary file %s not found in %s.' % (vocabfile, functionname)
        logging.debug(s)
        return None

    if dialect is None:
        dialect = vocab_dialect()

    # Try to determine the encoding of the inputfile.
    if encoding is None or len(encoding.strip()) == 0:
        encoding = csv_file_encoding(vocabfile)
        # csv_file_encoding() always returns an encoding if there is an input file.

    # Set up the field names to match the standard vocabulary header
    fieldnames = vocabheader(key, separator)

    # Create a dictionary to hold the vocabulary
    vocabdict = {}

    # Iterate through all rows in the input file
    for row in read_csv_row(vocabfile,
                            dialect,
                            encoding,
                            header=True,
                            fieldnames=fieldnames):
        # Make a complete copy of the row
        rowdict = copy.deepcopy(row)
        value = row[key]
        # Remove the key from the row copy
        rowdict.pop(key)
        newvalue = value
        # If we are not supposed to apply a function to the key value
        if function is not None:
            newvalue = function(value, *args, **kwargs)
        vocabdict[newvalue] = rowdict

    return vocabdict
コード例 #9
0
def term_token_count_from_file(inputfile,
                               termname,
                               dialect=None,
                               encoding=None):
    ''' Make a dictionary of tokens for a given term in a file along with the number of 
       times each occurs.
    parameters:
        inputfile - full path to the input file (required)
        termname - term for which to count rows (required)
        dialect - csv.dialect object with the attributes of the input files, which must
           all have the same dialect if dialect is given, otherwise it will be detected
           (default None)
        encoding - a string designating the input file encoding (optional; default None) 
            (e.g., 'utf-8', 'mac_roman', 'latin_1', 'cp1252')
    returns:
        tokens - a dictionary containing the tokens and their counts
    '''
    functionname = 'term_token_count_from_file()'

    if inputfile is None or len(inputfile) == 0:
        s = 'No input file given in %s.' % functionname
        logging.debug(s)
        return 0

    if os.path.isfile(inputfile) == False:
        s = 'File %s not found in %s.' % (inputfile, functionname)
        logging.debug(s)
        return 0

    if termname is None or len(termname) == 0:
        s = 'No term name given in %s.' % functionname
        logging.debug(s)
        return 0

    # Determine the dialect of the input file
    if dialect is None:
        dialect = csv_file_dialect(inputfile)
        # csv_file_dialect() always returns a dialect if there is an input file.
        # No need to check.

    # Determine the encoding of the input file
    if encoding is None or len(encoding.strip()) == 0:
        encoding = csv_file_encoding(inputfile)
        # csv_file_encoding() always returns an encoding if there is an input file.
        # No need to check.

    inputheader = read_header(inputfile, dialect=dialect, encoding=encoding)

    if termname not in inputheader:
        s = 'Term %s not found in file %s ' % (termname, inputfile)
        s += 'in %s.' % functionname
        logging.debug(s)
        return None

    rowcount = 0
    tokencount = 0
    populatedrowcount = 0
    tokens = {'tokenlist': {}}

    for row in read_csv_row(inputfile, dialect, encoding):
        try:
            value = row[termname]
        except:
            pass

        if value is not None and len(value.strip()) > 0:
            rowdict = {}
            wordlist = re.sub("[^\w]", " ", value).split()

            for token in wordlist:
                if token in rowdict:
                    rowdict[token][
                        'totalcount'] = rowdict[token]['totalcount'] + 1
                else:
                    rowdict[token] = {}
                    rowdict[token]['rowcount'] = 1
                    rowdict[token]['totalcount'] = 1

            populatedrowcount += 1

            for key, value in rowdict.iteritems():
                tokenlist = tokens['tokenlist']
                if key in tokenlist:
                    tokenlist[key]['rowcount'] = \
                        tokenlist[key]['rowcount'] + value['rowcount']
                    tokenlist[key]['totalcount'] = \
                        tokenlist[key]['totalcount'] + value['totalcount']
                else:
                    tokenlist[key] = {}
                    tokenlist[key]['rowcount'] = value['rowcount']
                    tokenlist[key]['totalcount'] = value['totalcount']
        rowcount += 1
        tokencount += len(wordlist)

    tokens['rowcount'] = rowcount
    tokens['tokencount'] = tokencount
    tokens['input'] = inputfile
    tokens['term'] = termname

    return tokens