Ejemplo n.º 1
0
def darwinize_header(options):
    ''' Translate field names from input file to Darwin Core field names in outputfile
        using a Darwin Cloud vocabulary lookup.
    options - a dictionary of parameters
        loglevel - level at which to log (e.g., DEBUG) (optional)
        workspace - path to a directory for the outputfile (optional)
        inputfile - full path to the input file (required)
        dwccloudfile - full path to the vocabulary file containing the Darwin Cloud 
           terms (required)
        outputfile - name of the output file, without path (required)
        encoding - string signifying the encoding of the input file. If known, it speeds
            up processing a great deal. (optional; default None) (e.g., 'utf-8')
        format - output file format (e.g., 'csv' or 'txt') (optional; default 'txt')
        namespace - prepend namespace to fields that were darwinized 
        (optional; default 'no') (e.g., 'y', 'n')
    returns a dictionary with information about the results
        outputfile - actual full path to the output file
        success - True if process completed successfully, otherwise False
        message - an explanation of the reason if success=False
    '''
    #print '%s options: %s' % (__version__, options)

    setup_actor_logging(options)

    logging.debug('Started %s' % __version__)
    logging.debug('options: %s' % options)

    # Make a list for the response
    returnvars = ['workspace', 'outputfile', 'success', 'message', 'artifacts']

    ### Standard outputs ###
    success = False
    message = None

    # Make a dictionary for artifacts left behind
    artifacts = {}

    ### Establish variables ###
    workspace = './'
    inputfile = None
    dwccloudfile = None
    outputfile = None
    encoding = None
    namespace = 'n'
    format = None

    ### Required inputs ###
    try:
        workspace = options['workspace']
    except:
        pass

    try:
        inputfile = options['inputfile']
    except:
        pass

    if inputfile is None or len(inputfile) == 0:
        message = 'No input file given. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    if os.path.isfile(inputfile) == False:
        message = 'Input file %s not found. %s' % (inputfile, __version__)
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    try:
        dwccloudfile = options['dwccloudfile']
    except:
        pass

    if dwccloudfile is None or len(dwccloudfile) == 0:
        message = 'No Darwin Cloud vocabulary file given. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    if os.path.isfile(dwccloudfile) == False:
        message = 'Darwin Cloud vocabulary file not found. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    try:
        outputfile = options['outputfile']
    except:
        pass

    if outputfile is None or len(outputfile) == 0:
        message = 'No output file given. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile)

    try:
        encoding = options['encoding']
    except:
        pass

    if encoding is None or len(encoding.strip()) == 0:
        encoding = csv_file_encoding(inputfile)
    try:
        namespace = options['namespace']
    except:
        pass

    inputdialect = csv_file_dialect(inputfile)

    try:
        format = options['format']
    except:
        pass

    if format is None or len(format) == 0:
        outputdialect = inputdialect
    elif format.lower() == 'csv':
        outputdialect = csv_dialect()
    else:
        outputdialect = tsv_dialect()

    header = read_header(inputfile, dialect=inputdialect, encoding=encoding)
    dwcheader = darwinize_list(header, dwccloudfile, namespace)

    if dwcheader is None:
        message = 'Unable to create darwinized header. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    # Write the new header to the outputfile
    if write_header(outputfile, dwcheader, dialect=outputdialect) == False:
        message = 'Unable to write header to output file. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    # Read the rows of the input file, append them to the output file after the
    # header with columns in the same order.
    with open(outputfile, 'a') as outfile:
        writer = csv.DictWriter(outfile,
                                dialect=outputdialect,
                                encoding='utf-8',
                                fieldnames=header)
        for row in read_csv_row(inputfile, inputdialect, encoding):
            writer.writerow(row)
            #print 'row: %s' % row

    success = True
    artifacts['darwinized_header_file'] = outputfile
    returnvals = [workspace, outputfile, success, message, artifacts]
    logging.debug('Finishing %s' % __version__)
    return response(returnvars, returnvals)
def term_value_count_reporter(options):
    ''' Extract a list of the distinct values of a given term in a text file along with 
        the number of times each occurs.
    options - a dictionary of parameters
        loglevel - level at which to log (e.g., DEBUG) (optional)
        workspace - path to a directory for the tsvfile (optional)
        inputfile - full path to the input file (required)
        outputfile - name of the output file, without path (optional)
        format - output file format (e.g., 'csv' or 'txt') (optional; default 'csv')
        termlist - list of fields in the field combination to count (required)
        separator - string that separates the values in in the output (e.g., '|') 
            (optional; default '|')
        encoding - string signifying the encoding of the input file. If known, it speeds
            up processing a great deal. (optional; default None) (e.g., 'utf-8')
    returns a dictionary with information about the results
        workspace - actual path to the directory where the outputfile was written
        outputfile - actual full path to the output tsv file
        success - True if process completed successfully, otherwise False
        message - an explanation of the reason if success=False
        artifacts - a dictionary of persistent objects created
    '''
    #print '%s options: %s' % (__version__, options)

    setup_actor_logging(options)

    #logging.debug( 'Started %s' % __version__ )
    #logging.debug( 'options: %s' % options )

    # Make a list for the response
    returnvars = ['workspace', 'outputfile', 'success', 'message', 'artifacts']

    ### Standard outputs ###
    success = False
    message = None
    # Make a dictionary for artifacts left behind
    artifacts = {}

    ### Establish variables ###
    workspace = './'
    inputfile = None
    outputfile = None
    format = None
    termlist = None
    separator = '|'
    encoding = None

    ### Required inputs ###
    try:
        workspace = options['workspace']
    except:
        pass

    try:
        inputfile = options['inputfile']
    except:
        pass

    if inputfile is None or len(inputfile)==0:
        message = 'No input file given. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    # Look to see if the input file is at the absolute path or in the workspace.
    if os.path.isfile(inputfile) == False:
        if os.path.isfile(workspace+'/'+inputfile) == True:
            inputfile = workspace+'/'+inputfile
        else:
            message = 'Input file %s not found. %s' % (inputfile, __version__)
            returnvals = [workspace, outputfile, success, message, artifacts]
            logging.debug('message:\n%s' % message)
            return response(returnvars, returnvals)

    try:
        termlist = options['termlist']
    except:
        pass

    if termlist is None or len(termlist)==0:
        message = 'No field list given. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message: %s' % message)
        return response(returnvars, returnvals)

    ### Optional inputs ###
    try:
        separator = options['separator']
    except:
        pass

    try:
        format = options['format']
    except:
        pass

    if format is None:
        format = 'csv'

    try:
        encoding = options['encoding']
    except:
        pass

    try:
        outputfile = options['outputfile']
    except:
        pass

    rootname = ''
    termname = ''
    n = 0
    for f in termlist:
        if n == 0:
            rootname += f
            termname += f
            n = 1
        else:
            rootname += '_'+f
            termname += separator+f
    if outputfile is None or len(outputfile)==0:
        outputfile = '%s_count_report_%s.%s' % (rootname, str(uuid.uuid1()), format)

    outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile)

    # Get the list of values for the field given by termname along with their counts.
    counts = extract_value_counts_from_file(inputfile, termlist, separator=separator, 
        encoding=encoding)
    #print 'counts: %s' % counts

    #Try to create the report for the term value counts.
    success = term_value_count_report(outputfile, counts, termname=termname, format=format)

    if success==False:
        message = 'No count report created for %s from %s. ' % (termname, outputfile)
        message += '%s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message: %s' % message)
        return response(returnvars, returnvals)
    
    s = '%s_count_report_file' % rootname
    artifacts[s] = outputfile
    returnvals = [workspace, outputfile, success, message, artifacts]
    logging.debug('Finishing %s' % __version__)
    return response(returnvars, returnvals)
Ejemplo n.º 3
0
def dataset_term_standardizer(options):
    ''' Create an output file replacing values from an input file for fields given in key 
        with standard values and adding new fields to hold the original values.
    options - a dictionary of parameters
        loglevel - level at which to log (e.g., DEBUG) (optional)
        workspace - path to a directory for the output file (optional; default './')
        inputfile - path to the input file. Either full path or path within the workspace
            (required)
        outputfile - name of the output file, without path (optional)
        vocabfile - path to the vocabulary file. Either full path or path within the
           workspace (required if constantvalues is None)
        encoding - string signifying the encoding of the input file. If known, it speeds
            up processing a great deal. (optional; default None) (e.g., 'utf-8')
        format - output file format (e.g., 'csv' or 'txt') (optional; default 'txt')
        key - field or separator-separated fields whose values are to be set to the 
            constantvalues (required)
        separator - string to use as the key and value separator (optional; default '|')
    returns a dictionary with information about the results
        workspace - actual path to the directory where the outputfile was written
        outputfile - actual full path to the output report file
        success - True if process completed successfully, otherwise False
        message - an explanation of the reason if success=False
        artifacts - a dictionary of persistent objects created
    '''
    print '%s options: %s' % (__version__, options)

    setup_actor_logging(options)

    logging.debug('Started %s' % __version__)
    logging.debug('options: %s' % options)

    # Make a list for the response
    returnvars = ['workspace', 'outputfile', 'success', 'message', 'artifacts']

    ### Standard outputs ###
    success = False
    message = None

    # Make a dictionary for artifacts left behind
    artifacts = {}

    ### Establish variables ###
    workspace = './'
    inputfile = None
    outputfile = None
    vocabfile = None
    format = 'txt'
    key = None
    separator = '|'
    encoding = None

    ### Required inputs ###
    try:
        workspace = options['workspace']
    except:
        pass

    try:
        inputfile = options['inputfile']
    except:
        pass

    if inputfile is None or len(inputfile) == 0:
        message = 'No input file given. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    # Look to see if the input file is at the absolute path or in the workspace.
    if os.path.isfile(inputfile) == False:
        if os.path.isfile(workspace + '/' + inputfile) == True:
            inputfile = workspace + '/' + inputfile
        else:
            message = 'Input file %s not found. %s' % (inputfile, __version__)
            returnvals = [workspace, outputfile, True, message, artifacts]
            logging.debug('message:\n%s' % message)
            return response(returnvars, returnvals)

    try:
        vocabfile = options['vocabfile']
    except:
        pass

    if vocabfile is None or len(vocabfile) == 0:
        message = 'No vocab file given. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    # Look to see if vocab file is at the absolute path or in the workspace.
    vocabfileat = None
    if os.path.isfile(vocabfile) == True:
        vocabfileat = vocabfile
    else:
        vocabfileat = workspace + '/' + vocabfile

    vocabfile = vocabfileat

    try:
        key = options['key']
    except:
        pass

    if key is None or len(key) == 0:
        message = 'No key given. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    ### Optional inputs ###
    try:
        separator = options['separator']
    except:
        pass

    try:
        format = options['format']
    except:
        pass

    try:
        encoding = options['encoding']
    except:
        pass

    try:
        outputfile = options['outputfile']
    except:
        pass

    if outputfile is None or len(outputfile.strip()) == 0:
        outputfile = '%s/%s_changed_report_%s.%s' % \
          (workspace.rstrip('/'), slugify(key), str(uuid.uuid1()), format)
    else:
        outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile)

    # Get a list of distinct values of the term in the input file
    fields = key.split(separator)

    success = term_standardizer_report(inputfile, outputfile, vocabfile, key, \
        separator=separator, encoding=encoding, format=format)

    if outputfile is not None and not os.path.isfile(outputfile):
        message = 'Failed to write results to output file %s. ' % outputfile
        message += '%s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    s = '%s_change_report_file' % slugify(key)
    artifacts[s] = outputfile
    returnvals = [workspace, outputfile, success, message, artifacts]
    logging.debug('Finishing %s' % __version__)
    return response(returnvars, returnvals)
Ejemplo n.º 4
0
def csv_field_selector(options):
    ''' Create a new file by selecting only fields in a termlist in the order given in
        that list.
    options - a dictionary of parameters
        loglevel - level at which to log (e.g., DEBUG) (optional)
        workspace - path to a directory to work in (optional)
        inputfile - full path to the input file (required)
        outputfile - name of the output file, without path (required)
        termlist - list of fields to extract from the input file (required)
        encoding - string signifying the encoding of the input file. If known, it speeds
            up processing a great deal. (optional; default None) (e.g., 'utf-8')
        format - output file format (e.g., 'csv' or 'txt') (optional; default 'txt')
    returns a dictionary with information about the results
        outputfile - actual full path to the output file
        workspace - path to a directory for the output artifacts
        success - True if process completed successfully, otherwise False
        message - an explanation of the reason if success=False
    '''
    #print '%s options: %s' % (__version__, options)

    setup_actor_logging(options)

    logging.debug( 'Started %s' % __version__ )
    logging.debug( 'options: %s' % options )

    # Make a list for the response
    returnvars = ['workspace', 'outputfile', 'success', 'message', 'artifacts']

    ### Standard outputs ###
    success = False
    message = None

    # Make a dictionary for artifacts left behind
    artifacts = {}

    ### Custom outputs ###

    ### Establish variables ###
    workspace = './'
    inputfile = None
    outputfile = None
    termlist = None
    encoding = None
    format = 'txt'

    ### Required inputs ###
    try:
        workspace = options['workspace']
    except:
        pass

    try:
        inputfile = options['inputfile']
    except:
        pass

    try:
        outputfile = options['outputfile']
    except:
        pass

    if inputfile is None or len(inputfile)==0:
        message = 'No input file given. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    if os.path.isfile(inputfile) == False:
        message = 'Input file %s not found. %s' % (inputfile, __version__)
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    try:
        termlist = options['termlist']
    except:
        pass

    if termlist is None or len(termlist)==0:
        message = 'No termlist given. %s' % __version__
        returnvals = [workspace, extractedvalues, success, message]
        logging.debug('message: %s' % message)
        return response(returnvars, returnvals)

    try:
        format = options['format']
    except:
        pass

    try:
        encoding = options['encoding']
    except:
        pass

    if outputfile is None or len(outputfile)==0:
        message = 'No output file given. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile)

    # Do the field selection. Let the selector figure out the input dialect.
    success = csv_select_fields(inputfile, outputfile, termlist, dialect=None, 
        encoding=encoding, format=format)

    if success == False:
        message = 'Unable to select fields from %s. %s' % (inputfile, __version__)
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)
        
    artifacts['selected_field_file'] = outputfile
    returnvals = [workspace, outputfile, success, message, artifacts]
    logging.debug('Finishing %s' % __version__)
    return response(returnvars, returnvals)
Ejemplo n.º 5
0
def utf8_encoder(options):
    ''' Translate input file from its current encoding to utf8.
    options - a dictionary of parameters
        loglevel - level at which to log (e.g., DEBUG) (optional)
        workspace - path to a directory for the outputfile (optional)
        inputfile - full path to the input file (required)
        outputfile - name of the output file, without path (required)
        encoding - the encoding of the input file (optional)
    returns a dictionary with information about the results
        outputfile - actual full path to the output file
        success - True if process completed successfully, otherwise False
        message - an explanation of the reason if success=False
    '''
    #print '%s options: %s' % (__version__, options)

    setup_actor_logging(options)

    logging.debug('Started %s' % __version__)
    logging.debug('options: %s' % options)

    # Make a list for the response
    returnvars = ['workspace', 'outputfile', 'success', 'message', 'artifacts']

    ### Standard outputs ###
    success = False
    message = None

    # Make a dictionary for artifacts left behind
    artifacts = {}

    ### Establish variables ###
    inputfile = None
    outputfile = None
    encoding = None

    ### Required inputs ###
    try:
        workspace = options['workspace']
    except:
        workspace = './'

    try:
        inputfile = options['inputfile']
    except:
        pass

    if inputfile is None or len(inputfile) == 0:
        message = 'No input file given. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    if os.path.isfile(inputfile) == False:
        message = 'Input file %s not found. %s' % (inputfile, __version__)
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    try:
        outputfile = options['outputfile']
    except:
        pass

    if outputfile is None or len(outputfile) == 0:
        message = 'No output file given. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile)

    try:
        encoding = options['encoding']
    except:
        pass

    success = utf8_file_encoder(inputfile, outputfile, encoding)

    if success == False:
        message = 'Unable to translate %s to utf8 encoding. %s' % (inputfile,
                                                                   __version__)
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    artifacts['utf8_encoded_file'] = outputfile
    returnvals = [workspace, outputfile, success, message, artifacts]
    logging.debug('Finishing %s' % __version__)
    return response(returnvars, returnvals)
                workspace, outputfile, rowcount, success, message, artifacts
            ]
            logging.debug('message:\n%s' % message)
            return response(returnvars, returnvals)
    else:
        try:
            with DwCAReader(inputfile) as dwcareader:
                rowcount = write_core_csv_file(dwcareader, outputfile)
        except Exception, e:
            message = 'Error %s reading archive %s. %s' % (e, inputfile,
                                                           __version__)
            returnvals = [
                workspace, outputfile, rowcount, success, message, artifacts
            ]
            logging.debug('message:\n%s' % message)
            return response(returnvars, returnvals)

    if rowcount == 0:
        message = 'Unable to create outputfile %s. %s' % (outputfile,
                                                          __version__)
        returnvals = [
            workspace, outputfile, rowcount, success, message, artifacts
        ]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    success = True
    if success == True:
        artifacts['dwca_core_to_tsv_outputfile'] = outputfile

    returnvals = [workspace, outputfile, rowcount, success, message, artifacts]
Ejemplo n.º 7
0
def text_file_splitter(options):
    ''' Split a text file into chunks with headers. Put the chunk files in the workspace.
    options - a dictionary of parameters
        loglevel - level at which to log (e.g., DEBUG) (optional)
        workspace - the directory in which the output will be written (optional)
        inputfile - full path to the input file (required)
        chunksize - the maximum number of records in an output file (optional)
    returns a dictionary with information about the results
        workspace - actual path to the directory where the outputfile was written
        filepattern - the pattern for the split file names
        chunks - the number of files created from the split
        rowcount - the number of rows in the file that was split, not counting header
        success - True if process completed successfully, otherwise False
        message - an explanation of the reason if success=False
    '''
    #print '%s options: %s' % (__version__, options)

    setup_actor_logging(options)

    logging.debug('Started %s' % __version__)
    logging.debug('options: %s' % options)

    # Make a list for the response
    returnvars = [
        'workspace', 'filepattern', 'chunks', 'rowcount', 'success', 'message'
    ]

    ### Standard outputs ###
    success = False
    message = None

    ### Custom outputs ###
    filepattern = None
    chunks = None
    rowcount = None

    ### Establish variables ###
    workspace = './'
    inputfile = None
    termname = None
    chunksize = 10000

    ### Required inputs ###
    try:
        workspace = options['workspace']
    except:
        pass

    try:
        inputfile = options['inputfile']
    except:
        pass

    if inputfile is None or len(inputfile) == 0:
        message = 'No input file given. %s' % __version__
        returnvals = [
            workspace, filepattern, chunks, rowcount, success, message
        ]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    if not os.path.isfile(inputfile):
        message = 'Input file %s not found. %s' % (inputfile, __version__)
        returnvals = [
            workspace, filepattern, chunks, rowcount, success, message
        ]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    try:
        chunksize = options['chunksize']
    except:
        pass

    path = None
    fileext = None
    path, fileext, filepattern = split_path(inputfile)

    # Open the file in universal mode
    input = open(inputfile, 'rU')

    # Get the first line of the file as the header
    header = input.next()

    # dest will be used for the chunk files
    dest = None
    rowcount = 0
    chunks = 0

    # Iterate though the entire input file
    for line in input:
        # For the first line and every multiple of subsequent max_chunk_length lines
        if rowcount % chunksize == 0:
            # Close the old chunk file, if there is one
            if dest:
                dest.close()
            # Open a new chunk file to write the next lines into, with a header
            destfile = workspace + '/' + filepattern + '-' + str(
                chunks) + '.' + fileext
            dest = open(destfile, 'w')
            dest.write(header)
            chunks += 1
        # Write a line to the current chunk and keep going
        dest.write(line)
        rowcount += 1

    # Close the last chunk file
    if dest:
        dest.close()

    # Close the last input file
    if input:
        input.close()

    outputpattern = None
    if filepattern is not None and fileext is not None:
        outputpattern = workspace + '/' + filepattern + '-*.' + fileext

    success = True

    # Prepare the response dictionary
    returnvals = [workspace, filepattern, chunks, rowcount, success, message]
    logging.debug('Finishing %s' % __version__)
    return response(returnvars, returnvals)
def darwin_cloud_collector(options):
    ''' Get field names from inputfile and put any that are not Simple Darwin Core into 
       outputfile.
    options - a dictionary of parameters
        loglevel - level at which to log (e.g., DEBUG) (optional)
        workspace - path to a directory for the outputfile (optional)
        inputfile - full path to the input file (required)
        outputfile - name of the output file, without path (required)
        encoding - string signifying the encoding of the input file. If known, it speeds
            up processing a great deal. (optional; default None) (e.g., 'utf-8')
    returns a dictionary with information about the results
        addedvalues - new values added to the output file
        outputfile - actual full path to the output file
        success - True if process completed successfully, otherwise False
        message - an explanation of the reason if success=False
    '''
    #print '%s options: %s' % (__version__, options)

    setup_actor_logging(options)

    logging.debug('Started %s' % __version__)
    logging.debug('options: %s' % options)

    # Make a list for the response
    returnvars = ['workspace', 'addedvalues', 'outputfile', 'success', 'message', \
        'artifacts']

    ### Standard outputs ###
    success = False
    message = None

    ### Custom outputs ###
    addedvalues = None

    # Make a dictionary for artifacts left behind
    artifacts = {}

    ### Establish variables ###
    workspace = './'
    inputfile = None
    outputfile = None
    encoding = None

    ### Required inputs ###
    try:
        workspace = options['workspace']
    except:
        pass

    try:
        inputfile = options['inputfile']
    except:
        pass

    try:
        outputfile = options['outputfile']
    except:
        pass

    if inputfile is None or len(inputfile) == 0:
        message = 'No input file given. %s' % __version__
        returnvals = [
            workspace, addedvalues, outputfile, success, message, artifacts
        ]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    if os.path.isfile(inputfile) == False:
        message = 'Input file %s not found. %s' % (inputfile, __version__)
        returnvals = [
            workspace, addedvalues, outputfile, success, message, artifacts
        ]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    if outputfile is None or len(outputfile) == 0:
        message = 'No output file given. %s' % __version__
        returnvals = [
            workspace, addedvalues, outputfile, success, message, artifacts
        ]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile)

    try:
        encoding = options['encoding']
    except:
        pass

# Read the header and let read_header figure out the dialect and encoding.
    header = read_header(inputfile, encoding=encoding)
    nondwc = terms_not_in_dwc(header, casesensitive=False)

    dialect = vocab_dialect()
    addedvalues = distinct_vocabs_to_file(outputfile,
                                          nondwc,
                                          'fieldname',
                                          dialect=dialect)
    success = True

    if addedvalues is not None:
        artifacts['darwin_cloud_collector_file'] = outputfile

    returnvals = [
        workspace, addedvalues, outputfile, success, message, artifacts
    ]
    logging.debug('Finishing %s' % __version__)
    return response(returnvars, returnvals)
def term_unknown_reporter(options):
    ''' Report a list of values from a field in an input file that are not in a given 
        vocabulary.
    options - a dictionary of parameters
        loglevel - level at which to log (e.g., DEBUG) (optional)
        workspace - path to a directory for the tsvfile (optional)
        inputfile - path to the input file. Either full path or path within the workspace
            (required)
        vocabfile - path to the vocabulary file. Either full path or path within the
           workspace (required)
        outputfile - name of the output file, without path (optional)
        format - output file format (e.g., 'csv' or 'txt') (optional; default csv)
        key - the field or separator-separated fieldnames that hold the distinct values 
              in the vocabulary file (required)
        separator - string to use as the value separator in the string (default '|')
        encoding - string signifying the encoding of the input file. If known, it speeds
            up processing a great deal. (optional; default None) (e.g., 'utf-8')
    returns a dictionary with information about the results
        workspace - actual path to the directory where the outputfile was written
        outputfile - actual full path to the output report file
        success - True if process completed successfully, otherwise False
        message - an explanation of the reason if success=False
        artifacts - a dictionary of persistent objects created
    '''
    #print '%s options: %s' % (__version__, options)

    setup_actor_logging(options)

    #logging.debug( 'Started %s' % __version__ )
    #logging.debug( 'options: %s' % options )

    # Make a list for the response
    returnvars = ['workspace', 'outputfile', 'success', 'message', 'artifacts']

    ### Standard outputs ###
    success = False
    message = None

    # Make a dictionary for artifacts left behind
    artifacts = {}

    ### Establish variables ###
    workspace = './'
    inputfile = None
    vocabfile = None
    outputfile = None
    format = 'txt'
    key = None
    separator = None
    encoding = None

    ### Required inputs ###
    try:
        workspace = options['workspace']
    except:
        pass

    try:
        inputfile = options['inputfile']
    except:
        pass

    if inputfile is None or len(inputfile) == 0:
        message = 'No input file given. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    # Look to see if the input file is at the absolute path or in the workspace.
    if os.path.isfile(inputfile) == False:
        if os.path.isfile(workspace + '/' + inputfile) == True:
            inputfile = workspace + '/' + inputfile
        else:
            message = 'Input file %s not found. %s.' % (inputfile, __version__)
            returnvals = [workspace, outputfile, success, message, artifacts]
            logging.debug('message:\n%s' % message)
            return response(returnvars, returnvals)

    try:
        vocabfile = options['vocabfile']
    except:
        pass

    if vocabfile is None or len(vocabfile) == 0:
        message = 'No vocab file given. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    # Look to see if vocab file is at the absolute path or in the workspace.
    vocabfileat = None
    if os.path.isfile(vocabfile) == True:
        vocabfileat = vocabfile
    else:
        vocabfileat = workspace + '/' + vocabfile

    vocabfile = vocabfileat

    try:
        key = options['key']
    except:
        pass

    if key is None or len(key) == 0:
        message = 'No key in term_unknown_reporter. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    try:
        separator = options['separator']
    except:
        pass

    try:
        format = options['format']
    except:
        pass

    try:
        encoding = options['encoding']
    except:
        pass

    try:
        outputfile = options['outputfile']
    except:
        pass

    if outputfile is None or len(outputfile.strip()) == 0:
        outputfile = '%s/%s_standardization_report_%s.%s' % \
          (workspace.rstrip('/'), slugify(key), str(uuid.uuid1()), format)
    else:
        outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile)

    # Get a list of distinct values of the term in the input file
    if separator is None or len(separator) == 0:
        fields = [key]
    else:
        fields = key.split(separator)

    # Let extract_values_from_file figure out the dialect of inputfile.
    checklist = extract_values_from_file(inputfile,
                                         fields,
                                         separator,
                                         encoding=encoding,
                                         function=ustripstr)

    if checklist is None or len(checklist) == 0:
        message = 'No values of %s from %s. %s' % (key, inputfile, __version__)
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message: %s' % message)
        return response(returnvars, returnvals)

    # Get a dictionary of checklist values not found in the vocabfile, which is assumed
    # to be in utf-8 encoding.
    missingvocablist = missing_vocab_list_from_file(checklist,
                                                    vocabfile,
                                                    key,
                                                    separator=separator,
                                                    encoding='utf-8')

    if missingvocablist is None or len(missingvocablist) == 0:
        message = 'No missing values of %s from %s ' % (key, inputfile)
        message += 'found in %s. %s' % (vocabfile, __version__)
        success = True
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    # TODO: Use Allan's DQ report framework
    # Validation, Improvement, Measure
    # Create a series of term reports
    success = term_list_report(outputfile,
                               missingvocablist,
                               key,
                               format=format)

    if outputfile is not None and not os.path.isfile(outputfile):
        message = 'Failed to write results to output file %s.' % outputfile
        message += '%s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    s = '%s_unknown_report_file' % key
    artifacts[s] = outputfile
    returnvals = [workspace, outputfile, success, message, artifacts]
    logging.debug('Finishing %s' % __version__)
    return response(returnvars, returnvals)
Ejemplo n.º 10
0
def composite_header_constructor(options):
    ''' Create a file with a header that contains the distinct union of column names from 
        two input files.
    options - a dictionary of parameters
        loglevel - level at which to log (e.g., DEBUG) (optional)
        workspace - path to a directory for the output file (optional; default './')
        inputfile1 - full path to one of the input files (optional)
        inputfile2 - full path to the second input file (optional)
        outputfile - name of the output file, without path (required)
        format - output file format (e.g., 'csv' or 'txt') (optional; default 'txt')
    returns a dictionary with information about the results
        compositeheader - header combining two inputs
        outputfile - actual full path to the output file
        success - True if process completed successfully, otherwise False
        message - an explanation of the reason if success=False
        artifacts - a dictionary of persistent objects created
    '''
    #print '%s options: %s' % (__version__, options)

    setup_actor_logging(options)

    logging.debug( 'Started %s' % __version__ )
    logging.debug( 'options: %s' % options )

    # Make a list for the response
    returnvars = ['workspace', 'compositeheader', 'outputfile', 'success', 'message', \
        'artifacts']

    ### Standard outputs ###
    success = False
    message = None

    # Make a dictionary for artifacts left behind
    artifacts = {}

    ### Establish variables ###
    workspace = './'
    inputfile1 = None
    inputfile2 = None
    outputfile = None
    format = 'txt'
    compositeheader = None

    ### Required inputs ###
    try:
        workspace = options['workspace']
    except:
        pass

    try:
        inputfile1 = options['inputfile1']
    except:
        pass

    try:
        inputfile2 = options['inputfile2']
    except:
        pass

    try:
        outputfile = options['outputfile']
    except:
        pass

    if outputfile is None or len(outputfile)==0:
        message = 'No output file given. %s' % __version__
        returnvals = [workspace, compositeheader, outputfile, success, message, artifacts]
        return response(returnvars, returnvals)

    outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile)

    try:
        format = options['format']
    except:
        pass

    # Read the headers of the two files and let read_header figure out the dialects and
    # encodings.
    header1 = read_header(inputfile1)
    header2 = read_header(inputfile2)

    compositeheader = merge_headers(header1, header2)

    if format is None or format.lower()=='txt':
        dialect = tsv_dialect()
    else:
        dialect = csv_dialect()

    # Write the resulting header into outputfile
    success = write_header(outputfile, compositeheader, dialect)
    if success == False:
        message = 'Header was not written. %s' % __version__
        returnvals = [workspace, compositeheader, outputfile, success, message, artifacts]
        return response(returnvars, returnvals)

    if compositeheader is not None:
        compositeheader = list(compositeheader)

    artifacts['composite_header_file'] = outputfile

    returnvals = [workspace, compositeheader, outputfile, success, message, artifacts]
    logging.debug('Finishing %s' % __version__)
    return response(returnvars, returnvals)
def term_token_reporter(options):
    ''' Get a dictionary of counts of tokens for a given term in an input file.
    options - a dictionary of parameters
        loglevel - level at which to log (e.g., DEBUG) (optional)
        workspace - path to a directory for the outputfile (optional)
        inputfile - full path to the input file (required)
        outputfile - name of the output file, without path (optional)
        termname - the name of the term for which to count rows (required)
        encoding - string signifying the encoding of the input file. If known, it speeds
            up processing a great deal. (optional; default None) (e.g., 'utf-8')
    returns a dictionary with information about the results
        workspace - actual path to the directory where the outputfile was written
        outputfile - actual full path to the output report file
        tokens - a dictionary of tokens from the term in the inputfile
        success - True if process completed successfully, otherwise False
        message - an explanation of the reason if success=False
        artifacts - a dictionary of persistent objects created
    '''
    #print '%s options: %s' % (__version__, options)

    setup_actor_logging(options)

    logging.debug('Started %s' % __version__)
    logging.debug('options: %s' % options)

    # Make a list for the response
    returnvars = [
        'workspace', 'outputfile', 'tokens', 'success', 'message', 'artifacts'
    ]

    ### Standard outputs ###
    success = False
    message = None

    ### Custom outputs ###
    tokens = None

    # Make a dictionary for artifacts left behind
    artifacts = {}

    ### Establish variables ###
    workspace = './'
    inputfile = None
    outputfile = None
    termname = None
    encoding = None

    ### Required inputs ###
    try:
        workspace = options['workspace']
    except:
        pass

    try:
        inputfile = options['inputfile']
    except:
        pass

    if inputfile is None or len(inputfile) == 0:
        message = 'No input file given. %s' % __version__
        returnvals = [
            workspace, outputfile, tokens, success, message, artifacts
        ]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    if os.path.isfile(inputfile) == False:
        message = 'Input file %s not found. %s' % (inputfile, __version__)
        returnvals = [
            workspace, outputfile, tokens, success, message, artifacts
        ]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    try:
        termname = options['termname']
    except:
        pass

    if termname is None or len(termname) == 0:
        message = 'No term given. %s' % __version__
        returnvals = [
            workspace, outputfile, tokens, success, message, artifacts
        ]
        logging.debug('message: %s' % message)
        return response(returnvars, returnvals)

    try:
        encoding = options['encoding']
    except:
        pass

    try:
        outputfile = options['outputfile']
    except:
        pass

    if outputfile is None or len(outputfile) == 0:
        outputfile = '%s_token_report_%s.txt' % (termname, str(uuid.uuid1()))

    outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile)

    tokens = term_token_count_from_file(inputfile, termname, encoding=encoding)
    success = token_report(outputfile, tokens)

    if success == True:
        s = '%s_token_report_file' % termname
        artifacts[s] = outputfile

    returnvals = [workspace, outputfile, tokens, success, message, artifacts]
    logging.debug('Finishing %s' % __version__)
    return response(returnvars, returnvals)
Ejemplo n.º 12
0
def vocab_counter(options):
    ''' Extract a dictionary of the distinct values of a given term in a text file along 
        with the number of times each occurs.
    options - a dictionary of parameters
        loglevel - level at which to log (e.g., DEBUG) (optional)
        workspace - path to a directory for the output artifacts (optional)
        inputfile - full path to the input file (required)
        termname - the name of the term for which to find distinct values (required)
        encoding - string signifying the encoding of the input file. If known, it speeds
            up processing a great deal. (optional; default None) (e.g., 'utf-8')
    returns a dictionary with information about the results
        workspace - path to a directory for the output artifacts
        extractedvalues - a list of distinct values of the term in the inputfile, with a
           count of the number of times it occurs
        success - True if process completed successfully, otherwise False
        message - an explanation of the reason if success=False
    '''
    #print '%s options: %s' % (__version__, options)

    setup_actor_logging(options)

    logging.debug('Started %s' % __version__)
    logging.debug('options: %s' % options)

    # Make a list for the response
    returnvars = ['workspace', 'extractedvalues', 'success', 'message']

    ### Standard outputs ###
    success = False
    message = None

    ### Custom outputs ###
    extractedvalues = None

    ### Establish variables ###
    workspace = './'
    inputfile = None
    termname = None
    encoding = None

    ### Required inputs ###
    try:
        workspace = options['workspace']
    except:
        pass

    try:
        inputfile = options['inputfile']
    except:
        pass

    if inputfile is None or len(inputfile) == 0:
        message = 'No input file given. %s' % __version__
        returnvals = [workspace, extractedvalues, success, message]
        logging.debug('message: %s' % message)
        return response(returnvars, returnvals)

    if not os.path.isfile(inputfile):
        message = 'Input file %s not found. %s' % (inputfile, __version__)
        returnvals = [workspace, extractedvalues, success, message]
        logging.debug('message: %s' % message)
        return response(returnvars, returnvals)

    try:
        termname = options['termname']
    except:
        pass

    if termname is None or len(termname) == 0:
        message = 'No term given. %s' % __version__
        returnvals = [workspace, extractedvalues, success, message]
        logging.debug('message: %s' % message)
        return response(returnvars, returnvals)

    try:
        encoding = options['encoding']
    except:
        pass

    extractedvalues = extract_value_counts_from_file(inputfile, [termname],
                                                     encoding=encoding)

    success = True
    returnvals = [workspace, extractedvalues, success, message]
    options['vocab_counter_response'] = response(returnvars, returnvals)
    logging.debug('Finishing %s' % __version__)
    return response(returnvars, returnvals)
Ejemplo n.º 13
0
def vocab_extractor(options):
    ''' Extract a list of the distinct values of a set of terms in a text file.
    options - a dictionary of parameters
        loglevel - level at which to log (e.g., DEBUG) (optional)
        workspace - path to a directory to work in (optional)
        inputfile - full path to the input file (required)
        termlist - list of fields to extract from the input file (required)
        separator - string that separates the values in termlist (e.g., '|') 
            (optional; default None)
        encoding - string signifying the encoding of the input file. If known, it speeds
            up processing a great deal. (optional; default None) (e.g., 'utf-8')
    returns a dictionary with information about the results
        workspace - path to a directory worked in
        extractedvalues - a list of distinct values of the term in the inputfile
        success - True if process completed successfully, otherwise False
        message - an explanation of the reason if success=False
    '''
    #print '%s options: %s' % (__version__, options)

    setup_actor_logging(options)

    logging.debug('Started %s' % __version__)
    logging.debug('options: %s' % options)

    # Make a list of keys in the response dictionary
    returnvars = ['workspace', 'extractedvalues', 'success', 'message']

    ### Standard outputs ###
    success = False
    message = None

    ### Custom outputs ###
    extractedvalues = None

    # Make a dictionary for artifacts left behind
    artifacts = {}

    ### Establish variables ###
    workspace = './'
    inputfile = None
    termlist = None
    separator = None
    encoding = None

    ### Required inputs ###
    try:
        workspace = options['workspace']
    except:
        pass

    try:
        inputfile = options['inputfile']
    except:
        pass

    if inputfile is None or len(inputfile) == 0:
        message = 'No input file given. %s' % __version__
        returnvals = [workspace, extractedvalues, success, message]
        logging.debug('message: %s' % message)
        return response(returnvars, returnvals)

    if not os.path.isfile(inputfile):
        message = 'Input file %s not found. %s' % (inputfile, __version__)
        returnvals = [workspace, extractedvalues, success, message]
        logging.debug('message: %s' % message)
        return response(returnvars, returnvals)

    try:
        termlist = options['termlist']
    except:
        pass

    if termlist is None or len(termlist) == 0:
        message = 'No termlist given. %s' % __version__
        returnvals = [workspace, extractedvalues, success, message]
        logging.debug('message: %s' % message)
        return response(returnvars, returnvals)

    try:
        separator = options['separator']
    except:
        pass

    try:
        encoding = options['encoding']
    except:
        pass

    if separator is None or len(separator.strip()) == 0:
        theterms = [termlist]
    else:
        theterms = termlist.split(separator)

    # Extract the distinct values from the inputfile, applying the function to strip
    # white space and make lower case.
    # Let extract_values_from_file figure out the dialect and encoding of inputfile.
    extractedvalues = extract_values_from_file(inputfile,
                                               theterms,
                                               separator=separator,
                                               encoding=encoding,
                                               function=ustripstr)

    success = True
    returnvals = [workspace, extractedvalues, success, message]
    logging.debug('Finishing %s' % __version__)
    return response(returnvars, returnvals)
def text_file_field_stripper(options):
    ''' Filter a text file into a new file based on matching a list of fields to keep.
    options - a dictionary of parameters
        loglevel - level at which to log (e.g., DEBUG) (optional)
        workspace - the directory in which the output will be written (optional)
        inputfile - full path to the input file (required)
        outputfile - name of the output file, without path (required)
        separator - string that separates the values in termlist (e.g., '|') 
            (optional; default None)
        encoding - string signifying the encoding of the input file. If known, it speeds
            up processing a great deal. (optional; default None) (e.g., 'utf-8')
        format - output file format (e.g., 'csv' or 'txt') (optional; default 'txt')
        termlist - list of fields to extract from the input file (required)
    returns a dictionary with information about the results
        workspace - actual path to the directory where the outputfile was written
        outputfile - actual full path to the output tsv file
        success - True if process completed successfully, otherwise False
        message - an explanation of the reason if success=False
        artifacts - a dictionary of persistent objects created
    '''
    #print '%s options: %s' % (__version__, options)

    setup_actor_logging(options)

    logging.debug( 'Started %s' % __version__ )
    logging.debug( 'options: %s' % options )

    # Make a list for the response
    returnvars = ['workspace', 'outputfile', 'success', 'message', 'artifacts']

    ### Standard outputs ###
    success = False
    message = None

    # Make a dictionary for artifacts left behind
    artifacts = {}

    ### Establish variables ###
    workspace = './'
    inputfile = None
    outputfile = None
    format = 'txt'
    termlist = None
    separator = None
    encoding = None

    ### Required inputs ###
    try:
        workspace = options['workspace']
    except:
        pass

    try:
        inputfile = options['inputfile']
    except:
        pass

    if inputfile is None or len(inputfile)==0:
        message = 'No input file given. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    if os.path.isfile(inputfile) == False:
        message = 'Input file %s not found. %s' % (inputfile, __version__)
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    try:
        termlist = options['termlist']
    except:
        pass

    if termlist is None or len(termlist)==0:
        message = 'No termlist given. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message: %s' % message)
        return response(returnvars, returnvals)

    try:
        separator = options['separator']
    except:
        pass

    try:
        encoding = options['encoding']
    except:
        pass

    if separator is None or len(separator.strip())==0:
        theterms = [termlist]
    else:
        theterms = termlist.split(separator)

    # Determine the file dialect
    inputdialect = csv_file_dialect(inputfile)

    # Determine the file encoding
    if encoding is None or len(encoding.strip()) == 0:
        encoding = csv_file_encoding(inputfile)
        # csv_file_encoding() always returns an encoding if there is an input file.
        # No need to check.

    # If the termname is not in the header of the inputfile, nothing to do.
    header = read_header(inputfile, dialect=inputdialect, encoding=encoding)

    # Make a clean version of the input header
    cleaninputheader = clean_header(header)

    try:
        format = options['format']
    except:
        pass

    try:
        outputfile = options['outputfile']
    except:
        pass

    if outputfile is None or len(outputfile)==0:
        message = 'No output file given. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile)

    # Prepare the outputfile
    if format is None or format.lower()=='txt':
        outputdialect = tsv_dialect()
    else:
        outputdialect = csv_dialect()

    if separator is None or len(separator.strip())==0:
        theterms = [termlist]
    else:
        theterms = termlist.split(separator)

    # Make a clean version of the output header
    cleanoutputheader = clean_header(theterms)

    # Create the outputfile and write the new header to it
    write_header(outputfile, cleanoutputheader, outputdialect)

    # Check to see that the file was created
    if os.path.isfile(outputfile) == False:
        message = 'Outputfile %s was not created. %s' % (outputfile, __version__)
        returnvals = [workspace, outputfile, success, message, artifacts]
        return response(returnvars, returnvals)

    # Open the outputfile to start writing matching rows
    with open(outputfile, 'a') as outfile:
        writer = csv.DictWriter(outfile, dialect=outputdialect, encoding='utf-8', 
            fieldnames=cleanoutputheader)

        # Iterate through all rows in the input file
        for row in read_csv_row(inputfile, dialect=inputdialect, encoding=encoding, 
            header=True, fieldnames=cleaninputheader):
            newrow = extract_fields_from_row(row, cleanoutputheader)
            writer.writerow(newrow)

    success = True
    s = 'stripped_file'
    artifacts[s] = outputfile
    
    # Prepare the response dictionary
    returnvals = [workspace, outputfile, success, message, artifacts]
    logging.debug('Finishing %s' % __version__)
    return response(returnvars, returnvals)
Ejemplo n.º 15
0
def text_file_aggregator(options):
    ''' Join the contents of files in a given path. Headers and encodings are not assumed 
        to be the same. Write a file containing the joined files with one header line.
    options - a dictionary of parameters
        loglevel - level at which to log (e.g., DEBUG) (optional)
        workspace - path to a directory for the outputfile (optional)
        inputpath - full path to the input file set. The file extension of the outputfile
            will be the substring following the last '.' in the inputpath.
            Example: ./workspace/thefiles.txt will produce an output file ending in
            '.txt' (required) 
        outputfile - name of the output file, without path (optional)
        format - output file format (e.g., 'csv' or 'txt') (optional; default 'txt')
    returns a dictionary with information about the results
        workspace - actual path to the directory where the outputfile was written
        outputfile - actual full path to the output file
        aggregaterowcount - the number of rows in the aggregated file, not counting header
        success - True if process completed successfully, otherwise False
        message - an explanation of the reason if success=False
        artifacts - a dictionary of persistent objects created
    '''
    #print '%s options: %s' % (__version__, options)

    setup_actor_logging(options)

    logging.debug( 'Started %s' % __version__ )
    logging.debug( 'options: %s' % options )

    # Make a list for the response
    returnvars = ['workspace', 'outputfile', 'aggregaterowcount', 'success', 'message', 
        'artifacts']

    ### Standard outputs ###
    success = False
    message = None

    ### Custom outputs ###
    aggregaterowcount = None

    # Make a dictionary for artifacts left behind
    artifacts = {}

    ### Establish variables ###
    workspace = './'
    inputpath = None
    outputfile = None
    format = None

    ### Required inputs ###
    try:
        workspace = options['workspace']
    except:
        pass

    try:
        inputpath = options['inputpath']
    except:
        pass

    if inputpath is None or len(inputpath)==0:
        message = 'No input file given. %s' % __version__
        returnvals = [workspace, outputfile, aggregaterowcount, success, message,
            artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    try:
        format = options['format']
    except:
        pass

    if format is None or len(format)==0:
        format = 'txt'

    if format.lower() == 'txt':
        dialect = tsv_dialect()
    else:
        dialect = csv_dialect()

    try:
        outputfile = options['outputfile']
    except:
        pass

    if outputfile is None or len(outputfile)==0:
        outputfile='aggregate_'+str(uuid.uuid1())+format

    # Construct the output file path in the workspace
    outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile)

    # Create the composite header. Let composite_header determine the dialects and 
    # encodings of the files to aggregate.
    aggregateheader = composite_header(inputpath)
    aggregaterowcount = 0

    # Open a file to write the aggregated results in chosen format and utf-8.
    with open(outputfile, 'w') as outfile:
        writer = csv.DictWriter(outfile, dialect=dialect, encoding='utf-8', 
            fieldnames=aggregateheader, extrasaction='ignore')
        writer.writeheader()
        files = glob.glob(inputpath)
        for file in files:
            dialect = csv_file_dialect(file)
            encoding = csv_file_encoding(file)
            with open(file, 'rU') as inputfile:
                reader = csv.DictReader(utf8_data_encoder(inputfile, encoding), 
                    dialect=dialect, encoding=encoding)
                for line in reader:
                    try:
                        writer.writerow(line)
                        aggregaterowcount += 1
                    except:
                        message = 'failed to write line:\n%s\n' % line
                        message += 'to file %s. %s' % (file, __version__)
                        returnvals = [workspace, outputfile, aggregaterowcount, success, 
                            message, artifacts]
                        logging.debug('message:\n%s' % message)
                        return response(returnvars, returnvals)

    success = True
    artifacts['aggregated_file'] = outputfile
    if aggregateheader is not None:
        aggregateheader = list(aggregateheader)
        returnvals = [workspace, outputfile, aggregaterowcount, success, message,
            artifacts]
    logging.debug('Finishing %s' % __version__)
    return response(returnvars, returnvals)
Ejemplo n.º 16
0
def parse_dynamic_properties(options):
    ''' Actor will parse the values of the dynamic properties column of the input
        and each property as separate column in a new csv file
    options - a dictionary of parameters
        loglevel - level at which to log (e.g., DEBUG) (optional)
        workspace - path to a directory for the outputfile (optional)
        inputfile - full path to the input file (required)
        outputfile - name of the output file, without path (optional)
    returns a dictionary with information about the results
        workspace - actual path to the directory where the outputfile was written
        outputfile - actual full path to the output file
        success - True if process completed successfully, otherwise False
        message - an explanation of the results
        artifacts - a dictionary of persistent objects created
    '''
    #print '%s options: %s' % (__version__, options)

    setup_actor_logging(options)

    logging.debug('Started %s' % __version__)
    logging.debug('options: %s' % options)

    # Make a list of keys in the response dictionary
    returnvars = ['workspace', 'outputfile', 'success', 'message', 'artifacts']

    ### Standard outputs ###
    success = False
    message = None

    ### Custom outputs ###
    # Intialize any other output variables here so that the response calls no about them

    # Make a dictionary for artifacts left behind
    artifacts = {}

    ### Establish variables ###
    inputfile = None
    outputfile = None

    ### Required inputs ###
    try:
        workspace = options['workspace']
    except:
        workspace = './'

    try:
        inputfile = options['inputfile']
    except:
        pass

    if inputfile is None or len(inputfile) == 0:
        message = 'No input file given. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    if os.path.isfile(inputfile) == False:
        message = 'Input file %s not found. %s' % (inputfile, __version__)
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    try:
        outputfile = options['outputfile']
    except:
        pass

    if outputfile is None or len(outputfile) == 0:
        outputfile = 'parsed_props_' + str(uuid.uuid1()) + '.csv'

    # Construct the output file path in the workspace
    outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile)

    ### Optional inputs ###
    # TODO: output format (csv or tsv)

    # Do the actual work now that the preparation is complete
    success = parse_props(inputfile, outputfile)

    # Add artifacts to the output dictionary if all went well
    if success == True:
        artifacts['parsed_props_output_file'] = outputfile

    # Prepare the response dictionary
    returnvals = [workspace, outputfile, success, message, artifacts]
    logging.debug('Finishing %s' % __version__)
    return response(returnvars, returnvals)
Ejemplo n.º 17
0
def dataset_guid_setter(options):
    ''' Create an output file adding a field populated by global unique identifiers.
    options - a dictionary of parameters
        loglevel - level at which to log (e.g., DEBUG) (optional)
        workspace - path to a directory for the output file (optional; default './')
        inputfile - path to the input file. Either full path or path within the workspace
            (required)
        outputfile - name of the output file, without path (optional)
        format - output file format (e.g., 'csv' or 'txt') (optional; default 'txt')
        key - field whose values are to be set to GUID values (required)
        encoding - string signifying the encoding of the input file. If known, it speeds
            up processing a great deal. (optional; default None) (e.g., 'utf-8')
        guidtype - type of GUID to use to populate the key (optional; default 'uuid')
    returns a dictionary with information about the results
        workspace - actual path to the directory where the outputfile was written
        outputfile - actual full path to the output report file
        success - True if process completed successfully, otherwise False
        message - an explanation of the reason if success=False
        artifacts - a dictionary of persistent objects created
    '''
    print '%s options: %s' % (__version__, options)

    setup_actor_logging(options)

    logging.debug('Started %s' % __version__)
    logging.debug('options: %s' % options)

    # Make a list of keys in the response dictionary
    returnvars = ['workspace', 'outputfile', 'success', 'message', 'artifacts']

    ### Standard outputs ###
    success = False
    message = None

    ### Custom outputs ###

    # Make a dictionary for artifacts left behind
    artifacts = {}

    ### Establish variables ###
    workspace = './'
    inputfile = None
    outputfile = None
    format = 'txt'
    key = None
    guidtype = 'uuid'
    encoding = None

    ### Required inputs ###
    try:
        workspace = options['workspace']
    except:
        pass

    try:
        inputfile = options['inputfile']
    except:
        pass

    if inputfile is None or len(inputfile) == 0:
        message = 'No input file given. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    # Look to see if the input file is at the absolute path or in the workspace.
    if os.path.isfile(inputfile) == False:
        if os.path.isfile(workspace + '/' + inputfile) == True:
            inputfile = workspace + '/' + inputfile
        else:
            message = 'Input file %s not found. %s' % (inputfile, __version__)
            returnvals = [workspace, outputfile, True, message, artifacts]
            logging.debug('message:\n%s' % message)
            return response(returnvars, returnvals)

    try:
        key = options['key']
    except:
        pass

    if key is None or len(key) == 0:
        message = 'No key given. %s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    ### Optional inputs ###
    try:
        format = options['format']
    except:
        pass

    try:
        separator = options['separator']
    except:
        pass

    try:
        encoding = options['encoding']
    except:
        pass

    try:
        guidtype = options['guidtype']
    except:
        pass

    try:
        outputfile = options['outputfile']
    except:
        pass

    if outputfile is None or len(outputfile.strip()) == 0:
        outputfile = '%s/%s_corrected_report_%s.%s' % \
          (workspace.rstrip('/'), slugify(key), str(uuid.uuid1()), format)
    else:
        outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile)

    # Run the core operation
    success = uuid_term_appender(inputfile,
                                 outputfile,
                                 key,
                                 guidtype=guidtype,
                                 encoding=encoding,
                                 format=format)

    # Check to see if the outputfile was created
    if outputfile is not None and not os.path.isfile(outputfile):
        message = 'Failed to write results to output file %s. ' % outputfile
        message += '%s' % __version__
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    # Add artifacts to the output dictionary if all went well
    s = '%s_setter_report_file' % slugify(key)
    artifacts[s] = outputfile

    # Prepare the response dictionary
    returnvals = [workspace, outputfile, success, message, artifacts]
    logging.debug('Finishing %s' % __version__)
    return response(returnvars, returnvals)
def outcomestats(options):
    """Generic actor showing patterns for logging, input dictionary, and output dictionary
       with artifacts.
    options - a dictionary of parameters
        loglevel - level at which to log (e.g., DEBUG) (optional)
        workspace - path to a directory for the outputfile (optional)
        inputfile - full path to the input file (required)
        outputfile - name of the output file, without path (optional)
    returns a dictionary with information about the results
        workspace - actual path to the directory where the outputfile was written
        outputfile - actual full path to the output file
        success - True if process completed successfully, otherwise False
        message - an explanation of the results
        artifacts - a dictionary of persistent objects created
    """
    setup_actor_logging(options)
    print options
    logging.debug( 'Started %s' % __version__ )
    logging.debug( 'options: %s' % options )

    # Make a list for the response
    returnvars = ['workspace', 'outputfile', 'success', 'message', 'artifacts']

    # Make a dictionary for artifacts left behind
    artifacts = {}

    # outputs
    workspace = None
    outputfile = None
    success = False
    message = None

    #abspath = os.path.abspath(__file__)
    #dname = os.path.dirname(abspath)
    #os.chdir(dname)

    #print dname
    #print options['configfile']

    # inputs
    try:
        workspace = options['workspace']
    except:
        workspace = None

    if workspace is None or len(workspace)==0:
        workspace = './'

    try:
        inputfile = options['inputfile']
    except:
        inputfile = None

    if inputfile is None or len(inputfile)==0:
        message = 'No input file given'
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    if os.path.isfile(inputfile) == False:
        message = 'Input file not found'
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    try:
        outputfile = options['outputfile']
    except:
        outputfile = None
    if outputfile is None or len(outputfile)==0:
        outputfile='outcomeStats_'+str(uuid.uuid1())+'.xlsx'

    try:
        configfile = options['configfile']
    except:
        configfile = None
    if configfile is None or len(configfile)==0:
        message = 'No config file given'
        returnvals = [workspace, outputfile, success, message, artifacts]
        logging.debug('message:\n%s' % message)

    # Construct the output file path in the workspace
    outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile)

    # Do the actual work now that the preparation is complete
    success = stats_to_xlsx(inputfile, outputfile, configfile)

    # Add artifacts to the output dictionary if all went well
    if success==True:
        artifacts['output_file'] = outputfile

    # Prepare the response dictionary
    returnvals = [workspace, outputfile, success, message, artifacts]
    logging.debug('Finishing %s' % __version__)
    return response(returnvars, returnvals)
    def do_stuffer(options):
        print("path" + os.getcwd())
        setup_actor_logging(options)

        #logging.debug('Started %s' % __version__)
        logging.debug('options: %s' % options)

        # Make a list of keys in the response dictionary
        returnvars = [
            'workspace', 'outputfile', 'success', 'message', 'artifacts'
        ]

        ### Standard outputs ###
        success = False
        message = None

        ### Custom outputs ###

        # Make a dictionary for artifacts left behind
        artifacts = {}

        ### Establish variables ###
        inputfile = None
        outputfile = None

        ### Required inputs ###
        try:
            workspace = options['workspace']
        except:
            workspace = './'

        try:
            inputfile = options['inputfile']
        except:
            pass

        if inputfile is None or len(inputfile) == 0:
            #message = 'No input file given. %s' % __version__
            message = 'No input file given.'
            returnvals = [workspace, outputfile, success, message, artifacts]
            logging.debug('message:\n%s' % message)
            return response(returnvars, returnvals)

        if os.path.isfile(inputfile) == False:
            #message = 'Input file %s not found. %s' % (inputfile, __version__)
            message = 'Input file %s not found.' % inputfile
            returnvals = [workspace, outputfile, success, message, artifacts]
            logging.debug('message:\n%s' % message)
            return response(returnvars, returnvals)

        try:
            outputfile = options['outputfile']
        except:
            pass

        if outputfile is None or len(outputfile) == 0:
            outputfile = 'dwca_' + str(uuid.uuid1()) + '.zip'

        # Construct the output file path in the workspace
        outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile)

        ### Optional inputs ###
        params = []
        argspec = inspect.getargspec(do_stuff)

        for arg in argspec.args:
            if arg == 'inputfile':
                params.append(inputfile)
            elif arg == 'outputfile':
                params.append(outputfile)
            elif arg == 'workspace':
                params.append(workspace)

            else:
                if (arg in options):
                    params.append(options[arg])
                else:
                    raise KeyError(
                        '%s not supplied as a parameter of %s in yaml config' %
                        (arg, do_stuff.func_name))

        # Do the actual work now that the preparation is complete
        success = do_stuff(*params)

        # Add artifacts to the output dictionary if all went well
        if success == True:
            artifacts['template_output_file'] = outputfile

        # Prepare the response dictionary
        returnvals = [workspace, outputfile, success, message, artifacts]
        #logging.debug('Finishing %s' % __version__)
        logging.debug('Finishing')
        return response(returnvars, returnvals)
Ejemplo n.º 20
0
def csv_fieldcount_checker(options):
    ''' Get the first row in a csv file where the number of fields is less than the number
        of fields in the header.
    options - a dictionary of parameters
        loglevel - level at which to log (e.g., DEBUG) (optional)
        workspace - path to a directory for the output artifacts (optional)
        inputfile - full path to the input file (required)
    returns a dictionary with information about the results
        workspace - path to a directory for the output artifacts
        firstbadrowindex - the line number of the first row in the inputfile where the 
            field count does not match
        row - the content of the first line in the inputfile where the field count does
            not match.
        success - True if process completed successfully, otherwise False
        message - an explanation of the reason if success=False
    '''
    #print '%s options: %s' % (__version__, options)

    setup_actor_logging(options)

    logging.debug( 'Started %s' % __version__ )
    logging.debug( 'options: %s' % options )

    # Make a list for the response
    returnvars = ['workspace', 'firstbadrowindex', 'row', 'success', 'message']

    ### Standard outputs ###
    success = False
    message = None

    ### Custom outputs ###
    firstbadrowindex = 0
    row = None

    ### Establish variables ###
    workspace = './'
    inputfile = None

    ### Required inputs ###
    try:
        workspace = options['workspace']
    except:
        pass

    try:
        inputfile = options['inputfile']
    except:
        pass

    if inputfile is None or len(inputfile)==0:
        message = 'No input file given. %s' % __version__
        returnvals = [workspace, firstbadrowindex, row, success, message]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    if os.path.isfile(inputfile) == False:
        message = 'Input file %s not found. %s' % (inputfile, __version__)
        returnvals = [workspace, firstbadrowindex, row, success, message]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    result = csv_field_checker(inputfile)

    if result is not None:
        firstbadrowindex = result[0]
        row = result[1]
        message = 'Row with incorrect number fields found. %s' % __version__
        returnvals = [workspace, firstbadrowindex, row, success, message]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    success = True
    returnvals = [workspace, firstbadrowindex, row, success, message]
    logging.info('Finishing %s' % __version__)
    return response(returnvars, returnvals)
Ejemplo n.º 21
0
def term_counter(options):
    ''' Get a count of the rows that are populated for a given term.
    options - a dictionary of parameters
        loglevel - level at which to log (e.g., DEBUG) (optional)
        workspace - path to a directory for the output artifacts (optional)
        inputfile - full path to the input file (required)
        termname - the name of the term for which to count rows (required)
        encoding - string signifying the encoding of the input file. If known, it speeds
            up processing a great deal. (optional; default None) (e.g., 'utf-8')
    returns a dictionary with information about the results
        workspace - path to a directory for the output artifacts
        rowcount - the number of rows in the inputfile that have a value for the term
        success - True if process completed successfully, otherwise False
        message - an explanation of the reason if success=False
    '''
    #print '%s options: %s' % (__version__, options)

    setup_actor_logging(options)

    logging.debug('Started %s' % __version__)
    logging.debug('options: %s' % options)

    # Make a list for the response
    returnvars = ['workspace', 'rowcount', 'success', 'message']

    ### Standard outputs ###
    success = False
    message = None

    ### Custom outputs ###
    rowcount = None

    ### Establish variables ###
    workspace = './'
    inputfile = None
    termname = None
    encoding = None

    ### Required inputs ###
    try:
        workspace = options['workspace']
    except:
        pass

    try:
        inputfile = options['inputfile']
    except:
        pass

    if inputfile is None or len(inputfile) == 0:
        message = 'No input file given. %s' % __version__
        returnvals = [workspace, rowcount, success, message]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    if os.path.isfile(inputfile) == False:
        message = 'Input file %s not found. %s' % (inputfile, __version__)
        returnvals = [workspace, rowcount, success, message]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    try:
        termname = options['termname']
    except:
        pass

    if termname is None or len(termname) == 0:
        message = 'No term given. %s' % __version__
        returnvals = [workspace, rowcount, success, message]
        logging.debug('message: %s' % message)
        return response(returnvars, returnvals)

    try:
        encoding = options['encoding']
    except:
        pass

    rowcount = term_rowcount_from_file(inputfile, termname, encoding=encoding)

    success = True
    returnvals = [workspace, rowcount, success, message]
    logging.debug('Finishing %s' % __version__)
    return response(returnvars, returnvals)
def dwca_core_to_tsv(options):
    ''' Save the core of the archive to a tsv file with DwC term names as headers.
    options - a dictionary of parameters
        loglevel - the level at which to log (e.g., DEBUG)
        workspace - path to a directory for the outputfile (optional)
        inputfile - full path to the input Darwin Core archive file (required)
        outputfile - file name of the tsv output file, no path (optional)
        archivetype - archive type ('standard' or 'gbif') (optional; default 'standard')
    returns a dictionary with information about the results
        workspace - actual path to the directory where the outputfile was written
        outputfile - actual full path to the output tsv file
        rowcount - the number of rows in the Darwin Core archive file
        success - True if process completed successfully, otherwise False
        message - an explanation of the reason if success=False
        artifacts - a dictionary of persistent objects created
    '''
    #print '%s options: %s' % (__version__, options)

    setup_actor_logging(options)

    logging.debug('Started %s' % __version__)
    logging.debug('options: %s' % options)

    # Make a list for the response
    returnvars = [
        'workspace', 'outputfile', 'rowcount', 'success', 'message',
        'artifacts'
    ]

    ### Standard outputs ###
    success = False
    message = None

    ### Custom outputs ###
    rowcount = None

    # Make a dictionary for artifacts left behind
    artifacts = {}

    ### Establish variables ###
    workspace = './'
    inputfile = None
    outputfile = None
    archivetype = 'standard'

    ### Required inputs ###
    try:
        workspace = options['workspace']
    except:
        pass

    try:
        inputfile = options['inputfile']
    except:
        pass

    if inputfile is None or len(inputfile) == 0:
        message = 'No input file given. %s' % __version__
        returnvals = [
            workspace, outputfile, rowcount, success, message, artifacts
        ]
        logging.debug('message:\n%s' % message)
        return response(returnvars, returnvals)

    # Look to see if the input file is at the absolute path or in the workspace.
    if os.path.isfile(inputfile) == False:
        if os.path.isfile(workspace + '/' + inputfile) == True:
            inputfile = workspace + '/' + inputfile
        else:
            message = 'Input file %s not found. %s' % (inputfile, __version__)
            returnvals = [
                workspace, outputfile, rowcount, success, message, artifacts
            ]
            logging.debug('message:\n%s' % message)
            return response(returnvars, returnvals)

    try:
        outputfile = options['outputfile']
    except:
        pass

    if outputfile is None or len(outputfile) == 0:
        outputfile = 'dwca_%s.txt' % str(uuid.uuid1())
    outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile)

    try:
        archivetype = options['archivetype']
    except:
        pass

    # Note: The DwCAReader creates a temporary directory of its own and cleans it up
    # Make a reader based on whether the archive is standard or a GBIF download.
    dwcareader = None
    if archivetype is not None and archivetype.lower() == 'gbif':
        try:
            with GBIFResultsReader(inputfile) as dwcareader:
                rowcount = write_core_csv_file(dwcareader, outputfile)

        except Exception, e:
            message = 'Error %s ' % e
            message += 'reading GBIF archive: %s. %s' % (inputfile,
                                                         __version__)
            returnvals = [
                workspace, outputfile, rowcount, success, message, artifacts
            ]
            logging.debug('message:\n%s' % message)
            return response(returnvars, returnvals)
Ejemplo n.º 23
0
def downloader(options):
    ''' Download a files from a list of URLs.
    options - a dictionary of parameters
        loglevel - level at which to log (e.g., DEBUG) (optional)
        workspace - path to a directory for the outputfile (optional)
        url - URL to the file to download (required)
        outputfile - name of the output file, without path (optional)
    returns a dictionary with information about the results
        workspace - actual path to the directory where the outputfile was written
        outputfile - actual full path to the output file
        success - True if process completed successfully, otherwise False
        message - an explanation of the results
        artifacts - a dictionary of persistent objects created
    '''
    #print '%s options: %s' % (__version__, options)

    setup_actor_logging(options)

    logging.debug('Started %s' % __version__)
    logging.debug('options: %s' % options)

    # Make a list for the response
    returnvars = ['workspace', 'outputfile', 'success', 'message', 'artifacts']

    ### Standard outputs ###
    success = False
    message = None

    # Make a dictionary for artifacts left behind
    artifacts = {}

    ### Establish variables ###
    workspace = './'
    url = None
    outputfile = None

    ### Required inputs ###
    try:
        workspace = options['workspace']
    except:
        pass

    try:
        url = options['url']
    except:
        pass

    try:
        outputfile = options['outputfile']
    except:
        pass

    if outputfile is None or len(outputfile) == 0:
        outputfile = 'dwca_' + str(uuid.uuid1()) + '.zip'

    outputfile = '%s/%s' % (workspace.rstrip('/'), outputfile)

    success = download_file(url, outputfile)

    if success == True:
        artifacts['downloaded_file'] = outputfile

    returnvals = [workspace, outputfile, success, message, artifacts]
    logging.debug('Finishing %s' % __version__)
    return response(returnvars, returnvals)