Exemple #1
0
def json_dump(json_data_object, output_filename):
    """Write "tsv_data_object" dict to JSON file.

    Args:
        arg1 (dict): A data dict. It should have metaData and data keys.
        arg2 (str): The name to use for the output file.

    Returns:
        None. It just writes out the dict to a JSON file.

    """
    log.info('TIME: {}. Writing data to output JSON file.'.format(timenow()))

    check_data_object(json_data_object)

    with open(output_filename, 'w') as outfile:
        json.dump(json_data_object,
                  outfile,
                  sort_keys=True,
                  indent=2,
                  separators=(',', ': '))
        outfile.close()

    log.info('TIME: {}. Done writing data to output file.'.format(timenow()))

    return
def write_proforma_line(field_key, field_value, proforma, outfile):
    """Evaluate a specific key:value pair in a data object and write proforma line(s) to file.

    Args:
        arg1 (str): The proforma field key.
        arg2: The value in the data object for a given key. Different object types are processed.
        arg3 (dict): The dict representing a specific proforma type (keys match field prefixes: e.g., "G1a").
        arg4 (_io.TextIOWrapper): An output file object for writing to.

    Returns:
        None.

    """
    log.debug('TIME: {}. Writing proforma line to file.'.format(timenow()))
    # Make sure data_object key is also in proforma key. Other keys are ignored.
    if field_key not in proforma.keys():
        log.warning('Ignoring {}: {}. Proforma key not recognized.'.format(
            field_key, field_value))
    # Action depends on type of value for a given key.
    # Value can be str, int, float, dict or monotypic list/tuple of any of these.
    # Simple case is when value is single str, int or float.
    elif type(field_value) in [str, int, float]:
        outfile.write(proforma[field_key])
        outfile.write(field_value + '\n')
    # If it's a list, evaluate each element in turn.
    elif type(field_value) in [list, tuple]:
        # Determine if list/tuple is monotypic.
        # Code doesn't handle a list of different element types.
        list_types = list(set(type(i) for i in field_value))
        if len(list_types) != 1:
            log.warning(
                'Ignoring {}: {}. Cannot handle empty or mixed lists.'.format(
                    field_key, field_value))
        # If it's a list of str/int/float, just write each element of the list out after the proforma leader.
        # e.g., list multiple synonyms on separate lines after writing G1b leader once.
        elif list_types[0] in [str, int, float]:
            log.debug('Handling {}: {} as list/tuple of {} objects.'.format(
                field_key, field_value, list_types[0]))
            outfile.write(proforma[field_key])
            for item in field_value:
                outfile.write(item + '\n')
        # If it's a list of dicts, just write out each element of each dict in turn.
        # In this way, one can write multiple sets of proforma fields: e.g., LC99a/LC99b accession/db pairs.
        #     { "LC99a": [ { "LC99a": "acc1", "LC99b": "db1" }, { "LC99a": "acc2", "LC99b": "db2" },...] }
        elif list_types[0] == dict:
            log.debug(
                'Handling {}: {} as a list/tuple of dict objects.'.format(
                    field_key, field_value))
            for element_dict in field_value:
                for element_key, element_value in element_dict.items():
                    write_proforma_line(element_key, element_value, proforma,
                                        outfile)
        else:
            log.warning('Ignoring {}: {}. Cant handle value type: {}.'.format(
                field_key, field_value, list_types[0]))
    else:
        log.warning('Ignoring {}: {}. Cant handle value of type: {}.'.format(
            field_key, field_value, type(field_value)))

    return
Exemple #3
0
def check_data_object(data_object):
    """Check the structure of the input data dict object before writing to file.

    Args:
        arg1 (dict): A dictionary that includes a "metaData" key, and a "data" key with a list type value.

    Returns:
        None. Simply a check.

    Warnings:
        Will raise a warning if the "data_object" dict has no "metaData" key.

    Raises:
        Will raise an exception if the "data_object" is not a dict.metaData
        Will raise an exception if the "data_object" has no "data" key.
        Will raise an exception if the "data_object["data"]" object is not a list.
        Will raise an exception if the "data_object["data"]" list is empty.
        Will raise an exception if the objects in the "data_object["data"]" list are not of type dict.

    """
    log.info('TIME: {}. Checking format of input data.'.format(timenow()))

    # Check that the "data_object" is a dict.
    if type(data_object) != dict:
        log.error('The "data_object" is not of the expected type "dict".')
        raise TypeError

    # Check that the "data_object" has a "metaData" key.
    try:
        data_object['metaData']
    except KeyError:
        log.warning(
            'The "data_object" is missing the expected "metaData" key.')

    # Check that the "data_object" has a "data" key.
    try:
        data_object['data']
    except KeyError:
        log.error('The "data_object" is missing the expected "data" key.')
        raise KeyError

    # Check that the "data_object["data"]" value is a list.
    if type(data_object['data']) != list:
        log.error(
            'The "data_object["data"]" object is not of the expected type "list".'
        )
        raise TypeError

    # Check that the "data_object["data"]" list is not empty.
    if len(data_object['data']) == 0:
        log.error('The "data_object["data"]" object is empty.')
        raise ValueError

    # Check that the "data_object["data"]" list elements are themselves dicts.
    for datum in data_object['data']:
        if type(datum) != dict:
            log.error(
                'Elements of the data_object["data"] list are not of expected type dict.'
            )
            raise TypeError
def detect_proforma_type(data_object, field_to_proforma_dict):
    """Detect the proforma type for a given data object (dict) by that object's keys.

    Args:
        arg1 (dict): The data object which encodes values for proforma record, keyed by proforma field prefix.
        arg2 (dict): The field-prefix-to-proforma-type dict generated by get_distinct_proforma_field_prefixes().

    Returns:
        str: Will be "undetermined", or the name of the proforma type (extracted from SVN proforma template file name).
             If a name, it will match the keys in the master proforma dict generated by get_proforma_masters().
             The "undetermined" is returned if: 1) "data_object" is not a dict; 2) 0 or many types detected.

    Warnings:
        Will raise a warning if a data_object key does not match any known proforma field prefix.
    """
    log.debug('TIME: {}. Detecting proforma type for this object: {}'.format(
        timenow(), data_object))
    # First make sure it's a dictionary.
    if type(data_object) != dict:
        log.warning('Data object is not of expected type dictionary.')
        resolved_proforma_type = 'undetermined'
    # Scan the dictionary keys.
    proforma_types_detected = []
    for key in data_object:
        if re.match(r'[A-Z]{1,3}', key):
            try:
                key_prefix = re.match(r'[A-Z]{1,3}', key).group(0)
                proforma_type = field_to_proforma_dict[key_prefix]
                log.debug('Key "{}" corresponds to proforma type: {}'.format(
                    key, proforma_type))
                proforma_types_detected.append(proforma_type)
            except KeyError:
                log.warning(
                    'Key "{}" looks like a proforma field prefix but is unknown.'
                    .format(key))
        else:
            log.debug('Ignoring key "{}"'.format(key))
    # Unique the types of proforma detected for data object.
    # We expect that a data_object dict corresponds to only one type of proforma; otherwise, code does nothing.
    proforma_types_detected = list(set(proforma_types_detected))
    cnt_types = len(proforma_types_detected)
    if cnt_types == 0:
        log.warning('Detected no proforma types for this object.')
        resolved_proforma_type = 'undetermined'
    elif cnt_types > 1:
        log.warning(
            'Cannot process data object representing many proforma types.')
        for i in proforma_types_detected:
            log.warning(i)
        resolved_proforma_type = 'undetermined'
    else:
        resolved_proforma_type = proforma_types_detected[0]
        log.debug(
            'This data object corresponds to this proforma type: {}'.format(
                resolved_proforma_type))

    return resolved_proforma_type
def get_distinct_proforma_field_prefixes(all_proforma_dict):
    """Make a field tag prefix to proforma type dict for detection of proforma type.

    Args:
        arg1 (dict): The nested proforma dict (proforma_type > field prefix) generated by get_proforma_masters().

    Returns:
        dict: A dictionary of distinct field tag prefix to proforma type. Used for detect_proforma_type().
              e.g., {"G": "gene", "GA": "allele", ...}

    Warnings:
        Will raise a warning if a given proforma type has > 1 field prefix string: e.g., expect only "G" for GENE.

    """
    log.info(
        'TIME: {}. Identifying unique field prefix tag for proforma masters.'.
        format(timenow()))
    field_to_proforma_dict = {}
    # Look through each type of proforma in the input "all_proforma_dict".
    for pro_type in all_proforma_dict.keys():
        log.debug('Assessing proforma type: {}'.format(pro_type))
        pro_dict = all_proforma_dict[pro_type]
        # A list for collecting all unique field prefixes for a given proforma type: e.g., "G" for all GENE fields.
        field_tag_prefixes = []
        for key in pro_dict.keys():
            try:
                field_tag_prefix = re.match(r'[A-Z]{1,3}', key).group(0)
                field_tag_prefixes.append(field_tag_prefix)
            except AttributeError:
                log.debug('Ignoring this proforma key: {}'.format(key))
        # Get the unique set of field prefixes for a given proforma.
        distinct_field_tag_prefixes = list(set(field_tag_prefixes))
        distinct_field_count = len(distinct_field_tag_prefixes)
        # We're expecting only one type of field prefix per proforma type: e.g., "G" for GENE, "GA" for ALLELE.
        if distinct_field_count == 1:
            field_prefix = distinct_field_tag_prefixes[0]
            log.debug(
                'The {} master has this distinct field tag prefix: {}'.format(
                    pro_type, field_prefix))
            field_to_proforma_dict[field_prefix] = pro_type
        # However, if we get multiple field types, we can't make the correspondence.
        # We add this to the dict for our records - would probably warrant fix to code or pro template.
        else:
            field_prefix = 'undetermined for ' + pro_type
            log.warning(
                'The {} master has {} distinct field tag prefixes.'.format(
                    pro_type, distinct_field_count))
            field_to_proforma_dict[field_prefix] = pro_type
    # For debugging, print this dict to the log file.
    for k, v in field_to_proforma_dict.items():
        log.debug(
            'Field tag prefix {} corresponds to this proforma type: {}'.format(
                k, v))

    return field_to_proforma_dict
def write_proforma_stanza(data_object, proforma, outfile):
    """Write proforma stanza for a given data_object.

    Args:
        arg1 (dict): The data object to write.
        arg2 (dict): The proforma type that corresponds to the data object.
        arg3 (_io.TextIOWrapper): An output file object for writing to.

    Returns:
        None.

    """
    log.debug('TIME: {}. Writing proforma stanza to file.'.format(timenow()))
    # Write out header.
    outfile.write(proforma['header'] + '\n')
    # Write out lines of proforma stanza.
    for field_key in data_object.keys():
        field_value = data_object[field_key]
        write_proforma_line(field_key, field_value, proforma, outfile)
    # Write stanza closer.
    outfile.write(proforma['end'] + '\n')

    return
def write_proforma_record(data_list, output_filename, svn_username,
                          svn_password):
    """Write full proforma record for a list of dicts representing data objects.

    A wrapper of several smaller functions in the "write_proforma" module.
    Expects that the data_list will be a list of dicts.
    List order determines write order.
    Each dict is expected to represent only a single proforma type; otherwise skipped.

    Args:
        arg1 (list): The list of data objects to write to file.
        arg2 (str): The filename of the output file.
        arg3 (str): The SVN username.
        arg4 (str): The SVN password.

    Returns:
        None.

    """
    log.info('TIME: {}. Writing proforma record to "{}".'.format(
        timenow(), output_filename))
    outfile = open(output_filename, 'wt')

    master_proforma_dict = get_proforma_masters(svn_username, svn_password)
    field_to_proforma_dict = get_distinct_proforma_field_prefixes(
        master_proforma_dict)
    write_record_curation_header(svn_username, outfile)
    for datum in data_list:
        proforma_type = detect_proforma_type(datum, field_to_proforma_dict)
        if proforma_type != 'undetermined':
            data_type_specific_proforma_dict = master_proforma_dict[
                proforma_type]
            write_proforma_stanza(datum, data_type_specific_proforma_dict,
                                  outfile)
    write_record_end(outfile)

    return
Exemple #8
0
def extract_date_from_filename(filename):
    """Extract YYMMDD/YYYYMMDD date from input string (usually a filename).

    Args:
        arg1 (str): The string to check for YYMMDD/YYYYMMDD date stamps (between periods or underscores).

    Returns:
        str: The YYMMDD/YYYYMMDD date stamp, or 'date_undetermined' if no matches found.

    Warnings:
        Raises a warning if one date stamp isn't found.

    """
    log.info('TIME: {}. Checking input file name for date stamp.'.format(
        timenow()))
    date_regex = r'(?<=(\.|_))([0-9]{6}|[0-9]{8})(?=(\.|_))'
    try:
        file_date = re.search(date_regex, filename).group(0)
    except AttributeError:
        log.warning(
            'Could not find datestamp in filename: "{}"'.format(filename))
        file_date = 'date_undetermined'

    return file_date
Exemple #9
0
def extract_data_from_tsv(input_filename, **kwargs):
    """Extract data from a csv/tsv file and return it as a list of dicts.

    Detects delimiter and header info.

    Args:
        arg1 (str): The input filename.
        **kwargs: optional "delimiter" (needed if CSV Sniffer doesn't work)

    Returns:
        list: A list of dicts where dict keys match the header row (if present), or keys are generic "col0" labels.

    Raises:
        Will raise an exception if CSV Sniffer can't detect delimiter and the kwarg delimiter is not specified.

    """
    log.info('TIME: {}. Opening input file: {}.'.format(
        timenow(), input_filename))

    # Check filename, open the file, and detect delimiter.
    check_tsv_filename(input_filename)
    file_input = open(input_filename, 'r')
    try:
        delimiter_detected = kwargs['delimiter']
        log.info('{}: Will use delimiter specified : "{}"'.format(
            input_filename, delimiter_detected))
    except KeyError:
        log.info('{}: No delimiter specified. Will try CSV Sniffer.'.format(
            input_filename))
        try:
            csv_sniffer = csv.Sniffer().sniff(file_input.read(1024))
            delimiter_detected = csv_sniffer.delimiter
            log.info('{}: CSV Sniffer detected this type of delimiter: "{}".'.
                     format(input_filename, delimiter_detected))
        except ValueError:
            log.error(
                '{}: No delimiter specified and CSV Sniffer could not detect delimiter either.'
                .format(input_filename))
            raise ValueError

    # Reset the file object iterator, open the file, scan for headers.
    file_input.seek(0)
    csv_input = csv.reader(file_input, delimiter=delimiter_detected)
    headers = find_headers(input_filename, csv_input, delimiter_detected)

    # Reset the file object iterator, then process into a dict.
    # Use of csv.DictReader was avoided because it does not handle zero or multiple leading comments very well.
    file_input.seek(0)
    log.info('TIME: {}. {}: Processing rows of input file.'.format(
        input_filename, timenow()))
    data_input = []
    row_cnt = 1
    for row in csv_input:
        log.debug('{}: Processing row {}:\n\t{}'.format(
            input_filename, row_cnt, row))
        if len(row) > 0:
            if not row[0].startswith('#'):
                if len(row) == len(headers):
                    row_data = {}
                    for i in range(0, len(headers)):
                        row_data[headers[i]] = row[i]
                    data_input.append(row_data)
                else:
                    log.warning(
                        '{}: Line {} has {} part(s) instead of {} part(s).'.
                        format(input_filename, row_cnt, len(row),
                               len(headers)))
        row_cnt += 1

    return data_input
def get_proforma_masters(svn_username, svn_password):
    """Get all proforma masters from SVN and return a dictionary keyed by proforma type, then by field.

    Args:
        arg1 (str): SVN username.
        arg2 (str): SVN password.

    Returns:
        dict: A nested dict with level one keys corresponding to proforma types (extracted from filenames).
              The level two keys correspond to proforma field prefixes: e.g., "G1a", "SF2b".
              e.g., {
                        'gene':
                            {
                                'G1h': '! G1h.  FlyBase gene ID (FBgn) *z :',
                                'G1b': '! G1a.  Gene symbol to use in FlyBase  *a :',
                                ...
                            },
                        'allele':
                            {
                                'GA1a': ...
                    }
    """
    log.info('TIME: {}. Retrieving proforma masters.'.format(timenow()))
    svn_url = 'https://svn.flybase.org/documents/curation/proformae/'
    r = svn.remote.RemoteClient(svn_url,
                                username=svn_username,
                                password=svn_password)
    local_svn_path = '/tmp/working/'
    r.checkout(local_svn_path)
    svn_contents = os.scandir(local_svn_path)
    proforma_master_dict = {}
    for item in svn_contents:
        if item.name.endswith('_master.pro'):
            pro_path = local_svn_path + item.name
            pro_name = item.name.split('_')[0]
            log.debug('Assessing the {} proforma now.'.format(pro_name))
            proforma_master_dict[pro_name] = {}
            pro_contents = open(pro_path, 'rt')
            line_counter = 0
            for line in pro_contents:
                # Look for header
                if re.match(r'! [A-Z]{3}.*Version.*[0-9]{4}\n$', line):
                    proforma_master_dict[pro_name]['header'] = line.rstrip()
                    log.debug('Found "header": {}'.format(
                        proforma_master_dict[pro_name]['header']))
                    line_counter += 1
                # Look for proforma field leader.
                elif re.match(r'!\s{1,6}[A-Z]{1,3}[0-9]{1,2}[a-z]{0,1}\.',
                              line):
                    left_line = re.match(
                        r'!\s{1,6}[A-Z]{1,3}[0-9]{1,2}[a-z]{0,1}\.',
                        line).group(0)
                    field_tag = re.search(
                        r'(?<=\s)[A-Z]{1,3}[0-9]{1,2}[a-z]{0,1}',
                        left_line).group(0)
                    field = line.split(':')[0] + ':'
                    proforma_master_dict[pro_name][field_tag] = field
                    log.debug('Found tag: {}, line: {}'.format(
                        field_tag, field))
                    line_counter += 1
                # Look for end of proforma. If found, stop reading.
                # e.g., EXPRESSION proforma has curation manual after proforma template.
                elif re.match(r'!!', line):
                    proforma_master_dict[pro_name]['end'] = 80 * '!'
                    log.debug('Found end of proforma template.')
                    line_counter += 1
                    break
                # Ignore other stuff in proforma.
                else:
                    log.debug('Ignore this proforma master line: {}'.format(
                        line.rstrip()))
                    line_counter += 1
            # Get distinguishing field tag prefix for this proforma master (e.g., "GA" for ALLELE).
            field_tag_count = len(proforma_master_dict[pro_name].keys())
            log.debug('Found {} field tags in {} lines for {}'.format(
                field_tag_count, line_counter, pro_name))
    # Check the master proforma if in debug.
    log.debug('Checking master proforma.')
    for pro_type in proforma_master_dict.keys():
        for k, v in proforma_master_dict[pro_type].items():
            log.debug('{}; {}; {}'.format(pro_type, k, v))

    return proforma_master_dict
Exemple #11
0
def tsv_report_dump(tsv_data_object, output_filename, **kwargs):
    """Write "tsv_data_object" dict to TSV file.

    Args:
        arg1 (dict): A data dict. It should have metaData and data keys.
        arg2 (str): The name to use for the output file.
        **kwargs (list): An optional list of headers under the "headers" key: e.g., headers=['a', 'b', 'c']

    Returns:
        None. It just writes out the dict to a TSV file.

    Raises:
        Raises an exception if no "headers" list is supplied AND first element of "data" list has no keys itself.

    """
    log.info('TIME: {}. Writing data to output TSV file.'.format(timenow()))

    check_data_object(tsv_data_object)

    # Can supply a list of headers under the keyword 'headers'.
    if 'headers' in kwargs.keys():
        headers = kwargs['headers']
    # Otherwise, it just takes the dictionary keys from the first data object.
    else:
        try:
            headers = tsv_data_object['data'][0].keys()
        except AttributeError:
            log.error(
                'The first element of the tsv_data_object["data"] has no dict keys.'
            )
            raise AttributeError

    # Open up the output file if we get this far.
    output_file = open(output_filename, 'w')

    # Check that metaData key exists.
    try:
        output_file.write('## {}\n'.format(
            tsv_data_object['metaData']['title']))
        output_file.write('## Generated: {}\n'.format(
            tsv_data_object['metaData']['dateProduced']))
        output_file.write('## Using datasource: {}\n'.format(
            tsv_data_object['metaData']['database']))
        if 'note' in tsv_data_object['metaData'].keys():
            output_file.write('## Note: {}\n'.format(
                tsv_data_object['metaData']['note']))
        output_file.write('##\n')
    except KeyError:
        log.debug('The "tsv_data_object" has no "metaData" key.')

    # Regardless of presence/absence of metaData, write out headers.
    output_file.write('#')
    csv_writer = csv.DictWriter(output_file,
                                fieldnames=headers,
                                delimiter='\t',
                                extrasaction='ignore')
    csv_writer.writeheader()

    for data_item in tsv_data_object['data']:
        csv_writer.writerow(data_item)

    try:
        output_file.write('## Finished {}.'.format(
            tsv_data_object['metaData']['title']))
    except KeyError:
        output_file.write('## Finished report.')

    log.info('TIME: {}. Done writing data to output file.'.format(timenow()))

    return