def json_dump(json_data_object, output_filename): """Write "tsv_data_object" dict to JSON file. Args: arg1 (dict): A data dict. It should have metaData and data keys. arg2 (str): The name to use for the output file. Returns: None. It just writes out the dict to a JSON file. """ log.info('TIME: {}. Writing data to output JSON file.'.format(timenow())) check_data_object(json_data_object) with open(output_filename, 'w') as outfile: json.dump(json_data_object, outfile, sort_keys=True, indent=2, separators=(',', ': ')) outfile.close() log.info('TIME: {}. Done writing data to output file.'.format(timenow())) return
def write_proforma_line(field_key, field_value, proforma, outfile): """Evaluate a specific key:value pair in a data object and write proforma line(s) to file. Args: arg1 (str): The proforma field key. arg2: The value in the data object for a given key. Different object types are processed. arg3 (dict): The dict representing a specific proforma type (keys match field prefixes: e.g., "G1a"). arg4 (_io.TextIOWrapper): An output file object for writing to. Returns: None. """ log.debug('TIME: {}. Writing proforma line to file.'.format(timenow())) # Make sure data_object key is also in proforma key. Other keys are ignored. if field_key not in proforma.keys(): log.warning('Ignoring {}: {}. Proforma key not recognized.'.format( field_key, field_value)) # Action depends on type of value for a given key. # Value can be str, int, float, dict or monotypic list/tuple of any of these. # Simple case is when value is single str, int or float. elif type(field_value) in [str, int, float]: outfile.write(proforma[field_key]) outfile.write(field_value + '\n') # If it's a list, evaluate each element in turn. elif type(field_value) in [list, tuple]: # Determine if list/tuple is monotypic. # Code doesn't handle a list of different element types. list_types = list(set(type(i) for i in field_value)) if len(list_types) != 1: log.warning( 'Ignoring {}: {}. Cannot handle empty or mixed lists.'.format( field_key, field_value)) # If it's a list of str/int/float, just write each element of the list out after the proforma leader. # e.g., list multiple synonyms on separate lines after writing G1b leader once. elif list_types[0] in [str, int, float]: log.debug('Handling {}: {} as list/tuple of {} objects.'.format( field_key, field_value, list_types[0])) outfile.write(proforma[field_key]) for item in field_value: outfile.write(item + '\n') # If it's a list of dicts, just write out each element of each dict in turn. # In this way, one can write multiple sets of proforma fields: e.g., LC99a/LC99b accession/db pairs. # { "LC99a": [ { "LC99a": "acc1", "LC99b": "db1" }, { "LC99a": "acc2", "LC99b": "db2" },...] } elif list_types[0] == dict: log.debug( 'Handling {}: {} as a list/tuple of dict objects.'.format( field_key, field_value)) for element_dict in field_value: for element_key, element_value in element_dict.items(): write_proforma_line(element_key, element_value, proforma, outfile) else: log.warning('Ignoring {}: {}. Cant handle value type: {}.'.format( field_key, field_value, list_types[0])) else: log.warning('Ignoring {}: {}. Cant handle value of type: {}.'.format( field_key, field_value, type(field_value))) return
def check_data_object(data_object): """Check the structure of the input data dict object before writing to file. Args: arg1 (dict): A dictionary that includes a "metaData" key, and a "data" key with a list type value. Returns: None. Simply a check. Warnings: Will raise a warning if the "data_object" dict has no "metaData" key. Raises: Will raise an exception if the "data_object" is not a dict.metaData Will raise an exception if the "data_object" has no "data" key. Will raise an exception if the "data_object["data"]" object is not a list. Will raise an exception if the "data_object["data"]" list is empty. Will raise an exception if the objects in the "data_object["data"]" list are not of type dict. """ log.info('TIME: {}. Checking format of input data.'.format(timenow())) # Check that the "data_object" is a dict. if type(data_object) != dict: log.error('The "data_object" is not of the expected type "dict".') raise TypeError # Check that the "data_object" has a "metaData" key. try: data_object['metaData'] except KeyError: log.warning( 'The "data_object" is missing the expected "metaData" key.') # Check that the "data_object" has a "data" key. try: data_object['data'] except KeyError: log.error('The "data_object" is missing the expected "data" key.') raise KeyError # Check that the "data_object["data"]" value is a list. if type(data_object['data']) != list: log.error( 'The "data_object["data"]" object is not of the expected type "list".' ) raise TypeError # Check that the "data_object["data"]" list is not empty. if len(data_object['data']) == 0: log.error('The "data_object["data"]" object is empty.') raise ValueError # Check that the "data_object["data"]" list elements are themselves dicts. for datum in data_object['data']: if type(datum) != dict: log.error( 'Elements of the data_object["data"] list are not of expected type dict.' ) raise TypeError
def detect_proforma_type(data_object, field_to_proforma_dict): """Detect the proforma type for a given data object (dict) by that object's keys. Args: arg1 (dict): The data object which encodes values for proforma record, keyed by proforma field prefix. arg2 (dict): The field-prefix-to-proforma-type dict generated by get_distinct_proforma_field_prefixes(). Returns: str: Will be "undetermined", or the name of the proforma type (extracted from SVN proforma template file name). If a name, it will match the keys in the master proforma dict generated by get_proforma_masters(). The "undetermined" is returned if: 1) "data_object" is not a dict; 2) 0 or many types detected. Warnings: Will raise a warning if a data_object key does not match any known proforma field prefix. """ log.debug('TIME: {}. Detecting proforma type for this object: {}'.format( timenow(), data_object)) # First make sure it's a dictionary. if type(data_object) != dict: log.warning('Data object is not of expected type dictionary.') resolved_proforma_type = 'undetermined' # Scan the dictionary keys. proforma_types_detected = [] for key in data_object: if re.match(r'[A-Z]{1,3}', key): try: key_prefix = re.match(r'[A-Z]{1,3}', key).group(0) proforma_type = field_to_proforma_dict[key_prefix] log.debug('Key "{}" corresponds to proforma type: {}'.format( key, proforma_type)) proforma_types_detected.append(proforma_type) except KeyError: log.warning( 'Key "{}" looks like a proforma field prefix but is unknown.' .format(key)) else: log.debug('Ignoring key "{}"'.format(key)) # Unique the types of proforma detected for data object. # We expect that a data_object dict corresponds to only one type of proforma; otherwise, code does nothing. proforma_types_detected = list(set(proforma_types_detected)) cnt_types = len(proforma_types_detected) if cnt_types == 0: log.warning('Detected no proforma types for this object.') resolved_proforma_type = 'undetermined' elif cnt_types > 1: log.warning( 'Cannot process data object representing many proforma types.') for i in proforma_types_detected: log.warning(i) resolved_proforma_type = 'undetermined' else: resolved_proforma_type = proforma_types_detected[0] log.debug( 'This data object corresponds to this proforma type: {}'.format( resolved_proforma_type)) return resolved_proforma_type
def get_distinct_proforma_field_prefixes(all_proforma_dict): """Make a field tag prefix to proforma type dict for detection of proforma type. Args: arg1 (dict): The nested proforma dict (proforma_type > field prefix) generated by get_proforma_masters(). Returns: dict: A dictionary of distinct field tag prefix to proforma type. Used for detect_proforma_type(). e.g., {"G": "gene", "GA": "allele", ...} Warnings: Will raise a warning if a given proforma type has > 1 field prefix string: e.g., expect only "G" for GENE. """ log.info( 'TIME: {}. Identifying unique field prefix tag for proforma masters.'. format(timenow())) field_to_proforma_dict = {} # Look through each type of proforma in the input "all_proforma_dict". for pro_type in all_proforma_dict.keys(): log.debug('Assessing proforma type: {}'.format(pro_type)) pro_dict = all_proforma_dict[pro_type] # A list for collecting all unique field prefixes for a given proforma type: e.g., "G" for all GENE fields. field_tag_prefixes = [] for key in pro_dict.keys(): try: field_tag_prefix = re.match(r'[A-Z]{1,3}', key).group(0) field_tag_prefixes.append(field_tag_prefix) except AttributeError: log.debug('Ignoring this proforma key: {}'.format(key)) # Get the unique set of field prefixes for a given proforma. distinct_field_tag_prefixes = list(set(field_tag_prefixes)) distinct_field_count = len(distinct_field_tag_prefixes) # We're expecting only one type of field prefix per proforma type: e.g., "G" for GENE, "GA" for ALLELE. if distinct_field_count == 1: field_prefix = distinct_field_tag_prefixes[0] log.debug( 'The {} master has this distinct field tag prefix: {}'.format( pro_type, field_prefix)) field_to_proforma_dict[field_prefix] = pro_type # However, if we get multiple field types, we can't make the correspondence. # We add this to the dict for our records - would probably warrant fix to code or pro template. else: field_prefix = 'undetermined for ' + pro_type log.warning( 'The {} master has {} distinct field tag prefixes.'.format( pro_type, distinct_field_count)) field_to_proforma_dict[field_prefix] = pro_type # For debugging, print this dict to the log file. for k, v in field_to_proforma_dict.items(): log.debug( 'Field tag prefix {} corresponds to this proforma type: {}'.format( k, v)) return field_to_proforma_dict
def write_proforma_stanza(data_object, proforma, outfile): """Write proforma stanza for a given data_object. Args: arg1 (dict): The data object to write. arg2 (dict): The proforma type that corresponds to the data object. arg3 (_io.TextIOWrapper): An output file object for writing to. Returns: None. """ log.debug('TIME: {}. Writing proforma stanza to file.'.format(timenow())) # Write out header. outfile.write(proforma['header'] + '\n') # Write out lines of proforma stanza. for field_key in data_object.keys(): field_value = data_object[field_key] write_proforma_line(field_key, field_value, proforma, outfile) # Write stanza closer. outfile.write(proforma['end'] + '\n') return
def write_proforma_record(data_list, output_filename, svn_username, svn_password): """Write full proforma record for a list of dicts representing data objects. A wrapper of several smaller functions in the "write_proforma" module. Expects that the data_list will be a list of dicts. List order determines write order. Each dict is expected to represent only a single proforma type; otherwise skipped. Args: arg1 (list): The list of data objects to write to file. arg2 (str): The filename of the output file. arg3 (str): The SVN username. arg4 (str): The SVN password. Returns: None. """ log.info('TIME: {}. Writing proforma record to "{}".'.format( timenow(), output_filename)) outfile = open(output_filename, 'wt') master_proforma_dict = get_proforma_masters(svn_username, svn_password) field_to_proforma_dict = get_distinct_proforma_field_prefixes( master_proforma_dict) write_record_curation_header(svn_username, outfile) for datum in data_list: proforma_type = detect_proforma_type(datum, field_to_proforma_dict) if proforma_type != 'undetermined': data_type_specific_proforma_dict = master_proforma_dict[ proforma_type] write_proforma_stanza(datum, data_type_specific_proforma_dict, outfile) write_record_end(outfile) return
def extract_date_from_filename(filename): """Extract YYMMDD/YYYYMMDD date from input string (usually a filename). Args: arg1 (str): The string to check for YYMMDD/YYYYMMDD date stamps (between periods or underscores). Returns: str: The YYMMDD/YYYYMMDD date stamp, or 'date_undetermined' if no matches found. Warnings: Raises a warning if one date stamp isn't found. """ log.info('TIME: {}. Checking input file name for date stamp.'.format( timenow())) date_regex = r'(?<=(\.|_))([0-9]{6}|[0-9]{8})(?=(\.|_))' try: file_date = re.search(date_regex, filename).group(0) except AttributeError: log.warning( 'Could not find datestamp in filename: "{}"'.format(filename)) file_date = 'date_undetermined' return file_date
def extract_data_from_tsv(input_filename, **kwargs): """Extract data from a csv/tsv file and return it as a list of dicts. Detects delimiter and header info. Args: arg1 (str): The input filename. **kwargs: optional "delimiter" (needed if CSV Sniffer doesn't work) Returns: list: A list of dicts where dict keys match the header row (if present), or keys are generic "col0" labels. Raises: Will raise an exception if CSV Sniffer can't detect delimiter and the kwarg delimiter is not specified. """ log.info('TIME: {}. Opening input file: {}.'.format( timenow(), input_filename)) # Check filename, open the file, and detect delimiter. check_tsv_filename(input_filename) file_input = open(input_filename, 'r') try: delimiter_detected = kwargs['delimiter'] log.info('{}: Will use delimiter specified : "{}"'.format( input_filename, delimiter_detected)) except KeyError: log.info('{}: No delimiter specified. Will try CSV Sniffer.'.format( input_filename)) try: csv_sniffer = csv.Sniffer().sniff(file_input.read(1024)) delimiter_detected = csv_sniffer.delimiter log.info('{}: CSV Sniffer detected this type of delimiter: "{}".'. format(input_filename, delimiter_detected)) except ValueError: log.error( '{}: No delimiter specified and CSV Sniffer could not detect delimiter either.' .format(input_filename)) raise ValueError # Reset the file object iterator, open the file, scan for headers. file_input.seek(0) csv_input = csv.reader(file_input, delimiter=delimiter_detected) headers = find_headers(input_filename, csv_input, delimiter_detected) # Reset the file object iterator, then process into a dict. # Use of csv.DictReader was avoided because it does not handle zero or multiple leading comments very well. file_input.seek(0) log.info('TIME: {}. {}: Processing rows of input file.'.format( input_filename, timenow())) data_input = [] row_cnt = 1 for row in csv_input: log.debug('{}: Processing row {}:\n\t{}'.format( input_filename, row_cnt, row)) if len(row) > 0: if not row[0].startswith('#'): if len(row) == len(headers): row_data = {} for i in range(0, len(headers)): row_data[headers[i]] = row[i] data_input.append(row_data) else: log.warning( '{}: Line {} has {} part(s) instead of {} part(s).'. format(input_filename, row_cnt, len(row), len(headers))) row_cnt += 1 return data_input
def get_proforma_masters(svn_username, svn_password): """Get all proforma masters from SVN and return a dictionary keyed by proforma type, then by field. Args: arg1 (str): SVN username. arg2 (str): SVN password. Returns: dict: A nested dict with level one keys corresponding to proforma types (extracted from filenames). The level two keys correspond to proforma field prefixes: e.g., "G1a", "SF2b". e.g., { 'gene': { 'G1h': '! G1h. FlyBase gene ID (FBgn) *z :', 'G1b': '! G1a. Gene symbol to use in FlyBase *a :', ... }, 'allele': { 'GA1a': ... } """ log.info('TIME: {}. Retrieving proforma masters.'.format(timenow())) svn_url = 'https://svn.flybase.org/documents/curation/proformae/' r = svn.remote.RemoteClient(svn_url, username=svn_username, password=svn_password) local_svn_path = '/tmp/working/' r.checkout(local_svn_path) svn_contents = os.scandir(local_svn_path) proforma_master_dict = {} for item in svn_contents: if item.name.endswith('_master.pro'): pro_path = local_svn_path + item.name pro_name = item.name.split('_')[0] log.debug('Assessing the {} proforma now.'.format(pro_name)) proforma_master_dict[pro_name] = {} pro_contents = open(pro_path, 'rt') line_counter = 0 for line in pro_contents: # Look for header if re.match(r'! [A-Z]{3}.*Version.*[0-9]{4}\n$', line): proforma_master_dict[pro_name]['header'] = line.rstrip() log.debug('Found "header": {}'.format( proforma_master_dict[pro_name]['header'])) line_counter += 1 # Look for proforma field leader. elif re.match(r'!\s{1,6}[A-Z]{1,3}[0-9]{1,2}[a-z]{0,1}\.', line): left_line = re.match( r'!\s{1,6}[A-Z]{1,3}[0-9]{1,2}[a-z]{0,1}\.', line).group(0) field_tag = re.search( r'(?<=\s)[A-Z]{1,3}[0-9]{1,2}[a-z]{0,1}', left_line).group(0) field = line.split(':')[0] + ':' proforma_master_dict[pro_name][field_tag] = field log.debug('Found tag: {}, line: {}'.format( field_tag, field)) line_counter += 1 # Look for end of proforma. If found, stop reading. # e.g., EXPRESSION proforma has curation manual after proforma template. elif re.match(r'!!', line): proforma_master_dict[pro_name]['end'] = 80 * '!' log.debug('Found end of proforma template.') line_counter += 1 break # Ignore other stuff in proforma. else: log.debug('Ignore this proforma master line: {}'.format( line.rstrip())) line_counter += 1 # Get distinguishing field tag prefix for this proforma master (e.g., "GA" for ALLELE). field_tag_count = len(proforma_master_dict[pro_name].keys()) log.debug('Found {} field tags in {} lines for {}'.format( field_tag_count, line_counter, pro_name)) # Check the master proforma if in debug. log.debug('Checking master proforma.') for pro_type in proforma_master_dict.keys(): for k, v in proforma_master_dict[pro_type].items(): log.debug('{}; {}; {}'.format(pro_type, k, v)) return proforma_master_dict
def tsv_report_dump(tsv_data_object, output_filename, **kwargs): """Write "tsv_data_object" dict to TSV file. Args: arg1 (dict): A data dict. It should have metaData and data keys. arg2 (str): The name to use for the output file. **kwargs (list): An optional list of headers under the "headers" key: e.g., headers=['a', 'b', 'c'] Returns: None. It just writes out the dict to a TSV file. Raises: Raises an exception if no "headers" list is supplied AND first element of "data" list has no keys itself. """ log.info('TIME: {}. Writing data to output TSV file.'.format(timenow())) check_data_object(tsv_data_object) # Can supply a list of headers under the keyword 'headers'. if 'headers' in kwargs.keys(): headers = kwargs['headers'] # Otherwise, it just takes the dictionary keys from the first data object. else: try: headers = tsv_data_object['data'][0].keys() except AttributeError: log.error( 'The first element of the tsv_data_object["data"] has no dict keys.' ) raise AttributeError # Open up the output file if we get this far. output_file = open(output_filename, 'w') # Check that metaData key exists. try: output_file.write('## {}\n'.format( tsv_data_object['metaData']['title'])) output_file.write('## Generated: {}\n'.format( tsv_data_object['metaData']['dateProduced'])) output_file.write('## Using datasource: {}\n'.format( tsv_data_object['metaData']['database'])) if 'note' in tsv_data_object['metaData'].keys(): output_file.write('## Note: {}\n'.format( tsv_data_object['metaData']['note'])) output_file.write('##\n') except KeyError: log.debug('The "tsv_data_object" has no "metaData" key.') # Regardless of presence/absence of metaData, write out headers. output_file.write('#') csv_writer = csv.DictWriter(output_file, fieldnames=headers, delimiter='\t', extrasaction='ignore') csv_writer.writeheader() for data_item in tsv_data_object['data']: csv_writer.writerow(data_item) try: output_file.write('## Finished {}.'.format( tsv_data_object['metaData']['title'])) except KeyError: output_file.write('## Finished report.') log.info('TIME: {}. Done writing data to output file.'.format(timenow())) return