def map_iris(iri_dict, target_ontologies, distance, use_paxo, oxo_inner_url, query_size, verbose): quantified_url = "%s?size=%d" % (oxo_inner_url, query_size) data = {'ids': list(iri_dict.keys()), 'mappingTarget': target_ontologies} json_strings = [] # data['distance'] = '1' if threshold >= 100 else '2' if threshold >= 75 else '3' if threshold >= 50 else '' data['distance'] = distance """ If boundary less than 50%, throw 'confidence too low' error: need to code! """ oxo_hit_counter = 0 while quantified_url is not None: reply = requests.post(quantified_url, data) oxo_hit_counter += 1 json_content = reply.content json_string = json.loads(json_content) newsflash(json_string, verbose) json_strings.append(json_string) try: these_results = json_string["_embedded"]["searchResults"] except: newsflash( "IRI map load aborted: OxO hit %d times, with %d query terms" % (oxo_hit_counter, oxo_hit_counter * query_size)) raise for this_result in these_results: source_label = this_result["label"] hits = this_result["mappingResponseList"] ontology_dict = {} for hit in hits: target_ontology = hit['targetPrefix'] """ Create one key per target ontology, then append individual hits to associated array """ ontology_dict.setdefault(target_ontology, []).append({ 'curie': hit['curie'], 'target_label': hit['label'], 'distance': hit['distance'] }) # for okey in ontology_dict: # ontology_dict[okey] = ', '.join(ontology_dict[okey]) iri_dict[this_result["queryId"]] = { 'source_label': source_label, 'ontodict': ontology_dict } # newsflash(ontology_dict) try: quantified_url = json_string["_links"]["next"]["href"] except KeyError: newsflash("Stopped: all good!") quantified_url = None newsflash("No. of iterative calls to OxO web API was %d" % len(json_strings)) """ Passed-in dictionary object is mutated in situ: no need to return it """ return None
def main(): """ First of all, check configuration file """ parser1 = argparse.ArgumentParser(description='Config filepath', add_help=False) parser1.add_argument('-g', '--config', help='filepath of config file (ini format)') namespace, extra = parser1.parse_known_args() config_file = namespace.config """ Second of all, get configuration info from configuration file (if specified) """ ontoconfig = configparser.ConfigParser(os.environ) ontoconfig.optionxform = str column_list = None if config_file is not None: ontoconfig.read(config_file) columns_plus = ontoconfig.items('Columns') spurious_columns = ontoconfig.items('DEFAULT') real_columns = dict(set(columns_plus) - set(spurious_columns)) column_list = list(real_columns.values()) """ Third of all, parse the rest of the switches, possibly using defaults from configuration file """ parser2 = argparse.ArgumentParser(prog='Ontomapper', description="%s%s" % ('Allows the generation of speadsheet subsample. Specifically, ', 'samples rows at random and columns by parameterised request.'), parents=[parser1]) cfg_sect_lookup = config_or_bust(ontoconfig, 'Params') parser2.add_argument('-i', '--input-file', default=cfg_sect_lookup('input_file', 'string'), help='location of input spreadsheet: accepts filepath or URL') parser2.add_argument('-o', '--output', default=cfg_sect_lookup('output', 'string'), help='output spreadsheet filepath **NO CURRENT EFFECT**') parser2.add_argument('-f', '--file-format', choices=['csv', 'tsv'], default=cfg_sect_lookup('file_format', 'string'), help='file format (both input and output)') parser2.add_argument('-s', '--sample-size', type=int, default=cfg_sect_lookup('sample_size', 'int'), help='number of records to return in randomly sampled spreadsheet') parser2.add_argument('-c', '--column-name', nargs='+', default=column_list, help='space-separated list of column names required in sampled output') if len(sys.argv) < 2: parser2.print_help(sys.stderr) newsflash() sys.exit(0) args = parser2.parse_args() """ vars returns a dictionary from the Namespace object; """ arg_dict = vars(args) newsflash() newsflash("These are your opening parameters:") newsflash() # newsflash("Length of args dictionary is %d" % len(arg_dict)) newsflash(pd.Series(arg_dict)) newsflash() """ config is still in arg_dict at this point """ arg_dict.pop('config') """ Don't check values of reserved options, which have no effect at the moment """ active_arg_dict = arg_dict.copy() for inactive_arg in ['output']: active_arg_dict.pop(inactive_arg) if None in active_arg_dict.values(): newsflash() newsflash("Please set values for the following parameters---on command line or in config file!") newsflash() for cfg_key in active_arg_dict: if active_arg_dict[cfg_key] is None: newsflash("\t%s" % cfg_key) newsflash() sys.exit(1) """ '**' unpacks a dictionary """ sample_ss(**arg_dict)
def sample_ss(input_file, output, file_format, sample_size, column_name): ss_dict = {} iri_map = {} try: # newsflash("Spreadsheet location is %s" % spreadsheet) # newsflash("Column index is %d" % colno) separator = "\t" if file_format == 'tsv' else "," url_bool = re.compile('[a-zA-Z](\w|[-+.])*://.*') # filestuff = None if url_bool.match(input_file): t0 = time.time() newsflash("Getting spreadsheet from URL ...") r = requests.get(input_file, allow_redirects=True) t1 = time.time() newsflash("It took %.2f seconds to retrieve the spreadsheet" % float(t1 - t0)) filestuff = io.StringIO(r.content.decode('utf-8')) else: filestuff = input_file newsflash("Pandafying spreadsheet ...") source_df = pd.read_csv(filestuff, sep=separator, low_memory=False, keep_default_na=False) newsflash("Generating random sample of records ...") output_df = source_df.sample(n=sample_size).loc[:, column_name] print(output_df.to_csv(index=False, sep=separator)) except requests.exceptions.InvalidSchema as is_error: """ pandas should have coped with distinguishing text file from URL already """ newsflash('Error retrieving spreadsheet?') newsflash(is_error) return None
def parse_ss(spreadsheet, separator, column_dict): ss_dict = {} iri_map = {} try: # newsflash("Spreadsheet location is %s" % spreadsheet) # newsflash("Column index is %d" % column_dict['index']) url_bool = re.compile('[a-zA-Z](\w|[-+.])*://.*') # filestuff = None if url_bool.match(spreadsheet): t0 = time.time() newsflash("Getting spreadsheet from URL ...") r = requests.get(spreadsheet, allow_redirects=True) t1 = time.time() newsflash("It took %.2f seconds to retrieve the spreadsheet" % float(t1 - t0)) filestuff = io.StringIO(r.content.decode('utf-8')) else: filestuff = spreadsheet newsflash("Pandafying spreadsheet ...") source_df = pd.read_csv(filestuff, sep=separator, low_memory=False, keep_default_na=False) if column_dict['index'] is None: column_dict['index'] = source_df.columns.get_loc( column_dict['name']) else: column_dict['name'] = source_df.columns[column_dict['index']] colno = column_dict['index'] newsflash("Getting source terms ...") source_iris = source_df.iloc[:, colno] # iri_lists = source_iris.apply(lambda x: x.split(", ")) """ 1. Return empty array if empty string. 2. Split by comma only, then remove spaces afterwards: more robust! """ newsflash("Breaking source term strings into lists ...") # iri_lists = source_iris.apply(lambda x: [] if x == '' else x.split(", ")) iri_lists = source_iris.apply(lambda x: [] if x == '' else list( map(lambda w: w.strip(), x.split(",")))) """ Use nested lambdas to collect source IRIs from Series of lists """ newsflash("Generate dictionary keyed on unique source terms ...") iri_lists.apply( lambda x, y: list(map(lambda z: y.update({z: None}), x)), args=[iri_map]) newsflash("Dictionary generated!") ss_dict.update({'pandafued': source_df}) ss_dict.update({'unique_iris': iri_map}) newsflash("Returning from function 'parse_ss' ...") except requests.exceptions.InvalidSchema as is_error: """ pandas should have coped with distinguishing text file from URL already """ # if is_error. # raise newsflash('Error retrieving spreadsheet?') newsflash(is_error) # raise return ss_dict
def main(): """ First of all, check configuration file """ parser1 = argparse.ArgumentParser(description='Config filepath', add_help=False) parser1.add_argument('-g', '--config', help='filepath of config file (ini format)') namespace, extra = parser1.parse_known_args() config_file = namespace.config """ Second of all, get configuration info from configuration file (if specified) """ ontoconfig = configparser.ConfigParser(os.environ) ontoconfig.optionxform = str target_list = None if config_file is not None: ontoconfig.read(config_file) targets_plus = ontoconfig.items('Targets') spurious_targets = ontoconfig.items('DEFAULT') real_targets = dict(set(targets_plus) - set(spurious_targets)) # newsflash(pd.Series(real_targets)) target_list = list(real_targets.values()) """ Third of all, parse the rest of the switches, possibly using defaults from configuration file """ parser2 = argparse.ArgumentParser( prog='Ontomapper', description="%s%s%s" % ('Takes source ontology (e.g. EFO) terms from an input spreadsheet, ', 'and generates an output spreadsheet with equivalent terms from ', 'another ontology or ontologies.'), parents=[parser1]) cfg_sect_lookup = config_or_bust(ontoconfig, 'Params') parser2.add_argument( '-i', '--input-file', default=cfg_sect_lookup('input_file', 'string'), help='location of input spreadsheet: accepts filepath or URL') parser2.add_argument( '-o', '--output', default=cfg_sect_lookup('output', 'string'), help='output spreadsheet filepath **NO CURRENT EFFECT**') parser2.add_argument('-f', '--file-format', choices=['csv', 'tsv'], default=cfg_sect_lookup('file_format', 'string'), help='file format (both input and output)') parser2.add_argument( '-l', '--layout', choices=[ 'in-situ', 'uni-column', 'multi-column', 'uni-row', 'multi-row' ], default=cfg_sect_lookup('layout', 'string'), help="%s%s" % ('whether new ontology terms are required in multiple rows, ', 'multiple columns, a single row, a single column, or the originating cell' )) cmeg = parser2.add_mutually_exclusive_group(required=False) cmeg.add_argument( '-x', '--column-index', type=int, # default=cfg_sect_lookup('column_index', 'int'), help='zero-based index of column containing source ontology terms') cmeg.add_argument( '-c', '--column-name', default=cfg_sect_lookup('column_name', 'string'), help='name or heading of column containing source ontology terms') kmeg = parser2.add_mutually_exclusive_group(required=False) kmeg.add_argument('-k', '--keep', dest='keep', action='store_true', help='retain source ontology terms') kmeg.add_argument('-e', '--no-keep', dest='keep', action='store_false', help='ditch source ontology terms') parser2.add_argument( '-t', '--target', nargs='+', default=target_list, help='space-separated list of target ontology prefixes') parser2.add_argument( '-u', '--uri-format', choices=['long', 'short', 'curie'], default=cfg_sect_lookup('uri_format', 'string'), help='format of target ontology term identifiers **NO CURRENT EFFECT**' ) parser2.add_argument('-d', '--distance', type=int, default=cfg_sect_lookup('distance', 'int'), choices=[1, 2, 3], help='stepwise OxO distance (ontology to ontology)') parser2.add_argument('-r', '--oxo-url', default=cfg_sect_lookup('oxo_url', 'string'), help='OxO (or Paxo) web service URL') pmeg = parser2.add_mutually_exclusive_group(required=False) pmeg.add_argument('-p', '--paxo', dest='paxo', action='store_true', help='use Paxo rather than OxO **NO CURRENT EFFECT**') pmeg.add_argument('-z', '--no-paxo', dest='paxo', action='store_false', help='do not use Paxo: use OxO') parser2.add_argument( '-n', '--number', type=int, default=cfg_sect_lookup('query_term_number', 'int'), help= 'number of query terms to chunk, per HTTP request on the OxO web service' ) vmeg = parser2.add_mutually_exclusive_group(required=False) vmeg.add_argument('-v', '--verbose', dest='verbose', action='store_true', help="%s%s" % ('send verbose progess reports to standard error: ', 'not recommended for regular use')) vmeg.add_argument('-q', '--quiet', dest='verbose', action='store_false', help='suppress verbose output') parser2.add_argument( '-m', '--mapping-file', # default=cfg_sect_lookup('mapping_file', 'string'), help= 'optional extra output file with tab-separated list of source to target term mappings' ) # parser2.add_argument('-b', '--boundary', type=int, default=cfg_sect_lookup('boundary', 'int'), # help="%s%s" % ('minimum percentage confidence threshold of target ontology term matches ', # '**NO CURRENT EFFECT: ENFORCE 100%% CONFIDENCE (OxO distance=1)**')) parser2.set_defaults(keep=cfg_sect_lookup('keep', 'boolean'), paxo=cfg_sect_lookup('paxo', 'boolean'), verbose=cfg_sect_lookup('verbose', 'boolean')) parser2.add_argument('--version', action='version', version='%(prog)s 1.0') if len(sys.argv) < 2: parser2.print_help(sys.stderr) newsflash() sys.exit(0) args = parser2.parse_args() """ vars returns a dictionary from the Namespace object; """ arg_dict = vars(args) newsflash() newsflash("These are your opening parameters:") newsflash() # newsflash("Length of args dictionary is %d" % len(arg_dict)) newsflash(pd.Series(arg_dict)) newsflash() """ config is still in arg_dict at this point """ arg_dict.pop('config') """ Don't check values of reserved options, which have no effect at the moment; also, column_index may be null """ active_arg_dict = arg_dict.copy() for inactive_arg in [ 'output', 'paxo', 'uri_format', 'column_index', 'mapping_file' ]: active_arg_dict.pop(inactive_arg) if None in active_arg_dict.values(): newsflash() newsflash( "Please set values for the following parameters---on command line or in config file!" ) newsflash() for cfg_key in active_arg_dict: if active_arg_dict[cfg_key] is None: newsflash("\t%s" % cfg_key) newsflash() sys.exit(1) newsflash(arg_dict, arg_dict['verbose']) """ '**' unpacks a dictionary """ re_ontologise(**arg_dict)
def re_ontologise(input_file, output, layout, file_format, column_index, column_name, keep, target, uri_format, distance, paxo, oxo_url, number, verbose, mapping_file): target = sorted(target) # newsflash("Length of target ontology array is %d" % len(target)) # for t in target: # newsflash("Target is %s" % t) field_separator = ',' if file_format == 'csv' else '\t' ss_column = {'index': column_index, 'name': column_name} ss_dict = parse_ss(input_file, field_separator, ss_column) column_index = ss_column['index'] column_name = ss_column['name'] iri_map = ss_dict['unique_iris'] """ Print out list of source iris in iri_map """ # iri_counter = 0 # for src_iri in iri_map: ### print("%d\t%s\t%s" % (iri_counter, src_iri, iri_map[src_iri])) # Print values _and_ keys # newsflash("%d\t%s" % (iri_counter, src_iri)) # iri_counter += 1 panda_original = ss_dict['pandafued'] newsflash("Calling map_iris with url = '%s' ..." % oxo_url) map_iris(iri_map, target, distance, paxo, oxo_url, number, verbose) """ Print a tab-separated list of source and target terms, if --mapping-file switch specified """ if mapping_file is not None: with open(mapping_file, 'w') as emf: for efo_iri in iri_map.keys(): # for efo_map in iri_map[efo_iri].values(): # for efo_single in efo_map.split(', '): # print("%s\t%s" % (efo_iri, efo_single), file=emf) ## for efo_map in iri_map[efo_iri]: for efo_map in iri_map[efo_iri]['ontodict'].values(): for efo_single in efo_map: print("%s\t%s\t%s\t%s\t%d" % (efo_iri, iri_map[efo_iri]['source_label'], efo_single['curie'], efo_single['target_label'], efo_single['distance']), file=emf) newsflash("Calling augment ...") ontologically_enriched = augment(panda_original, iri_map, layout, column_index, keep, uri_format) """ Print out augmented_panda here ... """ # newsflash("No. of dictionary elements: %d" % len(ss_dict)) # newsflash("No. of rows in spreadsheet: %d" % len(panda_original)) newsflash("No. of unique IRIs: %d" % len(iri_map)) newsflash('', verbose) newsflash(ss_dict['unique_iris'], verbose) # newsflash(ss_dict.keys()) # for spot_key in ss_dict: # newsflash(spot_key) """ Enable print to check for uniqueness of IRIs """ # for iri_key in ss_dict['unique_iris']: # newsflash(iri_key) # newsflash(ss_dict['unique_iris']) newsflash("Outputting ontologically enriched spreadsheet ...") # print(ontologically_enriched.head(30).to_csv(index=False, sep='\t')) print(ontologically_enriched.to_csv(index=False, sep=field_separator))
def augment(panda_input, iri_map, table_layout, colno, keep_original, iri_format): colname = panda_input.columns[colno] prev_colname = panda_input.columns[colno - 1] next_colname = panda_input.columns[colno + 1] out_columns = panda_input.columns newsflash() newsflash('These are the column headers of your pandas DataFrame:') newsflash() newsflash(out_columns) newsflash() in_tuple_counter = 0 # out_tuple_counter = 0 out_dict_list = [] if table_layout in {'uni-column', 'multi-column'}: extra_col_dict_list = [] tt0 = time.time() tt1 = tt0 newsflash('Processing input records ...') for in_tuple in panda_input.itertuples(): """ Need to convert back to regular tuple, from pandafied named tuple with extra leading index number """ in_supple = tuple(in_tuple[1:]) # source_string = in_tuple[colname] source_string = in_supple[colno] source_terms = list(map(lambda x: x.strip(), source_string.split(","))) target_groups = {} if table_layout == 'in-situ' and keep_original: """ Key '00source00' is lazy, collational way of placing source terms at top of list, where we want them """ target_groups.setdefault('00source00', source_terms) for source_term in source_terms: # newsflash("Source term is %s" % source_term) # map_dict = iri_map[source_term] map_dict = iri_map.get(source_term) # print('map_dict below ...', file=sys.stderr) # print(map_dict, file=sys.stderr) if map_dict: for m in map_dict['ontodict']: # print(m, file=sys.stderr) """ target_groups assigned one key per target ontology --- NOT per source term in source cell! """ # target_groups.setdefault(m, []).append(map_dict['ontodict'][m]) ncount = 0 for n in map_dict['ontodict'][m]: target_groups.setdefault(m, []).append( map_dict['ontodict'][m][ncount]['curie']) ncount += 1 for target_group in target_groups: # newsflash("Data type of target_groups: %s" % type(target_groups)) # newsflash("Data type of target_group: %s" % type(target_group)) # newsflash("target_group: %s" % target_group) # newsflash("Data type of target_groups[target_group]: %s" % type(target_groups[target_group])) # print(target_groups[target_group], file=sys.stderr) target_groups[target_group] = ', '.join( target_groups[target_group]) tg_series = pd.Series(target_groups) # tg_series = tg_series.reindex(sorted(tg_series.index)) # newsflash(tg_series) # out_dict_list = [] if table_layout in {'in-situ', 'uni-row', 'uni-column'}: target_string = ', '.join(tg_series.values) # newsflash("Additions to %s from MeSH: %s" % (source_string, target_string)) # newsflash() if table_layout in {'in-situ', 'uni-row', 'multi-row'}: out_dict = dict(zip(out_columns, in_supple)) # newsflash(out_dict) if table_layout == 'in-situ' or keep_original: if len(tg_series) > 0 or keep_original: if table_layout == 'in-situ': out_dict[colname] = target_string out_dict_list.append(out_dict) if table_layout == 'uni-row' and len(tg_series) > 0: out_dict_extra = dict(out_dict) out_dict_extra[colname] = target_string out_dict_list.append(out_dict_extra) elif table_layout == 'multi-row': for hit in tg_series.values: out_dict_iter = dict(out_dict) out_dict_iter[colname] = hit out_dict_list.append(out_dict_iter) elif table_layout in {'uni-column', 'multi-column' } and (len(tg_series) > 0 or keep_original): """ Now need to handle uni- and multi-column outputs """ out_dict = dict(zip(out_columns, in_supple)) out_dict_list.append(out_dict) extra_col_dict = {} if table_layout == 'multi-column': extra_col_dict = dict(tg_series) elif table_layout == 'uni-column': # extra_col_dict = dict({'all_ontologies', ', '.join(tg_series.values)}) extra_col_dict = dict({'EQUIVALENT_TRAIT_URIS': target_string}) extra_col_dict_list.append(extra_col_dict) in_tuple_counter += 1 if in_tuple_counter % 4000 == 0: tt2 = time.time() newsflash( "Processed %d thousand input records: took %.2f s (increment of %.2f s) ..." % (int(in_tuple_counter / 1000), float(tt2 - tt0), float(tt2 - tt1))) tt1 = tt2 tt2 = time.time() newsflash("Processed a total of %d input records, in %.2f seconds" % (in_tuple_counter, float(tt2 - tt0))) newsflash("No. of records in output spreadsheet is %d" % len(out_dict_list)) newsflash() panda_output = pd.DataFrame(out_dict_list, columns=out_columns) if table_layout in {'uni-column', 'multi-column'}: newsflash("Adding new columns ...") # tg_series out of scope here! # extra_columns_df = pd.DataFrame(extra_col_dict_list, # columns=[colname] if table_layout == 'uni-column' else tg_series.keys()) extra_columns_df = pd.DataFrame(extra_col_dict_list) panda_output = pd.concat([ None if colno == 0 and not keep_original else panda_output.loc[:, :colname if keep_original else prev_colname], extra_columns_df, None if colno == len(panda_input.columns) - 1 else panda_output.loc[:, next_colname:] ], axis=1) return panda_output