def main(): import argparse usage = "%(prog)s -v" #usage=usage, parser = argparse.ArgumentParser(description=desc, epilog=epilog, \ formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('--version', action='version', version='1.0b') parser.add_argument("-v", "--verbose", default=False, action="store_true", help="verbose") parser.add_argument('-t', '--taxids', nargs="+", type=int, help="group taxid(s) [%(default)s]") parser.add_argument("--taxadb", default="/users/tg/lpryszcz/cluster/rapsi/taxonomy.db3", help="taxonomy path [%(default)s]") o = parser.parse_args() if o.verbose: sys.stderr.write("Options: %s\n"%str(o)) #init taxonomy taxa = Taxonomy(o.taxadb) #init metaphors connection cur = _getConnection() cur.execute("select taxid, name from species") species = {} for taxid, name in cur.fetchall(): species[taxid] = (taxid, name) if o.verbose: sys.stderr.write("%s species in database\n"%len(species)) #process taxa groups for taxid in o.taxids: #fetch proteins from given taxa taxid2proteomes(cur, species, taxa, taxid, o.verbose)
def main(): import argparse usage = "%(prog)s -v" #usage=usage, parser = argparse.ArgumentParser(description=desc, epilog=epilog, \ formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('--version', action='version', version='1.0b') parser.add_argument("-v", "--verbose", default=False, action="store_true", help="verbose") parser.add_argument('-d', '--db', default="metaphors_201405", help="database name [%(default)s]") parser.add_argument('-t', '--taxids', nargs="+", type=int, help="group taxid(s) [%(default)s]") parser.add_argument("--taxadb", default="/users/tg/lpryszcz/cluster/rapsi/taxonomy.db3", help="taxonomy path [%(default)s]") o = parser.parse_args() if o.verbose: sys.stderr.write("Options: %s\n"%str(o)) #init taxonomy taxa = Taxonomy(o.taxadb) #init metaphors connection m = dbClient.metaphors(o.db) if o.verbose: sys.stderr.write("%s species in %s database\n"%(len(m.species), o.db)) #process taxa groups for taxid in o.taxids: #fetch proteins from given taxa taxid2proteomes(m, taxa, taxid, o.verbose)
def read_taxonomy(tax_f): root = TNode('*', []) tax = Taxonomy(tax_f, root) with open(tax_f) as f: for line in f: node_name, ph_str = line.strip('\r\n').split('\t') node = TNode(node_name, ph_str.split(',')) tax.add_node(node) return tax
def load_from_json(cls, name): """ Loads a taxonomy object from the given json file and name. """ name_with_extension = str(name) + ".json" file_name = "taxonomies/" + name_with_extension with open(file_name) as file_object: graph_json = json.load(file_object) graph = json_graph.node_link_graph(graph_json) return Taxonomy(graph)
def read(self, parsed_taxonomies): """ Read in taxonomies for a given code table. params: taxonomies (dict{id: Taxonomy}) """ for key, taxonomy in parsed_taxonomies.iteritems(): synonym_phrases = [ Phrase(synonym) for synonym in taxonomy.synonyms ] head_phrase = Phrase(taxonomy.head) self.taxonomies[key] = Taxonomy(key, head_phrase, synonym_phrases)
def taxonomies_to_str(self): """ Convert taxonomies to strings, to be written to a file. returns: taxonomies_as_str {dict:Taxonomy(str)} """ taxonomies_as_str = {} for key, taxonomy in self.taxonomies.iteritems(): taxonomy_head = taxonomy.head.raw_form synonyms = [synonym.raw_form for synonym in taxonomy.synonyms] taxonomies_as_str[key] = Taxonomy(key, taxonomy_head, synonyms) return taxonomies_as_str
def find_cites(contribs): tax = Taxonomy() successes = 0 failures = 0 citations = [] for contrib in contribs: row = None if "Rationale" in contrib.keys(): row = contrib["Rationale"] if "Text" in contrib.keys(): row = contrib["Text"] if row is not None: cites = search_for_citations(tax, row) entry = {"ID": contrib["ID"], "Citations": cites} citations.append(entry) successes += 1 else: failures += 1 return citations
def hits2taxa(input, out, db, verbose, limit=0): """Process fastq from input file. You may play with bufsize, so processes runs without waiting. """ #init taxonomy taxa = Taxonomy(db) #hadle gzipped/bzip2 stream if input.name.endswith('.gz'): input = gzip.open(input.name) elif input.name.endswith('.bz2'): import bz2 input = bz2.BZ2File(input.name) #get match generator if input == sys.stdin: line0 = input.readline() if line0.startswith('@'): mGenerator = get_matches_sam(input, verbose) else: mGenerator = get_matches_blast8(input, verbose) #get sam stream elif input.name.endswith(('.sam', '.sam.gz')): mGenerator = get_matches_sam(input, verbose) else: mGenerator = get_matches_blast8(input, verbose) #process reads in 1K batches?print if verbose: sys.stderr.write("[%s] Processing reads from %s ...\n" % (datetime.ctime(datetime.now()), input.name)) #get taxa and genes taxid2reads = {} taxid2matches = {} k = 0 for i, (rname, hits) in enumerate(mGenerator, 1): if limit and i > limit: break if not rname: continue #print info if verbose and i % 1e4 == 1: sys.stderr.write(" %s parsed. %.2f%s with taxa \r" % (i, k * 100.0 / i, '%')) #get taxa taxid, matches = get_taxa(hits, taxa, verbose) if not taxid: continue k += 1 if taxid not in taxid2reads: taxid2reads[taxid] = 0 #store read name & genes taxid2reads[taxid] += 1 #report if not taxid2reads: sys.exit("No matches found!") ##foreign reads freads = sum(reads for taxid, reads in taxid2reads.iteritems()) header = "#name\ttaxid\treads\t%\n" out.write(header) out.write("%s\t%s\t%s\t%.2f\n" % ("unknown", "-", i - freads, 100.0 * (i - freads) / i)) for taxid, reads in sorted(taxid2reads.iteritems(), key=lambda x: x[1], reverse=True)[:10]: out.write("%s\t%s\t%s\t%.2f\n" % (taxa[taxid][1], taxid, reads, 100.0 * reads / i)) #print summary sys.stderr.write("[hits2taxa] %s entries processed!\n" % (i, ))
NOUNSET_BANK = NounSetBank(DATA_DIR + 'nounsets.yml') NOUN_FORMS = { lang: NounFormBank(DATA_DIR + 'nouns_{}.yml'.format(lang)) for lang in LANGUAGES } PREPSET_BANK = PrepositionSetBank(DATA_DIR + 'prepsets.yml') PRONSET_BANK = PronounSetBank(DATA_DIR + 'pronsets.yml') PRONOUN_FORMS = { lang: PronounFormBank(DATA_DIR + 'prons_{}.yml'.format(lang)) for lang in LANGUAGES } TAXONOMY = Taxonomy(DATA_DIR + 'taxonomy.yml') #VERBSET_BANK = VerbSetBank(DATA_DIR + 'verbsets/') VERBSET_BANK = VerbSetBank(DATA_DIR + 'verbsets.yml') VERB_FORMS = { lang: VerbFormBank(DATA_DIR + 'verbs_{}.yml'.format(lang)) for lang in LANGUAGES } TEMPLATE_DIR = DATA_DIR + 'templates/' ADJP_TEMPLATE_BANK = TemplateBank(TEMPLATE_DIR + 'adjp_templates.yml') ADVP_TEMPLATE_BANK = TemplateBank(TEMPLATE_DIR + 'advp_templates.yml') CLAUSE_TEMPLATE_BANK = TemplateBank(TEMPLATE_DIR + 'clause_templates.yml') #CUSTOM_TEMPLATE_BANK = TemplateBank(TEMPLATE_DIR + 'custom_templates.yml') CUSTOM_TEMPLATE_BANK = TemplateBank(TEMPLATE_DIR + 'custom_postedited.yml') NP_TEMPLATE_BANK = TemplateBank(TEMPLATE_DIR + 'np_templates.yml')
# # TODO: list-types is missing # INFO = """ This the CLI for the Orange Button Core library. Information is available at the following URL's: Orange Button Overview: https://sunspec.org/orange-button-initiative/ Orange Button GitHUb: https://github.com/SunSpecOrangeButton Orange Button CLI GitHub: https://github.com/SunSpecOrangeButton/core """ DASHES = "---------------------------------------------------------------------------------------" taxonomy = Taxonomy() csv = False json = False xml = False def info(args): print(INFO) def convert(args): p = Parser(taxonomy) ff = None if json:
# 根据筛选出来的信息 实例化Page和Taxonomy对象 page = tk.result_list_page tax = tk.result_list_tax # 目标对象列表 aim_page = [] aim_tax = [] for item in page: #实例化页面 aim_page.append(Page(item = item)) for item in tax: #实例化分类 aim_tax.append(Taxonomy(tax_name=item['type'],item=item)) # ----------------------------------- 小艾 ------------------------------------------ xiaoai = XiaoAi() # 将任务加入小艾的队列 for item in aim_page: xiaoai.add_task(item) for item in aim_tax: xiaoai.add_task(item) # 开始执行自动化生成
def assign_taxonomy(self, key, output_dir, dna_region, names_file, ref_taxa): from taxonomy import Taxonomy, consensus #results = uc_results results = {} try: self.runobj.run_status_file_h.write( json.dumps({'status': "STARTING_ASSIGN_TAXONOMY: " + key}) + "\n") except: pass #test_read='FI1U8LC02GEF7N' # open gast_file to get results "to Dirs" tagtax_terse_filename = os.path.join(output_dir, "tagtax_terse") tagtax_long_filename = os.path.join(output_dir, "tagtax_long") tagtax_terse_fh = open(tagtax_terse_filename, 'w') tagtax_long_fh = open(tagtax_long_filename, 'w') tagtax_long_fh.write("\t".join([ "read_id", "taxonomy", "distance", "rank", "refssu_count", "vote", "minrank", "taxa_counts", "max_pcts", "na_pcts", "refhvr_ids" ]) + "\n") gast_file = os.path.join(output_dir, "gast" + dna_region) if not os.path.exists(gast_file): logging.info("gast:assign_taxonomy: Could not find gast file: " + gast_file + ". Returning") return results for line in open(gast_file, 'r'): # must split on tab because last field may be empty and must be maintained as blank data = line.strip().split("\t") if len(data) == 3: data.append("") # 0=id, 1=ref, 2=dist, 3=align 4=frequency #if data[0]==test_read: # print 'found test in gastv6 ', data[1].split('|')[0], data[2], data[3] read_id = data[0] if read_id in results: results[read_id].append( [data[1].split('|')[0], data[2], data[3], data[4]]) else: results[read_id] = [[ data[1].split('|')[0], data[2], data[3], data[4] ]] for line in open(names_file, 'r'): data = line.strip().split("\t") dupes = data[1].split(",") read_id = data[0] taxObjects = [] distance = 0 frequency = 0 refs_for = {} #print 'read_id', read_id 'assing taxonomyt method, either fake or real' if read_id not in results: results[read_id] = [ "Unknown", '1', "NA", '0', '0', "NA", "0;0;0;0;0;0;0;0", "0;0;0;0;0;0;0;0", "100;100;100;100;100;100;100;100" ] refs_for[read_id] = ["NA"] else: 'it is in results[]' #print 'read_id in res', read_id, results[read_id] #if read_id == test_read_id: # print 'found ', test_read_id, results[test_read_id] for i in range(0, len(results[read_id])): #for resultread_id in results[read_id]: #print 'resread_id', results[read_id] ref = results[read_id][i][0] if ref in ref_taxa: for tax in ref_taxa[ref]: for t in tax: taxObjects.append(Taxonomy(t)) else: pass if read_id in refs_for: #if read_id ==test_read_id: # print '2', read_id, refs_for[test_read_id] if results[read_id][i][0] not in refs_for[read_id]: refs_for[read_id].append(results[read_id][i][0]) else: #if read_id == test_read_id: # print '1', read_id, results[read_id][i][0] refs_for[read_id] = [results[read_id][i][0]] # should all be the same distance for the duplicates distance = results[read_id][i][1] frequency = results[read_id][i][3] #Lookup the consensus taxonomy for the array taxReturn = consensus(taxObjects, C.majority) # 0=taxObj, 1=winning vote, 2=minrank, 3=rankCounts, 4=maxPcts, 5=naPcts; taxon = taxReturn[0].taxstring() #if taxon[-3:] = ';NA': # taxon = taxon[:-3] #tax_counter[taxon] rank = taxReturn[0].depth() #print read_id, taxon, rank, taxReturn[0], taxReturn[1] if not taxon: taxon = "Unknown" # (taxonomy, distance, rank, refssu_count, vote, minrank, taxa_counts, max_pcts, na_pcts) results[read_id] = [ taxon, str(distance), rank, str(len(taxObjects)), str(taxReturn[1]), taxReturn[2], taxReturn[3], taxReturn[4], taxReturn[5] ] #print "\t".join([read_id, taxon, str(distance), rank, str(len(taxObjects)), str(taxReturn[1]), taxReturn[2], taxReturn[3], taxReturn[4], taxReturn[5]]) + "\n" #read_id_id taxonomy distance rank refssu_count vote minrank taxa_counts max_pcts na_pcts refhvr_ids #D4ZHLFP1:25:B022DACXX:3:1101:12919:40734 1:N:0:TGACCA|frequency:162 Bacteria;Proteobacteria;Gammaproteobacteria 0.117 class 2 100 genus 1;1;1;2;2;2;0;0 100;100;100;50;50;50;0;0 0;0;0;0;0;0;100;100 v6_CI671 #D4ZHLFP1:25:B022DACXX:3:1101:10432:76870 1:N:0:TGACCA|frequency:105 Bacteria;Proteobacteria;Gammaproteobacteria 0.017 class 1 100 class 1;1;1;0;0;0;0;0 100;100;100;0;0;0;0;0 0;0;0;100;100;100;100;100 v6_BW306 # Replace hash with final taxonomy results, for each copy of the sequence for d in dupes: # print OUT join("\t", $d, @{$results{$read_id}}, join(", ", sort @{$refs_for{$read_id}})) . "\n"; d = d.strip() tagtax_long_fh.write(d + "\t" + "\t".join(results[read_id]) + "\t" + ', '.join(sorted(refs_for[read_id])) + "\n") tagtax_terse_fh.write(d + "\t" + results[read_id][0] + "\t" + results[read_id][2] + "\t" + results[read_id][3] + "\t" + ', '.join(sorted(refs_for[read_id])) + "\t" + results[read_id][1] + "\t" + str(frequency) + "\n") tagtax_terse_fh.close() tagtax_long_fh.close() return results
def main(): logger.warning("Start building taxonomy") # Load input: this includes reading network, text, and # a background corpus for contrastive analysis logger.info("Loading graph from file") A, node_info = utils.load_graph(args.data_dir, remove_citation=True, force_undirected=True) logger.info("Create HIN") G = HIN(A, node_info) logger.info("Load text") corpus = utils.load_documents(args.data_dir) motif_matchers = [ Motif_KPV(), Motif_KPA(), Motif_KP(), Motif_KPVY(), Motif_KPAA() ] intermediate_dir = plib.Path(args.data_dir, "intermediate") if not intermediate_dir.is_dir(): logger.warning(f"Creating intermediate dir {intermediate_dir}") intermediate_dir.mkdir(parents=False) # we collect all phrases T = [] # terms / phrases for info in node_info.values(): if info.node_type == "K": T.append(info.entity_id) D = corpus tf_bg, idf_bg = utils.get_tf_idf_from_file( plib.Path(args.data_dir, "background_documents.txt"), T) taxo = Taxonomy(D, T, G) builder = NetTaxo(motif_matchers, tf_lift=args.tf_lift, idf_lift=args.idf_lift, damping=args.damping, conf_motif=Motif_KPA().motif_name) # set background corpus for contrastive analysis builder.set_background(tf_bg, idf_bg) builder.build(taxo, args.levels) # save output_dir = plib.Path(args.output_dir, config.unique_id) if not output_dir.is_dir(): output_dir.mkdir(parents=True) logger.info(f"Saving to {output_dir}") taxo.save(output_dir) logger.info("Saving complete") # generate output taxo.visualize(plib.Path(output_dir, f"vis.pdf")) taxo.save_readable(output_dir)