def main(): taxtree_filename = "../taxonomy/nodes.dmp" get_levels = False taxtree = ncbi_taxonomy.parse_taxtree(taxtree_filename, get_levels) species = sys.argv[1:] for taxid in species: lineage = ncbi_taxonomy.climb_tax_tree(taxid, taxtree) print str(taxid) + "\t" + str(lineage)
def main(): taxtree_filename = "../taxonomy/nodes.dmp" get_levels = True taxtree, taxlevel = ncbi_taxonomy.parse_taxtree(taxtree_filename, get_levels) species_file = sys.argv[1] with open(species_file, 'r') as f: for l in f: taxid = l.rstrip('\n') lineage = ncbi_taxonomy.climb_tax_tree_to_level(taxid, taxtree, taxlevel, "species") print str(taxid) + "\t" + str(lineage)
def main(): taxtree_filename = "../taxonomy/nodes.dmp" get_levels = True taxtree, taxlevel = ncbi_taxonomy.parse_taxtree(taxtree_filename, get_levels) stoplevel = "Family" stopcriteria = [ 944644, # marseille 549779, # mimi 10486, # irido 10501, # phycodna 43682, # asco 10240, # pox 137992, # asfar ] species = sys.argv[1:] for taxid in species: family = ncbi_taxonomy.climb_tax_tree_to_level(taxid, taxtree, taxlevel, stoplevel) print str(taxid) + "\t" + str(family)
def main(): filename_uniprot = "data_in/uniprotkb-viruses-reviewed-date-keywords-chain.txt" taxfilename = "../taxonomy/viruses.categories.txt" taxfilename_edit = "../taxonomy/dengue-edit" ncbi_tax_file = "../taxonomy/names.dmp" taxtree_filename = "../taxonomy/nodes.dmp" simap_filename = "../SIMAP/virus_vs_all/homologs" out_clades_file = "data_out/initial_clade_assignments" # TODO read in the object output by uniprot_to_payload # but for now, we do it this way # read the tax and uniprot data taxmap = parse_taxids(taxfilename, taxfilename_edit) # mapping isolates to species uniprot = parse_uniprot(filename_uniprot, taxmap) # choose the isolates that will be in STRING chosen = choose_isolates(uniprot) # then get all the proteins that correspond to these isolates chosen_proteins = get_chosen_proteins(uniprot, chosen) # cleave any polyproteins based on the chain entries in uniprot chosen_proteins = cleave_polyproteins(chosen_proteins) # convert host name to taxid ncbi_tax, taxid_to_name = ncbi_taxonomy.parse_ncbi_taxonomy_names( ncbi_tax_file, True) # mapping name to taxids chosen_proteins = convert_host_name_to_taxids(chosen_proteins, ncbi_tax) get_levels = True taxtree, taxlevel = ncbi_taxonomy.parse_taxtree(taxtree_filename, get_levels) # TODO all the above should be encapsulated clades = generate_clades(chosen_proteins, taxtree, taxlevel, "Family") print_clades(out_clades_file, clades)
def main(): pickle_file = "data_out/parsed-proteomes" proteomes_list = "data_in/proteomes-all.tab" mapfile_proteomes = "data_in/proteomes.entryids" dirname_proteomes = "data_in/proteomes" ncbi_tax_file = "../taxonomy/names.dmp" taxtree_filename = "../taxonomy/nodes.dmp" pogs_file = "data_in/pVOGs" out_isolates = "data_out/isolates_amb" out_isolates_all = "data_out/isolates_all" out_fasta_dir = "data_out/fastafiles" out_pfam_file = "data_out/pfam" out_go_file = "data_out/goterms" out_functions_file = "data_out/functions" out_pdb_file = "data_out/pdb" out_uniprot_file = "data_out/uniprot" out_host_file = "data_out/hosts" out_species_file = "data_out/string_v11.species.tsv" out_synonyms_file = "data_out/synonyms" out_best_synonyms_file = "data_out/synonyms_best" out_functions_file = "data_out/functions" out_pogs_file = "data_out/pogs" if os.path.isfile(pickle_file): chosen_proteins = pickle.load(open(pickle_file, "rb")) else: # read proteome to entry mapping proteomes = parse_proteome_mapping(proteomes_list, mapfile_proteomes) # generate taxmap based on all entries in ncbi taxonomy so that we can map isolate -> species get_levels = True taxtree, taxlevel = ncbi_taxonomy.parse_taxtree( taxtree_filename, get_levels) unused, taxnames = ncbi_taxonomy.parse_ncbi_taxonomy_names( ncbi_tax_file, True) taxmap = {} for taxid in taxtree: species = ncbi_taxonomy.climb_tax_tree_to_level( taxid, taxtree, taxlevel, "species") if species == -1: next # above species taxmap[taxid] = species # read the tax and uniprot data chosen_proteins = parse_uniprot_promeome_dir( proteomes, dirname_proteomes, taxmap, True) # isolate_is_species sys.stderr.write("parsed uniprot xml\n") chosen_proteins = cleave_polyproteins(chosen_proteins) chosen_proteins = dedup(chosen_proteins) print "have " + str(len(chosen_proteins)) + " left" chosen_proteins = remove_too_small_genomes(chosen_proteins) pickle.dump(chosen_proteins, open(pickle_file, "wb")) if False: # run this when getting names for textmining # takes a really long time because it blasts swiss-prot against (some of) tremble to find more synonyms filename_uniprot_xml_all = "data_in/uniprot_viruses.xml" allspecies = parse_uniprot_xml(filename_uniprot_xml_all, taxmap) allspecies = cleave_polyproteins(allspecies) else: allspecies = [] # print simple derivatives of the uniprot data print_fasta( out_fasta_dir, chosen_proteins, False, True) # False, True -> no name details, don't print polyproteins print_pfam(out_pfam_file, chosen_proteins) print_go(out_go_file, chosen_proteins) print_pdb(out_pdb_file, chosen_proteins) print_uniprot(out_uniprot_file, chosen_proteins) print_hosts(out_host_file, chosen_proteins) print_species(out_species_file, chosen_proteins) print_synonyms(out_synonyms_file, out_best_synonyms_file, chosen_proteins, allspecies) print_functions(out_functions_file, chosen_proteins) map_pogs(pogs_file, out_pogs_file, chosen_proteins) # print output that is needed for text mining #print_synonyms(out_synonyms_file, out_best_synonyms_file, chosen_proteins, chosen_proteins) sys.stderr.write("done\n")
def main(): taxtree_filename = "../taxonomy/nodes.dmp" get_levels = False taxtree = ncbi_taxonomy.parse_taxtree(taxtree_filename, get_levels) host_pathogen_list = sys.argv[1] with open(host_pathogen_list, 'r') as f: for l in f: host, pathogen = l.rstrip('\n').split('\t') lineage = ncbi_taxonomy.climb_tax_tree(host, taxtree) host_domain = '' if 2 in lineage: host_domain = "bacteria" elif 2157 in lineage: host_domain = "archaea" elif 9606 in lineage: host_domain = "human" elif 7742 in lineage: host_domain = "vertebrates" elif 33208 in lineage: host_domain = "animals" elif 3193 in lineage: host_domain = "plants" elif 6960 in lineage: host_domain = "insects" elif 4751 in lineage: host_domain = "fungi" elif 2759 in lineage: host_domain = "eukaryota" else: host_domain = "none" lineage = ncbi_taxonomy.climb_tax_tree(pathogen, taxtree) pathogen_domain = '' if 2 in lineage: pathogen_domain = "bacteria" elif 2157 in lineage: pathogen_domain = "archaea" elif 5794 in lineage: pathogen_domain = "apicomplexa (protists)" elif 5653 in lineage: pathogen_domain = "kinetoplastida (protists)" elif 554915 in lineage: pathogen_domain = "amoebozoa" elif 6340 in lineage: pathogen_domain = "annelida (worms)" elif 6157 in lineage: pathogen_domain = "platyhelminthes (worms)" elif 6231 in lineage: pathogen_domain = "nematoda (worms)" elif 10232 in lineage: pathogen_domain = "acanthocephala (worms)" elif 33340 in lineage: pathogen_domain = "neoptera (fleas, louse)" elif 4751 in lineage: pathogen_domain = "fungi" elif 7742 in lineage: pathogen_domain = "vertebrates" elif 2759 in lineage: pathogen_domain = "eukaryota" elif 10239 in lineage: pathogen_domain = "viruses" else: pathogen_domain = "none" print host + "\t" + host_domain + "\t" + pathogen + "\t" + pathogen_domain