Ejemplo n.º 1
def main():
    taxtree_filename = "../taxonomy/nodes.dmp"
    get_levels = False
    taxtree = ncbi_taxonomy.parse_taxtree(taxtree_filename, get_levels)

    species = sys.argv[1:]
    for taxid in species:
        lineage = ncbi_taxonomy.climb_tax_tree(taxid, taxtree)
        print str(taxid) + "\t" + str(lineage)
Ejemplo n.º 2
def main():
    taxtree_filename = "../taxonomy/nodes.dmp"
    get_levels = True
    taxtree, taxlevel = ncbi_taxonomy.parse_taxtree(taxtree_filename, get_levels)

    species_file = sys.argv[1]
    with open(species_file, 'r') as f:
        for l in f:
            taxid = l.rstrip('\n')
            lineage = ncbi_taxonomy.climb_tax_tree_to_level(taxid, taxtree, taxlevel, "species")
            print str(taxid) + "\t" + str(lineage)
Ejemplo n.º 3
def main():
    taxtree_filename = "../taxonomy/nodes.dmp"
    get_levels = True
    taxtree, taxlevel = ncbi_taxonomy.parse_taxtree(taxtree_filename, get_levels)
    stoplevel = "Family"
    stopcriteria = [
                    944644, # marseille
                    549779, # mimi
                    10486, # irido
                    10501, # phycodna
                    43682, # asco
                    10240, # pox
                    137992, # asfar

    species = sys.argv[1:]
    for taxid in species:
        family = ncbi_taxonomy.climb_tax_tree_to_level(taxid, taxtree, taxlevel, stoplevel)
        print str(taxid) + "\t" + str(family)
Ejemplo n.º 4
def main():
    filename_uniprot = "data_in/uniprotkb-viruses-reviewed-date-keywords-chain.txt"
    taxfilename = "../taxonomy/viruses.categories.txt"
    taxfilename_edit = "../taxonomy/dengue-edit"
    ncbi_tax_file = "../taxonomy/names.dmp"
    taxtree_filename = "../taxonomy/nodes.dmp"
    simap_filename = "../SIMAP/virus_vs_all/homologs"

    out_clades_file = "data_out/initial_clade_assignments"

    # TODO read in the object output by uniprot_to_payload
    # but for now, we do it this way

    # read the tax and uniprot data
    taxmap = parse_taxids(taxfilename,
                          taxfilename_edit)  # mapping isolates to species
    uniprot = parse_uniprot(filename_uniprot, taxmap)

    # choose the isolates that will be in STRING
    chosen = choose_isolates(uniprot)
    # then get all the proteins that correspond to these isolates
    chosen_proteins = get_chosen_proteins(uniprot, chosen)
    # cleave any polyproteins based on the chain entries in uniprot
    chosen_proteins = cleave_polyproteins(chosen_proteins)

    # convert host name to taxid
    ncbi_tax, taxid_to_name = ncbi_taxonomy.parse_ncbi_taxonomy_names(
        ncbi_tax_file, True)  # mapping name to taxids
    chosen_proteins = convert_host_name_to_taxids(chosen_proteins, ncbi_tax)

    get_levels = True
    taxtree, taxlevel = ncbi_taxonomy.parse_taxtree(taxtree_filename,

    # TODO all the above should be encapsulated

    clades = generate_clades(chosen_proteins, taxtree, taxlevel, "Family")
    print_clades(out_clades_file, clades)
Ejemplo n.º 5
def main():
    pickle_file = "data_out/parsed-proteomes"

    proteomes_list = "data_in/proteomes-all.tab"
    mapfile_proteomes = "data_in/proteomes.entryids"
    dirname_proteomes = "data_in/proteomes"

    ncbi_tax_file = "../taxonomy/names.dmp"
    taxtree_filename = "../taxonomy/nodes.dmp"

    pogs_file = "data_in/pVOGs"

    out_isolates = "data_out/isolates_amb"
    out_isolates_all = "data_out/isolates_all"

    out_fasta_dir = "data_out/fastafiles"
    out_pfam_file = "data_out/pfam"
    out_go_file = "data_out/goterms"
    out_functions_file = "data_out/functions"
    out_pdb_file = "data_out/pdb"
    out_uniprot_file = "data_out/uniprot"
    out_host_file = "data_out/hosts"
    out_species_file = "data_out/string_v11.species.tsv"
    out_synonyms_file = "data_out/synonyms"
    out_best_synonyms_file = "data_out/synonyms_best"
    out_functions_file = "data_out/functions"
    out_pogs_file = "data_out/pogs"

    if os.path.isfile(pickle_file):
        chosen_proteins = pickle.load(open(pickle_file, "rb"))
        # read proteome to entry mapping
        proteomes = parse_proteome_mapping(proteomes_list, mapfile_proteomes)

        # generate taxmap based on all entries in ncbi taxonomy so that we can map isolate -> species
        get_levels = True
        taxtree, taxlevel = ncbi_taxonomy.parse_taxtree(
            taxtree_filename, get_levels)
        unused, taxnames = ncbi_taxonomy.parse_ncbi_taxonomy_names(
            ncbi_tax_file, True)

        taxmap = {}
        for taxid in taxtree:
            species = ncbi_taxonomy.climb_tax_tree_to_level(
                taxid, taxtree, taxlevel, "species")
            if species == -1:
                next  # above species
            taxmap[taxid] = species

        # read the tax and uniprot data
        chosen_proteins = parse_uniprot_promeome_dir(
            proteomes, dirname_proteomes, taxmap, True)  # isolate_is_species
        sys.stderr.write("parsed uniprot xml\n")

        chosen_proteins = cleave_polyproteins(chosen_proteins)

        chosen_proteins = dedup(chosen_proteins)
        print "have " + str(len(chosen_proteins)) + " left"
        chosen_proteins = remove_too_small_genomes(chosen_proteins)

        pickle.dump(chosen_proteins, open(pickle_file, "wb"))

    if False:
        # run this when getting names for textmining
        # takes a really long time because it blasts swiss-prot against (some of) tremble to find more synonyms
        filename_uniprot_xml_all = "data_in/uniprot_viruses.xml"
        allspecies = parse_uniprot_xml(filename_uniprot_xml_all, taxmap)
        allspecies = cleave_polyproteins(allspecies)
        allspecies = []

    # print simple derivatives of the uniprot data
        out_fasta_dir, chosen_proteins, False,
        True)  # False, True -> no name details, don't print polyproteins

    print_pfam(out_pfam_file, chosen_proteins)
    print_go(out_go_file, chosen_proteins)
    print_pdb(out_pdb_file, chosen_proteins)
    print_uniprot(out_uniprot_file, chosen_proteins)
    print_hosts(out_host_file, chosen_proteins)
    print_species(out_species_file, chosen_proteins)
    print_synonyms(out_synonyms_file, out_best_synonyms_file, chosen_proteins,
    print_functions(out_functions_file, chosen_proteins)
    map_pogs(pogs_file, out_pogs_file, chosen_proteins)

    # print output that is needed for text mining
    #print_synonyms(out_synonyms_file, out_best_synonyms_file, chosen_proteins, chosen_proteins)

Ejemplo n.º 6
def main():
    taxtree_filename = "../taxonomy/nodes.dmp"
    get_levels = False
    taxtree = ncbi_taxonomy.parse_taxtree(taxtree_filename, get_levels)

    host_pathogen_list = sys.argv[1]
    with open(host_pathogen_list, 'r') as f:
        for l in f:
            host, pathogen = l.rstrip('\n').split('\t')
            lineage = ncbi_taxonomy.climb_tax_tree(host, taxtree)
            host_domain = ''
            if 2 in lineage:
                host_domain = "bacteria"
            elif 2157 in lineage:
                host_domain = "archaea"
            elif 9606 in lineage:
                host_domain = "human"
            elif 7742 in lineage:
                host_domain = "vertebrates"
            elif 33208 in lineage:
                host_domain = "animals"
            elif 3193 in lineage:
                host_domain = "plants"
            elif 6960 in lineage:
                host_domain = "insects"
            elif 4751 in lineage:
                host_domain = "fungi"
            elif 2759 in lineage:
                host_domain = "eukaryota"
                host_domain = "none"

            lineage = ncbi_taxonomy.climb_tax_tree(pathogen, taxtree)
            pathogen_domain = ''
            if 2 in lineage:
                pathogen_domain = "bacteria"
            elif 2157 in lineage:
                pathogen_domain = "archaea"
            elif 5794 in lineage:
                pathogen_domain = "apicomplexa (protists)"
            elif 5653 in lineage:
                pathogen_domain = "kinetoplastida (protists)"
            elif 554915 in lineage:
                pathogen_domain = "amoebozoa"
            elif 6340 in lineage:
                pathogen_domain = "annelida (worms)"
            elif 6157 in lineage:
                pathogen_domain = "platyhelminthes (worms)"
            elif 6231 in lineage:
                pathogen_domain = "nematoda (worms)"
            elif 10232 in lineage:
                pathogen_domain = "acanthocephala (worms)"
            elif 33340 in lineage:
                pathogen_domain = "neoptera (fleas, louse)"
            elif 4751 in lineage:
                pathogen_domain = "fungi"
            elif 7742 in lineage:
                pathogen_domain = "vertebrates"
            elif 2759 in lineage:
                pathogen_domain = "eukaryota"
            elif 10239 in lineage:
                pathogen_domain = "viruses"
                pathogen_domain = "none"

            print host + "\t" + host_domain + "\t" + pathogen + "\t" + pathogen_domain