コード例 #1
0
ファイル: Parser.py プロジェクト: moritzbuck/mOTUlizer
    def convert(self, infile, count=False):
        from ete3 import NCBITaxa
        tax_db = NCBITaxa()

        with open(infile) as handle:
            header = None
            counter = 0
            while not header and counter < 15:
                head = handle.readline().rstrip().split("\t")
                if head[0] == "#query_name":
                    header = head
                counter += 1
            if not header:
                print(
                    "\nYou sure this file is good? Like, is it the '.emapper.annotations' you got from running eggnoggmapper?\n"
                )
                sys.exit(0)

            idx = [i for i, v in enumerate(header) if v == "eggNOG OGs"][0]
            print("Loading eggNOGs from file.", file=sys.stderr)
            gene_id2eggs = {
                l.split("\t")[0]: l.split("\t")[idx]
                for l in tqdm(handle.readlines()) if not l.startswith("#")
            }

        print("Parsing taxonomies, and simplifying to deepest eggNOG.",
              file=sys.stderr)
        taxos = {
            vv.split("@")[1]
            for v in gene_id2eggs.values() for vv in v.split(",")
        }
        tax2level = {
            k: len(v)
            for k, v in tax_db.get_lineage_translator(list(taxos)).items()
        }
        lowest_level = lambda x: tax2level.get(int(x[1]), 1000)
        gene_id2deepest_egg = {
            k: min([vv.split('@') for vv in v.split(",")], key=lowest_level)[0]
            for k, v in tqdm(gene_id2eggs.items())
        }

        print("Stratify it to genome", file=sys.stderr)
        if not self.gene_id2genome:
            self.gene_id2genome = {
                k: "_".join(k.split("_")[:-1])
                for k in gene_id2deepest_egg.keys()
            }

        genome2nog = {k: [] for k in set(self.gene_id2genome.values())}
        for k, v in gene_id2deepest_egg.items():
            genome2nog[self.gene_id2genome[k]] += [v]
        if count:
            return {
                k: {vv: v.count(vv)
                    for vv in set(v)}
                for k, v in genome2nog.items()
            }
        else:
            return {k: list(set(v)) for k, v in genome2nog.items()}
コード例 #2
0
def findclade(namelist, ranks='family|genus'):
    #rankregex = re.compile('^(%s)$' % ranks)
    ncbi = NCBITaxa()
    name2taxid = ncbi.get_name_translator(namelist)
    lineages = ncbi.get_lineage_translator([v[0] for v in name2taxid.values()])
    cladetaxids = []
    for name in namelist:
        lineage = lineages[name2taxid[name][0]]
        #print(name, name2taxid[name], lineage)
        rank2clade = {
            rk: taxid
            for taxid, rk in ncbi.get_rank(lineage).items()
        }
        cladetaxids.append(
            [rank2clade.get(rank, 0) for rank in ranks.split('|')])

    #print(cladetaxids)
    taxid2clade = ncbi.get_taxid_translator(chain(*cladetaxids))

    for name, taxidlist in zip(namelist, cladetaxids):
        yield name, [taxid2clade.get(t, '') for t in taxidlist]
コード例 #3
0
    emapper_outs = []
    for v in os.walk(os.path.dirname(genus), followlinks=True):
        for vv in v[2]:
            if vv.endswith(".emapper") and not "__" in vv:
                emapper_outs += [pjoin(v[0], vv)]

    for f in emapper_outs:
        with open(f) as handle:
            res = json.load(handle)
            gid = os.path.basename(f)[:-8]
            for k,v in res.items():
                if v :
                    nogs = v["eggNOG OGs"]
                    if nogs not in nogs2lowest:
                        taxos = [v.split("@")[1] for v in nogs.split(",")]
                        tax2level = {k : len(v) for k,v in tax_db.get_lineage_translator(list(taxos)).items()}
                        lowest_nog = min([vv.split('@') for vv in nogs.split(",")], key = lambda x : tax2level.get(int(x[1]),1000))[0]
                        nogs2lowest[nogs] = lowest_nog
                    else :
                        lowest_nog = nogs2lowest[nogs]
                    nog2ko[lowest_nog] += [v['KEGG_ko']]
                    nog2cogcat[lowest_nog] += [v['COG Functional cat.']]
                    nog2ec[lowest_nog] += [v['EC']]
                    nog2cazy[lowest_nog] += [v['CAZy']]

                    if k in gid2abinit:
                        abinit2ko[gid2abinit[k]] += [v['KEGG_ko']]
                        abinit2cogcat[gid2abinit[k]] += [v['COG Functional cat.']]
                        abinit2ec[gid2abinit[k]] += [v['EC']]
                        abinit2cazy[gid2abinit[k]] += [v['CAZy']]
コード例 #4
0
gid2eggs = {}
for g in tqdm(gids):
    patty = pjoin(clade_folder, g, g + ".emapper")
    with open(patty) as handle:
        gid2eggs[g] = {
            v["eggNOG OGs"]
            for k, v in json.load(handle).items() if v
        }

taxos = {
    vvv.split("@")[1]
    for v in gid2eggs.values() for vv in v for vvv in vv.split(",")
}
tax2level = {
    k: len(v)
    for k, v in tax_db.get_lineage_translator(list(taxos)).items()
}
for g in tqdm(gids):
    gid2eggs[g] = list({
        min([vv.split('@') for vv in v.split(",")],
            key=lambda x: tax2level.get(int(x[1]), 1000))[0]
        for v in gid2eggs[g]
    })

#checking checkm file
completeness_switch = "--checkm " + checkm_file if os.path.exists(
    checkm_file) else ""

print("executing mOTUpan")
with tempfile.NamedTemporaryFile(mode="w", suffix=".gid2cog") as temp:
    json.dump(gid2eggs, temp, indent=4, sort_keys=True)
def create_CAMI_profile(data_file, sample_id):
    """
    CSV Parser for converting information to the CAMI profiling
    format.
    
    Input: csv file with the required information, sample ID
        and the name of the file to write to
    Output: header and contents of the CAMI profile file
        (see format linked above)
    """
    dataframe = pd.read_csv(data_file)
    subset = dataframe[dataframe["sample"] == sample_id]
    taxa = subset["Assignment"]
    total_percentages = subset["percentage_of_total_reads"]
    ncbi = NCBITaxa()

    rank_list_list = []  #save all taxonomies to find the longest
    #I use the longest, because virus taxonomy is diverse...
    output_list = []  #stores the CAMI profiles as strings

    for name in taxa:
        #remove names that have some addition in brackets,
        # like " (segment 1)"
        if ' (' in name:
            ncbi_name = name[:name.index(' (')]
        else:
            ncbi_name = name

        taxon_and_id = ncbi.get_name_translator([ncbi_name])
        #ncbi.get_name_translator() returns a dictionary { 'taxon' : [id]}
        taxid = taxon_and_id[ncbi_name]
        #taxid is a list with one number
        taxid_nr = taxid[0]

        rank_dict = ncbi.get_rank(taxid)
        #ncbi.get_rank() requires a list of IDs, and returns a dictionary:
        # {id: 'rank'}
        rank = rank_dict[taxid_nr]

        tax_path_dict = ncbi.get_lineage_translator(taxid)  #[taxid_nr]
        #ncbi.get_lineage_translator() requires a list of IDs, and returns
        # a dictionary {leaf_id: [root_id, node_id, leaf_id]}
        tax_path = tax_path_dict[taxid_nr][1:]

        tax_path_sn = []
        #with a for-loop you can translate the taxids in the list
        # 'tax_path' to their corresponding scientific names (sn)
        for t in tax_path:
            tax_path_sn.append(ncbi.get_taxid_translator([t])[t])

        rank_list = []
        #Making this list requires using a for-loop;
        # using the function on a list makes an UNORDERED dictionary
        #Also, since the path differs between branches, I will look
        # for the longest using a list of lists
        for taxid in tax_path:
            rank_dict = ncbi.get_rank([taxid])
            rank = rank_dict[taxid]
            rank_list.append(rank)

        rank_list_list.append(rank_list)

        tax_path_string = '|'.join(map(str, tax_path))
        tax_path_sn_string = '|'.join(tax_path_sn)

        percentage = subset.loc[subset["Assignment"] ==
                                name]["percentage_of_total_reads"].values[0]

        output_line = "%s\t%s\t%s\t%s\t%s" % (taxid_nr, rank, tax_path_string,
                                              tax_path_sn_string, percentage)

        output_list.append(output_line)

    longest_taxonomy = '|'.join(max(rank_list_list, key=len))

    #Read the specification for details about this header:
    #https://github.com/bioboxes/rfc/blob/60263f34c57bc4137deeceec4c68a7f9f810f6a5/data-format/profiling.mkd
    header = """# Taxonomic Profiling Output
@SampleID:%s
@Version:0.9.3
@Ranks:%s\t#the longest path in this sample: virus taxonomy is messy
@TaxonomyID:ncbi-taxonomy_2018-05-25
@@TAXID\tRANK\tTAXPATH\tTAXPATHSN\tPERCENTAGE
""" % (sample_id, longest_taxonomy)

    return (header, output_list)
コード例 #6
0
    mag2md[k]['sample derived from'] = ass_md['sample_accession']
    mag2md[k]['ENA-CHECKLIST'] = 'ERC000047'
    mag2md[k]['isolation_source'] = " ".join(ass_md['taxonomy'].split()[:-1])
    mag2md[k]['metagenomic source'] = ass_md['ncbi_taxid']
    del mag2md[k]['ncbi_taxid']
    del mag2md[k]['taxonomy']
    if 'sample_accession' in mag2md[k]:
        del mag2md[k]['sample_accession']
    if 'Run' in mag2md[k]:
        del mag2md[k]['Run']
    if 'Lake_code' in mag2md[k]:
        del mag2md[k]['Lake_code']

get_parent = lambda taxid: max({
    k: v
    for k, v in ncbi.get_lineage_translator(ncbi.get_lineage(taxid)).items()
    if k != taxid
}.items(),
                               key=lambda l: len(l[1]))[0]
tax2uncul = {}
for k, v in set(found_taxo.values()):
    options = ncbi.get_name_translator(
        ["uncultured " + k + " bacterium", "uncultured " + k + " archaeon"])
    if k == 'root':
        tax2uncul[k] = ('uncultured prokaryote', 198431)
        continue
    if not options:
        options = ncbi.get_name_translator(["uncultured " + k + " sp."])
    if not options:
        options = ncbi.get_name_translator(
            ["uncultured " + k + " cyanobacterium"])