コード例 #1
0
def funGenusLocalBlast(sFastaFileName, sGBKFileName, dbName):
    """Import packages used """
    from Bio.Blast.Applications import NcbiblastnCommandline
    from Bio import SeqIO
    from Bio.SeqUtils import GC
    import subprocess
    import xlsxwriter
    from funReadBlast import funReadBlast
    from funANICalc import funANICalc
    from collections import Counter
    from ete3 import NCBITaxa

    #sFastaFileName = "TestFolderFasta/AMERTCC_31.fasta"
    #sGBKFileName = "TestFolderGenBank/AMERTCC_31.annotation.20161209.gbk"
    #dbName = "ref_prok_rep_genomes"

    columnTitleRow = [
        "FDAARGOS_ID",  #0
        "Num_Contig",  #1 
        "Assembly_Size",  #2
        "N_50",  #3
        "Largest_Contig_Size",  #4
        "Contig_ID",  #5
        "Contig_Length",  #6
        "Contig_GC",  #7
        "Proposed Organism",  #8
        "Blast_Hit",  #9
        "ACCESSION",  #10
        "Score",  #11
        "Percent_Query_Identity",  #12
        "Percent_Query_Coverage",  #13
        "Scientific_Name",  #14
        "Query_ANI_Coverage",  #15
        "Subject_ANI_Coverage",  #16
        "Query_ANI_Length",  #17
        "Subject_ANI_Length",  #18
        "Query_ANI_HD",  #19
        "Subject_ANI_HD",  #20
        "Query_ANI_Identity",  #21
        "Subject_ANI_Identity",  #22
        "Query_ANI_SE",  #23
        "Subject_ANI_SE"
    ]

    sFileName = sFastaFileName[0:-6] + '_Genus_Blast.xlsx'
    workbook = xlsxwriter.Workbook(sFileName)

    lARGOSID = sFastaFileName.split("/")
    sARGOSID = lARGOSID[-1][0:-6]
    """Import Fasta sequence from assembly file"""

    lSeqRecord = []
    for seq_record in SeqIO.parse(sFastaFileName, "fasta"):
        lSeqRecord.append(seq_record)
    """Import Annotation"""
    all_species = []

    if sGBKFileName == "N/A":
        for seq_record in lSeqRecord:
            all_species.append('N/A-N/A')
    else:
        f = open(sGBKFileName, 'r', errors='ignore')
        for line in f:
            if "ORGANISM" in line:
                print(line)
                sSpecie = line
                all_species.append(sSpecie)
        f.close
    """Calculate Contig Statistics"""
    lSize = []
    lGC = []

    for seq_record in lSeqRecord:
        lSize.append(len(seq_record.seq))
        lGC.append(GC(seq_record.seq))

    nTotalAssemblySize = sum(lSize)
    nNumContig = len(lSize)
    nLargestContig = max(lSize)

    #nTotalGC = np.multiply(lGC,lSize)
    #nTotalPercentGC = sum(nTotalGC)/nTotalAssemblySize

    nThreshold = 0.5 * nTotalAssemblySize
    lTempSize = sorted(lSize, reverse=True)

    nSize = 0
    count = 0

    while nSize <= nThreshold:
        nSize = nSize + lTempSize[count]
        out = count
        count = count + 1

    nN50 = lTempSize[out]

    #Run Blast

    sOutFileName = sARGOSID + ".txt"
    ncbi = NCBITaxa()

    Genus = all_species[0].split()
    Genus = Genus[1]
    name2taxid = ncbi.get_name_translator([Genus])
    sOrganism = "\"txid" + str(name2taxid[Genus][0]) + " [ORGN]\""

    sOutFileName = sARGOSID + ".txt"


    blastn_cline = NcbiblastnCommandline(task = "megablast", \
                                         query = sFastaFileName, \
                                         db = "nt",\
                                         evalue = 0.001, \
                                         max_target_seqs = 5, \
                                         outfmt = "\"6 " +\
                                         "qseqid "+\
                                         "qlen "+\
                                         "sscinames "+\
                                         "sacc "+\
                                         "stitle "+\
                                         "length "+\
                                         "score "+\
                                         "pident "+\
                                         "qcovs\"",
                                         entrez_query = sOrganism,
                                         remote = 1,
                                         out = sOutFileName)


    process = subprocess.Popen("export BLASTDB=/Users/yi.yan/Documents/db/:$BLASTDB"\
                               +"&&/usr/local/ncbi/blast/bin/"\
                               +str(blastn_cline),\
                               shell=True,\
                               stdout = subprocess.PIPE,\
                               stderr = subprocess.PIPE)
    proc_out, proc_err = process.communicate()


    tblComplete = funReadBlast(sOutFileName,all_species,sARGOSID,nNumContig,nTotalAssemblySize,\
                 nN50,nLargestContig,lGC,lSize)
    #Run ANI

    FinalTbl = funANICalc(tblComplete, lSeqRecord, 'nt')

    s = sorted(FinalTbl, key=lambda x: (x[6], x[11]), reverse=True)

    ContigNames = [i[5] for i in s]
    lContigName = list(set(ContigNames))
    SummaryTbl = []
    for Name in lContigName:
        SummaryTbl.append(s[ContigNames.index(Name)])

    SummaryTbl = sorted(SummaryTbl, key=lambda x: (x[6], x[11]), reverse=True)

    worksheet = workbook.add_worksheet('NT_Genus')

    for i in range(0, len(s[0])):
        worksheet.write(0, i, columnTitleRow[i])

    for i in range(0, len(s)):
        for j in range(0, len(s[0])):
            worksheet.write(i + 1, j, s[i][j])
    #Specie Distribution
    nRowStart = i + 5
    lSciNames = [i[14] for i in s]
    lGenus = []
    lSpecies = []
    for i in lSciNames:
        temp = i.replace('[', '')
        temp = temp.replace(']', '')
        lGenus.append(temp.split()[0])
        lSpecies.append(temp.split()[0] + ' ' + temp.split()[1])

    c = Counter(lGenus)

    nItem = len(lGenus)
    Genus = list(c.keys())

    tempHeader = ['Genus', 'Count', 'Percentage']

    for i in range(0, 3):
        worksheet.write(nRowStart, i, tempHeader[i])

    for i in range(0, len(Genus)):
        worksheet.write(nRowStart + i, 0, Genus[i])
        worksheet.write(nRowStart + i, 1, c.get(Genus[i]))
        worksheet.write(nRowStart + i, 2, c.get(Genus[i]) / nItem)

    nRowStart = nRowStart + i + 5

    c = Counter(lSpecies)

    nItem = len(lSpecies)
    Species = list(c.keys())

    tempHeader = ['Genus', 'Count', 'Percentage']

    for i in range(0, 3):
        worksheet.write(nRowStart, i, tempHeader[i])

    for i in range(0, len(Species)):
        worksheet.write(nRowStart + i, 0, Species[i])
        worksheet.write(nRowStart + i, 1, c.get(Species[i]))
        worksheet.write(nRowStart + i, 2, c.get(Species[i]) / nItem)
#Summary
    worksheet = workbook.add_worksheet('NT_Summary_Genus')

    for i in range(0, len(s[0])):
        worksheet.write(0, i, columnTitleRow[i])

    for i in range(0, len(SummaryTbl)):
        for j in range(0, len(s[0])):
            worksheet.write(i + 1, j, SummaryTbl[i][j])

#Specie Distribution
    nRowStart = i + 5
    lSciNames = [i[14] for i in SummaryTbl]
    lGenus = []
    lSpecies = []
    for i in lSciNames:
        temp = i.replace('[', '')
        temp = temp.replace(']', '')
        lGenus.append(temp.split()[0])
        lSpecies.append(temp.split()[0] + ' ' + temp.split()[1])

    c = Counter(lGenus)

    nItem = len(lGenus)
    Genus = list(c.keys())

    tempHeader = ['Genus', 'Count', 'Percentage']

    for i in range(0, 3):
        worksheet.write(nRowStart, i, tempHeader[i])

    for i in range(0, len(Genus)):
        worksheet.write(nRowStart + i, 0, Genus[i])
        worksheet.write(nRowStart + i, 1, c.get(Genus[i]))
        worksheet.write(nRowStart + i, 2, c.get(Genus[i]) / nItem)

    nRowStart = nRowStart + i + 5

    c = Counter(lSpecies)

    nItem = len(lSpecies)
    Species = list(c.keys())

    for i in range(0, len(Species)):
        worksheet.write(nRowStart + i, 0, Species[i])
        worksheet.write(nRowStart + i, 1, c.get(Species[i]))
        worksheet.write(nRowStart + i, 2, c.get(Species[i]) / nItem)

    workbook.close()
コード例 #2
0
Get the protein sequences for a list of genera.

The module uses a text file with a number of TaxIDs to download the protein sequences belonging to
the respective genera. The taxonomic lineage will be added to the protein header.
"""

import logging
import os
import re
import urllib.parse
import urllib.request
from ete3 import NCBITaxa
from mptk import general_functions

logger = logging.getLogger("pies.use_amplicon")
NCBI = NCBITaxa()


def get_taxid(input_file):
    """
    Return a list of tax IDs based on tax names.

    Each line of the input_file has a tax name on each line. The function `get_taxid` returns a
    list with tax IDs with the same length.

    Parameters
    ----------
      input_file: file with tax names on each line

    Returns
    -------
コード例 #3
0
ファイル: fNCBItax.py プロジェクト: maryletteroa/CCMetagen
def lineage_extractor(query_taxid, TaxInfo_object):
    list_of_taxa_ranks = [
        'superkingdom', 'kingdom', 'phylum', 'class', 'order', 'family',
        'genus', 'species'
    ]
    ncbi = NCBITaxa()
    lineage = ncbi.get_lineage(query_taxid)
    ranks = ncbi.get_rank(lineage)
    names = ncbi.get_taxid_translator(lineage)

    # get known data

    for key, val in ranks.items():

        if val == list_of_taxa_ranks[0]:
            TaxInfo_object.Superkingdom = names[key]
            TaxInfo_object.Superkingdom_TaxId = key

        elif val == list_of_taxa_ranks[1]:
            TaxInfo_object.Kingdom = names[key]
            TaxInfo_object.Kingdom_TaxId = key

        elif val == list_of_taxa_ranks[2]:
            TaxInfo_object.Phylum = names[key]
            TaxInfo_object.Phylum_TaxId = key

        elif val == list_of_taxa_ranks[3]:
            TaxInfo_object.Class = names[key]
            TaxInfo_object.Class_TaxId = key

        elif val == list_of_taxa_ranks[4]:
            TaxInfo_object.Order = names[key]
            TaxInfo_object.Order_TaxId = key

        elif val == list_of_taxa_ranks[5]:
            TaxInfo_object.Family = names[key]
            TaxInfo_object.Family_TaxId = key

        elif val == list_of_taxa_ranks[6]:
            TaxInfo_object.Genus = names[key]
            TaxInfo_object.Genus_TaxId = key

        elif val == list_of_taxa_ranks[7]:
            TaxInfo_object.Species = names[key]
            TaxInfo_object.Species_TaxId = key

# fill in the blanks
    if TaxInfo_object.Superkingdom is None:
        TaxInfo_object.Superkingdom = "unk_sk"

    if TaxInfo_object.Kingdom is None:
        TaxInfo_object.Kingdom = "unk_k"

    if TaxInfo_object.Phylum is None:
        TaxInfo_object.Phylum = "unk_p"

    if TaxInfo_object.Class is None:
        TaxInfo_object.Class = "unk_c"

    if TaxInfo_object.Order is None:
        TaxInfo_object.Order = "unk_o"

    if TaxInfo_object.Family is None:
        TaxInfo_object.Family = "unk_f"

    if TaxInfo_object.Genus is None:
        TaxInfo_object.Genus = "unk_g"

    if TaxInfo_object.Species is None:
        TaxInfo_object.Species = "unk_s"

    return TaxInfo_object
コード例 #4
0
ファイル: CCMetagen.py プロジェクト: carden24/CCMetagen
#q = 50
#d = 0.2
#p = 0.05
#st = 99
#gt = 98
#ft = 95
#ot = 80
#ct = 0
#pt = 0
#du = 'kma'


##### Checks:

# Run implicitly ete3.NCBITaxa.__init__() to check for valid taxonomy database
NCBITaxa()

# Warning if RefDatabase is unknown
if ref_database not in ("UNITE", "RefSeq","nt"):
    print (""" Reference database (-r) must be either UNITE, RefSeq or nt.
           the input is case sensitive and the default is nt.""")
    sys.exit("Try again.")


##### Read input files and output a pandas dataframe
print ("")
print ("Reading file %s" %(f))
print ("")

df = pd.read_csv(f, sep='\t', index_col=0, encoding='latin1')
コード例 #5
0
import sys
import time
import random
import os

import pandas as pd
import numpy as np
import sqlite3
import sqlalchemy

from ete3 import NCBITaxa
ncbi_taxdump_names = ["taxdump_2019-01-01.tar.gz", "taxdump_2019-06-01.tar"]
ncbi_dbs = []
if not os.path.exists(ncbi_taxdump_names[0].replace(".tar.gz", ".sqlite")):
    ncbi_dbs.append(
        NCBITaxa(ncbi_taxdump_names[0].replace(".tar.gz", ".sqlite"),
                 taxdump_file=ncbi_taxdump_names[0]))
else:
    ncbi_dbs.append(
        NCBITaxa(ncbi_taxdump_names[0].replace(".tar.gz", ".sqlite")))
if not os.path.exists(ncbi_taxdump_names[1].replace(".tar", ".sqlite")):
    ncbi_dbs.append(
        NCBITaxa(ncbi_taxdump_names[1].replace(".tar", ".sqlite"),
                 taxdump_file=ncbi_taxdump_names[1]))
else:
    ncbi_dbs.append(NCBITaxa(ncbi_taxdump_names[1].replace(".tar", ".sqlite")))
ncbi_new = NCBITaxa()
#ncbi_new.update_taxonomy_database()
ncbi_dbs.append(ncbi_new)

from lca_functions import *
コード例 #6
0
def get_tax_lineage(taxonid, source, tax_rank_id={}):
    """Return taxonomy lineage information

    This function uses either Biopython library to connect
    NCBI database and search for taxonomy information
    or searches the information locally by using ete3 taxdump
    file or taxit program to create sql version of it.

    Parameters
    -------------
    taxonid : string
        Taxonomic id of the species
    source : string
        Source to be used to collect the info about the taxonid
    tax_rank_id: dict
        Taxonomic rank and id

    Returns
    -------------
    lineage: dict
        Species lineage

    """

    if taxonid not in LINEAGES:
        if source == "taxdump":
            ncbi_taxdump = NCBITaxa()
            lineage_ids = ncbi_taxdump.get_lineage(taxonid)
            ranks = ncbi_taxdump.get_rank(lineage_ids)
            names = ncbi_taxdump.get_taxid_translator(lineage_ids)
            lineage = {ranks[i]: names[i] for i in lineage_ids}

            LINEAGES[taxonid] = lineage
            return LINEAGES[taxonid]

        if source == "taxit":
            lineage = {
                level: tax_rank_id[tax_rank_id[taxonid][level]]["tax_name"]
                for level in TAX_LEVELS
            }

            LINEAGES[taxonid] = lineage
            return LINEAGES[taxonid]

        while True:
            data = ""
            try:
                Entrez.email = "*****@*****.**"
                handle = Entrez.efetch(id=taxonid,
                                       db="taxonomy",
                                       retmode="xml")
                data = Entrez.read(handle)
                handle.close()
            except Exception as e:
                with open(LOG, "a") as log:
                    print("Error when searching information about {}".format(
                        taxonid),
                          file=log)

            if data:
                break

        lineage = {
            d["Rank"]: d["ScientificName"]
            for d in data[0]["LineageEx"]
        }
        lineage[data[0]["Rank"]] = data[0]["ScientificName"]
        LINEAGES[taxonid] = lineage

    return LINEAGES[taxonid]
def create_CAMI_profile(data_file, sample_id):
    """
    CSV Parser for converting information to the CAMI profiling
    format.
    
    Input: csv file with the required information, sample ID
        and the name of the file to write to
    Output: header and contents of the CAMI profile file
        (see format linked above)
    """
    dataframe = pd.read_csv(data_file)
    subset = dataframe[dataframe["sample"] == sample_id]
    taxa = subset["Assignment"]
    total_percentages = subset["percentage_of_total_reads"]
    ncbi = NCBITaxa()

    rank_list_list = []  #save all taxonomies to find the longest
    #I use the longest, because virus taxonomy is diverse...
    output_list = []  #stores the CAMI profiles as strings

    for name in taxa:
        #remove names that have some addition in brackets,
        # like " (segment 1)"
        if ' (' in name:
            ncbi_name = name[:name.index(' (')]
        else:
            ncbi_name = name

        taxon_and_id = ncbi.get_name_translator([ncbi_name])
        #ncbi.get_name_translator() returns a dictionary { 'taxon' : [id]}
        taxid = taxon_and_id[ncbi_name]
        #taxid is a list with one number
        taxid_nr = taxid[0]

        rank_dict = ncbi.get_rank(taxid)
        #ncbi.get_rank() requires a list of IDs, and returns a dictionary:
        # {id: 'rank'}
        rank = rank_dict[taxid_nr]

        tax_path_dict = ncbi.get_lineage_translator(taxid)  #[taxid_nr]
        #ncbi.get_lineage_translator() requires a list of IDs, and returns
        # a dictionary {leaf_id: [root_id, node_id, leaf_id]}
        tax_path = tax_path_dict[taxid_nr][1:]

        tax_path_sn = []
        #with a for-loop you can translate the taxids in the list
        # 'tax_path' to their corresponding scientific names (sn)
        for t in tax_path:
            tax_path_sn.append(ncbi.get_taxid_translator([t])[t])

        rank_list = []
        #Making this list requires using a for-loop;
        # using the function on a list makes an UNORDERED dictionary
        #Also, since the path differs between branches, I will look
        # for the longest using a list of lists
        for taxid in tax_path:
            rank_dict = ncbi.get_rank([taxid])
            rank = rank_dict[taxid]
            rank_list.append(rank)

        rank_list_list.append(rank_list)

        tax_path_string = '|'.join(map(str, tax_path))
        tax_path_sn_string = '|'.join(tax_path_sn)

        percentage = subset.loc[subset["Assignment"] ==
                                name]["percentage_of_total_reads"].values[0]

        output_line = "%s\t%s\t%s\t%s\t%s" % (taxid_nr, rank, tax_path_string,
                                              tax_path_sn_string, percentage)

        output_list.append(output_line)

    longest_taxonomy = '|'.join(max(rank_list_list, key=len))

    #Read the specification for details about this header:
    #https://github.com/bioboxes/rfc/blob/60263f34c57bc4137deeceec4c68a7f9f810f6a5/data-format/profiling.mkd
    header = """# Taxonomic Profiling Output
@SampleID:%s
@Version:0.9.3
@Ranks:%s\t#the longest path in this sample: virus taxonomy is messy
@TaxonomyID:ncbi-taxonomy_2018-05-25
@@TAXID\tRANK\tTAXPATH\tTAXPATHSN\tPERCENTAGE
""" % (sample_id, longest_taxonomy)

    return (header, output_list)
コード例 #8
0
def tdb_from_hits(hits, minPerc=50, testing=False):
    '''
    Determines the lowest taxonomic level with at least minPerc certainty

    For every hit:
        reconstruct the lineage (kingdom, phylum, class, ect.)
        add a count to every rank in the lineage

    For every rank:
        see if the number of hits matching one taxa at that rank is above the minPerc
        the denominator for this equation is the number of hits that have a phyla rank

    * Note: this is complicated because some lower ranks don't have higher ranks
        For example, species [Eubacterium] rectale (taxID 39491) has no genus
        Also, species [artifical construct] (taxID 32630) has no anything but species

    '''

    from ete3 import NCBITaxa
    ncbi = NCBITaxa()

    Levels = [
        'superkingdom', 'phylum', 'class', 'order', 'family', 'genus',
        'species'
    ]

    # generate nested dictionary for levels
    countDic = {}
    for level in Levels:
        countDic[level] = {}

    # fill in nested dictionary
    for t in hits['taxID'].tolist():
        if t == 0:
            continue

        # This try / except thing is trying to catch sporatic errors of:
        # sqlite3.OperationalError: disk I/O error
        try:
            lin = ncbi.get_lineage(t)
            lin2name = ncbi.get_taxid_translator(lin)
            name2rank = ncbi.get_rank(lin)
        except:
            time.sleep(1)
            lin = ncbi.get_lineage(t)
            lin2name = ncbi.get_taxid_translator(lin)
            name2rank = ncbi.get_rank(lin)

        for i in lin:
            rank = name2rank[i]
            name = lin2name[i]
            if rank in countDic:
                countDic[rank][i] = countDic[rank].get(i, 0) + 1

    # make the table
    total = sum(countDic['phylum'].values())
    table = {
        'tax_ID': [],
        'tax_confidence': [],
        'tax_level': [],
        'taxonomy': []
    }
    count = None

    for level in Levels:
        dic = countDic[level]
        for name in sorted(dic, key=dic.get, reverse=True):
            count = dic[name]
            break

        if count == None:
            table['tax_ID'].append(None)
            table['tax_confidence'].append(0)
            table['tax_level'].append(level)
            table['taxonomy'].append('unk')

        else:
            lin = ncbi.get_lineage(name)
            lin2name = ncbi.get_taxid_translator(lin)
            name2rank = ncbi.get_rank(lin)
            rank2name = {v: k for k, v in name2rank.items()}
            tax = (lin2name[rank2name[level]])

            table['tax_ID'].append(name)
            table['tax_confidence'].append(((count / total) * 100))
            table['tax_level'].append(level)
            table['taxonomy'].append(tax)

        count = None
    tdb = pd.DataFrame(table)

    # find and mark the best hit
    best = tdb['tax_ID'][tdb['tax_confidence'] >= minPerc].tolist()[-1]
    tdb['best_hit'] = [True if i == best else False for i in tdb['tax_ID']]

    # get the full taxonomy for the best hit
    tdb['full_tax'] = [lineage_from_taxId(t) if b else False for t, b in zip(\
            tdb['tax_ID'], tdb['best_hit'])]

    return tdb
コード例 #9
0
# BOILER PLATE ------------------------------------------------------------------------------------------------------------

import sqlite3
from ete3 import NCBITaxa
ncbi_taxa = NCBITaxa()

db = sqlite3.connect("/u/home/c/cloeffle/scratch/sql/S_aureus.db")
cur = db.cursor()

db.execute(
    "CREATE TABLE TAX_INFO(DBNAME TEXT, GIVENTAXID INT, GENUSTAXID INT, SPECIESTAXID INT, STRAINTAXID INT);"
)
cur.commit()

# FUNCTIONS ---------------------------------------------------------------------------------------------------------------


# Takes a list of lists and prints each list
# used in debugging
def look_at(entry):
    for i in entry:
        print i
    return


# manually handle a taxid where taxid lineage could not be obtained using NCBITaxa
# mainly used for debugging
def handle(taxid):
    entry = []
    print("This is the taxid: " + str(taxid))
    int(taxid)
コード例 #10
0
ファイル: UpdateDatabase.py プロジェクト: smangul1/CMR
def get_lineage(taxId):
    ncbi_taxa = NCBITaxa()
    lineage = ncbi_taxa.get_lineage(taxId)
    return lineage
コード例 #11
0
ファイル: UpdateDatabase.py プロジェクト: smangul1/CMR
def get_name(taxId):
    ncbi_taxa = NCBITaxa()
    name_dict = ncbi_taxa.get_taxid_translator([taxId])
    for taxid, name in name_dict.items():
        # print(name)
        return name
コード例 #12
0
ファイル: UpdateDatabase.py プロジェクト: smangul1/CMR
def get_rank(taxId):
    ncbi_taxa = NCBITaxa()
    rank_dict = ncbi_taxa.get_rank([taxId])
    for taxid, rank in rank_dict.items():
        # print(rank)
        return rank
コード例 #13
0
def taxId2Species(taxid):
    return NCBITaxa().get_taxid_translator([taxid])
コード例 #14
0
ファイル: taxonomy.py プロジェクト: AuReMe/metage2metabo
def extract_taxa(mpwt_taxon_file,
                 taxon_output_file,
                 tree_output_file,
                 taxonomy_level="phylum"):
    """From NCBI taxon ID, extract taxonomy rank and create a tree file

    Args:
        mpwt_taxon_file (str): mpwt taxon file for species in sbml folder
        taxon_output_file (str): path to taxonomy output file
        tree_output_file (str): path to tree output file
        taxonomy_level (str): taxonomy level, must be: phylum, class, order, family, genus or species.
    """
    ncbi = NCBITaxa()

    # Map the taxonomy level to the taxonomy index in the ranks list.
    map_taxonomy_index = {
        'phylum': 0,
        'class': 1,
        'order': 2,
        'family': 3,
        'genus': 4,
        'species': 5
    }
    taxonomy_index = map_taxonomy_index[taxonomy_level]

    taxon_ids = []

    taxon_count = {}
    taxonomy_file_datas = []

    with open(mpwt_taxon_file, "r") as taxon_file:
        csvfile = csv.reader(taxon_file, delimiter="\t")
        next(csvfile)
        for line in csvfile:
            taxon_ids.append(line[1])
            lineage = ncbi.get_lineage(line[1])
            lineage2ranks = ncbi.get_rank(lineage)
            names = ncbi.get_taxid_translator(lineage)
            ranks2lineage = dict((rank, names[taxid])
                                 for (taxid, rank) in lineage2ranks.items())
            ranks = [
                ranks2lineage.get(rank, "unknown") for rank in
                ["phylum", "class", "order", "family", "genus", "species"]
            ]
            if ranks[taxonomy_index] != "unknown":
                taxon = ranks[taxonomy_index]
            else:
                taxon = "unknown"

            taxon = taxon.replace(' ', '_').replace('.', '')
            if taxon not in taxon_count:
                taxon_count[taxon] = 1
            elif taxon == "unknown":
                taxon_count[taxon] = ""
            else:
                taxon_count[taxon] += 1

            row = ([line[0], line[1]] +
                   [taxon + '__' + str(taxon_count[taxon])] + ranks)
            taxonomy_file_datas.append(row)

    with open(taxon_output_file, "w") as taxonomy_file:
        csvwriter = csv.writer(taxonomy_file, delimiter="\t")
        csvwriter.writerow([
            "organism_id", "taxid", "taxon_number", "phylum", "class", "order",
            "family", "genus", "species"
        ])
        for taxonomy_file_data in taxonomy_file_datas:
            csvwriter.writerow(taxonomy_file_data)

    tree = ncbi.get_topology(taxon_ids)

    with open(tree_output_file, "w") as tree_file:
        tree_file.write(tree.get_ascii(attributes=["sci_name", "rank"]))
コード例 #15
0
ファイル: emapper.py プロジェクト: ropolomx/pipeline-v5
def parse_args(parser):
    args = parser.parse_args()

    if args.version:
        print get_version()
        sys.exit(0)

    if args.data_dir:
        set_data_path(args.data_dir)

    if not args.no_annot and not pexists(get_eggnogdb_file()):
        print colorify(
            'Annotation database data/eggnog.db not present. Use download_eggnog_database.py to fetch it',
            'red')
        raise emapperException()

    if args.mode == 'diamond':
        dmnd_db = args.dmnd_db if args.dmnd_db else get_eggnog_dmnd_db()
        if not pexists(dmnd_db):
            print colorify(
                'DIAMOND database %s not present. Use download_eggnog_database.py to fetch it'
                % dmnd_db, 'red')
            raise emapperException()

    if args.cpu == 0:
        args.cpu = multiprocessing.cpu_count()

    # No --servermode available for diamond
    if args.mode == 'diamond' and args.servermode:
        parser.error(
            '--mode [diamond] and --servermode are mutually exclusive')

    # Output file required unless running in servermode
    if not args.servermode and not args.output:
        parser.error('An output project name is required (-o)')

    # Servermode implies using mem-based databases
    if args.servermode:
        args.usemem = True

    # Direct annotation implies no searches
    if args.annotate_hits_table:
        args.no_search = True
        args.no_annot = False

    # Sets GO evidence bases
    if args.go_evidence == 'experimental':
        args.go_evidence = set(["EXP", "IDA", "IPI", "IMP", "IGI", "IEP"])
        args.go_excluded = set(["ND", "IEA"])

    elif args.go_evidence == 'non-electronic':
        args.go_evidence = None
        args.go_excluded = set(["ND", "IEA"])
    else:
        raise ValueError('Invalid --go_evidence value')

    # Check inputs for running sequence searches
    if not args.no_search and not args.servermode:
        if not args.input:
            parser.error('An input fasta file is required (-i)')

        # HMM
        if args.mode == 'hmmer':
            if not args.db and not args.guessdb:
                parser.error(
                    'HMMER mode requires specifying a target database (i.e. -d, --guessdb ))'
                )
            if args.db and args.guessdb:
                parser.error('-d and --guessdb options are mutually exclusive')

            if args.guessdb:
                from ete3 import NCBITaxa
                ncbi = NCBITaxa()
                lineage = ncbi.get_lineage(args.guessdb)
                for tid in reversed(lineage):
                    if tid in TAXID2LEVEL:
                        print tid, TAXID2LEVEL[tid]
                        args.db = TAXID2LEVEL[tid]
                        break
        # DIAMOND
        elif args.mode == 'diamond':
            #if args.db or args.guessdb:
            #    parser.error('diamond mode does not require -d or --guessdb options')
            pass

    return args
コード例 #16
0
def connect_ncbitaxa():
    # ncbi = NCBITaxa("/data/collaborations/spongilla_web/webplugin_py2/ete3_webserver/taxa.sqlite")
    ncbi = NCBITaxa("./taxa.sqlite")
    return(ncbi)
コード例 #17
0
def get_child_taxa(taxid):
    """get child taxids using """
    ncbi = NCBITaxa(dbfile=app.config['TAXA_SQLITE'])
    child_taxids = ncbi.get_descendant_taxa(int(taxid),
                                            intermediate_nodes=True)
    return child_taxids
コード例 #18
0
    def __init__(self,
                 file_loc,
                 input_type,
                 artifact_threshold=0,
                 verbose=0,
                 extension=None,
                 include_strains=False):
        """
        Creates OTUdata object for manipulation/transformation

        Parameters
        ------------
        file_loc: str,
            location/file name of the input file (taxa profiling output file)

        input_type: str,
            what type of taxa profiling tool was used to generate the file, "E.g. Kaiju"

        Returns
        ------------
        N/A

        """

        self.file_id = ''
        self.include_strains = include_strains
        self.verbose = verbose
        self.ncbi = NCBITaxa()
        self.cumulated = False
        self.basic_ranks = [
            'superkingdom', 'phylum', 'class', 'order', 'family', 'genus',
            'species'
        ]
        self.ranks = {
            'superkingdom': set(),
            'phylum': set(),
            'class': set(),
            'order': set(),
            'family': set(),
            'genus': set(),
            'species': set()
        }
        self.superkingdom = {
            'virus': set(),
            'bacteria': set(),
            'eukaryote': set(),
            'archaea': set()
        }

        if extension is None:
            self.file_id = self.process_file_name(file_loc)
        else:
            self.file_id = self.process_file_name_with_known_extension(
                file_loc, extension)

        if input_type.lower() not in ['kaiju', 'old kaiju', 'old kaiju v2']:
            print(
                "[ERROR] only 'kaiju','old kaiju','old kaiju v2' input type supported"
            )
        if input_type.lower() == 'kaiju':
            self.otufile = read_kj(file_loc, artifact_threshold)

        if input_type.lower() == 'old kaiju':
            self.otufile = read_old_kj(file_loc, artifact_threshold)

        if input_type.lower() == 'old kaiju v2':
            self.otufile = read_old_kjV2(file_loc, artifact_threshold)

        #root_id = [-1, 1, 131567]
        root_id = [-1]
        for root in root_id:
            if root in self.otufile:
                del self.otufile[root]

        ### these two are problematic ncbi taxa id that have been deleted, I am now replacing them with their updated taxa id

        fix_taxa = get_dict()
        for k, v in fix_taxa.items():
            if k in self.otufile:
                self.otufile[v] = self.otufile[k]
                del self.otufile[k]

        self.update_ranks()
        self.update_superkingdom()
コード例 #19
0
def run(args):
    # add lineage profiles/stats

    import re
    from ete3 import PhyloTree, NCBITaxa

    # dump tree by default
    if not args.tree and not args.info and not args.descendants:
        args.tree = True

    ncbi = NCBITaxa()

    all_taxids = {}
    all_names = set()
    queries = []

    if not args.search:
        log.error('Search terms should be provided (i.e. --search) ')
        sys.exit(-1)
    for n in args.search:
        queries.append(n)
        try:
            all_taxids[int(n)] = None
        except ValueError:
            all_names.add(n.strip())

    # translate names
    name2tax = ncbi.get_name_translator(all_names)
    all_taxids.update([(v, None) for v in list(name2tax.values())])

    not_found_names = all_names - set(name2tax.keys())
    if args.fuzzy and not_found_names:
        log.warn("%s unknown names", len(not_found_names))
        for name in not_found_names:
            # enable extension loading
            tax, realname, sim = ncbi.get_fuzzy_name_translation(
                name, args.fuzzy)
            if tax:
                all_taxids[tax] = None
                name2tax[name] = tax
                name2realname[name] = realname
                name2score[name] = "Fuzzy:%0.2f" % sim

    if not_found_names:
        log.warn("[%s] could not be translated into taxids!" %
                 ','.join(not_found_names))

    if args.tree:
        if len(all_taxids) == 1:
            target_taxid = next(all_taxids.keys())
            log.info("Dumping NCBI descendants tree for %s" % (target_taxid))
            t = ncbi.get_descendant_taxa(
                target_taxid,
                collapse_subspecies=args.collapse_subspecies,
                rank_limit=args.rank_limit,
                return_tree=True)
        else:
            log.info("Dumping NCBI taxonomy of %d taxa..." % (len(all_taxids)))
            t = ncbi.get_topology(list(all_taxids.keys()),
                                  intermediate_nodes=args.full_lineage,
                                  rank_limit=args.rank_limit,
                                  collapse_subspecies=args.collapse_subspecies)

        id2name = ncbi.get_taxid_translator([n.name for n in t.traverse()])
        for n in t.traverse():
            n.add_features(taxid=n.name)
            n.add_features(sci_name=str(id2name.get(int(n.name), "?")))
            n.name = "%s - %s" % (id2name.get(int(n.name), n.name), n.name)
            lineage = ncbi.get_lineage(n.taxid)
            n.add_features(
                named_lineage='|'.join(ncbi.translate_to_names(lineage)))
        dump(t,
             features=[
                 "taxid", "name", "rank", "bgcolor", "sci_name",
                 "collapse_subspecies", "named_lineage"
             ])
    elif args.descendants:
        log.info("Dumping NCBI taxonomy of %d taxa..." % (len(all_taxids)))
        print('# ' + '\t'.join([
            "Taxid", "Sci.Name", "Rank", "descendant_taxids",
            "descendant_names"
        ]))
        translator = ncbi.get_taxid_translator(all_taxids)
        ranks = ncbi.get_rank(all_taxids)
        for taxid in all_taxids:
            descendants = ncbi.get_descendant_taxa(
                taxid,
                collapse_subspecies=args.collapse_subspecies,
                rank_limit=args.rank_limit)
            print('\t'.join([
                str(taxid),
                translator.get(taxid, taxid),
                ranks.get(taxid, ''), '|'.join(map(str, descendants)),
                '|'.join(map(str, ncbi.translate_to_names(descendants)))
            ]))

    elif args.info:
        print('# ' + '\t'.join(
            ["Taxid", "Sci.Name", "Rank", "Named Lineage", "Taxid Lineage"]))
        translator = ncbi.get_taxid_translator(all_taxids)
        ranks = ncbi.get_rank(all_taxids)
        for taxid, name in six.iteritems(translator):
            lineage = ncbi.get_lineage(taxid)
            named_lineage = ','.join(ncbi.translate_to_names(lineage))
            lineage_string = ','.join(map(str, lineage))
            print('\t'.join([
                str(taxid), name,
                ranks.get(taxid, ''), named_lineage, lineage_string
            ]))
コード例 #20
0
#!/usr/bin/env python

import sys
import argparse
import logging
from collections import Counter, defaultdict
from ete3 import NCBITaxa

# Constants
ncbi = NCBITaxa(
    dbfile="/apps/etetoolkit/taxa.sqlite")  # location of ete3 database

# Parse and check arguments
parser = argparse.ArgumentParser(
    description='Parse eggNOG file and determine OGs')
parser.add_argument(
    '-min_occurence',
    metavar="PERCENT",
    action="store",
    type=float,
    default=0,
    help=
    "Minimum occurence (percent of genomes where gene is present); should be 0-100, default=0"
)
parser.add_argument(
    '-min_uniqueness',
    metavar="PERCENT",
    action="store",
    type=float,
    default=0,
    help=
コード例 #21
0
def contig_tax(annot_df, ncbi_db, min_prot, prop_annot, tax_thres):
    '''This function takes the annotation table generated by viral_contig_maps.py and generates a table that
       provides the taxonomic lineage of each viral contig, based on the corresponding ViPhOG annotations'''

    ncbi = NCBITaxa(dbfile=ncbi_db)
    tax_rank_order = ["genus", "subfamily", "family", "order"]
    contig_list = list(annot_df["Contig"].value_counts().index)
    df_rows = []

    def get_tax_rank(label):
        try:
            tax_id = ncbi.get_name_translator([label])[label]
            tax_rank = ncbi.get_rank(tax_id)[tax_id[0]]
        except:
            tax_rank = ""
        return tax_rank

    for contig in contig_list:
        assigned_taxa = []
        assigned_taxa.append(contig)
        contig_df = annot_df[annot_df["Contig"] == contig]
        filtered_df = contig_df[contig_df["Label"].notnull()]
        filtered_df = filtered_df.reset_index(drop=True)
        total_annot_prot = len(filtered_df)
        if total_annot_prot < max(min_prot, prop_annot * len(contig_df)):
            assigned_taxa.extend([""]*4)
        else:
            filtered_df["Rank"] = filtered_df["Label"].apply(get_tax_rank)
            for item in tax_rank_order:
                tax_hits = {}
                if item == "genus":
                    for row, column in filtered_df.iterrows():
                        if column["Rank"] == item:
                            if column["Label"] not in tax_hits.keys():
                                tax_hits[column["Label"]] = 1
                            else:
                                tax_hits[column["Label"]] += 1
                    if len(tax_hits) < 1:
                        assigned_taxa.append("")
                    else:
                        annot_ratio = max(tax_hits.items(), key=operator.itemgetter(1))[
                            1]/total_annot_prot
                        if annot_ratio < tax_thres:
                            assigned_taxa.append(str(annot_ratio))
                        else:
                            max_tax = []
                            for key, value in tax_hits.items():
                                if value == max(tax_hits.items(), key=operator.itemgetter(1))[1]:
                                    max_tax.append(key)
                            if len(max_tax) > 1:
                                assigned_taxa.append("-".join(max_tax))
                            else:
                                assigned_taxa.append(max_tax[0])
                else:
                    for row, column in filtered_df.iterrows():
                        if column["Rank"] == item:
                            if column["Label"] not in tax_hits.keys():
                                tax_hits[column["Label"]] = 1
                            else:
                                tax_hits[column["Label"]] += 1
                        else:
                            try:
                                name2taxid = ncbi.get_name_translator(
                                    [column["Label"]])
                                label_lineage = ncbi.get_lineage(
                                    name2taxid[column["Label"]][0])
                                lineage_names = ncbi.get_taxid_translator(
                                    label_lineage)
                                lineage_ranks = ncbi.get_rank(label_lineage)
                                if item in lineage_ranks.values():
                                    for x, y in lineage_ranks.items():
                                        if y == item:
                                            if lineage_names[x] not in tax_hits.keys():
                                                tax_hits[lineage_names[x]] = 1
                                            else:
                                                tax_hits[lineage_names[x]] += 1
                                            break
                            except:
                                continue

                    if len(tax_hits) < 1:
                        assigned_taxa.append("")
                    else:
                        annot_ratio = max(tax_hits.items(), key=operator.itemgetter(1))[
                            1]/total_annot_prot
                        if annot_ratio < tax_thres:
                            assigned_taxa.append(str(annot_ratio))
                        else:
                            max_tax = []
                            for key, value in tax_hits.items():
                                if value == max(tax_hits.items(), key=operator.itemgetter(1))[1]:
                                    max_tax.append(key)
                            if len(max_tax) > 1:
                                assigned_taxa.append("-".join(max_tax))
                            else:
                                assigned_taxa.append(max_tax[0])
        df_rows.append(assigned_taxa)
    final_df = pd.DataFrame(
        df_rows, columns=["contig_ID", "genus", "subfamily", "family", "order"])
    return final_df
コード例 #22
0
ファイル: mob_init.py プロジェクト: yemilawal/mob-suite
def main():
    args = arguments()


    database_directory = os.path.abspath(args.database_directory)


    if os.path.exists(database_directory) == False:
        os.mkdir(database_directory)
    else:
        logger.info("Database directory folder already exists at {}".format(database_directory))

    # Helper function to simplify adding database_directory to everything
    prepend_db_dir = functools.partial(os.path.join, database_directory)

    lockfilepath=os.path.join(database_directory,".lock")
    status_file = prepend_db_dir('status.txt')

    if os.path.exists(lockfilepath) == False:
        try:
            open(file=lockfilepath, mode="w").close()
            logger.info("Placed lock file at {}".format(lockfilepath))
        except Exception as e:
            logger.error("Failed to place a lock file at {}. Database diretory can not be accessed. Wrong path?".format(lockfilepath))
            logger.error("{}".format(e))
            exit(-1)
    else:
        while os.path.exists(lockfilepath):
            elapsed_time = time.time() - os.path.getmtime(lockfilepath)
            logger.info("Lock file found at {}. Waiting for other processes to finish database init ...".format(lockfilepath))
            logger.info("Elapsed time {} min. Will continue processing after 16 min mark.".format(int(elapsed_time/60)))
            if elapsed_time >= 1000:
                logger.info("Elapsed time {} min. Assuming previous process completed all init steps. Continue ...".format(int(elapsed_time/60)))
                try: #if previous process failed, no processes are running and > 16 min passed since the lock was created
                    os.remove(lockfilepath)
                except: #continue if file was removed by other process
                    pass
                break
            time.sleep(60) #recheck every 1 min if lock file was removed
        logger.info("Lock file no longer exists. Assuming init process completed successfully")
        return 0



    logger.info('Initializing databases...this will take some time')
    # Find available threads and use the maximum number available for mash sketch but cap it at 32
    num_threads = min(multiprocessing.cpu_count(), 32)


    if not os.path.exists(database_directory):
        os.makedirs(database_directory)

    zip_file = prepend_db_dir('data.tar.gz')
    plasmid_database_fasta_file = prepend_db_dir('ncbi_plasmid_full_seqs.fas')
    repetitive_fasta_file = prepend_db_dir('repetitive.dna.fas')
    mash_db_file =  prepend_db_dir('ncbi_plasmid_full_seqs.fas.msh')

    logger.info('Downloading databases...this will take some time')

    for db_mirror in config['db_mirrors']:
        try:
            logger.info('Trying mirror {}'.format(db_mirror))
            download_to_file(db_mirror, zip_file)
            break
        except Exception as e:
            logger.error("Download failed with error {}. Removing lock file".format(str(e)))
            os.remove(lockfilepath)
            sys.exit(-1)


    logger.info("Downloading databases successful, now building databases at {}".format(database_directory))
    extract(zip_file, database_directory)

    files = [prepend_db_dir(f)
             for f in os.listdir(database_directory)
             if f.endswith('.gz')]

    for file in files:

        extract(file, database_directory)

    #Initialize blast and mash databases
    try:
        logger.info('Building repetitive mask database')
        blast_runner = BlastRunner(repetitive_fasta_file, database_directory)
        blast_runner.makeblastdb(repetitive_fasta_file, 'nucl',logger)

        logger.info('Building complete plasmid database')
        blast_runner = BlastRunner(plasmid_database_fasta_file, database_directory)
        blast_runner.makeblastdb(plasmid_database_fasta_file, 'nucl',logger,True)

        logger.info('Sketching complete plasmid database')
        mObj = mash()
        mObj.mashsketch(plasmid_database_fasta_file,
                        mash_db_file,
                        num_threads=num_threads)
    except Exception as e:
        logger.error('Downloading databases failed, please check your internet connection and retry')
        logger.error("Process failed with error {}. Removing lock file".format(e))
        os.remove(lockfilepath)
        sys.exit(-1)

    try:
        logger.info("Init ete3 library ...")
        ete3taxadbpath = os.path.abspath(os.path.join(database_directory,"taxa.sqlite"))
        ncbi = NCBITaxa()
        ncbi.dbfile=ete3taxadbpath
        ncbi.update_taxonomy_database()
    except Exception as e:
        logger.error("Init of ete3 library failed with error {}. Removing lock file".format(e))
        os.remove(lockfilepath)
        sys.exit(-1)

    try:
        os.remove(os.path.join(os.getcwd(), "taxdump.tar.gz"))
        logger.info("Removed residual taxdump.tar.gz as ete3 is not doing proper cleaning job.")
    except:
        pass

    with open(status_file, 'w') as f:
        download_date = datetime.datetime.today().strftime('%Y-%m-%d')
        f.write("Download date: {}. Removing lock file.".format(download_date))
        try:
            os.remove(lockfilepath)
        except:
            logger.warning("Lock file is already removed by some other process.")
            pass
    logger.info("MOB init completed successfully")
    return 0
コード例 #23
0
ファイル: taxa.py プロジェクト: austinv11/ERC-Pipeline
def get_taxa_info() -> NCBITaxa:
    return NCBITaxa(dbfile=DEFAULT_TAXADB, taxdump_file=None if os.path.exists(DEFAULT_TAXADB) else TAXA_DUMP)
コード例 #24
0
def analysis():
    args = setting()
    cwd = args.workdir #os.getcwd()
    ncbi = NCBITaxa()
    home = str(Path.home())
    pathogens = args.pathogens_species.split(",")
    file_combined_fastq = os.path.join(os.getcwd(), args.fastq)
    if not os.path.isfile(file_combined_fastq):
        fastq_files = [os.path.join(file_combined_fastq, f) for f in listdir(file_combined_fastq) if isfile(join(file_combined_fastq, f)) and f.endswith("fastq")]
        k = file_combined_fastq.rfind("/")
        file_combined_fastq = file_combined_fastq[:k] + ".fastq" + file_combined_fastq[k + 1:]
        with open(file_combined_fastq, 'wb') as wfd:
            for file in fastq_files:
                with open(file, 'rb') as fd:
                    shutil.copyfileobj(fd, wfd, 1024 * 1024 * 10)

    reads_fastq = []
    if file_combined_fastq.endswith("fastq") or file_combined_fastq.endswith("fq"):
        for record in SeqIO.parse(file_combined_fastq, "fastq"):
            reads_fastq.append(str(record.id))
    elif file_combined_fastq.endswith("fasta") or file_combined_fastq.endswith("fa"):
        for record in SeqIO.parse(file_combined_fastq, "fasta"):
            reads_fastq.append(str(record.id))
    else:
        print("Not known reads file format")

    number_reads = len(reads_fastq)

    if args.host_specie == "" and args.pathogens_species == "":
        species = ""
    elif args.host_specie == "" and not args.pathogens_species == "":
        species = pathogens
    elif not args.host_specie == "" and args.pathogens_species == "":
        species = [args.host_specie]
    else:
        species = [args.host_specie] + pathogens

    species.sort()
    name_database = "_".join(species).replace(" ", "_")
    genome_db = os.path.join(cwd, name_database + ".fasta")
    genome_db_id = os.path.join(cwd, name_database + ".txt")
    all_genomes = False
    if "refseq" in args.NCBIdatabase:
        table_file = "assembly_summary_refseq.txt"
    if "assembly" in args.NCBIdatabase:
        all_genomes = True
        table_file = "assembly_summary_genbank.txt"
    if os.path.exists(os.path.join(cwd,table_file)):
        os.remove(os.path.join(cwd,table_file))
    cmd = WGET % table_file
    wget = sb.Popen(cmd, shell=True, stdout=sb.PIPE, stderr=sb.PIPE, cwd=cwd)
    wget.communicate()



    sys.stdout.write("### UPDATING THE DATABASE\n")
    # This part checks for a new version of the taxdump.tar.gz; the code looks for a new version every day
    ete = os.path.expanduser("~/.etetoolkit/taxa.sqlite.traverse.pkl")
    modified = os.path.getmtime(ete)
    modificationTime = time.strftime('%m', time.localtime(modified))
    today = datetime.date.today()
    month = today.strftime("%m")
    if modificationTime != month:
        ncbi.update_taxonomy_database()
    dict_species = {}

    # here we set if is an not know pathogen or we have and idea of which pathogen to investigate
    with open(os.path.join(cwd, table_file), "r") as fh:
        descendants_all = []
        for specie in species:
            name2taxid = ncbi.get_name_translator([specie])
            if args.host_specie in specie:
                plant = name2taxid[specie]
            for key in name2taxid[specie]:
                descendants = ncbi.get_descendant_taxa(key, collapse_subspecies=True)
                for sstaxa in descendants:
                    descendants_all.append(str(sstaxa))
        for line in fh:
            if not line.startswith("#"):
                if line.split("\t")[6] in descendants_all:# and "subsp" in line:
                    ssname = " ".join([line.split("\t")[7].split(" ")[0], line.split("\t")[7].split(" ")[1]])
                    tax = line.split("\t")[6]
                    ftp = line.split("\t")[19]
                    genome = ftp.split("/")[-1] + "_genomic.fna.gz"
                    ftp_genome = os.path.join(ftp, genome)
                    path_genome = os.path.join(cwd, genome)
                    #species_assembly = " ".join([line.split("\t")[7]].split(" ")[0], [line.split("\t")[7]].split(" ")[1])
                    if ssname in dict_species:
                        dict_species[ssname] = dict_species[ssname] + [(ftp_genome, path_genome, tax, genome, ssname)]
                    else:
                        dict_species[ssname] = [(ftp_genome, path_genome, tax, genome, ssname)]
    db_file = os.path.join(home, ".db_monica." + name_database)
    if all_genomes:
        print("DOWNLOADING MULTIPLE GENOMES FOR THE SAME SPECIES")
        genomes_select = [name for specie in dict_species for name in dict_species[specie]]
    else:
        print("DOWNLOADING ONE GENOME FOR SPECIES")
        genomes_select = [dict_species[specie][-1] for specie in dict_species]
    print("I WILL DOWNLOAD %s GENOMES" % str(len(genomes_select)))
    if not os.path.exists(db_file) or not os.path.exists(genome_db):
        with open(genome_db, "w") as output_handle, open(genome_db_id, "w") as output_handle_id:
            with open(db_file, "w") as fh:
                for names in genomes_select:
                    ftp_genome, path_genome, tax, genome, ssname = names
                    if genome.startswith("GC"):
                        genome_used = cwd + genome + "\n"
                        fh.write(genome_used)
                        if not os.path.exists(path_genome):
                            cmd = WGET_GENOME % ftp_genome
                            wget_gen = sb.Popen(cmd, shell=True, stdout=sb.PIPE, stderr=sb.PIPE, cwd=cwd)
                            wget_gen.communicate()
                        with gzip.open(path_genome, "rt") as handle:
                            print("PARSING " + genome + " GENOME")
                            for record in SeqIO.parse(handle, "fasta"):
                                record.id = tax + "_" + str(record.id)
                                record.description = genome.split(".")[0]
                                SeqIO.write(record, output_handle, "fasta")
                                output_handle_id.write(str(record.name) + "%" + str(record.description) + "\n")

    sys.stdout.write("### PREPARING FOR MAPPING\n")
    genome_to_contig = {}
    with open(genome_db_id, "r") as fhtxt:
        for record in fhtxt: #txt SeqIO.parse(genome_db, "fasta"):
            line = record.split("%")
            genome_to_contig[line[0]] = line[1].rsplit()
    genome_to_species= {}
    with open(os.path.join(cwd, table_file), "r") as fh:
        for line in fh:
            line = line.rstrip().split("\t")
            genome = line[0].split(".")[0]
            if len(line) > 9 and not line[0].startswith("#"):
                subspecies = line[7].split(" ")[:2]
                subspecie = "_".join(subspecies) #+ " " + line[8].split("=")[1:]
                tribu = "_".join(line[8].split("=")[1:])
                genome_to_species[genome] = subspecie + "-" + tribu
    sam_output = file_combined_fastq + ".sam"
    cmd = MINIMAP % (str(args.threads), genome_db, file_combined_fastq, sam_output)
    sys.stdout.write("RUNNING MINIMAP2\n")
    minimap = sb.Popen(cmd, shell=True, cwd=cwd)
    minimap.communicate()
    reads_dict = {}
    count = 0
    with open(sam_output) as fh:
        for sam in fh:
            if sam != "" and not sam.startswith("@"):
                fields = sam.split("\t")
                if not fields[2] == "*":
                    for entry in fields:
                        if entry.startswith("MD"):
                            md = entry.split(":")[-1]
                            mismatch = len(re.findall("[A-Z]", md))
                            match = sum([int(number) for number in re.sub('[A-Z]|\^', ',', md).split(",") if number != "" and number.isdigit()])
                            if match > 0:
                                if mismatch > 0:
                                    iden = (match - mismatch) / match * 100
                                    if fields[0] in reads_dict:
                                        if iden == reads_dict[fields[0]][0]:
                                            if reads_dict[fields[0]][1].startswith(fields[2].split("_")[0]):
                                                continue
                                            else:
                                                count += 1
                                                reads_dict.pop(fields[0], None)
                                        elif iden > reads_dict[fields[0]][0]:
                                            reads_dict[fields[0]] = (iden, fields[2], fields[0])
                                    else:
                                        reads_dict[fields[0]] = (iden, fields[2], fields[0])
                                else:
                                    iden = 100
                                    if fields[0] in reads_dict:
                                        if iden == reads_dict[fields[0]][0]:
                                            if reads_dict[fields[0]][1].startswith(fields[2].split("_")[0]):
                                                continue
                                            else:
                                                count += 1
                                                reads_dict.pop(fields[0], None)
                                        elif iden > reads_dict[fields[0]][0]:
                                            reads_dict[fields[0]] = (iden, fields[2], fields[0])
                                    else:
                                        reads_dict[fields[0]] = (iden, fields[2], fields[0])

    out_file = file_combined_fastq + ".reads.txt"
    with open(out_file, "w") as csv:
        for key in reads_dict:
            csv.write("\t".join([reads_dict[key][1], reads_dict[key][2]]) + " \n")
    print(count)
    count = {}
    number_reads_mapped = 0
    for read in reads_dict:
        match = reads_dict[read][1].split("_")
        if len(match) > 1:
            number_reads_mapped += 1
            if all_genomes:
                contig = match[1] #+ "_" + match[2]
            else:
                contig = match[1] + "_" + match[2]
            genome_map = genome_to_contig[contig]
            species_ss = genome_to_species[genome_map[0]]
            uniq_name = match[0] + "_" + species_ss
            if not uniq_name in count:
                count[uniq_name] = 1
            else:
                count[uniq_name] = count[uniq_name] + 1
    print("Name sample: " + file_combined_fastq)
    print("Number reads:" + str(number_reads))
    print("Number reads mapped:" + str(number_reads_mapped) + "\nPercentage of reads mapped:" + str(
        number_reads_mapped/number_reads * 100) + " %\n")
    header = []
    reads_mapped = []
    partial_tree = []
    for clade in types:
        header.append(clade[0])
        reads_mapped.append("")
    header.append("A")
    reads_mapped.append(str(number_reads-number_reads_mapped))
    total = [header] + [reads_mapped]
    tribu_dict = {}
    sorted_list = []
    for value in count:
        key = value.split("_")[0]
        if not str(key).startswith(str(plant[0])):
            sorted_list.append((value[1],(count[value]/number_reads_mapped*100)))
            lineage = ncbi.get_lineage(int(key))
            a = ncbi.get_rank(lineage)
            tribu = value.split("-")[1]
            tribu_dict["tribu"] = tribu
            tree = []
            for match in types:
                combination = [match[1]]
                if match[0] in tribu_dict:
                    combination.append("".join([tribu_dict[match[0]]]))
                else:
                    for tax in a:
                        if match[0].startswith(a[tax]) and match[0].endswith(a[tax]):
                            combination.append(ncbi.get_taxid_translator([int(tax)])[tax].replace(" ","_"))
                tree.append("".join(combination))
            tree.append(str(count[value]))
            partial_tree = partial_tree + [tree]
    partial_tree.sort()
    total = total + partial_tree
    out_file = file_combined_fastq + ".txt"
    with open(out_file, "w") as csv:
        for line in total:
            csv.write(",".join(line) + " \n")
    plot_circ(out_file, file_combined_fastq)
    print("done")
コード例 #25
0
def build_tree(seqs, taxa2acc, red_factor, root, log):
    # Important: you should update ETE DB before running this script.
    # This is done automatically only if it has not been downloaded yet.
    ncbi = NCBITaxa()
    taxa = []
    for s in seqs.values():
        try:
            taxa.append(s['taxid'])
        except KeyError:
            continue
    built = False
    while not built:
        try:
            t = ncbi.get_topology(taxa, intermediate_nodes=True)
            built = True
        except KeyError as e:
            taxid_not_found = int(e.args[0])
            taxa.remove(taxid_not_found)
            if log:
                print(
                    '[prophyle_ncbi_tree] ERROR: TaxID ' + str(taxid_not_found) +
                    ' not found in ETE DB (try updating it)', file=log
                )
            pass

    # [Issue #153] Ignore internal nodes with fasta associated till we find a solution for it
    if log:
        internal_with_fasta = 0
        for node in t.traverse('postorder'):
            if not node.is_leaf() and node.taxid in taxa:
                internal_with_fasta += len([acc for acc in taxa2acc[node.taxid] if acc in seqs.keys()])
        print(
            '[prophyle_ncbi_tree] ' + str(internal_with_fasta) + ' sequences' +
            ' ignored because associated to internal node (see issue #153)', file=log
        )
    leaves_taxa = [leaf.taxid for leaf in t if leaf.taxid in taxa2acc]
    t = ncbi.get_topology(leaves_taxa, intermediate_nodes=True)

    if red_factor:
        i = 0
        red_taxa = []
        for leaf in t:
            if i % red_factor == 0:
                red_taxa.append(leaf.taxid)
            i += 1
        t = ncbi.get_topology(red_taxa, intermediate_nodes=True)

    if root:
        taxa_to_keep = []
        for leaf in t:
            if root in leaf.named_lineage:
                taxa_to_keep.append(leaf.taxid)
        t = ncbi.get_topology(taxa_to_keep, intermediate_nodes=True)

    node_count = len(t.get_descendants()) + 1
    seq_count = 0

    for node in t.traverse('postorder'):
        node.name = node.taxid
        if node.is_leaf():
            first = True
            for acc in taxa2acc[node.taxid]:
                try:
                    s = seqs[acc]
                    if first:
                        accession = '@'.join([acc] * (s['offset'].count('@') + 1))
                        path = s['fn']
                        base_len = s['seqlen']
                        infasta_offset = s['offset']
                        first = False
                    else:
                        accession += ('@' + acc) * (s['offset'].count('@') + 1)
                        path += '@' + s['fn']
                        base_len += '@' + s['seqlen']
                        infasta_offset += '@' + s['offset']
                    seq_count += 1
                except KeyError:
                    pass
            node.add_features(path=path, base_len=base_len, infasta_offset=infasta_offset, accession=accession)

    if not hasattr(t, 'taxid'):
        t.add_features(taxid=0)
    t.name = t.taxid

    return t, seq_count, node_count
コード例 #26
0
def plot_phylum_counts(domain_id,
                       rank='phylum',
                       colapse_low_species_counts=4,
                       remove_unlassified=True):
    '''

    1. get phylum tree
    2. foreach species => get phylum
    3. build phylum2count dictionnary
    3. plot barchart

    # merge eukaryotes into 5 main clades
    # merge virus as a single clade


    ATTENTION: no-rank groups and no-rank species...

    '''

    import MySQLdb
    import os
    from chlamdb.biosqldb import manipulate_biosqldb
    from ete3 import NCBITaxa, Tree, TextFace, TreeStyle, StackedBarFace
    ncbi = NCBITaxa()

    sqlpsw = os.environ['SQLPSW']
    conn = MySQLdb.connect(
        host="localhost",  # your host, usually localhost
        user="******",  # your username
        passwd=sqlpsw,  # your password
        db="interpro")  # name of the data base
    cursor = conn.cursor()

    sql = 'select * from pfam.leaf2n_genomes_%s' % rank

    cursor.execute(sql, )
    leaf_taxon2n_species = manipulate_biosqldb.to_dict(cursor.fetchall())

    leaf_taxon2n_species_with_domain = get_domain_taxonomy(domain_id, rank)

    sql = 'select phylogeny from pfam.phylogeny where rank="%s"' % (rank)

    cursor.execute(sql, )

    tree = Tree(cursor.fetchall()[0][0], format=1)

    sql = 'select * from pfam.taxid2label_%s' % rank
    cursor.execute(sql, )

    taxon_id2scientific_name_and_rank = manipulate_biosqldb.to_dict(
        cursor.fetchall())
    taxon_id2scientific_name_and_rank = {
        str(k): v
        for k, v in taxon_id2scientific_name_and_rank.items()
    }

    tss = TreeStyle()
    tss.draw_guiding_lines = True
    tss.guiding_lines_color = "blue"

    keep = []
    for lf in tree.iter_leaves():
        # n genomes

        if remove_unlassified:
            label = taxon_id2scientific_name_and_rank[str(lf.name)][0]
            if 'unclassified' in label:
                continue

        n_genomes = int(leaf_taxon2n_species[lf.name])
        if n_genomes > colapse_low_species_counts:
            keep.append(lf.name)
    print('number of leaves:', len(keep))

    tree.prune(keep)

    header_list = ['Rank', 'N genomes', 'N with %s' % domain_id, 'Percentage']
    for col, header in enumerate(header_list):

        n = TextFace('%s' % (header))
        n.margin_top = 0
        n.margin_right = 1
        n.margin_left = 20
        n.margin_bottom = 1
        n.rotation = 270
        n.hz_align = 2
        n.vt_align = 2
        n.inner_background.color = "white"
        n.opacity = 1.
        tss.aligned_header.add_face(n, col)

    for lf in tree.iter_leaves():
        # n genomes

        n_genomes = int(leaf_taxon2n_species[lf.name])
        if n_genomes <= colapse_low_species_counts:
            continue

        n = TextFace('  %s ' % str(leaf_taxon2n_species[lf.name]))
        n.margin_top = 1
        n.margin_right = 1
        n.margin_left = 0
        n.margin_bottom = 1
        n.fsize = 7
        n.inner_background.color = "white"
        n.opacity = 1.
        lf.add_face(n, 2, position="aligned")

        # n genomes with domain
        m = TextFace('  %s ' % str(leaf_taxon2n_species_with_domain[lf.name]))
        m.margin_top = 1
        m.margin_right = 1
        m.margin_left = 0
        m.margin_bottom = 1
        m.fsize = 7
        m.inner_background.color = "white"
        m.opacity = 1.
        lf.add_face(m, 3, position="aligned")

        # rank
        ranks = ncbi.get_rank([lf.name])
        try:
            r = ranks[max(ranks.keys())]
        except:
            r = '-'
        n = TextFace('  %s ' % r, fsize=14, fgcolor='red')
        n.margin_top = 1
        n.margin_right = 1
        n.margin_left = 0
        n.margin_bottom = 1
        n.fsize = 7
        n.inner_background.color = "white"
        n.opacity = 1.
        lf.add_face(n, 1, position="aligned")

        # percent with target domain
        percentage = (float(leaf_taxon2n_species_with_domain[lf.name]) /
                      float(leaf_taxon2n_species[lf.name])) * 100

        m = TextFace('  %s ' % str(round(percentage, 2)))
        m.fsize = 1
        m.margin_top = 1
        m.margin_right = 1
        m.margin_left = 0
        m.margin_bottom = 1
        m.fsize = 7
        m.inner_background.color = "white"
        m.opacity = 1.
        lf.add_face(m, 4, position="aligned")

        b = StackedBarFace([percentage, 100 - percentage],
                           width=100,
                           height=10,
                           colors=["#7fc97f", "white"])
        b.rotation = 0
        b.inner_border.color = "grey"
        b.inner_border.width = 0
        b.margin_right = 15
        b.margin_left = 0
        lf.add_face(b, 5, position="aligned")

        n = TextFace('%s' % taxon_id2scientific_name_and_rank[str(lf.name)][0],
                     fgcolor="black",
                     fsize=9)  # , fstyle = 'italic'

        lf.name = " %s (%s)" % (taxon_id2scientific_name_and_rank[str(
            lf.name)][0], str(lf.name))
        n.margin_right = 10
        lf.add_face(n, 0)

    tss.show_leaf_name = False

    for node in tree.traverse("postorder"):
        try:
            r = taxon_id2scientific_name_and_rank[str(node.name)][1]
        except:
            pass
        try:
            if r in ['phylum', 'superkingdom', 'class', 'subphylum'
                     ] or taxon_id2scientific_name_and_rank[str(
                         node.name)][0] in ['FCB group']:

                hola = TextFace(
                    "%s" %
                    (taxon_id2scientific_name_and_rank[str(node.name)][0]))
                node.add_face(hola, column=0, position="branch-top")
        except:
            pass
    return tree, tss
コード例 #27
0
from ete3 import NCBITaxa


ncbiTaxa = NCBITaxa()
コード例 #28
0
def get_rank_summary_statistics(rank='phylum'):
    '''

    Get phylogeny from the ncbi taxonomy database given the taxon list in the table pfam.refseq_ref_repres_genomes
    Keep rank phylogeny in the table pfam.phylogeny
    Calculate genome counts for each taxon at the specified rank. Save taxid2count in the table: pfam.<rank>_leaf2n_genomes

    :param rank:
    :return:
    '''

    import MySQLdb
    import os
    from ete3 import NCBITaxa, Tree, TextFace, TreeStyle, StackedBarFace
    ncbi = NCBITaxa()

    sqlpsw = os.environ['SQLPSW']
    conn = MySQLdb.connect(
        host="localhost",  # your host, usually localhost
        user="******",  # your username
        passwd=sqlpsw,  # your password
        db="pfam")  # name of the data base
    cursor = conn.cursor()

    sql = 'create table if not exists pfam.phylogeny (rank varchar(400), phylogeny TEXT)'
    cursor.execute(sql, )
    conn.commit()

    sql2 = 'CREATE table if not exists pfam.leaf2n_genomes_%s(taxon_id INT, n_genomes INT)' % rank
    cursor.execute(sql2, )
    conn.commit()

    sql_taxid_list = 'select taxid from pfam.refseq_ref_repres_genomes'
    cursor.execute(sql_taxid_list, )
    taxid_list = [i[0] for i in cursor.fetchall()]

    tree = ncbi.get_topology(taxid_list, rank_limit=rank)

    taxon_id_list = [int(i.name) for i in tree.traverse("postorder")]
    taxon_id2scientific_name = ncbi.get_taxid_translator(taxon_id_list)

    sql = 'CREATE table if not exists pfam.taxid2label_%s(taxon_id INT, scientific_name TEXT, rank TEXT)' % (
        rank)
    cursor.execute(sql, )

    taxon_id2rank = {}
    for taxon in taxon_id2scientific_name:
        ranks = ncbi.get_rank([taxon])

        try:
            r = ranks[max(ranks.keys())]
        except:
            r = '-'
        taxon_id2rank[taxon] = r

    for taxon in taxon_id2scientific_name:
        sql = 'insert into taxid2label_%s values(%s, "%s", "%s")' % (
            rank, taxon, taxon_id2scientific_name[taxon], taxon_id2rank[taxon])

        cursor.execute(sql, )
    conn.commit()

    collapse = [
        'Opisthokonta', 'Alveolata', 'Amoebozoa', 'Stramenopiles',
        'Viridiplantae', 'Rhodophyta', 'Trypanosomatidae', 'Viruses',
        'unclassified Bacteria', 'Leptospiraceae',
        'unclassified Gammaproteobacteria', 'unclassified Alphaproteobacteria',
        'unclassified Epsilonproteobacteria',
        'unclassified Deltaproteobacteria',
        'unclassified Cyanobacteria (miscellaneous)',
        'unclassified Firmicutes sensu stricto',
        'unclassified Actinobacteria (class) (miscellaneous)',
        'unclassified Tissierellia', 'Dehalogenimonas'
    ]
    #def collapsed_leaf(node):
    #    collapse = ['Opisthokonta', 'Alveolata','Amoebozoa','Stramenopiles','Viridiplantae','Rhodophyta', 'Trypanosomatidae', 'Viruses']
    #    name = taxon_id2scientific_name[int(node.name)]
    #    if name in collapse:
    #       return True
    #    else:
    #       return False

    # colapse major euk clades some clades

    for node in tree.traverse("postorder"):
        name = taxon_id2scientific_name[int(node.name)]
        to_detach = []
        if name in collapse:
            to_detach.extend(node.children)
            print('ok-------------------', node.name)
        for n in to_detach:
            n.detach()
    leaves_list = [i.name for i in tree.iter_leaves()]
    leaf_taxon2n_species = {}
    leaf_taxon2n_species_with_domain = {}
    for leaf_taxon in leaves_list:
        print('leaf', leaf_taxon)
        leaf_taxon2n_species[leaf_taxon] = 0
        leaf_taxon2n_species_with_domain[leaf_taxon] = 0
        for taxon in taxid_list:
            lineage = ncbi.get_lineage(taxon)
            if int(leaf_taxon) in lineage:
                leaf_taxon2n_species[leaf_taxon] += 1
                #if taxon in taxid_with_domain_list:
                #    leaf_taxon2n_species_with_domain[leaf_taxon]+=1
    for leaf_taxon in leaf_taxon2n_species:
        sql = 'insert into pfam.leaf2n_genomes_%s values(%s, %s)' % (
            rank, leaf_taxon, leaf_taxon2n_species[leaf_taxon])
        cursor.execute(sql, )
    conn.commit()

    sql = 'insert into pfam.phylogeny values("%s","%s")' % (
        rank, tree.write(format=1))
    cursor.execute(sql, )
    conn.commit()
コード例 #29
0
import os
import sys

from ete3 import NCBITaxa

ncbi = NCBITaxa()

# lineage = ncbi.get_lineage(9606)
# print (ncbi.get_rank(lineage))
# print (ncbi.get_taxid_translator(lineage))

acc_list = {}
with open(sys.argv[1]) as f:
    for line in f:
        val = line.strip()
        acc_list[val] = 1

levels = {
    'superkingdom': 1,
    'phylum': 1,
    'class': 1,
    'order': 1,
    'family': 1,
    'genus': 1,
    'species': 1
}
with open(sys.argv[2]) as f:
    for line in f:
        if line.startswith('#'):
            continue
        val = line.strip().split('\t')
コード例 #30
0
 def __init__(self, category):
     self.ncbi = NCBITaxa()
     self.species = list(
         self.ncbi.get_descendant_taxa(category, collapse_subspecies=True))
     self.ranks = self.ncbi.get_rank(self.species)
     self.taxas = filter(lambda x: self.ranks[x] == 'species', self.species)