コード例 #1
0
def activate_ncbi(update=True):
    print("\nActivating NCBI taxonomy database...")
    ncbi = NCBITaxa()
    if update is True:
        print("\tUpdating database...")
        ncbi.update_taxonomy_database()
    return ncbi
コード例 #2
0
def get_taxonomy(species_name,
                 name_format="Genus species",
                 ranks=None,
                 update_db=False):
    species_name = str(species_name)
    ncbi = NCBITaxa()
    if update_db == True:
        ncbi.update_taxonomy_database()
    if name_format == "Genus species":
        species_name = species_name
    if name_format == "Genus_species":
        species_name = species_name.replace("_", " ")
    species_id = ncbi.get_name_translator([species_name])
    if len(species_id) == 0 and ranks == None:
        return (['unknown'])
    if len(species_id) == 0 and ranks != None:
        return (['unknown'] * len(ranks))
    lineage_ids = ncbi.get_lineage(species_id[species_name][0])
    names = ncbi.get_taxid_translator(lineage_ids)
    if ranks == None:
        return (names)
    lineage_rk = ncbi.get_rank(lineage_ids)
    parsed_names = []
    for rk in ranks:
        for rk_id, rk_rk in lineage_rk.items():
            if rk_rk == rk:
                parsed_names.append(ncbi.get_taxid_translator([rk_id])[rk_id])
    return (parsed_names)
コード例 #3
0
def initNCBI():
    """
    Build the dabase (if not build before), and update its contents
    :return: connection
    """
    ncbi = NCBITaxa()
    ncbi.update_taxonomy_database()
    return ncbi
コード例 #4
0
def update_db(update):
    ncbi = NCBITaxa()
    message = "ete3 taxonomy database loaded\n"
    if update:
        ncbi.update_taxonomy_database()
        message = "ete3 taxonomy database updated\n"
    with open("db_update_status.txt", 'w') as f:
        f.write(message)
コード例 #5
0
def main():
    """Make queries against NCBI Taxa databases"""
    # Get commandline args
    args = get_args()

    # Instantiate the ete NCBI taxa object
    ncbi = NCBITaxa()

    if args.verbose > 1:
        print("Taxa database is stored under ~/.etetoolkit/taxa.sqlite")

    # Update the database if required.
    if args.update is True:
        if args.verbose > 1:
            print(
                "Updating the taxonomy database. This may take several minutes..."
            )
        ncbi.update_taxonomy_database()

    # If a name was provided instead of a TaxID, convert and store it.
    if args.name:
        args.taxid = ncbi.get_name_translator([args.name])[args.name][0]

    if args.verbose > 0:
        tax_dict = {}
        # If a name was provided, simply add it to dict
        if args.name:
            tax_dict['Name'] = args.name
        # If not, do the opposite conversion to the above and store that
        else:
            tax_dict['Name'] = ncbi.get_taxid_translator([args.taxid
                                                          ])[args.taxid]

# Continue to populate the taxa dict with other information
        tax_dict['TaxID'] = args.taxid
        tax_dict['Rank'] = ncbi.get_rank([args.taxid])
        tax_dict['Lineage'] = ncbi.get_taxid_translator(
            ncbi.get_lineage(args.taxid))

        print("Information about your selected taxa:")
        pretty(tax_dict)

    # Main feature of the script is to get all taxa within a given group.
    descendent_taxa = ncbi.get_descendant_taxa(args.taxid)
    descendent_taxa_names = ncbi.translate_to_names(descendent_taxa)
    print("Descendent taxa for TaxID: %s" % (args.taxid))

    # Under python3, zip = izip. In python2, this list could be very large, and memory intensive
    # Suggest the script is run with python3
    if args.verbose > 0:
        for dtn, dt in zip(descendent_taxa_names, descendent_taxa):
            print("%s\t%s" % (dtn, dt))

    if args.outfile:
        with open(args.outfile, 'w') as ofh:
            for id in descendent_taxa:
                ofh.write(str(id) + '\n')
コード例 #6
0
def get_taxonomy(updateBool, spName):
    ncbi = NCBITaxa()

    #add update condition
    if updateBool is True:
        ncbi.update_taxonomy_database()

    #get only genus name
    genus = spName.partition('_')[0]

    name2taxid = ncbi.get_name_translator([genus])

    lineage = ncbi.get_lineage(name2taxid[genus][0])

    return lineage[2:]
コード例 #7
0
ファイル: check_db_age.py プロジェクト: wangdi2014/FANCy
def check_taxa_db_age(dbLocation,sqliteLoc):
    # if file doesn't exist, catch the error and run the update, as it will create the file.
    ncbi = NCBITaxa(sqliteLoc)

    try:
        filetime = datetime.fromtimestamp(path.getctime(dbLocation))
        one_month_ago = datetime.now() - timedelta(days=30)
        if filetime < one_month_ago:
            # File older than 1 month, update it:
            print('<> NCBITaxa Database older than 1 month, updating it <>')
            ncbi.update_taxonomy_database()
        else:
            print('<> NCBITaxa Database up to date <>')
    except:
        print("<> NCBITaxa Database didn't exist, downloading it <>")
        ncbi.update_taxonomy_database()
コード例 #8
0
def main(args):

    # STEP 1: Set up logger
    log = logging.getLogger(__name__)
    coloredlogs.install(fmt='%(asctime)s [%(levelname)s] %(message)s', level='DEBUG', logger=log)

    # STEP 2: Retrieve and/or update localized NCBI Taxonomy database
    ncbi = NCBITaxa()
    if (time.time() - os.path.getmtime(os.path.join(Path.home(), ".etetoolkit/taxa.sqlite"))) > 604800:
        ncbi.update_taxonomy_database()

    # STEP 3: Prune species-level tree to family-level

        # Step 3.1 Read tree from input file
    log.debug("Loading Tree...")
    t = Tree(args.infn, format=5)
        # STEP 3.2: Add species names to species_set_from_tree set
    log.debug("Gathering species(leaf) names...")
    species_set_from_tree = set()
    for leaf in t.iter_leaves():
        species_set_from_tree.add(leaf.name.replace("_"," "))
        # STEP 3.3: Assign species to families
    log.debug("Constructing dict of species in family...")
    species_in_family = get_species_in_family(species_set_from_tree, ncbi)
        # STEP 3.4: Prune the tree
    log.debug("Pruning Tree to family level...")
    prune_to_family(t, species_in_family)

    # STEP 4: Calculate counts of species per family and plastid genome entries per family and attach them to the tree leaves

        # STEP 4.1: Read plastid genome information from input table
    species_list_from_table = get_species_list_from_table(args.tablefn)
        # STEP 4.2: Count plastid genome entries per family
    log.debug("Counting plastid genome entries per family...")
    genome_count_per_family = get_genome_count_per_family(species_list_from_table, species_in_family)
        # STEP 4.3: Attach counts to tree leaves
    log.debug("Attaching counts to Tree...")
    attach_counts_to_tree(t, genome_count_per_family, get_species_count_per_family(species_in_family))

    # STEP 5: Set TreeStyle and render tree
    ts = TreeStyle()
    ts.mode = "c"
    ts.draw_guiding_lines = True
    ts.show_leaf_name = False
    log.debug("Rendering Tree...")
    t.render(args.outfn, w=10000, h=10000, tree_style=ts)
コード例 #9
0
def main():
    """Make queries against NCBI Taxa databases
    """
    # Get commandline args		
    args = get_args()
	
    # Instantiate the ete NCBI taxa object
    ncbi = NCBITaxa(dbfile=args.database)
    ## dbfile location
    if args.verbose > 1:
        sys.stderr.write('Taxa database is stored at {}\n'.format(ncbi.dbfile))

    # Update the database if required.
    if args.update is True:
        if args.verbose > 1:
            msg = 'Updating the taxonomy database. This may take several minutes...\n'
            sys.stderr.write(msg)
        ncbi.update_taxonomy_database()
            
    # If names were provided in taxid list, convert to taxids
    args.taxid = args.taxid.replace('"', '').replace("'", '').split(',')
    args.taxid = name2taxid(args.taxid, ncbi)

    # Output
    if args.outfile is None:
        outFH = sys.stdout
    else:
        outFH = open(args.outfile, 'w')
    ## header
    if args.taxon_info:
        outFH.write('\t'.join(['name', 'taxid', 'rank', 'lineage']) + '\n')
    elif not args.just_taxids:
        outFH.write('\t'.join(['parent_taxid',
                               'descendent_taxid',
                               'descendent_name']) + '\n')
    ## body
    for taxid in args.taxid:
        if args.taxon_info:
            taxon_info(taxid, ncbi, outFH)
        else:
            desc_taxa(taxid, ncbi,  outFH, args.just_taxids)
            
    outFH.close()
コード例 #10
0
ファイル: utils.py プロジェクト: yemilawal/mob-suite
def initETE3Database(database_directory, ETE3DBTAXAFILE, logging):
    lockfilepath = os.path.join(database_directory, ".lock")

    if os.path.exists(lockfilepath) == False:
        open(file=lockfilepath, mode="w").close()
        logging.info("Placed lock file at {}".format(lockfilepath))
    else:
        while os.path.exists(lockfilepath):
            elapsed_time = time.time() - os.path.getmtime(lockfilepath)
            logging.info("Lock file found at {}. Waiting for other processes to finish ete3 database init ...".format(
                lockfilepath))
            logging.info(
                "Elapsed time {} min. Will continue processing after 16 min mark.".format(int(elapsed_time / 60)))
            if elapsed_time >= 1000:
                logging.info(
                    "Elapsed time {} min. Assuming previous process completed all init steps. Continue ...".format(
                        int(elapsed_time / 60)))
                try:  # if previous process failed, no processes are running and > 16 min passed since the lock was created
                    os.remove(lockfilepath)
                except:  # continue if file was removed by other process
                    pass
                break
            time.sleep(60)  # recheck every 1 min if lock file was removed by other process
        logging.info("Lock file no longer exists. Assuming init process completed successfully")

    ncbi = NCBITaxa()
    ncbi.dbfile = ETE3DBTAXAFILE
    ncbi.update_taxonomy_database()

    try:
        os.remove(lockfilepath)
        logging.info("Lock file removed.")
    except:
        logging.warning("Lock file is already removed by some other process.")
        pass

    try:
        os.remove(os.path.join(os.getcwd(), "taxdump.tar.gz"))
        logging.info("Removed residual taxdump.tar.gz as ete3 is not doing proper cleaning job.")
    except:
        pass
    logging.info("ETE3 database init completed successfully.")
コード例 #11
0
def setup_database(force_update=False):
    """ Setup a local sqllite copy of the NCBI Taxonomy database. If :obj:`force_update` is `False`, then
    only download the content from NCBI and build the sqllite database, if a local database doesn't already
    exist. If :obj:`force_update` is `True`, then always download the content from NCBI and rebuild the
    sqllite copy of the database.

    Args:
        force_update (:obj:`bool`, optional):

            * :obj:`False`: only download the content for the database and build a local sqllite database
                if a local sqllite copy of the database doesn't already exist
            * :obj:`True`: always download the content for the database from NCBI and rebuild a local sqllite
                database
    """
    ncbi_taxa = NCBITaxa()
    if force_update:
        # force downloading of latest content from NCBI and (re)building of local sqllite database
        ncbi_taxa.update_taxonomy_database()
    else:
        # run an operation on the local sqllite database to trigger NCBITaxa to setup a local sqllite
        # database if one doesn't already exist
        ncbi_taxa.get_descendant_taxa('H**o')
コード例 #12
0
ファイル: tax_collector.py プロジェクト: samodha/tRep
def main(**args):
    out_base = args.get('out_loc')
    skip_scaffs = args.get('SkipScaffolds', False)
    skip_genes = args.get('SkipGenes', False)
    stb = args.get('scaffold2bin', None)
    update = args.get('update', False)

    if update:
        from ete3 import NCBITaxa
        ncbi = NCBITaxa()
        ncbi.update_taxonomy_database()

    # Make Tdb (gene_level taxonomy)
    Tdb = tRep.controller.convert_b6_to_Tdb(args, save=False)
    if not skip_genes:
        Tdb.to_csv(os.path.join(out_base) + '_fullGeneTaxonomy.tsv', \
            index=False, sep='\t')

    # Make genome level taxonomy
    if stb is None:
        pass
    else:
        if stb == 'ALL':
            Tdb['bin'] = 'genome'
        else:
            Tdb = tRep.add_bin_to_tdb(Tdb, stb)
        gdb = tRep.gen_taxonomy_table(Tdb, on='bin')
        gdb.to_csv(os.path.join(out_base) + '_fullGenomeTaxonomy.tsv', \
            index=False, sep='\t')

    # Make scaffold level taxonomy
    if not skip_scaffs:
        try:
            sdb = tRep.gen_taxonomy_table(Tdb, on='scaffold')
            sdb.to_csv(os.path.join(out_base) + '_fullScaffoldTaxonomy.tsv', \
                index=False, sep='\t')
        except:
            print('unable to parse scaffold information- skipping')
コード例 #13
0
from ete3 import NCBITaxa
ncbi = NCBITaxa()
ncbi.update_taxonomy_database()
コード例 #14
0
def analysis():
    args = setting()
    cwd = args.workdir #os.getcwd()
    ncbi = NCBITaxa()
    home = str(Path.home())
    pathogens = args.pathogens_species.split(",")
    file_combined_fastq = os.path.join(os.getcwd(), args.fastq)
    if not os.path.isfile(file_combined_fastq):
        fastq_files = [os.path.join(file_combined_fastq, f) for f in listdir(file_combined_fastq) if isfile(join(file_combined_fastq, f)) and f.endswith("fastq")]
        k = file_combined_fastq.rfind("/")
        file_combined_fastq = file_combined_fastq[:k] + ".fastq" + file_combined_fastq[k + 1:]
        with open(file_combined_fastq, 'wb') as wfd:
            for file in fastq_files:
                with open(file, 'rb') as fd:
                    shutil.copyfileobj(fd, wfd, 1024 * 1024 * 10)

    reads_fastq = []
    if file_combined_fastq.endswith("fastq") or file_combined_fastq.endswith("fq"):
        for record in SeqIO.parse(file_combined_fastq, "fastq"):
            reads_fastq.append(str(record.id))
    elif file_combined_fastq.endswith("fasta") or file_combined_fastq.endswith("fa"):
        for record in SeqIO.parse(file_combined_fastq, "fasta"):
            reads_fastq.append(str(record.id))
    else:
        print("Not known reads file format")

    number_reads = len(reads_fastq)

    if args.host_specie == "" and args.pathogens_species == "":
        species = ""
    elif args.host_specie == "" and not args.pathogens_species == "":
        species = pathogens
    elif not args.host_specie == "" and args.pathogens_species == "":
        species = [args.host_specie]
    else:
        species = [args.host_specie] + pathogens

    species.sort()
    name_database = "_".join(species).replace(" ", "_")
    genome_db = os.path.join(cwd, name_database + ".fasta")
    genome_db_id = os.path.join(cwd, name_database + ".txt")
    all_genomes = False
    if "refseq" in args.NCBIdatabase:
        table_file = "assembly_summary_refseq.txt"
    if "assembly" in args.NCBIdatabase:
        all_genomes = True
        table_file = "assembly_summary_genbank.txt"
    if os.path.exists(os.path.join(cwd,table_file)):
        os.remove(os.path.join(cwd,table_file))
    cmd = WGET % table_file
    wget = sb.Popen(cmd, shell=True, stdout=sb.PIPE, stderr=sb.PIPE, cwd=cwd)
    wget.communicate()



    sys.stdout.write("### UPDATING THE DATABASE\n")
    # This part checks for a new version of the taxdump.tar.gz; the code looks for a new version every day
    ete = os.path.expanduser("~/.etetoolkit/taxa.sqlite.traverse.pkl")
    modified = os.path.getmtime(ete)
    modificationTime = time.strftime('%m', time.localtime(modified))
    today = datetime.date.today()
    month = today.strftime("%m")
    if modificationTime != month:
        ncbi.update_taxonomy_database()
    dict_species = {}

    # here we set if is an not know pathogen or we have and idea of which pathogen to investigate
    with open(os.path.join(cwd, table_file), "r") as fh:
        descendants_all = []
        for specie in species:
            name2taxid = ncbi.get_name_translator([specie])
            if args.host_specie in specie:
                plant = name2taxid[specie]
            for key in name2taxid[specie]:
                descendants = ncbi.get_descendant_taxa(key, collapse_subspecies=True)
                for sstaxa in descendants:
                    descendants_all.append(str(sstaxa))
        for line in fh:
            if not line.startswith("#"):
                if line.split("\t")[6] in descendants_all:# and "subsp" in line:
                    ssname = " ".join([line.split("\t")[7].split(" ")[0], line.split("\t")[7].split(" ")[1]])
                    tax = line.split("\t")[6]
                    ftp = line.split("\t")[19]
                    genome = ftp.split("/")[-1] + "_genomic.fna.gz"
                    ftp_genome = os.path.join(ftp, genome)
                    path_genome = os.path.join(cwd, genome)
                    #species_assembly = " ".join([line.split("\t")[7]].split(" ")[0], [line.split("\t")[7]].split(" ")[1])
                    if ssname in dict_species:
                        dict_species[ssname] = dict_species[ssname] + [(ftp_genome, path_genome, tax, genome, ssname)]
                    else:
                        dict_species[ssname] = [(ftp_genome, path_genome, tax, genome, ssname)]
    db_file = os.path.join(home, ".db_monica." + name_database)
    if all_genomes:
        print("DOWNLOADING MULTIPLE GENOMES FOR THE SAME SPECIES")
        genomes_select = [name for specie in dict_species for name in dict_species[specie]]
    else:
        print("DOWNLOADING ONE GENOME FOR SPECIES")
        genomes_select = [dict_species[specie][-1] for specie in dict_species]
    print("I WILL DOWNLOAD %s GENOMES" % str(len(genomes_select)))
    if not os.path.exists(db_file) or not os.path.exists(genome_db):
        with open(genome_db, "w") as output_handle, open(genome_db_id, "w") as output_handle_id:
            with open(db_file, "w") as fh:
                for names in genomes_select:
                    ftp_genome, path_genome, tax, genome, ssname = names
                    if genome.startswith("GC"):
                        genome_used = cwd + genome + "\n"
                        fh.write(genome_used)
                        if not os.path.exists(path_genome):
                            cmd = WGET_GENOME % ftp_genome
                            wget_gen = sb.Popen(cmd, shell=True, stdout=sb.PIPE, stderr=sb.PIPE, cwd=cwd)
                            wget_gen.communicate()
                        with gzip.open(path_genome, "rt") as handle:
                            print("PARSING " + genome + " GENOME")
                            for record in SeqIO.parse(handle, "fasta"):
                                record.id = tax + "_" + str(record.id)
                                record.description = genome.split(".")[0]
                                SeqIO.write(record, output_handle, "fasta")
                                output_handle_id.write(str(record.name) + "%" + str(record.description) + "\n")

    sys.stdout.write("### PREPARING FOR MAPPING\n")
    genome_to_contig = {}
    with open(genome_db_id, "r") as fhtxt:
        for record in fhtxt: #txt SeqIO.parse(genome_db, "fasta"):
            line = record.split("%")
            genome_to_contig[line[0]] = line[1].rsplit()
    genome_to_species= {}
    with open(os.path.join(cwd, table_file), "r") as fh:
        for line in fh:
            line = line.rstrip().split("\t")
            genome = line[0].split(".")[0]
            if len(line) > 9 and not line[0].startswith("#"):
                subspecies = line[7].split(" ")[:2]
                subspecie = "_".join(subspecies) #+ " " + line[8].split("=")[1:]
                tribu = "_".join(line[8].split("=")[1:])
                genome_to_species[genome] = subspecie + "-" + tribu
    sam_output = file_combined_fastq + ".sam"
    cmd = MINIMAP % (str(args.threads), genome_db, file_combined_fastq, sam_output)
    sys.stdout.write("RUNNING MINIMAP2\n")
    minimap = sb.Popen(cmd, shell=True, cwd=cwd)
    minimap.communicate()
    reads_dict = {}
    count = 0
    with open(sam_output) as fh:
        for sam in fh:
            if sam != "" and not sam.startswith("@"):
                fields = sam.split("\t")
                if not fields[2] == "*":
                    for entry in fields:
                        if entry.startswith("MD"):
                            md = entry.split(":")[-1]
                            mismatch = len(re.findall("[A-Z]", md))
                            match = sum([int(number) for number in re.sub('[A-Z]|\^', ',', md).split(",") if number != "" and number.isdigit()])
                            if match > 0:
                                if mismatch > 0:
                                    iden = (match - mismatch) / match * 100
                                    if fields[0] in reads_dict:
                                        if iden == reads_dict[fields[0]][0]:
                                            if reads_dict[fields[0]][1].startswith(fields[2].split("_")[0]):
                                                continue
                                            else:
                                                count += 1
                                                reads_dict.pop(fields[0], None)
                                        elif iden > reads_dict[fields[0]][0]:
                                            reads_dict[fields[0]] = (iden, fields[2], fields[0])
                                    else:
                                        reads_dict[fields[0]] = (iden, fields[2], fields[0])
                                else:
                                    iden = 100
                                    if fields[0] in reads_dict:
                                        if iden == reads_dict[fields[0]][0]:
                                            if reads_dict[fields[0]][1].startswith(fields[2].split("_")[0]):
                                                continue
                                            else:
                                                count += 1
                                                reads_dict.pop(fields[0], None)
                                        elif iden > reads_dict[fields[0]][0]:
                                            reads_dict[fields[0]] = (iden, fields[2], fields[0])
                                    else:
                                        reads_dict[fields[0]] = (iden, fields[2], fields[0])

    out_file = file_combined_fastq + ".reads.txt"
    with open(out_file, "w") as csv:
        for key in reads_dict:
            csv.write("\t".join([reads_dict[key][1], reads_dict[key][2]]) + " \n")
    print(count)
    count = {}
    number_reads_mapped = 0
    for read in reads_dict:
        match = reads_dict[read][1].split("_")
        if len(match) > 1:
            number_reads_mapped += 1
            if all_genomes:
                contig = match[1] #+ "_" + match[2]
            else:
                contig = match[1] + "_" + match[2]
            genome_map = genome_to_contig[contig]
            species_ss = genome_to_species[genome_map[0]]
            uniq_name = match[0] + "_" + species_ss
            if not uniq_name in count:
                count[uniq_name] = 1
            else:
                count[uniq_name] = count[uniq_name] + 1
    print("Name sample: " + file_combined_fastq)
    print("Number reads:" + str(number_reads))
    print("Number reads mapped:" + str(number_reads_mapped) + "\nPercentage of reads mapped:" + str(
        number_reads_mapped/number_reads * 100) + " %\n")
    header = []
    reads_mapped = []
    partial_tree = []
    for clade in types:
        header.append(clade[0])
        reads_mapped.append("")
    header.append("A")
    reads_mapped.append(str(number_reads-number_reads_mapped))
    total = [header] + [reads_mapped]
    tribu_dict = {}
    sorted_list = []
    for value in count:
        key = value.split("_")[0]
        if not str(key).startswith(str(plant[0])):
            sorted_list.append((value[1],(count[value]/number_reads_mapped*100)))
            lineage = ncbi.get_lineage(int(key))
            a = ncbi.get_rank(lineage)
            tribu = value.split("-")[1]
            tribu_dict["tribu"] = tribu
            tree = []
            for match in types:
                combination = [match[1]]
                if match[0] in tribu_dict:
                    combination.append("".join([tribu_dict[match[0]]]))
                else:
                    for tax in a:
                        if match[0].startswith(a[tax]) and match[0].endswith(a[tax]):
                            combination.append(ncbi.get_taxid_translator([int(tax)])[tax].replace(" ","_"))
                tree.append("".join(combination))
            tree.append(str(count[value]))
            partial_tree = partial_tree + [tree]
    partial_tree.sort()
    total = total + partial_tree
    out_file = file_combined_fastq + ".txt"
    with open(out_file, "w") as csv:
        for line in total:
            csv.write(",".join(line) + " \n")
    plot_circ(out_file, file_combined_fastq)
    print("done")
コード例 #15
0
                  help='Species list in text format one species in each line')

parser.add_option('-f', '--format', type='choice', choices=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '100'], dest="format",
                  default='8', help='outpur format for tree')

parser.add_option('-t', '--treebest', type='choice', choices=['yes', 'no'], dest="treebest",
                  default='no', help='To be used in TreeBest')

parser.add_option('-d', '--database', type='choice', choices=['yes', 'no'], dest="database",
                  default='no', help='Update database')

options, args = parser.parse_args()

if options.database == "yes":
    try:
        ncbi.update_taxonomy_database()
    except:
        pass

if options.input_species_filename is None:
    raise Exception('-s option must be specified, Species list in text format one species in each line')

with open(options.input_species_filename) as f:
    species_name = [_.strip().replace('_', ' ') for _ in f.readlines()]

name2taxid = ncbi.get_name_translator(species_name)

taxid = [name2taxid[_][0] for _ in species_name]

tree = ncbi.get_topology(taxid)
コード例 #16
0
ファイル: mob_init.py プロジェクト: phac-nml/mob-suite
def main():
    args = arguments()

    database_directory = os.path.abspath(args.database_directory)

    if os.path.exists(database_directory) == False:
        os.mkdir(database_directory)
    else:
        logger.info("Database directory folder already exists at {}".format(
            database_directory))

    # Helper function to simplify adding database_directory to everything
    prepend_db_dir = functools.partial(os.path.join, database_directory)

    lockfilepath = os.path.join(database_directory, ".lock")
    status_file = prepend_db_dir('status.txt')

    if os.path.exists(lockfilepath) == False:
        try:
            open(file=lockfilepath, mode="w").close()
            logger.info("Placed lock file at {}".format(lockfilepath))
        except Exception as e:
            logger.error(
                "Failed to place a lock file at {}. Database diretory can not be accessed. Wrong path?"
                .format(lockfilepath))
            logger.error("{}".format(e))
            pass
    else:
        while os.path.exists(lockfilepath):
            elapsed_time = time.time() - os.path.getmtime(lockfilepath)
            logger.info(
                "Lock file found at {}. Waiting for other processes to finish database init ..."
                .format(lockfilepath))
            logger.info(
                "Elapsed time {} min. Will continue processing after 16 min mark."
                .format(int(elapsed_time / 60)))
            if elapsed_time >= 1000:
                logger.info(
                    "Elapsed time {} min. Assuming previous process completed all init steps. Continue ..."
                    .format(int(elapsed_time / 60)))
                try:  #if previous process failed, no processes are running and > 16 min passed since the lock was created
                    os.remove(lockfilepath)
                except:  #continue if file was removed by other process
                    pass
                break
            time.sleep(60)  #recheck every 1 min if lock file was removed
        logger.info(
            "Lock file no longer exists. Assuming init process completed successfully"
        )
        return 0

    logger.info('Initializing databases...this will take some time')
    # Find available threads and use the maximum number available for mash sketch but cap it at 32
    num_threads = min(multiprocessing.cpu_count(), 32)

    if not os.path.exists(database_directory):
        os.makedirs(database_directory)

    zip_file = prepend_db_dir('data.tar.gz')
    plasmid_database_fasta_file = prepend_db_dir('ncbi_plasmid_full_seqs.fas')
    repetitive_fasta_file = prepend_db_dir('repetitive.dna.fas')
    mash_db_file = prepend_db_dir('ncbi_plasmid_full_seqs.fas.msh')

    logger.info('Downloading databases...this will take some time')

    for db_mirror in config['db_mirrors']:
        try:
            logger.info('Trying mirror {}'.format(db_mirror))
            download_to_file(db_mirror, zip_file)
            break
        except Exception as e:
            logger.error(
                "Download failed with error {}. Removing lock file".format(
                    str(e)))
            os.remove(lockfilepath)
            sys.exit(-1)

    logger.info(
        "Downloading databases successful, now building databases at {}".
        format(database_directory))
    extract(zip_file, database_directory)

    files = [
        prepend_db_dir(f) for f in os.listdir(database_directory)
        if f.endswith('.gz')
    ]

    for file in files:

        extract(file, database_directory)

    #Initialize blast and mash databases
    try:
        logger.info('Building repetitive mask database')
        blast_runner = BlastRunner(repetitive_fasta_file, database_directory)
        blast_runner.makeblastdb(repetitive_fasta_file, 'nucl', logger)

        logger.info('Building complete plasmid database')
        blast_runner = BlastRunner(plasmid_database_fasta_file,
                                   database_directory)
        blast_runner.makeblastdb(plasmid_database_fasta_file, 'nucl', logger,
                                 True)

        logger.info('Sketching complete plasmid database')
        mObj = mash()
        mObj.mashsketch(plasmid_database_fasta_file,
                        mash_db_file,
                        num_threads=num_threads)
    except Exception as e:
        logger.error(
            'Downloading databases failed, please check your internet connection and retry'
        )
        logger.error(
            "Process failed with error {}. Removing lock file".format(e))
        os.remove(lockfilepath)
        sys.exit(-1)

    try:
        logger.info("Init ete3 library ...")
        ete3taxadbpath = os.path.abspath(
            os.path.join(database_directory, "taxa.sqlite"))
        ncbi = NCBITaxa()
        ncbi.dbfile = ete3taxadbpath
        ncbi.update_taxonomy_database()
    except Exception as e:
        logger.error(
            "Init of ete3 library failed with error {}. Removing lock file".
            format(e))
        os.remove(lockfilepath)
        sys.exit(-1)

    try:
        os.remove(os.path.join(os.getcwd(), "taxdump.tar.gz"))
        logger.info(
            "Removed residual taxdump.tar.gz as ete3 is not doing proper cleaning job."
        )
    except:
        pass

    with open(status_file, 'w') as f:
        download_date = datetime.datetime.today().strftime('%Y-%m-%d')
        f.write("Download date: {}. Removing lock file.".format(download_date))
        try:
            os.remove(lockfilepath)
        except:
            logger.warning(
                "Lock file is already removed by some other process.")
            pass

    logger.info("MOB init completed successfully")
    return 0
コード例 #17
0
def main(args):

    ## STEP 1. Set up logger
    log = logging.getLogger(__name__)
    coloredlogs.install(fmt='%(asctime)s [%(levelname)s] %(message)s',
                        level='INFO',
                        logger=log)

    ## STEP 2. Initialize variables
    ncbi = NCBITaxa()
    # Update database if it is older than one month
    if (time.time() - os.path.getmtime(
            os.path.join(Path.home(), ".etetoolkit/taxa.sqlite"))) > 2592000:
        ncbi.update_taxonomy_database()
    blocklist = set()
    blocklist_existing = set()

    ## STEP 3. Read blocklist if the file already exists
    if os.path.isfile(args.file_blocklist):
        log.info("Reading existing blocklist ...")
        blocklist_existing = read_blocklist(args.file_blocklist)

    ## STEP 4a. Collect genus names of IRL clade of Fabaceae
    log.info("Fetching genus names of taxa in 'IRL clade' of Fabaceae ...")
    try:
        irl_clade_genera = get_irl_clade_genera(ncbi)
    except:
        irl_clade_genera = set()
    log.info("Adding new genus names to blocklist ...")
    blocklist = set(list(irl_clade_genera.union(blocklist_existing)))

    ## STEP 4b. Collect species names of IRL clade of Fabaceae
    log.info("Fetching species names of taxa in 'IRL clade' of Fabaceae ...")
    try:
        irl_clade_species = get_irl_clade_species(ncbi)
    except:
        irl_clade_species = set()
    log.info("Adding new species names to blocklist ...")
    blocklist = set(list(irl_clade_species.union(blocklist)))

    ## STEP 5. Conduct the search on NCBI PubMed
    if args.query and args.mail:
        log.info("Querying NCBI Pubmed for taxon names ...")
        try:
            irl_clade_genera = set()
            am = AM.ArticleMining(log)
            ei = EI.EntrezInteraction(log)
            if ei.internet_on():  # Check if internet connection active
                articles = ei.fetch_pubmed_articles(args.mail, args.query)
            else:  # If no internet connection, raise error
                raise Exception("ERROR: No internet connection.")
            ncbi = NCBITaxa()
            # Update database if it is older than 1 month
            if (time.time() - os.path.getmtime(
                    os.path.join(Path.home(),
                                 ".etetoolkit/taxa.sqlite"))) > 2592000:
                ncbi.update_taxonomy_database()
            article_genera = set()
            for article in articles:
                article_genera.union(
                    am.get_genera_from_pubmed_article(article, ncbi))
        except:
            article_genera = set()
        blocklist = blocklist.union(article_genera)

    ## STEP 6. Keeping only species name if corresponding genus name present
    log.info("Removing genus names if individual species of genus in list ...")
    species_names = list()
    genus_names = set()
    genus_epithets = set()
    for line in blocklist:
        first, *rest = line.split()
        if rest:
            species_names.append(line)
            genus_epithets.add(first)
        else:
            genus_names.add(first)
    species_names.extend(genus_names - genus_epithets)
    blocklist = set(
        species_names)  # Making sure that no two lines are identical

    ## STEP 7. Write updated blocklist to file or replace old blocklist
    log.info("Writing updated blocklist to file ...")
    #append_blocklist(args.file_blocklist, blocklist)
    write_blocklist(args.file_blocklist, sorted(blocklist))
コード例 #18
0
def main(args):

  # STEP 1. Set up logger
    log = logging.getLogger(__name__)
    if args.verbose:
        coloredlogs.install(fmt='%(asctime)s [%(levelname)s] %(message)s', level='DEBUG', logger=log)
    else:
        coloredlogs.install(fmt='%(asctime)s [%(levelname)s] %(message)s', level='INFO', logger=log)
    mail = args.mail
    query = args.query
    iro = ir_operations.IROperations(log)
    EI = entrez_interaction.EntrezInteraction(log)

  # STEP 2. Read in accession numbers to loop over
    tio = table_io.TableIO(args.infn, args.outfn, args.blocklist, logger = log)
    tio.remove_blocklisted_entries()

    accessions = list(tio.entry_table["ACCESSION"].values)
    if len(accessions) > 0:
        if not os.path.exists(args.recordsdir):
            os.makedirs(args.recordsdir)
        if not os.path.exists(args.datadir):
            os.makedirs(args.datadir)

  # STEP 3. Loop over accession in inlist
    for accession in accessions:
        acc_folder = os.path.join(args.datadir, str(accession))
        if not os.path.exists(acc_folder):
            os.makedirs(acc_folder)
        else:
            log.warning("Folder for accession `%s` already exists. Skipping this accession." % (str(accession)))
            continue

        # Step 3.1. Get flatfile
        if not os.path.isfile(os.path.join(args.recordsdir, accession + ".tar.gz")):
            log.info("Saving GenBank flat file for accession `%s`." % (str(accession)))
            if EI.internet_on():  # Check if internet connection active
                try:
                    fp_entry = EI.fetch_gb_entry(accession, acc_folder)
                except:
                    log.warning("Error retrieving accession `%s`. Skipping this accession." % (str(accession)))
                    os.rmdir(acc_folder)
                    continue
            else:  # If no internet connection, raise error
                raise Exception("ERROR: No internet connection.")
        else:
            log.info("GenBank flat file for accession `%s` already exists. Extracting existing file." % (str(accession)))
            tar = tarfile.open(os.path.join(args.recordsdir, accession + ".tar.gz"), "r:gz")
            tar.extractall(acc_folder)
            tar.close()
            fp_entry = os.path.join(acc_folder, accession + ".gb")

        # Step 3.2. Parse and analyze flatfile
        try:
            try:
                rec = SeqIO.read(fp_entry, "genbank")
            except Exception as err:
                raise Exception("Error while parsing record of accession `%s`: `%s`. Skipping this accession." %
                (str(accession), str(err)))
                continue

            rec_id = str(rec.id).split('.')[0]
            # Note: This internal check ensures that we are actually dealing with the record that was
            # intended to be downloaded via efetch.
            if not rec_id == str(accession):
                log.warning("Accession number mismatch. Expected: `%s`. Retrieved: `%s`. Skipping this accession." % \
                  (str(accession), rec_id))
                continue
            log.info("Writing sequence as FASTA for accession `%s`." % (str(accession)))
            iro.write_sequence_to_fasta(str(rec.seq), ">" + str(accession) + "_completeSequence", os.path.join(acc_folder, str(accession) + "_completeSeq.fasta"))

            ira_feature = None
            irb_feature = None
            if not str(accession) in tio.ir_table.index:
                tio.ir_table = tio.ir_table.append(pd.Series(name=str(accession), dtype='float64'))
            try:
                ira_feature, irb_feature = iro.identify_inverted_repeats(rec, 1000)
                rev_comp = False
                if ira_feature and irb_feature:
                    score_noRC = fuzz.ratio(ira_feature.extract(rec).seq,
                                            irb_feature.extract(rec).seq)
                    score_RC = fuzz.ratio(ira_feature.extract(rec).seq,
                                          irb_feature.extract(rec).seq.reverse_complement())
                    if score_noRC < score_RC:
                        rev_comp = True
                ir_info = iro.collect_info_from_features(ira_feature, irb_feature)
                tio.ir_table.loc[accession] = ir_info
                tio.append_ir_info_to_table(ir_info, accession, args.outfn)
            except Exception as err:
                ir_info = iro.collect_info_from_features(ira_feature, irb_feature)
                tio.ir_table.loc[accession] = ir_info
                tio.append_ir_info_to_table(ir_info, accession, args.outfn)
                raise Exception("Error while extracting IRs for accession `%s`: `%s`. Skipping further processing of this accession." % (str(accession), str(err)))
                continue
            tio.ir_table.loc[accession] = iro.collect_info_from_features(ira_feature, irb_feature)
            # TODO: Currently, nothing is done with the table object, since the file is written entry-wise. Remove full table from use?
            iro.write_irs_to_fasta(rec, ira_feature, irb_feature, acc_folder, rev_comp)
        except Exception as err:
            log.warning(str(err))
        finally:
            if not os.path.isfile(os.path.join(args.recordsdir, accession + ".tar.gz")):
                tar = tarfile.open(os.path.join(args.recordsdir, accession + ".tar.gz"), "w:gz")
                tar.add(fp_entry, os.path.basename(fp_entry))
                tar.close()
            os.remove(fp_entry)

  # STEP 4. Check every accession for IR loss in literature and remove from outlist if so published
    if EI.internet_on():  # Check if internet connection active
        am = article_mining.ArticleMining(log)
        articles = EI.fetch_pubmed_articles(mail, query)
        ncbi = NCBITaxa()
        # Update database if it is older than one month
        if (time.time() - os.path.getmtime(os.path.join(Path.home(), ".etetoolkit/taxa.sqlite"))) > 2592000:
            ncbi.update_taxonomy_database()
        article_genera = set()
        for article in articles:
            article_genera.union(am.get_genera_from_pubmed_article(article, ncbi))
        tio.read_ir_table(args.outfn)
        tio.remove_naturally_irl_genera(article_genera)
        tio.write_ir_table(args.outfn)
    else:  # If no internet connection, skip article mining
        log.warning("No internet connection. Skipping PubMed article mining.")