def activate_ncbi(update=True): print("\nActivating NCBI taxonomy database...") ncbi = NCBITaxa() if update is True: print("\tUpdating database...") ncbi.update_taxonomy_database() return ncbi
def get_taxonomy(species_name, name_format="Genus species", ranks=None, update_db=False): species_name = str(species_name) ncbi = NCBITaxa() if update_db == True: ncbi.update_taxonomy_database() if name_format == "Genus species": species_name = species_name if name_format == "Genus_species": species_name = species_name.replace("_", " ") species_id = ncbi.get_name_translator([species_name]) if len(species_id) == 0 and ranks == None: return (['unknown']) if len(species_id) == 0 and ranks != None: return (['unknown'] * len(ranks)) lineage_ids = ncbi.get_lineage(species_id[species_name][0]) names = ncbi.get_taxid_translator(lineage_ids) if ranks == None: return (names) lineage_rk = ncbi.get_rank(lineage_ids) parsed_names = [] for rk in ranks: for rk_id, rk_rk in lineage_rk.items(): if rk_rk == rk: parsed_names.append(ncbi.get_taxid_translator([rk_id])[rk_id]) return (parsed_names)
def initNCBI(): """ Build the dabase (if not build before), and update its contents :return: connection """ ncbi = NCBITaxa() ncbi.update_taxonomy_database() return ncbi
def update_db(update): ncbi = NCBITaxa() message = "ete3 taxonomy database loaded\n" if update: ncbi.update_taxonomy_database() message = "ete3 taxonomy database updated\n" with open("db_update_status.txt", 'w') as f: f.write(message)
def main(): """Make queries against NCBI Taxa databases""" # Get commandline args args = get_args() # Instantiate the ete NCBI taxa object ncbi = NCBITaxa() if args.verbose > 1: print("Taxa database is stored under ~/.etetoolkit/taxa.sqlite") # Update the database if required. if args.update is True: if args.verbose > 1: print( "Updating the taxonomy database. This may take several minutes..." ) ncbi.update_taxonomy_database() # If a name was provided instead of a TaxID, convert and store it. if args.name: args.taxid = ncbi.get_name_translator([args.name])[args.name][0] if args.verbose > 0: tax_dict = {} # If a name was provided, simply add it to dict if args.name: tax_dict['Name'] = args.name # If not, do the opposite conversion to the above and store that else: tax_dict['Name'] = ncbi.get_taxid_translator([args.taxid ])[args.taxid] # Continue to populate the taxa dict with other information tax_dict['TaxID'] = args.taxid tax_dict['Rank'] = ncbi.get_rank([args.taxid]) tax_dict['Lineage'] = ncbi.get_taxid_translator( ncbi.get_lineage(args.taxid)) print("Information about your selected taxa:") pretty(tax_dict) # Main feature of the script is to get all taxa within a given group. descendent_taxa = ncbi.get_descendant_taxa(args.taxid) descendent_taxa_names = ncbi.translate_to_names(descendent_taxa) print("Descendent taxa for TaxID: %s" % (args.taxid)) # Under python3, zip = izip. In python2, this list could be very large, and memory intensive # Suggest the script is run with python3 if args.verbose > 0: for dtn, dt in zip(descendent_taxa_names, descendent_taxa): print("%s\t%s" % (dtn, dt)) if args.outfile: with open(args.outfile, 'w') as ofh: for id in descendent_taxa: ofh.write(str(id) + '\n')
def get_taxonomy(updateBool, spName): ncbi = NCBITaxa() #add update condition if updateBool is True: ncbi.update_taxonomy_database() #get only genus name genus = spName.partition('_')[0] name2taxid = ncbi.get_name_translator([genus]) lineage = ncbi.get_lineage(name2taxid[genus][0]) return lineage[2:]
def check_taxa_db_age(dbLocation,sqliteLoc): # if file doesn't exist, catch the error and run the update, as it will create the file. ncbi = NCBITaxa(sqliteLoc) try: filetime = datetime.fromtimestamp(path.getctime(dbLocation)) one_month_ago = datetime.now() - timedelta(days=30) if filetime < one_month_ago: # File older than 1 month, update it: print('<> NCBITaxa Database older than 1 month, updating it <>') ncbi.update_taxonomy_database() else: print('<> NCBITaxa Database up to date <>') except: print("<> NCBITaxa Database didn't exist, downloading it <>") ncbi.update_taxonomy_database()
def main(args): # STEP 1: Set up logger log = logging.getLogger(__name__) coloredlogs.install(fmt='%(asctime)s [%(levelname)s] %(message)s', level='DEBUG', logger=log) # STEP 2: Retrieve and/or update localized NCBI Taxonomy database ncbi = NCBITaxa() if (time.time() - os.path.getmtime(os.path.join(Path.home(), ".etetoolkit/taxa.sqlite"))) > 604800: ncbi.update_taxonomy_database() # STEP 3: Prune species-level tree to family-level # Step 3.1 Read tree from input file log.debug("Loading Tree...") t = Tree(args.infn, format=5) # STEP 3.2: Add species names to species_set_from_tree set log.debug("Gathering species(leaf) names...") species_set_from_tree = set() for leaf in t.iter_leaves(): species_set_from_tree.add(leaf.name.replace("_"," ")) # STEP 3.3: Assign species to families log.debug("Constructing dict of species in family...") species_in_family = get_species_in_family(species_set_from_tree, ncbi) # STEP 3.4: Prune the tree log.debug("Pruning Tree to family level...") prune_to_family(t, species_in_family) # STEP 4: Calculate counts of species per family and plastid genome entries per family and attach them to the tree leaves # STEP 4.1: Read plastid genome information from input table species_list_from_table = get_species_list_from_table(args.tablefn) # STEP 4.2: Count plastid genome entries per family log.debug("Counting plastid genome entries per family...") genome_count_per_family = get_genome_count_per_family(species_list_from_table, species_in_family) # STEP 4.3: Attach counts to tree leaves log.debug("Attaching counts to Tree...") attach_counts_to_tree(t, genome_count_per_family, get_species_count_per_family(species_in_family)) # STEP 5: Set TreeStyle and render tree ts = TreeStyle() ts.mode = "c" ts.draw_guiding_lines = True ts.show_leaf_name = False log.debug("Rendering Tree...") t.render(args.outfn, w=10000, h=10000, tree_style=ts)
def main(): """Make queries against NCBI Taxa databases """ # Get commandline args args = get_args() # Instantiate the ete NCBI taxa object ncbi = NCBITaxa(dbfile=args.database) ## dbfile location if args.verbose > 1: sys.stderr.write('Taxa database is stored at {}\n'.format(ncbi.dbfile)) # Update the database if required. if args.update is True: if args.verbose > 1: msg = 'Updating the taxonomy database. This may take several minutes...\n' sys.stderr.write(msg) ncbi.update_taxonomy_database() # If names were provided in taxid list, convert to taxids args.taxid = args.taxid.replace('"', '').replace("'", '').split(',') args.taxid = name2taxid(args.taxid, ncbi) # Output if args.outfile is None: outFH = sys.stdout else: outFH = open(args.outfile, 'w') ## header if args.taxon_info: outFH.write('\t'.join(['name', 'taxid', 'rank', 'lineage']) + '\n') elif not args.just_taxids: outFH.write('\t'.join(['parent_taxid', 'descendent_taxid', 'descendent_name']) + '\n') ## body for taxid in args.taxid: if args.taxon_info: taxon_info(taxid, ncbi, outFH) else: desc_taxa(taxid, ncbi, outFH, args.just_taxids) outFH.close()
def initETE3Database(database_directory, ETE3DBTAXAFILE, logging): lockfilepath = os.path.join(database_directory, ".lock") if os.path.exists(lockfilepath) == False: open(file=lockfilepath, mode="w").close() logging.info("Placed lock file at {}".format(lockfilepath)) else: while os.path.exists(lockfilepath): elapsed_time = time.time() - os.path.getmtime(lockfilepath) logging.info("Lock file found at {}. Waiting for other processes to finish ete3 database init ...".format( lockfilepath)) logging.info( "Elapsed time {} min. Will continue processing after 16 min mark.".format(int(elapsed_time / 60))) if elapsed_time >= 1000: logging.info( "Elapsed time {} min. Assuming previous process completed all init steps. Continue ...".format( int(elapsed_time / 60))) try: # if previous process failed, no processes are running and > 16 min passed since the lock was created os.remove(lockfilepath) except: # continue if file was removed by other process pass break time.sleep(60) # recheck every 1 min if lock file was removed by other process logging.info("Lock file no longer exists. Assuming init process completed successfully") ncbi = NCBITaxa() ncbi.dbfile = ETE3DBTAXAFILE ncbi.update_taxonomy_database() try: os.remove(lockfilepath) logging.info("Lock file removed.") except: logging.warning("Lock file is already removed by some other process.") pass try: os.remove(os.path.join(os.getcwd(), "taxdump.tar.gz")) logging.info("Removed residual taxdump.tar.gz as ete3 is not doing proper cleaning job.") except: pass logging.info("ETE3 database init completed successfully.")
def setup_database(force_update=False): """ Setup a local sqllite copy of the NCBI Taxonomy database. If :obj:`force_update` is `False`, then only download the content from NCBI and build the sqllite database, if a local database doesn't already exist. If :obj:`force_update` is `True`, then always download the content from NCBI and rebuild the sqllite copy of the database. Args: force_update (:obj:`bool`, optional): * :obj:`False`: only download the content for the database and build a local sqllite database if a local sqllite copy of the database doesn't already exist * :obj:`True`: always download the content for the database from NCBI and rebuild a local sqllite database """ ncbi_taxa = NCBITaxa() if force_update: # force downloading of latest content from NCBI and (re)building of local sqllite database ncbi_taxa.update_taxonomy_database() else: # run an operation on the local sqllite database to trigger NCBITaxa to setup a local sqllite # database if one doesn't already exist ncbi_taxa.get_descendant_taxa('H**o')
def main(**args): out_base = args.get('out_loc') skip_scaffs = args.get('SkipScaffolds', False) skip_genes = args.get('SkipGenes', False) stb = args.get('scaffold2bin', None) update = args.get('update', False) if update: from ete3 import NCBITaxa ncbi = NCBITaxa() ncbi.update_taxonomy_database() # Make Tdb (gene_level taxonomy) Tdb = tRep.controller.convert_b6_to_Tdb(args, save=False) if not skip_genes: Tdb.to_csv(os.path.join(out_base) + '_fullGeneTaxonomy.tsv', \ index=False, sep='\t') # Make genome level taxonomy if stb is None: pass else: if stb == 'ALL': Tdb['bin'] = 'genome' else: Tdb = tRep.add_bin_to_tdb(Tdb, stb) gdb = tRep.gen_taxonomy_table(Tdb, on='bin') gdb.to_csv(os.path.join(out_base) + '_fullGenomeTaxonomy.tsv', \ index=False, sep='\t') # Make scaffold level taxonomy if not skip_scaffs: try: sdb = tRep.gen_taxonomy_table(Tdb, on='scaffold') sdb.to_csv(os.path.join(out_base) + '_fullScaffoldTaxonomy.tsv', \ index=False, sep='\t') except: print('unable to parse scaffold information- skipping')
from ete3 import NCBITaxa ncbi = NCBITaxa() ncbi.update_taxonomy_database()
def analysis(): args = setting() cwd = args.workdir #os.getcwd() ncbi = NCBITaxa() home = str(Path.home()) pathogens = args.pathogens_species.split(",") file_combined_fastq = os.path.join(os.getcwd(), args.fastq) if not os.path.isfile(file_combined_fastq): fastq_files = [os.path.join(file_combined_fastq, f) for f in listdir(file_combined_fastq) if isfile(join(file_combined_fastq, f)) and f.endswith("fastq")] k = file_combined_fastq.rfind("/") file_combined_fastq = file_combined_fastq[:k] + ".fastq" + file_combined_fastq[k + 1:] with open(file_combined_fastq, 'wb') as wfd: for file in fastq_files: with open(file, 'rb') as fd: shutil.copyfileobj(fd, wfd, 1024 * 1024 * 10) reads_fastq = [] if file_combined_fastq.endswith("fastq") or file_combined_fastq.endswith("fq"): for record in SeqIO.parse(file_combined_fastq, "fastq"): reads_fastq.append(str(record.id)) elif file_combined_fastq.endswith("fasta") or file_combined_fastq.endswith("fa"): for record in SeqIO.parse(file_combined_fastq, "fasta"): reads_fastq.append(str(record.id)) else: print("Not known reads file format") number_reads = len(reads_fastq) if args.host_specie == "" and args.pathogens_species == "": species = "" elif args.host_specie == "" and not args.pathogens_species == "": species = pathogens elif not args.host_specie == "" and args.pathogens_species == "": species = [args.host_specie] else: species = [args.host_specie] + pathogens species.sort() name_database = "_".join(species).replace(" ", "_") genome_db = os.path.join(cwd, name_database + ".fasta") genome_db_id = os.path.join(cwd, name_database + ".txt") all_genomes = False if "refseq" in args.NCBIdatabase: table_file = "assembly_summary_refseq.txt" if "assembly" in args.NCBIdatabase: all_genomes = True table_file = "assembly_summary_genbank.txt" if os.path.exists(os.path.join(cwd,table_file)): os.remove(os.path.join(cwd,table_file)) cmd = WGET % table_file wget = sb.Popen(cmd, shell=True, stdout=sb.PIPE, stderr=sb.PIPE, cwd=cwd) wget.communicate() sys.stdout.write("### UPDATING THE DATABASE\n") # This part checks for a new version of the taxdump.tar.gz; the code looks for a new version every day ete = os.path.expanduser("~/.etetoolkit/taxa.sqlite.traverse.pkl") modified = os.path.getmtime(ete) modificationTime = time.strftime('%m', time.localtime(modified)) today = datetime.date.today() month = today.strftime("%m") if modificationTime != month: ncbi.update_taxonomy_database() dict_species = {} # here we set if is an not know pathogen or we have and idea of which pathogen to investigate with open(os.path.join(cwd, table_file), "r") as fh: descendants_all = [] for specie in species: name2taxid = ncbi.get_name_translator([specie]) if args.host_specie in specie: plant = name2taxid[specie] for key in name2taxid[specie]: descendants = ncbi.get_descendant_taxa(key, collapse_subspecies=True) for sstaxa in descendants: descendants_all.append(str(sstaxa)) for line in fh: if not line.startswith("#"): if line.split("\t")[6] in descendants_all:# and "subsp" in line: ssname = " ".join([line.split("\t")[7].split(" ")[0], line.split("\t")[7].split(" ")[1]]) tax = line.split("\t")[6] ftp = line.split("\t")[19] genome = ftp.split("/")[-1] + "_genomic.fna.gz" ftp_genome = os.path.join(ftp, genome) path_genome = os.path.join(cwd, genome) #species_assembly = " ".join([line.split("\t")[7]].split(" ")[0], [line.split("\t")[7]].split(" ")[1]) if ssname in dict_species: dict_species[ssname] = dict_species[ssname] + [(ftp_genome, path_genome, tax, genome, ssname)] else: dict_species[ssname] = [(ftp_genome, path_genome, tax, genome, ssname)] db_file = os.path.join(home, ".db_monica." + name_database) if all_genomes: print("DOWNLOADING MULTIPLE GENOMES FOR THE SAME SPECIES") genomes_select = [name for specie in dict_species for name in dict_species[specie]] else: print("DOWNLOADING ONE GENOME FOR SPECIES") genomes_select = [dict_species[specie][-1] for specie in dict_species] print("I WILL DOWNLOAD %s GENOMES" % str(len(genomes_select))) if not os.path.exists(db_file) or not os.path.exists(genome_db): with open(genome_db, "w") as output_handle, open(genome_db_id, "w") as output_handle_id: with open(db_file, "w") as fh: for names in genomes_select: ftp_genome, path_genome, tax, genome, ssname = names if genome.startswith("GC"): genome_used = cwd + genome + "\n" fh.write(genome_used) if not os.path.exists(path_genome): cmd = WGET_GENOME % ftp_genome wget_gen = sb.Popen(cmd, shell=True, stdout=sb.PIPE, stderr=sb.PIPE, cwd=cwd) wget_gen.communicate() with gzip.open(path_genome, "rt") as handle: print("PARSING " + genome + " GENOME") for record in SeqIO.parse(handle, "fasta"): record.id = tax + "_" + str(record.id) record.description = genome.split(".")[0] SeqIO.write(record, output_handle, "fasta") output_handle_id.write(str(record.name) + "%" + str(record.description) + "\n") sys.stdout.write("### PREPARING FOR MAPPING\n") genome_to_contig = {} with open(genome_db_id, "r") as fhtxt: for record in fhtxt: #txt SeqIO.parse(genome_db, "fasta"): line = record.split("%") genome_to_contig[line[0]] = line[1].rsplit() genome_to_species= {} with open(os.path.join(cwd, table_file), "r") as fh: for line in fh: line = line.rstrip().split("\t") genome = line[0].split(".")[0] if len(line) > 9 and not line[0].startswith("#"): subspecies = line[7].split(" ")[:2] subspecie = "_".join(subspecies) #+ " " + line[8].split("=")[1:] tribu = "_".join(line[8].split("=")[1:]) genome_to_species[genome] = subspecie + "-" + tribu sam_output = file_combined_fastq + ".sam" cmd = MINIMAP % (str(args.threads), genome_db, file_combined_fastq, sam_output) sys.stdout.write("RUNNING MINIMAP2\n") minimap = sb.Popen(cmd, shell=True, cwd=cwd) minimap.communicate() reads_dict = {} count = 0 with open(sam_output) as fh: for sam in fh: if sam != "" and not sam.startswith("@"): fields = sam.split("\t") if not fields[2] == "*": for entry in fields: if entry.startswith("MD"): md = entry.split(":")[-1] mismatch = len(re.findall("[A-Z]", md)) match = sum([int(number) for number in re.sub('[A-Z]|\^', ',', md).split(",") if number != "" and number.isdigit()]) if match > 0: if mismatch > 0: iden = (match - mismatch) / match * 100 if fields[0] in reads_dict: if iden == reads_dict[fields[0]][0]: if reads_dict[fields[0]][1].startswith(fields[2].split("_")[0]): continue else: count += 1 reads_dict.pop(fields[0], None) elif iden > reads_dict[fields[0]][0]: reads_dict[fields[0]] = (iden, fields[2], fields[0]) else: reads_dict[fields[0]] = (iden, fields[2], fields[0]) else: iden = 100 if fields[0] in reads_dict: if iden == reads_dict[fields[0]][0]: if reads_dict[fields[0]][1].startswith(fields[2].split("_")[0]): continue else: count += 1 reads_dict.pop(fields[0], None) elif iden > reads_dict[fields[0]][0]: reads_dict[fields[0]] = (iden, fields[2], fields[0]) else: reads_dict[fields[0]] = (iden, fields[2], fields[0]) out_file = file_combined_fastq + ".reads.txt" with open(out_file, "w") as csv: for key in reads_dict: csv.write("\t".join([reads_dict[key][1], reads_dict[key][2]]) + " \n") print(count) count = {} number_reads_mapped = 0 for read in reads_dict: match = reads_dict[read][1].split("_") if len(match) > 1: number_reads_mapped += 1 if all_genomes: contig = match[1] #+ "_" + match[2] else: contig = match[1] + "_" + match[2] genome_map = genome_to_contig[contig] species_ss = genome_to_species[genome_map[0]] uniq_name = match[0] + "_" + species_ss if not uniq_name in count: count[uniq_name] = 1 else: count[uniq_name] = count[uniq_name] + 1 print("Name sample: " + file_combined_fastq) print("Number reads:" + str(number_reads)) print("Number reads mapped:" + str(number_reads_mapped) + "\nPercentage of reads mapped:" + str( number_reads_mapped/number_reads * 100) + " %\n") header = [] reads_mapped = [] partial_tree = [] for clade in types: header.append(clade[0]) reads_mapped.append("") header.append("A") reads_mapped.append(str(number_reads-number_reads_mapped)) total = [header] + [reads_mapped] tribu_dict = {} sorted_list = [] for value in count: key = value.split("_")[0] if not str(key).startswith(str(plant[0])): sorted_list.append((value[1],(count[value]/number_reads_mapped*100))) lineage = ncbi.get_lineage(int(key)) a = ncbi.get_rank(lineage) tribu = value.split("-")[1] tribu_dict["tribu"] = tribu tree = [] for match in types: combination = [match[1]] if match[0] in tribu_dict: combination.append("".join([tribu_dict[match[0]]])) else: for tax in a: if match[0].startswith(a[tax]) and match[0].endswith(a[tax]): combination.append(ncbi.get_taxid_translator([int(tax)])[tax].replace(" ","_")) tree.append("".join(combination)) tree.append(str(count[value])) partial_tree = partial_tree + [tree] partial_tree.sort() total = total + partial_tree out_file = file_combined_fastq + ".txt" with open(out_file, "w") as csv: for line in total: csv.write(",".join(line) + " \n") plot_circ(out_file, file_combined_fastq) print("done")
help='Species list in text format one species in each line') parser.add_option('-f', '--format', type='choice', choices=['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '100'], dest="format", default='8', help='outpur format for tree') parser.add_option('-t', '--treebest', type='choice', choices=['yes', 'no'], dest="treebest", default='no', help='To be used in TreeBest') parser.add_option('-d', '--database', type='choice', choices=['yes', 'no'], dest="database", default='no', help='Update database') options, args = parser.parse_args() if options.database == "yes": try: ncbi.update_taxonomy_database() except: pass if options.input_species_filename is None: raise Exception('-s option must be specified, Species list in text format one species in each line') with open(options.input_species_filename) as f: species_name = [_.strip().replace('_', ' ') for _ in f.readlines()] name2taxid = ncbi.get_name_translator(species_name) taxid = [name2taxid[_][0] for _ in species_name] tree = ncbi.get_topology(taxid)
def main(): args = arguments() database_directory = os.path.abspath(args.database_directory) if os.path.exists(database_directory) == False: os.mkdir(database_directory) else: logger.info("Database directory folder already exists at {}".format( database_directory)) # Helper function to simplify adding database_directory to everything prepend_db_dir = functools.partial(os.path.join, database_directory) lockfilepath = os.path.join(database_directory, ".lock") status_file = prepend_db_dir('status.txt') if os.path.exists(lockfilepath) == False: try: open(file=lockfilepath, mode="w").close() logger.info("Placed lock file at {}".format(lockfilepath)) except Exception as e: logger.error( "Failed to place a lock file at {}. Database diretory can not be accessed. Wrong path?" .format(lockfilepath)) logger.error("{}".format(e)) pass else: while os.path.exists(lockfilepath): elapsed_time = time.time() - os.path.getmtime(lockfilepath) logger.info( "Lock file found at {}. Waiting for other processes to finish database init ..." .format(lockfilepath)) logger.info( "Elapsed time {} min. Will continue processing after 16 min mark." .format(int(elapsed_time / 60))) if elapsed_time >= 1000: logger.info( "Elapsed time {} min. Assuming previous process completed all init steps. Continue ..." .format(int(elapsed_time / 60))) try: #if previous process failed, no processes are running and > 16 min passed since the lock was created os.remove(lockfilepath) except: #continue if file was removed by other process pass break time.sleep(60) #recheck every 1 min if lock file was removed logger.info( "Lock file no longer exists. Assuming init process completed successfully" ) return 0 logger.info('Initializing databases...this will take some time') # Find available threads and use the maximum number available for mash sketch but cap it at 32 num_threads = min(multiprocessing.cpu_count(), 32) if not os.path.exists(database_directory): os.makedirs(database_directory) zip_file = prepend_db_dir('data.tar.gz') plasmid_database_fasta_file = prepend_db_dir('ncbi_plasmid_full_seqs.fas') repetitive_fasta_file = prepend_db_dir('repetitive.dna.fas') mash_db_file = prepend_db_dir('ncbi_plasmid_full_seqs.fas.msh') logger.info('Downloading databases...this will take some time') for db_mirror in config['db_mirrors']: try: logger.info('Trying mirror {}'.format(db_mirror)) download_to_file(db_mirror, zip_file) break except Exception as e: logger.error( "Download failed with error {}. Removing lock file".format( str(e))) os.remove(lockfilepath) sys.exit(-1) logger.info( "Downloading databases successful, now building databases at {}". format(database_directory)) extract(zip_file, database_directory) files = [ prepend_db_dir(f) for f in os.listdir(database_directory) if f.endswith('.gz') ] for file in files: extract(file, database_directory) #Initialize blast and mash databases try: logger.info('Building repetitive mask database') blast_runner = BlastRunner(repetitive_fasta_file, database_directory) blast_runner.makeblastdb(repetitive_fasta_file, 'nucl', logger) logger.info('Building complete plasmid database') blast_runner = BlastRunner(plasmid_database_fasta_file, database_directory) blast_runner.makeblastdb(plasmid_database_fasta_file, 'nucl', logger, True) logger.info('Sketching complete plasmid database') mObj = mash() mObj.mashsketch(plasmid_database_fasta_file, mash_db_file, num_threads=num_threads) except Exception as e: logger.error( 'Downloading databases failed, please check your internet connection and retry' ) logger.error( "Process failed with error {}. Removing lock file".format(e)) os.remove(lockfilepath) sys.exit(-1) try: logger.info("Init ete3 library ...") ete3taxadbpath = os.path.abspath( os.path.join(database_directory, "taxa.sqlite")) ncbi = NCBITaxa() ncbi.dbfile = ete3taxadbpath ncbi.update_taxonomy_database() except Exception as e: logger.error( "Init of ete3 library failed with error {}. Removing lock file". format(e)) os.remove(lockfilepath) sys.exit(-1) try: os.remove(os.path.join(os.getcwd(), "taxdump.tar.gz")) logger.info( "Removed residual taxdump.tar.gz as ete3 is not doing proper cleaning job." ) except: pass with open(status_file, 'w') as f: download_date = datetime.datetime.today().strftime('%Y-%m-%d') f.write("Download date: {}. Removing lock file.".format(download_date)) try: os.remove(lockfilepath) except: logger.warning( "Lock file is already removed by some other process.") pass logger.info("MOB init completed successfully") return 0
def main(args): ## STEP 1. Set up logger log = logging.getLogger(__name__) coloredlogs.install(fmt='%(asctime)s [%(levelname)s] %(message)s', level='INFO', logger=log) ## STEP 2. Initialize variables ncbi = NCBITaxa() # Update database if it is older than one month if (time.time() - os.path.getmtime( os.path.join(Path.home(), ".etetoolkit/taxa.sqlite"))) > 2592000: ncbi.update_taxonomy_database() blocklist = set() blocklist_existing = set() ## STEP 3. Read blocklist if the file already exists if os.path.isfile(args.file_blocklist): log.info("Reading existing blocklist ...") blocklist_existing = read_blocklist(args.file_blocklist) ## STEP 4a. Collect genus names of IRL clade of Fabaceae log.info("Fetching genus names of taxa in 'IRL clade' of Fabaceae ...") try: irl_clade_genera = get_irl_clade_genera(ncbi) except: irl_clade_genera = set() log.info("Adding new genus names to blocklist ...") blocklist = set(list(irl_clade_genera.union(blocklist_existing))) ## STEP 4b. Collect species names of IRL clade of Fabaceae log.info("Fetching species names of taxa in 'IRL clade' of Fabaceae ...") try: irl_clade_species = get_irl_clade_species(ncbi) except: irl_clade_species = set() log.info("Adding new species names to blocklist ...") blocklist = set(list(irl_clade_species.union(blocklist))) ## STEP 5. Conduct the search on NCBI PubMed if args.query and args.mail: log.info("Querying NCBI Pubmed for taxon names ...") try: irl_clade_genera = set() am = AM.ArticleMining(log) ei = EI.EntrezInteraction(log) if ei.internet_on(): # Check if internet connection active articles = ei.fetch_pubmed_articles(args.mail, args.query) else: # If no internet connection, raise error raise Exception("ERROR: No internet connection.") ncbi = NCBITaxa() # Update database if it is older than 1 month if (time.time() - os.path.getmtime( os.path.join(Path.home(), ".etetoolkit/taxa.sqlite"))) > 2592000: ncbi.update_taxonomy_database() article_genera = set() for article in articles: article_genera.union( am.get_genera_from_pubmed_article(article, ncbi)) except: article_genera = set() blocklist = blocklist.union(article_genera) ## STEP 6. Keeping only species name if corresponding genus name present log.info("Removing genus names if individual species of genus in list ...") species_names = list() genus_names = set() genus_epithets = set() for line in blocklist: first, *rest = line.split() if rest: species_names.append(line) genus_epithets.add(first) else: genus_names.add(first) species_names.extend(genus_names - genus_epithets) blocklist = set( species_names) # Making sure that no two lines are identical ## STEP 7. Write updated blocklist to file or replace old blocklist log.info("Writing updated blocklist to file ...") #append_blocklist(args.file_blocklist, blocklist) write_blocklist(args.file_blocklist, sorted(blocklist))
def main(args): # STEP 1. Set up logger log = logging.getLogger(__name__) if args.verbose: coloredlogs.install(fmt='%(asctime)s [%(levelname)s] %(message)s', level='DEBUG', logger=log) else: coloredlogs.install(fmt='%(asctime)s [%(levelname)s] %(message)s', level='INFO', logger=log) mail = args.mail query = args.query iro = ir_operations.IROperations(log) EI = entrez_interaction.EntrezInteraction(log) # STEP 2. Read in accession numbers to loop over tio = table_io.TableIO(args.infn, args.outfn, args.blocklist, logger = log) tio.remove_blocklisted_entries() accessions = list(tio.entry_table["ACCESSION"].values) if len(accessions) > 0: if not os.path.exists(args.recordsdir): os.makedirs(args.recordsdir) if not os.path.exists(args.datadir): os.makedirs(args.datadir) # STEP 3. Loop over accession in inlist for accession in accessions: acc_folder = os.path.join(args.datadir, str(accession)) if not os.path.exists(acc_folder): os.makedirs(acc_folder) else: log.warning("Folder for accession `%s` already exists. Skipping this accession." % (str(accession))) continue # Step 3.1. Get flatfile if not os.path.isfile(os.path.join(args.recordsdir, accession + ".tar.gz")): log.info("Saving GenBank flat file for accession `%s`." % (str(accession))) if EI.internet_on(): # Check if internet connection active try: fp_entry = EI.fetch_gb_entry(accession, acc_folder) except: log.warning("Error retrieving accession `%s`. Skipping this accession." % (str(accession))) os.rmdir(acc_folder) continue else: # If no internet connection, raise error raise Exception("ERROR: No internet connection.") else: log.info("GenBank flat file for accession `%s` already exists. Extracting existing file." % (str(accession))) tar = tarfile.open(os.path.join(args.recordsdir, accession + ".tar.gz"), "r:gz") tar.extractall(acc_folder) tar.close() fp_entry = os.path.join(acc_folder, accession + ".gb") # Step 3.2. Parse and analyze flatfile try: try: rec = SeqIO.read(fp_entry, "genbank") except Exception as err: raise Exception("Error while parsing record of accession `%s`: `%s`. Skipping this accession." % (str(accession), str(err))) continue rec_id = str(rec.id).split('.')[0] # Note: This internal check ensures that we are actually dealing with the record that was # intended to be downloaded via efetch. if not rec_id == str(accession): log.warning("Accession number mismatch. Expected: `%s`. Retrieved: `%s`. Skipping this accession." % \ (str(accession), rec_id)) continue log.info("Writing sequence as FASTA for accession `%s`." % (str(accession))) iro.write_sequence_to_fasta(str(rec.seq), ">" + str(accession) + "_completeSequence", os.path.join(acc_folder, str(accession) + "_completeSeq.fasta")) ira_feature = None irb_feature = None if not str(accession) in tio.ir_table.index: tio.ir_table = tio.ir_table.append(pd.Series(name=str(accession), dtype='float64')) try: ira_feature, irb_feature = iro.identify_inverted_repeats(rec, 1000) rev_comp = False if ira_feature and irb_feature: score_noRC = fuzz.ratio(ira_feature.extract(rec).seq, irb_feature.extract(rec).seq) score_RC = fuzz.ratio(ira_feature.extract(rec).seq, irb_feature.extract(rec).seq.reverse_complement()) if score_noRC < score_RC: rev_comp = True ir_info = iro.collect_info_from_features(ira_feature, irb_feature) tio.ir_table.loc[accession] = ir_info tio.append_ir_info_to_table(ir_info, accession, args.outfn) except Exception as err: ir_info = iro.collect_info_from_features(ira_feature, irb_feature) tio.ir_table.loc[accession] = ir_info tio.append_ir_info_to_table(ir_info, accession, args.outfn) raise Exception("Error while extracting IRs for accession `%s`: `%s`. Skipping further processing of this accession." % (str(accession), str(err))) continue tio.ir_table.loc[accession] = iro.collect_info_from_features(ira_feature, irb_feature) # TODO: Currently, nothing is done with the table object, since the file is written entry-wise. Remove full table from use? iro.write_irs_to_fasta(rec, ira_feature, irb_feature, acc_folder, rev_comp) except Exception as err: log.warning(str(err)) finally: if not os.path.isfile(os.path.join(args.recordsdir, accession + ".tar.gz")): tar = tarfile.open(os.path.join(args.recordsdir, accession + ".tar.gz"), "w:gz") tar.add(fp_entry, os.path.basename(fp_entry)) tar.close() os.remove(fp_entry) # STEP 4. Check every accession for IR loss in literature and remove from outlist if so published if EI.internet_on(): # Check if internet connection active am = article_mining.ArticleMining(log) articles = EI.fetch_pubmed_articles(mail, query) ncbi = NCBITaxa() # Update database if it is older than one month if (time.time() - os.path.getmtime(os.path.join(Path.home(), ".etetoolkit/taxa.sqlite"))) > 2592000: ncbi.update_taxonomy_database() article_genera = set() for article in articles: article_genera.union(am.get_genera_from_pubmed_article(article, ncbi)) tio.read_ir_table(args.outfn) tio.remove_naturally_irl_genera(article_genera) tio.write_ir_table(args.outfn) else: # If no internet connection, skip article mining log.warning("No internet connection. Skipping PubMed article mining.")