def main(): ## this code runs when call as a single script ## control if options provided or help if len(sys.argv) > 1: print("") else: help_options() exit() ## get arguments provided ID_file = os.path.abspath(sys.argv[1]) folder = os.path.abspath(sys.argv[2]) ## get file information strains2get = HCGB.functions.main_functions.readList_fromFile(ID_file) ## debug messages Debug = False if (sys.argv[3] == "True"): print('*******************************') debug_message("Mode ON") print('*******************************') Debug = True data = NCBI_download_list(strains2get, folder, Debug) print("+ Data has been retrieved.\n")
def parse_taxid(tax_id, ncbi, option, debug): """Function to parse according to option: provide info or return unravelled data """ ############ ## debug messages ############ if debug: debug_message('parse_taxid:', "yellow") if tax_id.isdigit(): debug_message('tax_id: ' + str(tax_id), "yellow") else: debug_message('tax_id: ' + tax_id, "yellow") debug_message('conversion needed: ', "yellow") debug_message('option: ' + option, "yellow") ############ ## convert to tax ID ############ if not tax_id.isdigit(): ## convert name to taxid integer print("+ Convert to NCBI taxonomy ID") print("\tSource: " + tax_id) tax_id = name2taxid([tax_id], ncbi) (tax_name, taxid, rank, lineage) = taxon_info(tax_id, ncbi, debug) print("\tRank: " + rank) print("\tID: " + str(taxid)) ############ ## parse accordingly ############ if (option == "info"): print() (tax_name, taxid, rank, lineage) = taxon_info(tax_id, ncbi, debug) print("----------------------------------------------") print("Result:") print("Name: " + tax_name) print("Rank: " + rank) print("Taxid: " + str(taxid)) list_lineage = lineage.split(";") print("Lineage:") for tax in list_lineage: tax_split = tax.split(":") print("\t" + '{}\t{}'.format(tax_split[0], tax_split[1])) print("----------------------------------------------") print() ## return info return (tax_name, taxid, rank, lineage) ############ ### call unravel taxid information ############ elif (option == "unravel"): return (unravel_taxid(tax_id, ncbi, debug))
def retrieve_genes_ids_sequences(profile, gene_ID, debug): """ """ ## given a profile folder if debug: HCGB_aes.debug_message('profile: ', 'yellow') print (profile) HCGB_aes.debug_message('gene_id: ', 'yellow') print (gene_ID) ## assembled_genes_list = HCGB_main.retrieve_matching_files(profile, "assembled_genes.fa", debug) assembled_genes_list = [s for s in assembled_genes_list if 'ariba.tmp' not in s] if debug: HCGB_aes.debug_message('assembled_genes_list: ', 'yellow') print(assembled_genes_list) if os.path.isfile(assembled_genes_list[0]): for record in SeqIO.parse(assembled_genes_list[0], "fasta"): if debug: HCGB_aes.debug_message('record.description: ', 'yellow') print(record.description) search_ID = re.search(gene_ID, record.description) if (search_ID): return (record.id, str(record.seq)) return('','')
def NCBI_get_info_GenbankID(data_folder, acc_ID, debug, assembly_level_given='complete'): section_given = get_section(acc_ID, debug) if debug: debug_message("-----------------------------------------") debug_message("NCBI_get_info_GenbankID function call", color="yellow") ## import module and class import ncbi_genome_download from ncbi_genome_download.config import NgdConfig tries = ['bacteria', 'archaea'] for entry_tried in tries: if debug: debug_message("Trying with: " + entry_tried, color="yellow") ngd_config = NgdConfig.from_kwargs(section=section_given, file_formats='genbank', assembly_accessions=acc_ID, output=data_folder, dry_run=True, groups=entry_tried) info = ncbi_genome_download.core.select_candidates(ngd_config) if info: if debug: debug_message("It worked!", color="yellow") return (entry_tried) raise "**** ERROR: Something happen while connecting to NCBI... ***" exit() return (False)
def desc_taxa(taxid, ncbi, debug): """Write descendent taxa for taxid Created by Joe R. J. Healey; Nick Youngblut Slightly modified. Returns python dictionary with descendant taxid and name. """ ## debug messages if debug: debug_message( "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") debug_message('desc_taxa: ' + str(taxid), "yellow") # Main feature of the script is to get all taxa within a given group. descendent_taxa = ncbi.get_descendant_taxa(taxid) descendent_taxa_names = ncbi.translate_to_names(descendent_taxa) dict_Descent = {} for dtn, dt in zip(descendent_taxa_names, descendent_taxa): dict_Descent[dt] = dtn ## debug messages if debug: debug_message('dict_Descent: ', "yellow") print(dict_Descent) debug_message( "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++") return dict_Descent
def get_superKingdom(tax_id, ncbi, debug): """For a given tax_id get superkingdom from NCBI taxonomy ID """ if debug: debug_message("get_superKingdom: ", 'yellow') debug_message("tax_id: " + str(tax_id), 'yellow') (tax_name2, taxid2, rank2, lineage2) = taxon_info(tax_id, ncbi, debug) list_lineage = lineage2.split(";") for tax3 in list_lineage: tax_split = tax3.split(":") ## check the rank provided: add also species or serotype (tax_name3, taxid3, rank3, lineage3) = taxon_info(tax_split[0], ncbi, debug) if (rank3 == "superkingdom"): return (tax_name3.lower())
def check_annot_table(annot_table, file, format, debug): '''Check annotation table provided matches the BLAST or protein fasta file provided''' ## read annotation annotation_table = pd.read_csv(annot_table, sep=",", index_col=0 ) #annotation_table = annotation_table.drop("Unnamed: 0",axis=1) ## debug messages if debug: debug_message('check_annot_table function:', 'yellow') debug_message("list(annotation_table.columns)", 'yellow') print(list(annotation_table.columns)) debug_message("BacDup_functions.columns_annot_table()", 'yellow') print(BacDup_functions.columns_annot_table()) ## skip if not OK if not (list(annotation_table.columns) == BacDup_functions.columns_annot_table()): print('ERROR: Annotation table does NOT match desired input format') return(False) ## read BLAST sequence results if format=="blast": ## TODO print() ## protein fasta file elif(format=="fasta"): with open (file) as in_handle: ref_recs = SeqIO.to_dict(SeqIO.parse(in_handle, "fasta")) protein_IDs = list(ref_recs.keys()) ## get sequence headers index_annot = list(annotation_table.index) ## get annotation labels ## debug messages if debug: debug_message("list(annotation_table.index).sort()", 'yellow') print(index_annot) debug_message("list(ref_recs.keys()).sort()", 'yellow') print(protein_IDs) if (index_annot == protein_IDs): return(True) else: return(False)
def update_db(ncbi_db, db_folder, debug): """Update database Created by Joe R. J. Healey; Nick Youngblut Original code. """ ## debug messages if debug: debug_message('Update database at {}\n'.format(ncbi_db.dbfile), "yellow") print('Updating the taxonomy database. This may take several minutes...\n') ncbi_db.update_taxonomy_database() ## print timestamp filename_stamp_parse = os.path.abspath(db_folder + '/timestamp_db.txt') time_functions.print_time_stamp(filename_stamp_parse) return ncbi_db
def get_data(sample, file_data, format, out_folder, debug): '''Function to get BLAST results''' file_data = os.path.abspath(file_data) ## debug messages if (debug): debug_message('dup_searcher.get_data:', 'yellow') debug_message('file_data:' + file_data, 'yellow') debug_message('format:' + format, 'yellow') ## check file is readable BacDup_functions.file_readable_check(file_data) ## parse accordingly if (format=='blast_raw'): ## FIXME raw_blast = pd.read_csv(file_data, sep="\t", header = None, names=BacDup_functions.columns_rawBLAST_table()) elif (format=='fasta'): raw_blast = create_blast_results(sample, file_data, out_folder, debug) raw_blast = pd.read_csv(raw_blast, sep="\t", header = None, names=BacDup_functions.columns_rawBLAST_table()) return (raw_blast)
def assembly_stats_caller(fasta_file, out_file, debug): contig_lens, scaffold_lens, gc_cont = assembly_stats.read_genome( fasta_file) ## debug messages if debug: HCGB_aes.debug_message("contig_lens", "yellow") print(contig_lens) HCGB_aes.debug_message("scaffold_lens", "yellow") print(scaffold_lens) HCGB_aes.debug_message("gc_cont", "yellow") print(gc_cont) ## get stats contig_stats = assembly_stats.calculate_stats(contig_lens, gc_cont) scaffold_stats = assembly_stats.calculate_stats(scaffold_lens, gc_cont) ## debug messages if debug: HCGB_aes.debug_message("contig_stats", "yellow") print(contig_stats) HCGB_aes.debug_message("scaffold_stats", "yellow") print(scaffold_stats) stat_output = { 'Contig Stats': contig_stats, 'Scaffold Stats': scaffold_stats } ## save results in file HCGB_main.printDict2file(out_file + '-contigs.csv', contig_stats, ",") HCGB_main.printDict2file(out_file + '-scaffolds.csv', scaffold_stats, ",") ## create stats in excel file assembly_stats_file = out_file + '_stats.xlsx' parse_stats(stat_output, assembly_stats_file, debug) return (stat_output, assembly_stats_file)
def agrvate_caller(dict_assemblies, dict_folders, debug=False): """Create agrvate call and control for parameters""" ## ATTENTION: agrvate needs to chdir to output folder path_here = os.getcwd() print ("+ Checking agr genes for each sample retrieved...") agrvate_results = pd.DataFrame() ## No need to optimize. There is a problem with the working dir of agrvate and we ## need to change every time. for name, assembly_file in dict_assemblies.items(): sample_folder = HCGB_files.create_folder(dict_folders[name]) ## check if previously done and succeeded filename_stamp = sample_folder + '/.success' if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print (colored("\tA previous command generated results on: %s [%s]" %(stamp, name), 'yellow')) else: os.chdir(sample_folder) info_sample = agrvate_call(name, assembly_file, sample_folder, debug) agrvate_results = pd.concat([agrvate_results, info_sample], join='outer') if (info_sample.shape[0] == 0): print("+ Some error occurred with sample %s. Please re-run analysis or check log files." %name) else: ## success HCGB_time.print_time_stamp(filename_stamp) print ("+ Jobs finished%s\n+ Collecting information for all samples...") os.chdir(path_here) ## debug messages if debug: HCGB_aes.debug_message('agrvate_results', 'yellow') HCGB_main.print_all_pandaDF(agrvate_results) return(agrvate_results)
def NCBI_get_info(section_given, data_folder, tax_ID_list, debug, assembly_level_given='complete', group_given='bacteria'): '''This function uses ncbi_genome_download to create a dry run and return information of each entry provided ''' ## import module and class import ncbi_genome_download from ncbi_genome_download.config import NgdConfig try: ngd_config = NgdConfig.from_kwargs( section=section_given, file_formats='genbank', taxids=tax_ID_list, output=data_folder, dry_run=True, assembly_levels=assembly_level_given, groups=group_given) info = ncbi_genome_download.core.select_candidates(ngd_config) except: raise "**** ERROR: Something happen while connecting to NCBI... ***" exit() return (False) #### if (len(info)) < 1: print( colored( "No entries matched your filter. Please check the input options provided", 'yellow')) exit() ## fill dictionary to simplify dict_entries = {} for entry, _ in info: strain_name = ncbi_genome_download.core.get_strain(entry) ## debug messagess if debug: debug_message("", 'yellow') print(entry) string = entry['assembly_accession'] + '\t' + entry[ 'organism_name'] + '\t' + strain_name debug_message(string, 'yellow') debug_message( ".....................................................................\n", 'yellow') ## fill dictionary dict_entries[entry['assembly_accession']] = (entry['organism_name'], strain_name) ## return return (dict_entries)
def unravel_taxid(tax_id, ncbi, debug): """This function unravels information and obtains children taxids for each taxid. If taxid corresponds to serotype or species, no further processing is done. On the other hand, if genes, family, order or any other rank is provided, all subranks would retrieved. It also takes into account serotypes and accomodates information. It returns a list of all taxids included within the tax_id provided. """ ## check the rank provided (tax_name, taxid, rank, lineage) = taxon_info(tax_id, ncbi, debug) ## debug messages if debug: debug_message('tax_name: ' + tax_name, "yellow") debug_message('taxid:' + str(taxid), "yellow") debug_message('rank: ' + rank, "yellow") debug_message('lineage: ' + lineage, "yellow") ## list_taxids = [] ## taxid provided is either a serotype or strain: directly to retrieve if (rank in ("species", "serotype", "strain")): list_taxids.append(taxid) else: ## get descendant dict_descent = desc_taxa(taxid, ncbi, debug) for tax, name in dict_descent.items(): ## add taxa retrieved list_taxids.append(tax) ## check the rank provided and decompose (tax_name2, taxid2, rank2, lineage2) = taxon_info(tax, ncbi, debug) list_lineage = lineage2.split(";") for tax3 in list_lineage: tax_split = tax3.split(":") ## check the rank provided: add also species or serotype (tax_name3, taxid3, rank3, lineage3) = taxon_info(tax_split[0], ncbi, debug) if (rank3 in ("species", "serotype")): list_taxids.append(taxid3) ## return uniq list of ids return (list(set(list_taxids)))
def create_blast_results(sample, fasta_file, outdir, debug): '''Creates BLAST results for each fasta vs. itself''' #phr is the header file, pin is the index file, psq is the sequence file ## debug messages if debug: debug_message('create_blast_results function call:', 'yellow') debug_message('sample: ' + sample, 'yellow') debug_message('fasta_file: ' + fasta_file, 'yellow') debug_message('outdir: ' + outdir, 'yellow') ## output file raw_blast = os.path.abspath(os.path.join(outdir, "BLAST_raw_results.tsv")) ## timestamps db_timestamp = os.path.join(outdir, '.db_success') search_timestamp = os.path.join(outdir, '.blast_success') if (not HCGB.functions.files_functions.is_non_zero_file(search_timestamp)): ## get binaries (makeblastdb_exe, blastp_exe) = BacDup.modules.config.get_exe('BLAST', debug) makeblastdb_exe = "/usr/bin/makeblastdb" blastp_exe = "/usr/bin/blastp" ## check if db is indexed already db_path_name = os.path.join(os.path.abspath(outdir), sample + '_db') if (not HCGB.functions.files_functions.is_non_zero_file(db_timestamp)): ## generate blastdb for genome HCGB.functions.blast_functions.makeblastdb(db_path_name, fasta_file, makeblastdb_exe, 'prot') # HCGB function ## print time stamp HCGB_time.print_time_stamp(db_timestamp) else: print (colored("\t+ BLAST database already available for sample %s [%s]" %(sample, read_time), 'green')) ## create blastp outfile HCGB.functions.blast_functions.blastp(blastp_exe, raw_blast, db_path_name, fasta_file, 1) # HCGB function ## print time stamp HCGB_time.print_time_stamp(search_timestamp) else: read_time = HCGB_time.read_time_stamp(search_timestamp) print (colored("\t+ Duplicate search already available for sample %s [%s]" %(sample, read_time), 'green')) return (raw_blast)
def get_genes_profile(samples_info, gene_names, debug, option): """ """ ## search by group id or gene name print ('\n+ Retrieve selected genes profile for each sample.') results_profileIDs = pd.DataFrame() sample_frame = samples_info.groupby(["name"]) for g in gene_names: #print ("\t+", g) for name, cluster_df in sample_frame: my_list_profiles = cluster_df.loc[cluster_df['tag'] == 'profile']['ext'].to_list() if debug: HCGB_aes.debug_message('name: ' + name, 'yellow') HCGB_aes.debug_message('my_list_profiles: ', 'yellow') print (my_list_profiles) HCGB_aes.debug_message('cluster_df: ', 'yellow') print (cluster_df) ## skip files if name == 'report': continue fill=False for p in my_list_profiles: profile_csv = cluster_df.loc[cluster_df['ext'] == p]['sample'].to_list()[0] ## skip files if not profile_csv.endswith('report_summary.csv'): if debug: HCGB_aes.debug_message('profile_csv: ' + profile_csv, 'yellow') value = retrieve_genes_ids_profile(profile_csv, g, debug, option) ## save results if (not value.empty): for Name, Data in value.iterrows(): results_profileIDs.loc[name,Name] = Data['Status'] fill=True if not fill: results_profileIDs.loc[name, g] = 'no' return (results_profileIDs)
def get_gbk_information(gbk, debug): ## read Genbank file to retrieve information for each samle ## https://biopython.org/wiki/SeqRecord # get for index, record in enumerate(SeqIO.parse(gbk, "genbank")): if (index == 0): ## only for first entry == Main chromosome if debug: debug_message("******************************************") debug_message("SeqIO.read(gbk, 'genbank') info:", color="yellow") debug_message("record", color="yellow") debug_message(record, color="yellow") organism = record.annotations['source'] taxonomy = record.annotations['taxonomy'] ## TODO: get plasmid information ## return (taxonomy, organism)
def parse_annot_file(name, folder_out_input, annot_file, output_path, Debug, ref_file=""): """ This functions checks for each annotation file provided type of input and calls appropriate parser: gbf_parser or gff_parser """ ## debug messages if (Debug): debug_message('+++++++++++++++++++++++++++++++') debug_message('check_annot_file function call:', 'yellow') debug_message('name: ' + name, 'yellow') debug_message('annot_file: ' + annot_file, 'yellow') ## check file integrity: exists & non-zero if (BacDup_functions.file_readable_check(annot_file)): ## check format; call parser format = format_checker.is_format(annot_file, Debug) ## debug messages if (Debug): debug_message('\nformat_checker.is_format function call:', 'yellow') debug_message('format: ' + format, 'yellow') ## parse gbk or gff if (format == 'gbk'): print(colored('\t* GenBank format file:........[OK]', 'green')) ## TODO: print details available within GenBank: # Accession, Bioproject, # Reference, Authors, Title, Journal, # Comment return (gbf_parser.gbf_parser_caller(annot_file, output_path, Debug)) elif (format == 'gff'): print(colored('\t* GFF format file:.......[OK]', 'green')) if (HCGB_files.is_non_zero_file(ref_file)): return (gff_parser.gff_parser_caller(annot_file, ref_file, output_path, Debug)) else: print( colored( "ERROR: No genome reference file provided for this GFF annotation. Check input options provided.", "red")) exit() ## not valid via this option else: print(colored("ERROR: not valid via this option", "red")) exit() ## not accessible for this sample else: return (False)
def ngd_download(section_given, acc_ID, data_folder, debug, section='genbank', assembly_level='complete', group_given='bacteria'): ''' Function that calls and retrieves data from NCBI using python package ngd. :param acc_ID: :param data_folder: Folder to store data. :param debug: True/false for debugging messages :attention Module ngd requires to download data in bacteria/archaea subfolder under genbank or refseq folder. ''' ################################## ## check if necessary to download ################################## ## get path print('+ Check data for ID: ', acc_ID) dir_path = os.path.join(data_folder, section_given, group_given, acc_ID) ## check if previously download download = False if os.path.exists(dir_path): print('+ Folder already exists: ', dir_path) ## get files download (genome, prot, gff, gbk) = BacDup.scripts.functions.get_files_annotation(dir_path, debug) if (gbk): ## Only genbank format file is required download = False else: print('+ Not all necessary data is available. Download it again.') download = True else: download = True ## download data if download: print("\n+ Downloading data for: " + colored(acc_ID, 'green')) ## download in data folder provided if (debug): debug_message("ngd.download call", color="yellow") debug_message("dir_path: " + dir_path, color="yellow") debug_message("section_given: " + section_given, color="yellow") ## download if debug: debug_message( "section='%s', file_formats='genbank', assembly_level=%s, assembly_accessions=%s, output=%s, groups=%s" % (section_given, assembly_level, acc_ID, data_folder, group_given), color="yellow") try: ngd.download(section=section_given, file_formats='genbank', assembly_levels=assembly_level, assembly_accessions=acc_ID, output=data_folder, groups=group_given) except: raise ( "A problem occurred when contacting NCBI for downloading id (%s) from %s" % (acc_ID, section_given)) ## return empty if not os.path.isdir(dir_path): return False ## check if files are gunzip files = os.listdir(dir_path) files_list = [] for f in files: if f.endswith('gz'): files_list.append(f) print("\t- Extracting files: ", f) HCGB.functions.files_functions.extract(dir_path + '/' + f, dir_path) #os.remove(dir_path + '/' + f) ## skip else: print('\t+ Data is already available, no need to download it again') print() ## return path where data is return (dir_path)
def retrieve_genes_ids_profile(profile, gene_ID, debug, option): """ """ ## read data get_csv_data = HCGB_main.get_data(profile, ',', '') if option == 'name': list_Genes = get_csv_data['Genes'].to_list() get_csv_data.index = get_csv_data['Genes'] elif option == 'ID': list_Genes = get_csv_data['ID'].to_list() get_csv_data.index = get_csv_data['ID'] ## debug messages if debug: HCGB_aes.debug_message('profile: ' + profile, 'yellow') HCGB_aes.debug_message('gene_id: ' + str(gene_ID), 'yellow') HCGB_aes.debug_message('data: ', 'yellow') print(get_csv_data) HCGB_aes.debug_message('Option: ' + option, 'yellow') HCGB_aes.debug_message('Genes: ', 'yellow') print (list_Genes) ## search accordingly if option == 'name': regex_search = re.compile("^" + gene_ID + ".*") filtered_genes = list(filter(regex_search.match, list_Genes)) ## debug messages if debug: HCGB_aes.debug_message('filtered_genes: ', 'yellow') print (filtered_genes) HCGB_aes.debug_message('filtered_genes.loc[filtered_genes]: ', 'yellow') print (get_csv_data.loc[filtered_genes]) return (get_csv_data.loc[filtered_genes]) else: if gene_ID in list_Genes: ## debug messages if debug: HCGB_aes.debug_message('gene_id: ' + gene_ID, 'yellow') print (get_csv_data.loc[gene_ID].to_frame().transpose()) return (get_csv_data.loc[gene_ID].to_frame().transpose()) else: return(pd.DataFrame())
def run_assembly(options): """Main function of the assemble module. It assembles each sample using SPADES_ and checks quality using BUSCO_ software and database. .. seealso:: This function depends on other BacterialTyper and HCGB functions called: - :func:`BacterialTyper.scripts.BUSCO_caller.print_help_BUSCO` - :func:`BacterialTyper.scripts.multiQC_report.multiqc_help` - :func:`BacterialTyper.modules.qc.BUSCO_check` - :func:`HCGB.sampleParser` - :func:`HCGB.functions.aesthetics_functions` - :func:`HCGB.functions.time_functions` - :func:`HCGB.functions.main_functions` - :func:`HCGB.functions.file_functions` .. include:: ../../links.inc """ ## init time start_time_total = time.time() ## debugging messages global Debug if (options.debug): Debug = True else: Debug = False ################################## ### show help messages if desired ################################## if (options.help_format): ## help_format option help_info.help_fastq_format() exit() elif (options.help_BUSCO): ## information for BUSCO BUSCO_caller.print_help_BUSCO() exit() elif (options.help_project): ## information for project help_info.project_help() exit() elif (options.help_multiqc): ## information for Multiqc multiQC_report.multiqc_help() exit() ### set as default paired_end mode if (options.single_end): options.pair = False else: options.pair = True ## message header HCGB_aes.pipeline_header("BacterialTyper", ver=pipeline_version) HCGB_aes.boxymcboxface("Assembly module") print("--------- Starting Process ---------") HCGB_time.print_time() ## absolute path for in & out input_dir = os.path.abspath(options.input) outdir = "" ## Project mode as default project_mode = True if (options.detached): options.project = False project_mode = False outdir = os.path.abspath(options.output_folder) else: options.project = True outdir = input_dir ## get files pd_samples_retrieved = sampleParser.files.get_files( options, input_dir, "trim", ['_trim'], options.debug) ## debug message if (Debug): print(colored("**DEBUG: pd_samples_retrieve **", 'yellow')) print(pd_samples_retrieved) ## generate output folder, if necessary print("\n+ Create output folder(s):") if not options.project: HCGB_files.create_folder(outdir) outdir_dict = HCGB_files.outdir_project(outdir, options.project, pd_samples_retrieved, "assemble", options.debug) ### call assemble using spades start_time_partial = start_time_total start_time_partial_assembly = start_time_partial ## optimize threads name_list = set(pd_samples_retrieved["name"].tolist()) threads_job = HCGB_main.optimize_threads( options.threads, len(name_list)) ## threads optimization max_workers_int = int(options.threads / threads_job) ## debug message if (Debug): HCGB_aes.debug_message("options.threads: " + str(options.threads), "yellow") HCGB_aes.debug_message("max_workers: " + str(max_workers_int), "yellow") HCGB_aes.debug_message("cpu_here: " + str(threads_job), "yellow") # Group dataframe by sample name sample_frame = pd_samples_retrieved.groupby(["name"]) # We can use a with statement to ensure threads are cleaned up promptly print('+ Running modules SPADES...') with concurrent.futures.ThreadPoolExecutor( max_workers=max_workers_int) as executor: ## send for each sample commandsSent = { executor.submit(check_sample_assembly, name, outdir_dict[name], sorted(cluster["sample"].tolist()), threads_job): name for name, cluster in sample_frame } for cmd2 in concurrent.futures.as_completed(commandsSent): details = commandsSent[cmd2] try: data = cmd2.result() except Exception as exc: print('***ERROR:') print(cmd2) print('%r generated an exception: %s' % (details, exc)) ## functions.timestamp print("\n+ Assembly of all samples finished: ") start_time_partial = HCGB_time.timestamp(start_time_partial_assembly) ## if (assembly_stats): ################### if Debug: HCGB_aes.debug_message("assembly_stats dictionary", "yellow") print(assembly_stats) ## create single file get_assembly_stats_all(assembly_stats, outdir, Debug) ### symbolic links print("+ Retrieve all genomes assembled...") ### BUSCO check assembly if (options.no_BUSCO): print() else: results = qc.BUSCO_check(outdir, outdir, options, start_time_partial, "genome") ## print to file results print("\n*************** Finish *******************") start_time_partial = HCGB_time.timestamp(start_time_total) print("+ Exiting Assembly module.") return ()
def get_files_annotation(folder, debug): ''' Code retrieve from BacterialTyper database_generator.py script ''' ## check if files are gunzip files = os.listdir(folder) genome = "" prot = "" gff = "" gbk = "" for f in files: if f.endswith('.fna'): genome = os.path.join(folder, f) elif f.endswith('.gff'): gff = os.path.join(folder, f) elif f.endswith('.gbk'): gbk = os.path.join(folder, f) elif f.endswith('.gbff'): gbk = os.path.join(folder, f) elif f.endswith('.faa'): prot = os.path.join(folder, f) ## debug messages if debug: debug_message("-----------------------------------------") debug_message("Return info get_files_download", color="yellow") debug_message("genome: " + genome, color="yellow") debug_message("prot: " + prot, color="yellow") debug_message("gff: " + gff, color="yellow") debug_message("gbk: " + gbk, color="yellow") return (genome, prot, gff, gbk)
def get_assembly_stats_all(assembly_stats_dict, outdir, debug): ## get all assembly stats outdir_report = HCGB_files.create_subfolder("report", outdir) final_dir = HCGB_files.create_subfolder("assembly_stats", outdir_report) final_sub_dir = HCGB_files.create_subfolder("samples", final_dir) #### summary and information results_summary_toPrint_all = pd.DataFrame() column_names = ("Type", "Sample", "Total Sequences", "GC% Content", "Longest sequence", "Shortest sequence", "Median length", "Mean length", "Total Length (bp)", "L10", "N10", "L20", "N20", "L30", "N30", "L40", "N40", "L50", "N50") ## debugging messages if debug: HCGB_aes.debug_message("Create assembly statistic for all samples") for sample_name in assembly_stats: excel_file_stats = assembly_stats[sample_name][1] if debug: HCGB_aes.debug_message("sample_name: " + sample_name, 'yellow') HCGB_aes.debug_message("excel: " + excel_file_stats, 'yellow') HCGB_aes.debug_message("contig stats dictionary: ", 'yellow') print(assembly_stats[sample_name][0]['Contig Stats']) HCGB_aes.debug_message("scaffold stats dictionary: ", 'yellow') print(assembly_stats[sample_name][0]['Scaffold Stats']) # get contig contig_stats = pd.DataFrame.from_dict( assembly_stats[sample_name][0]['Contig Stats'], orient='index').transpose() contig_stats['type'] = 'contigs' contig_stats['sample_name'] = sample_name # get scaffold scaff_stats = pd.DataFrame.from_dict( assembly_stats[sample_name][0]['Scaffold Stats'], orient='index').transpose() scaff_stats['type'] = 'scaffolds' scaff_stats['sample_name'] = sample_name ## copy individual excel file shutil.copy(excel_file_stats, final_sub_dir) ## add all data results_summary_toPrint_all = pd.concat( [results_summary_toPrint_all, contig_stats, scaff_stats], ignore_index=True) ## reorder columns cols = results_summary_toPrint_all.columns.tolist() cols = cols[-1:] + cols[:-1] cols = cols[-1:] + cols[:-1] results_summary_toPrint_all = results_summary_toPrint_all[cols] ## write to excel name_excel_summary = final_dir + '/summary_stats.xlsx' writer_summary = pd.ExcelWriter(name_excel_summary, engine='xlsxwriter') ## open excel handle ## filter important columns results_summary_toPrint_all = results_summary_toPrint_all.set_axis( column_names, 1) ## save in excel results_summary_toPrint_all.to_excel( writer_summary, sheet_name="all_data") ## write excel handle writer_summary.save() ## close excel handle
def check_sample_assembly(name, sample_folder, files, threads): """Checks if sample is assembled. It checks whether a sample is assembled or not by reading file *sample_folder/.success_all*. If file not available (no previous assembly or not suceeded it) it calls :func:`BacterialTyper.scripts.spades_assembler.run_module_assembly` to generate assembly for the sample speficied. :param name: Sample name or tag to identify sample. :param sample_folder: directory to generate assembly ouptut. It must exist. :param files: List containing files (fastq R1 & R2) for the sample to be assembled. :param threads: Number of CPUs to use :type name: string :type sample_folder: string :type files: list :type threads: integer :return: Populates dictionary assembly_stats with assembly stats dictionary information :rtype: Dataframe .. seealso:: This function depends on other BacterialTyper and HCGB functions called: - :func:`BacterialTyper.scripts.spades_assembler.run_module_assembly` """ ## check if previously assembled and succeeded filename_stamp = sample_folder + '/.success_all' if os.path.isfile(filename_stamp): stamp = HCGB_time.read_time_stamp(filename_stamp) print( colored( "\tA previous command generated results on: %s [%s]" % (stamp, name), 'yellow')) ## Get information stat_output = { 'Contig Stats': HCGB_main.file2dictionary( sample_folder + '/' + name + '_assembly-contigs.csv', ','), 'Scaffold Stats': HCGB_main.file2dictionary( sample_folder + '/' + name + '_assembly-scaffolds.csv', ',') } ## populate main dictionary assembly_stats[name] = [ stat_output, sample_folder + '/' + name + '_assembly_stats.xlsx' ] else: ## debug message if (Debug): HCGB_aes.debug_message( "spades_assembler.run_module_assembly call:", "yellow") print("spades_assembler.run_module_assembly " + name + "\t" + sample_folder + "\t" + files[0] + "\t" + files[1] + "\t" + str(threads) + "\n") # Call spades_assembler code = spades_assembler.run_module_assembly(name, sample_folder, files[0], files[1], threads) if (code != 'FAIL'): ## success stamps filename_stamp = sample_folder + '/.success_all' stamp = HCGB_time.print_time_stamp(filename_stamp) assembly_stats[ name] = code # list containing dictionary of data and excel else: print( "Some error occurred for sample %s while generating the assembly. " % name)
def parse_search_options(arg_dict): ## outdir = os.path.abspath(arg_dict.input_folder) ## --------------------------------------- ## ## Project containing data ## --------------------------------------- ## if (arg_dict.project): print(colored('\t* BacDup project folder:.......[OK]', 'green')) ## set missing options arg_dict.pair = False arg_dict.include_all = True arg_dict.include_lane = True ## find samples previously parsed and prepared within a BacDup project structure pd_proteins = sampleParser.files.get_files(arg_dict, outdir, "parse", ["fa"], arg_dict.debug) pd_proteins = pd_proteins.drop(["dirname", "name", "ext", "tag"], axis=1) pd_proteins = pd_proteins.rename(index=str, columns={'sample': 'file_data'}) pd_proteins['format'] = 'fasta' pd_annot = sampleParser.files.get_files(arg_dict, outdir, "parse", ["annot_df.csv"], arg_dict.debug) pd_annot = pd_annot.drop(["dirname", "name", "ext", "tag"], axis=1) pd_annot = pd_annot.rename(index=str, columns={'sample': 'annot_table'}) ## merge into pd_samples_retrieved pd_samples_retrieved = pd.merge(pd_proteins, pd_annot) ## debug messages if (arg_dict.debug): debug_message('pd_proteins:', 'yellow') HCGB_main.print_all_pandaDF(pd_proteins) debug_message('pd_annot:', 'yellow') HCGB_main.print_all_pandaDF(pd_annot) debug_message('pd_samples_retrieved:', 'yellow') HCGB_main.print_all_pandaDF(pd_samples_retrieved) ## --------------------------------------- ## ## data on multiple sources ## --------------------------------------- ## elif (arg_dict.detached): print(colored('\t* Detached mode:.......[OK]', 'green')) ## parse samples provided print() ######################################################### ## BLAST raw results provided: either batch or single ######################################################### if (arg_dict.text_file): print( colored('\t* BLAST raw results provided:.......[OK]', 'green')) print() # *************************** ## ## Batch file provided # *************************** ## if (arg_dict.batch): ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') debug_message( 'Multiple BLAST results file provided option:', 'yellow') debug_message('arg_dict.text_file: ' + arg_dict.text_file, 'yellow') ## check if ok BacDup_functions.file_readable_check(arg_dict.text_file) print( colored( '\t* Multiple BLAST results files provided .......[OK]', 'green')) dict_entries = HCGB_main.file2dictionary( arg_dict.text_file, ',') ## check file is readable BacDup_functions.file_readable_check(arg_dict.annot_table) dict_entries_annot = HCGB_main.file2dictionary( arg_dict.annot_table, ',') ## Check dictionaries contain same information if (dict_entries.keys() == dict_entries_annot.keys()): for sample, files in dict_entries.items(): ## check annot_table and fasta_file headers are the same ## return_code = dup_searcher.check_annot_table( dict_entries_annot[sample], files, 'BLAST', arg_dict.debug) if not (return_code): print( 'Process will continue but sample %s would be discarded' % sample) else: print() ## fill dataframe pd_samples_retrieved # *************************** ## ## single file provided # *************************** ## else: ## check annot_table and fasta_file headers are the same ## return_code = dup_searcher.check_annot_table( arg_dict.annot_table, arg_dict.text_file, 'BLAST', arg_dict.debug) if not (return_code): print('Process will stop here. Please check input files') exit() else: print() ## fill dataframe pd_samples_retrieved ######################################################### ## annotations file provided: either batch or single ######################################################### elif (arg_dict.annot_file): ## debug messages if (arg_dict.debug): debug_message('Multiple BLAST results file provided option:', 'yellow') debug_message('arg_dict.annot_file: ' + arg_dict.annot_file, 'yellow') ## get input info df_accID = input_parser.parse_options(arg_dict) if (arg_dict.debug): debug_message('df_accID', 'yellow') print(df_accID) ## parse info input_parser.parse_information(arg_dict, df_accID, outdir) ## set missing options arg_dict.pair = False arg_dict.include_all = True arg_dict.include_lane = True ## find samples previously parsed and prepared within a BacDup project structure pd_proteins = sampleParser.files.get_files(arg_dict, outdir, "parse", ["fa"], arg_dict.debug) pd_annot = sampleParser.files.get_files(arg_dict, outdir, "parse", ["annot_df.csv"], arg_dict.debug) ## merge into pd_samples_retrieved frames = [pd_proteins, pd_annot] pd_samples_retrieved = pd.concat(frames, sort=True, join='outer') if (arg_dict.debug): debug_message('pd_samples_retrieved', 'yellow') print(pd_samples_retrieved) ######################################################### ## CDS fasta and annotations provided: either batch or single ######################################################### elif arg_dict.fasta_prot: # *************************** ## ## Batch file provided # *************************** ## if (arg_dict.batch): print( colored('\t* Multiple FASTA files provided .......[OK]', 'green')) ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') debug_message( 'Multiple Protein FASTA files provided option:', 'yellow') debug_message( 'arg_dict.fasta_prot: ' + arg_dict.fasta_prot, 'yellow') ## check if ok BacDup_functions.file_readable_check(arg_dict.fasta_prot) dict_entries = HCGB_main.file2dictionary( arg_dict.fasta_prot, ',') ## check file is readable BacDup_functions.file_readable_check(arg_dict.annot_table) print( colored( '\t* Multiple annotation tables provided .......[OK]', 'green')) dict_entries_annot = HCGB_main.file2dictionary( arg_dict.annot_table, ',') ## Check dictionaries contain right information if (dict_entries.keys() == dict_entries_annot.keys()): for sample, files in dict_entries.items(): ## check annot_table and fasta_file headers are the same ## return_code = dup_searcher.check_annot_table( dict_entries_annot[sample], files, 'fasta', arg_dict.debug) if not (return_code): print( 'Process will continue but sample %s would be discarded' % sample) else: print() ## fill dataframe pd_samples_retrieved # *************************** ## ## single file provided # *************************** ## else: print( colored('\t* Protein FASTA file provided .......[OK]', 'green')) BacDup_functions.file_readable_check(arg_dict.fasta_prot) ## check file is readable print( colored('\t* An annotation table provided .......[OK]', 'green')) BacDup_functions.file_readable_check(arg_dict.annot_table) ## check annot_table and fasta_file headers are the same ## return_code = dup_searcher.check_annot_table( arg_dict.annot_table, arg_dict.fasta_prot, 'fasta', arg_dict.debug) if not (return_code): print('Process will stop here. Please check input files') exit() else: print() ## fill dataframe pd_samples_retrieved exit() ### What?? else: ## Nespresso print() ## return information pd_samples_retrieved = pd_samples_retrieved.set_index('new_name') return (pd_samples_retrieved)
def run_search(arg_dict): """Main function of the search module in BacDup package. This module searches and create gene duplication analysis. It allows the user to provide either a previous parsed data project (NCBI Genbank IDs, taxonomy or user annotation data) or a single or multiple samples. """ ## help message if (arg_dict.input_help): help_input() exit() if (arg_dict.blast_help): info.blast_help() exit() if (arg_dict.project_help): info.project_help() exit() if (arg_dict.detached_mode_help): info.detached_mode() exit() ### Start the analysis BacDup_functions.pipeline_header('BacDup') HCGB_aes.boxymcboxface("Search module") print("--------- Starting Process ---------") HCGB_time.print_time() ## init time start_time_total = time.time() ## absolute path for in & out outdir = os.path.abspath(arg_dict.input_folder) ## project or detached? if arg_dict.detached: arg_dict.project = False ## output folder print("\n+ Create output folder(s):") HCGB.functions.files_functions.create_folder(outdir) else: arg_dict.project = True ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') debug_message('Project/Detached option:', 'yellow') debug_message('arg_dict.detached: ' + str(arg_dict.detached), 'yellow') debug_message('arg_dict.project: ' + str(arg_dict.project), 'yellow') debug_message('outdir:' + outdir, 'yellow') debug_message('+++++++++++++++++++++++++++++++') ## get files print() HCGB_aes.print_sepLine("-", 50, False) print('+ Getting information provided... ') print('+ Several options available:') print('\t* BacDup project folder with initiated data') print('\t* Single/Multiple Annotation file:') print('\t |-- GenBank format files') print('\t |-- GFF files + Reference fasta files required') print('\t* Single/Multiple raw BLAST results files') print('\t* Single/Multiple fasta proteins + annotation table') print("""\n\n**** NOTE: **** For additional options (e.g. Single/Multiple NCBI GenBank or taxonomy IDs) use the input module to accommodate accordingly """) time.sleep(1) print() ## parse options pd_samples_retrieved = parse_search_options(arg_dict) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ## for each sample dict_search_folders = HCGB.functions.files_functions.outdir_project( outdir, arg_dict.project, pd_samples_retrieved, "search", arg_dict.debug) dict_dup_folders = HCGB.functions.files_functions.outdir_project( outdir, arg_dict.project, pd_samples_retrieved, "dups", arg_dict.debug) dict_parse_folders = HCGB.functions.files_functions.outdir_project( outdir, arg_dict.project, pd_samples_retrieved, "parse", arg_dict.debug) ## create results data2add = pd.DataFrame(columns=BacDup_functions.columns_dup_table()) for sample, folder in dict_search_folders.items(): annot_timestamp = os.path.join(dict_dup_folders[sample], '.annot_success') dup_annot_file = os.path.join(dict_dup_folders[sample], 'dup_annot.csv') ## annotation annot_table_file = pd_samples_retrieved.loc[sample, 'annot_table'] if (not HCGB.functions.files_functions.is_non_zero_file( annot_timestamp)): ## get results file_data = pd_samples_retrieved.loc[sample, 'file_data'] format = pd_samples_retrieved.loc[sample, 'format'] filtered_data = dup_searcher.filter_data( sample, file_data, format, arg_dict.pident, arg_dict.evalue, arg_dict.percentage, arg_dict.bitscore, folder, arg_dict.debug) ## timestamps filter_timestamp = os.path.join(dict_dup_folders[sample], '.filter_success') if (not HCGB.functions.files_functions.is_non_zero_file( filter_timestamp)): #save results as a .csv file sort_csv = os.path.abspath( os.path.join(dict_dup_folders[sample], 'filtered_results.csv')) filtered_data.to_csv(sort_csv, header=True, index=False) ## print time stamp HCGB_time.print_time_stamp(filter_timestamp) else: read_time = HCGB_time.read_time_stamp(filter_timestamp) print( colored( "\t+ Filter results already available for sample %s [%s]" % (sample, read_time), 'green')) ## get annotation (dup_annot_df, data2add_entry) = dup_searcher.get_dupannot( sample, filtered_data, annot_table_file, arg_dict.debug) ## info_dup_file = os.path.join(dict_dup_folders[sample], 'info_dup.csv') data2add_entry.to_csv(info_dup_file, header=True, index=False) ## save into file dup_annot_df.to_csv(dup_annot_file, header=True) ## print time stamp HCGB_time.print_time_stamp(annot_timestamp) else: read_time = HCGB_time.read_time_stamp(annot_timestamp) print( colored( "\t+ Duplicate annotation already available for sample %s [%s]" % (sample, read_time), 'green')) ## add info for each dup_annot_df = HCGB_main.get_data(dup_annot_file, ',', "index_col=0") annot_table = HCGB_main.get_data(annot_table_file, ',', "index_col=0") data2add_entry = dup_searcher.get_dup_stats( sample, dup_annot_df, annot_table, arg_dict.debug) ## add genome length data data2add_entry['genome_len'] = '' len_df_file = os.path.join(dict_parse_folders[sample], 'length_df.csv') if os.path.isfile(len_df_file): len_data = HCGB_main.get_data(len_df_file, ',', "header=None") data2add_entry['genome_len'] = len_data[1].sum() ## merge data #data2add_entry = data2add_entry.reset_index() data2add = data2add.append(data2add_entry, ignore_index=False) ### report generation HCGB_aes.boxymcboxface("Summarizing duplicated search") outdir_report = HCGB.functions.files_functions.create_subfolder( "report", outdir) dups_report = HCGB.functions.files_functions.create_subfolder( "dups", outdir_report) ## add data2add data2add.to_csv(os.path.join(dups_report, 'info_annot.csv'), index=True, header=True) ## maybe add a summary of the files? print("\n*************** Finish *******************") start_time_partial = HCGB_time.timestamp(start_time_total) print("+ Exiting search module.") return ()
def agrvate_call(sample, assembly_file, folder, debug=False): """agrvate call and check results.""" ## prepare call log_call = os.path.join(folder, "agrvate_cmd.log") err_call = os.path.join(folder, "agrvate_cmd.err") agrvate_bin = set_config.get_exe('agrvate') ## system call cmd_call = "%s -i %s -m -f > %s 2> %s " %(agrvate_bin, assembly_file, log_call, err_call) ## use mummer (-m) and force results folder (-f) status = HCGB_sys.system_call(cmd_call) ## check results ## see https://github.com/VishnuRaghuram94/AgrVATE#results for additional details results = pd.DataFrame() ## check folder is created assembly_file_name = os.path.basename(assembly_file).split('.fna')[0] original_results_folder = os.path.join(folder, assembly_file_name + '-results') results_folder = os.path.join(folder, 'agrvate_results') if os.path.isdir(original_results_folder): print("+ Results folder generated OK") print("+ Check results generated:") ## rename folder os.rename(original_results_folder, results_folder) os.rename(os.path.join(folder, assembly_file_name + '.fna-error-report.tab'), os.path.join(results_folder, 'error_report.tab')) ## write to excel file_name_Excel = os.path.join(folder, sample + '_agr_results.xlsx') writer_Excel = pd.ExcelWriter(file_name_Excel, engine='xlsxwriter') ## open excel handle ## get all files list_files = HCGB_main.get_fullpath_list(results_folder) ## summary tab summary_tab_file = [s for s in list_files if s.endswith("summary.tab")][0] summary_tab = HCGB_main.get_data(summary_tab_file, '\t', options="") summary_tab['sample'] = sample ## columns #agr_group: gp1/gp2/gp3/gp4. 'u' means unknown. ## If multiple agr groups were found (col 5 = m), ## the displayed agr group is the majority/highest confidence. # match_score: maximum 15; 0 means untypeable; < 5 means low confidence. # canonical_agrD: 1 means canonical; 0 means non-canonical; u means unknown. # multiple_agr: s means single, m means multiple, u means unknown ) ## Multiple groups are found likely due to multiple S. aureus isolates in sequence # frameshifts: Number found in CDS of extracted agr operon ('u' if agr operon not extracted) ## debug messages if debug: HCGB_aes.debug_message("agrvate results: Summary tab file", 'yellow') print(summary_tab_file) print(summary_tab) ## add summary results to all results del summary_tab['#filename'] results = summary_tab.copy() ## save summary_tab into excel ## tab summary summary_tab.to_excel(writer_Excel, sheet_name='summary') ## write excel handle ## agr_gp tab agr_gp_tab_file = [s for s in list_files if s.endswith("agr_gp.tab")][0] if HCGB_files.is_non_zero_file(agr_gp_tab_file): agr_gp_tab = HCGB_main.get_data(agr_gp_tab_file, '\t', options='header=None') agr_gp_tab.columns = ['contig', 'agr', 'evalue', 'identity', 'start', 'end'] agr_gp_tab['sample'] = sample ## columns ## Assembly Contig ID ## ID of matched agr group kmer ## evalue ## Percentage identity of match ## Start position of kmer alignment on input sequence ## End position of kmer alignment on input sequence ## debug messages if debug: HCGB_aes.debug_message("agrvate results: agr_gp file", 'yellow') print(agr_gp_tab_file) print(agr_gp_tab) ## save agr_gp_tab file into excel ## tab operon agr_gp_tab.to_excel(writer_Excel, sheet_name='operon') ## write excel handle ## agr_operon fna try: agr_operon_fna_file = [s for s in list_files if s.endswith("agr_operon.fna")][0] ## debug messages if debug: HCGB_aes.debug_message("agrvate results: agr_operon file", 'yellow') print(agr_operon_fna_file) results['operon_fna'] = agr_operon_fna_file except: results['operon_fna'] = '' ## agr_operon fna error_report_file = [s for s in list_files if s.endswith("error_report.tab")][0] error_report = HCGB_main.get_data(error_report_file, '\t', options="") del error_report['#input_name'] ## debug messages if debug: HCGB_aes.debug_message("agrvate results: error_report.tab file", 'yellow') print(error_report_file) print(error_report) ## save error_report file into excel ## tab steps error_report.to_excel(writer_Excel, sheet_name='steps') ## write excel handle ## merge results results = pd.concat([results, error_report], axis=1) ## close xlsx file writer_Excel.save() ## close excel handle ## add to pandas dataframe results['agr_operon_xlsx'] = file_name_Excel ## debug messages if debug: HCGB_aes.debug_message("agrvate results", 'yellow') HCGB_main.print_all_pandaDF(results) return (results)
def gbf_parser(gbf_file, list_out_files, debug=False): ## create dataframe. ## get common column names columns = columns_annot_table() annot_df = pd.DataFrame(data=None, columns=columns) genome_length = pd.DataFrame(data=None, columns=["length"]) for rec in SeqIO.parse(gbf_file, "genbank"): #get genome length for BioCircos plotting ID = rec.id genome_length.loc[ID, ["length"]] = [len(rec.seq)] ## debug messages if (debug): debug_message('GenBank record', 'yellow') print(rec) ## loop through features for feature in rec.features: #sort by CDS type. Duplicate genes analysis needs coding regions to proteins. if feature.type == "CDS": genome_seq = rec.seq[feature.location.nofuzzy_start:feature. location.nofuzzy_end] if int(feature.strand) > 0: strand = "pos" else: strand = "neg" genome_seq = genome_seq.reverse_complement() #we create an ID for each entry protID = feature.type + "_" + rec.id + "_" + str( feature.location.nofuzzy_start) + "_" + str( feature.location.nofuzzy_end) + "_" + strand annot_df.loc[protID, ["rec_id", "start", "end", "strand"]] = [ ID, feature.location.nofuzzy_start, feature.location.nofuzzy_end, strand ] qualif = feature.qualifiers pseudo = False ## Debug messages if (debug): debug_message('protID: ' + protID, 'yellow') debug_message('qualif: ', 'yellow') print(qualif) debug_message('feature: ', 'yellow') print(feature) debug_message('genome_seq: ', 'yellow') print(genome_seq) pseudo_seq = "" ## fill datafarme for keys, values in qualif.items(): if keys not in columns: continue ## Save keys into dataframe annot_df.loc[protID, [keys]] = [values[0]] #################################### ## Pseudogenes: #################################### if keys == "pseudo": pseudo = True ## set pseudo True/False annot_df.loc[protID, ["pseudo"]] = ["True"] table_code = feature.qualifiers["transl_table"][0] pseudo_seq = genome_seq.translate(table=table_code, to_stop=False) if pseudo_seq.endswith("*"): pseudo_seq = pseudo_seq[:-1] ## Debug messages if (debug): print("***************************************") debug_message('Pseudogene: ', 'yellow') print("***************************************") debug_message('feature.location.nofuzzy_start: ', 'yellow') print(feature.location.nofuzzy_start) debug_message('feature.location.nofuzzy_end: ', 'yellow') print(feature.location.nofuzzy_end) debug_message('Translation table code: ', 'yellow') print(table_code) debug_message('genome_seq: ', 'yellow') print(genome_seq) debug_message('pseudo_seq: ', 'yellow') print(pseudo_seq) ## create a sequence fasta entry if (pseudo): # Pseudogenes have no translation item # set translated CDS even including * if len(pseudo_seq) != 0: gene_seq = pseudo_seq else: ## sometimes it might fail gene_seq = Seq.Seq('***') else: ## CDS provided by genbank gene_seq = Seq.Seq(feature.qualifiers["translation"][0]) yield (SeqRecord(gene_seq, protID, "", "")) ## print to file annot_df.to_csv(list_out_files[1], header=True) genome_length.to_csv(list_out_files[2], header=False) ## debug messages if (debug): debug_message('annot_df: ', 'yellow') print(annot_df) return ()
def run_input(arg_dict): """Main function of the input_parser module in BacDup package. This module prepares data for later gene duplication analysis. It allows the user to provide either a single sample, multiple samples, NCBI GenBank IDs or NCBI taxonomy IDs to retrieve and obtain the annotation data. """ ## help message if (arg_dict.input_help): help_input() exit() BacDup_functions.pipeline_header('BacDup') HCGB_aes.boxymcboxface("Preparing input files") print("--------- Starting Process ---------") HCGB_time.print_time() ## init time start_time_total = time.time() ## absolute path for in & out #input_dir = os.path.abspath(options.input) outdir = os.path.abspath(arg_dict.output_folder) ## output folder print("\n+ Create output folder(s):") HCGB_files.create_folder(outdir) ## set defaults if not (arg_dict.assembly_level): arg_dict.assembly_level = 'complete' if not (arg_dict.section): arg_dict.section = 'genbank' ## project or detached? if arg_dict.detached: arg_dict.project = False final_dir = outdir data_dir = outdir else: arg_dict.project = True print( "+ Generate a directory containing information within the project folder provided" ) final_dir = HCGB_files.create_subfolder("info", outdir) ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') debug_message('Project/Detached option:', 'yellow') debug_message('arg_dict.detached: ' + str(arg_dict.detached), 'yellow') debug_message('arg_dict.project: ' + str(arg_dict.project), 'yellow') debug_message('outdir:' + outdir, 'yellow') debug_message('final_dir:' + final_dir, 'yellow') debug_message('+++++++++++++++++++++++++++++++') ## get files print() HCGB_aes.print_sepLine("-", 50, False) print('+ Getting input information provided... ') print('+ Several options available:') print('\t* Single/Multiple Annotation file:') print('\t |-- GenBank format files') print('\t |-- GFF files + Reference fasta files required') print('\n\t* Single/Multiple NCBI GenBank IDs') print('\n\t* Single/Multiple NCBI taxonomy IDs + Options') print('\n\t* A previous BacDup project folder') print('\n+ Check the option provided...') time.sleep(1) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_total) ################################################# ## Parse and obtain the type of input information provided ################################################# df_accID = parse_options(arg_dict) ## pd.DataFrame: 'new_name','folder','genus', ## 'species','taxonomy','genome', ## 'annot_file','format_annot_file', 'proteins', ## 'plasmids_number','plasmids_ID')) ## time stamp start_time_partial = HCGB_time.timestamp(start_time_partial) ## parse information accordingly parse_information(arg_dict, df_accID, outdir) ### report generation HCGB_aes.boxymcboxface("Summarizing input files") outdir_report = HCGB_files.create_subfolder("report", outdir) input_report = HCGB_files.create_subfolder("input", outdir_report) ## add df_accID.loc[sample,] information as csv into input folder df_accID.to_csv(os.path.join(input_report, 'info.csv'), index=True, header=True) ## maybe add a summary of the files? print("\n*************** Finish *******************") start_time_partial = HCGB_time.timestamp(start_time_total) print("+ Exiting Input module.") return ()
def parse_options(arg_dict): outdir = os.path.abspath(arg_dict.output_folder) ## TODO: Now set as mutually_exclusive group. It might be Set to multiple options ## ATTENTION: df_accID merge generated dataframe ## --------------------------------------- ## ## GFF or GBF file ## --------------------------------------- ## if (arg_dict.annot_file): arg_dict.annot_file = os.path.abspath(arg_dict.annot_file) # *************************** ## ## multiple files provided # *************************** ## if (arg_dict.batch): ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') debug_message('Multiple annotation file provided option:', 'yellow') debug_message('arg_dict.annot_file: ' + arg_dict.annot_file, 'yellow') ## check if ok BacDup_functions.file_readable_check(arg_dict.annot_file) print( colored('\t* Multiple annotation files provided .......[OK]', 'green')) dict_entries = HCGB_main.file2dictionary(arg_dict.annot_file, ',') ## debug messages if (arg_dict.debug): debug_message('dict_entries: ', 'yellow') debug_message(dict_entries, 'yellow') debug_message('+++++++++++++++++++++++++++++++\n\n') # *************************** ## ## single file provided # *************************** ## else: dict_entries = {} print(colored('\t* Annotation file:.......[OK]', 'green')) if (arg_dict.sample_name): sample_name = arg_dict.sample_name else: sample_name = "sample" ## dict_entries[sample_name] = arg_dict.annot_file ## create dataframe df_accID to match other formats df_accID = pd.DataFrame( columns=(BacDup_functions.columns_accID_table())) for name, file_annot in dict_entries.items(): file_annot = os.path.abspath(file_annot) ## init all genome = "" prot = "" gff = "" gbk = "" plasmid_count = "" plasmid_id = "" ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') debug_message( 'dict_entries check annotation files provided option:', 'yellow') debug_message('name: ' + name, 'yellow') debug_message('file_annot: ' + file_annot, 'yellow') ## check file is valid BacDup_functions.file_readable_check(file_annot) ## get format format = format_checker.is_format(file_annot, arg_dict.debug) if (arg_dict.debug): debug_message('format: ' + format, 'yellow') ## parse accordingly taxonomy = "" organism = "" taxonomy_string = "" genus = "" if (format == 'gbk'): ## get information from each sample (taxonomy, organism) = BacDup.scripts.functions.get_gbk_information( file_annot, arg_dict.debug) ## plasmid_count, plasmid_id not available elif (format == 'gff'): if (arg_dict.ref_file): arg_dict.ref_file = os.path.abspath(arg_dict.ref_file) BacDup_functions.file_readable_check(arg_dict.ref_file) if (arg_dict.batch): ref_entries = HCGB_main.file2dictionary( arg_dict.ref_file, ',') genome = ref_entries[name] else: genome = arg_dict.ref_file ## save into dataframe if len(taxonomy) > 1: genus = taxonomy[-1] taxonomy_string = ";".join(taxonomy) dir_path = os.path.abspath(os.path.dirname(file_annot)) df_accID.loc[len(df_accID)] = (name, dir_path, genus, organism, taxonomy_string, genome, file_annot, format, prot, plasmid_count, ";".join(plasmid_id)) ## --------------------------------------- ## ## NCBI RefSeq/Genbank IDs: GCA_XXXXXXXX.1; GCF_XXXXXXXXX.1 ## --------------------------------------- ## elif (arg_dict.GenBank_id): ## get database path if (arg_dict.db_folder): db_folder = HCGB_files.create_folder( os.path.abspath(arg_dict.db_folder)) else: db_folder = HCGB_files.create_subfolder( "db", os.path.abspath(arg_dict.output_folder)) ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') debug_message('GenBank ID option:', 'yellow') debug_message('db_folder: ' + db_folder, 'yellow') # *************************** ## ## batch file # *************************** ## if (arg_dict.batch): arg_dict.GenBank_id = os.path.abspath(arg_dict.GenBank_id) ## debug messages if (arg_dict.debug): debug_message('GenBank ID batch file provided:', 'yellow') debug_message('arg_dict.GenBank_id: ' + arg_dict.GenBank_id, 'yellow') ## check is a file and readable BacDup_functions.file_readable_check(arg_dict.GenBank_id) print( colored('\t* Multiple NCBI GenBank IDs in a file .......[OK]', 'green')) print() ## call IDs into a list and create tmp folder strains2get = HCGB_main.readList_fromFile(arg_dict.GenBank_id) strains2get = list(filter(None, strains2get)) ## debug messages if (arg_dict.debug): debug_message('strains2get: ' + str(strains2get), 'yellow') ## call NCBI_downloader df_accID = BacDup.scripts.NCBI_downloader.NCBI_download_list( strains2get, db_folder, arg_dict.debug, arg_dict.assembly_level) # *************************** ## ## single GenBank ID # *************************** ## else: ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') debug_message('Single NCBI GenBank IDs provided option:', 'yellow') debug_message('arg_dict.GenBank_id: ' + arg_dict.GenBank_id, 'yellow') debug_message('db_folder: ' + db_folder, 'yellow') debug_message('+++++++++++++++++++++++++++++++') ## download print(colored('\t* A NCBI GenBank ID:.......[OK]', 'green')) print() HCGB_aes.print_sepLine("+", 75, False) df_accID = BacDup.scripts.NCBI_downloader.NCBIdownload( arg_dict.GenBank_id, db_folder, arg_dict.debug) ## --------------------------------------- ## ## NCBI Taxonomy ID: ## --------------------------------------- ## elif (arg_dict.tax_id): ################# ## get tax ids ################# if (arg_dict.batch): print( colored('\t* Multiple NCBI Taxonomy IDs in a file .......[OK]', 'green')) ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') debug_message('Multiple NCBI Taxonomy IDs provided option:', 'yellow') ## check is a file and readable BacDup_functions.file_readable_check(arg_dict.tax_id) ## get IDs into a list taxIDs2get = HCGB_main.readList_fromFile(arg_dict.tax_id) else: print(colored('\t* A NCBI Taxonomy ID:.......[OK]', 'green')) taxIDs2get = [arg_dict.tax_id] print() ################################## ## init ete NCBI taxonomy database ################################## print('+ Initiate NCBI taxonomy database...') ncbi = taxonomy_retrieval.init_db_object(arg_dict.debug) string_info_total = [] for taxid in taxIDs2get: ## parse info = taxonomy_retrieval.parse_taxid(taxid, ncbi, 'unravel', arg_dict.debug) print() ## debug messages if arg_dict.debug: debug_message( "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) debug_message('info\n', "yellow") print(info) ## append if more string_info_total.extend(info) ## convert to list of strings string_info_total = [str(int) for int in string_info_total] ## assume all belong to same superkingdom if children of same tax_id group_obtained = taxonomy_retrieval.get_superKingdom( string_info_total[0], ncbi, arg_dict.debug) ################# ## get database path ################# if (arg_dict.db_folder): db_folder = HCGB_files.create_folder( os.path.abspath(arg_dict.db_folder)) else: db_folder = HCGB_files.create_subfolder("db", outdir) ## debug messages if arg_dict.debug: debug_message( "+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++" ) debug_message('group_obtained: ' + group_obtained, "yellow") debug_message('db_folder: ' + db_folder, "yellow") debug_message( 'arg_dict.assembly_level: ' + arg_dict.assembly_level, "yellow") debug_message('arg_dict.section: ' + arg_dict.section, "yellow") ################################## ## get GenBank entries selected ################################## (strains2get, allstrains_available) = taxonomy_retrieval.get_GenBank_ids( db_folder, string_info_total, int(arg_dict.k_random), arg_dict.debug, assembly_level_given=arg_dict.assembly_level, group_given=group_obtained, section_given=arg_dict.section) ## print list and dictionary of possible and selected taxIDs outdir = os.path.abspath(arg_dict.output_folder) info_dir = HCGB_files.create_subfolder("info", outdir) input_info_dir = HCGB_files.create_subfolder("input", info_dir) HCGB_main.printList2file( os.path.join(input_info_dir, 'Downloaded.txt'), strains2get) HCGB_main.printList2file( os.path.join(input_info_dir, 'all_entries.txt'), allstrains_available) ## save into file file_info = os.path.join(input_info_dir, 'info.txt') ## stop here if dry_run if arg_dict.dry_run: print() HCGB_aes.print_sepLine("*", 75, False) print( "ATTENTION: Dry run mode selected. Stopping the process here.") HCGB_aes.print_sepLine("*", 75, False) print("+ All available entries listed and printed in file:\n\t" + os.path.join(input_info_dir, 'all_entries.txt')) print("+ Subset of entries generated and printed in file:\n\t" + os.path.join(input_info_dir, 'Downloaded.txt')) print( "\n\nIf random numbers selected, take into account re-running this process might produce different results.\n" ) HCGB_aes.print_sepLine("*", 75, False) print() exit() ################# ## call NCBI_downloader ################# df_accID = BacDup.scripts.NCBI_downloader.NCBI_download_list( strains2get, db_folder, arg_dict.debug, arg_dict.assembly_level) ## --------------------------------------- ## ## Previous BacDup analysis folder ## --------------------------------------- ## ## TODO elif (arg_dict.project): print( colored( '\t* A previous BacDup analysis project folder:.......[OK]', 'green')) ## create df_accID to store data ## TODO ## Returns dataframe with information df_accID = df_accID.set_index('new_name') return (df_accID)
def parse_information(arg_dict, df_accID, outdir): ### Parse df_accID dict_input_folders = HCGB_files.outdir_project(outdir, arg_dict.project, df_accID, "input", arg_dict.debug) dict_parse_folders = HCGB_files.outdir_project(outdir, arg_dict.project, df_accID, "parse", arg_dict.debug) ## debug messages if (arg_dict.debug): debug_message('+++++++++++++++++++++++++++++++') print("dict_input_folders") print(dict_input_folders) print("dict_parse_folders") print(dict_parse_folders) ## parse each sample retrieved for sample, folder_input in dict_input_folders.items(): if (arg_dict.debug): debug_message('sample: ' + sample, 'yellow') debug_message('folder_input: ' + folder_input, 'yellow') debug_message('folder_parse: ' + dict_parse_folders[sample], 'yellow') debug_message('annot_file: ' + df_accID.loc[sample, 'annot_file'], 'yellow') debug_message('genome' + df_accID.loc[sample, 'genome'], 'yellow') ## timestamps input_timestamp = os.path.join(folder_input, '.success') parse_timestamp = os.path.join(dict_parse_folders[sample], '.success') print() print("\t+ Parsing sample: " + sample) if (not HCGB_files.is_non_zero_file(parse_timestamp) and not HCGB_files.is_non_zero_file(input_timestamp)): ## TODO: Set threads to use in parallel process_OK = parse_annot_file(sample, folder_input, df_accID.loc[sample, 'annot_file'], dict_parse_folders[sample], arg_dict.debug, df_accID.loc[sample, 'genome']) if (process_OK): ## link or copy annotation file into folder_input HCGB_files.get_symbolic_link_file( df_accID.loc[sample, 'annot_file'], folder_input) ## add df_accID.loc[sample,] information as csv into input folder df_accID.loc[sample, ].to_csv(os.path.join( folder_input, 'info.csv'), index=True, header=True) ## print time stamp HCGB_time.print_time_stamp(input_timestamp) ## print time stamp HCGB_time.print_time_stamp(parse_timestamp) else: print( colored( "\t+ Some error occurred for sample %s while parsing input options" % sample, 'red')) ## print time stamp HCGB_time.print_time_stamp(os.path.join(folder_input, '.fail')) ## print time stamp HCGB_time.print_time_stamp( os.path.join(dict_parse_folders[sample], '.fail')) else: read_time = HCGB_time.read_time_stamp(parse_timestamp) print( colored( "\t+ Input parsing already available for sample %s [%s]" % (sample, read_time), 'green')) print()