def download_compressed(zip_uri, species_name, schema_name, download_folder, headers_get): """ Downloads and extracts a ZIP archive with a ready-to-use version of a schema in the Chewie-NS. Parameters ---------- zip_uri : str Endpoint URL to make the request to download the compressed schema. species_name : str Scientific name of the schema species. schema_name : str Name of the schema in the Chewie-NS. download_folder : str Path to the directory to which the ZIP archive will be saved. headers_get : dict HTTP headers for GET requests. Returns ------- schema_path : str ZIP archive contents will be extracted to this directory. """ zip_name = '{0}{1}_{2}.zip'.format(species_name[0].lower(), species_name.split(' ')[-1], schema_name) schema_path = os.path.join(download_folder, zip_name.split('.zip')[0]) fo.create_directory(schema_path) # download ZIP archive url, zip_response = cr.simple_get_request( zip_uri, headers_get, parameters={'request_type': 'download'}) zip_path = os.path.join(schema_path, zip_name) open(zip_path, 'wb').write(zip_response.content) # uncompress print('Decompressing schema...') shutil.unpack_archive(zip_path, extract_dir=schema_path) # delete ZIP os.remove(zip_path) return schema_path
def main(input_files, output_directory, protein_table, blast_score_ratio, cpu_cores, taxa, proteome_matches, no_cleanup, blast_path): # create output directory fo.create_directory(output_directory) # create temp directory temp_directory = fo.join_paths(output_directory, ['temp']) fo.create_directory(temp_directory) # validate input files genes_list = fo.join_paths(temp_directory, ['listGenes.txt']) genes_list = pv.check_input_type(input_files, genes_list) loci_paths = fo.read_lines(genes_list) schema_directory = os.path.dirname(loci_paths[0]) schema_basename = fo.file_basename(schema_directory) print('Schema: {0}'.format(schema_directory)) print('Number of loci: {0}'.format(len(loci_paths))) # find annotations based on reference proteomes for species proteome_results = {} if taxa is not None: proteome_results = proteome_annotations(schema_directory, temp_directory, taxa, blast_score_ratio, cpu_cores, proteome_matches, blast_path) # find annotations in SPARQL endpoint print('\nQuerying UniProt\'s SPARQL endpoint...') config_file = fo.join_paths(input_files, '.schema_config') if os.path.isfile(config_file) is True: config = fo.pickle_loader(config_file) translation_table = config.get('translation_table', [11])[0] else: translation_table = 11 sparql_results = sparql_annotations(loci_paths, translation_table, cpu_cores) loci_info = {} if protein_table is not None: # read cds_info table # read "cds_info.tsv" file created by CreateSchema table_lines = fo.read_tabular(protein_table) for l in table_lines[1:]: # create locus identifier based on genome identifier and # cds identifier in file locus_id = l[0].replace('_', '-') locus_id = locus_id + '-protein{0}'.format(l[-2]) loci_info[locus_id] = l annotations = join_annotations(sparql_results, proteome_results, loci_info) # table header header = ['Locus_ID'] if len(loci_info) > 0: header += table_lines[0] header += ['Uniprot_Name', 'UniProt_URL'] if len(proteome_results) > 0: header.extend(['Proteome_ID', 'Proteome_Product', 'Proteome_Gene_Name', 'Proteome_Species', 'Proteome_BSR']) loci_info_bool = True if len(loci_info) > 0 else False output_table = create_annotations_table(annotations, output_directory, header, schema_basename, loci_info_bool) if no_cleanup is False: shutil.rmtree(temp_directory) print('\n\nThe table with new information can be found at:' '\n{0}'.format(output_table))
def main(input_files, output_directory, cpu_cores, blast_score_ratio, minimum_length, translation_table, ptf_path, size_threshold, blast_path): print('Adapting schema in the following ' 'directory:\n{0}'.format(os.path.abspath(input_files))) print('Prodigal training file:\n{0}'.format(ptf_path)) print('Number of cores: {0}'.format(cpu_cores)) print('BLAST Score Ratio: {0}'.format(blast_score_ratio)) print('Translation table: {0}'.format(translation_table)) print('Minimum accepted sequence length: {0}'.format(minimum_length)) print('Size threshold: {0}'.format(size_threshold)) # define output paths schema_path = os.path.abspath(output_directory) schema_short_path = fo.join_paths(schema_path, ['short']) # create output directories # check if they exist first fo.create_directory(schema_path) fo.create_directory(schema_short_path) # list schema gene files genes_file = pv.check_input_type(input_files, os.path.join(output_directory, 'schema_genes.txt')) # import list of schema files with open(genes_file, 'r') as gf: genes_list = [line.rstrip('\n') for line in gf] os.remove(genes_file) print('Number of genes to adapt: {0}\n'.format(len(genes_list))) print('Determining the total number of alleles and ' 'allele mean length per gene...\n'.format()) # count number of sequences and mean length per gene genes_info = [] genes_pools = multiprocessing.Pool(processes=cpu_cores) gp = genes_pools.map_async(fao.gene_seqs_info, genes_list, callback=genes_info.extend) gp.wait() # split files according to number of sequences and sequence mean length # in each file to pass even groups of sequences to all cores even_genes_groups = mo.split_genes_by_core(genes_info, cpu_cores*4, 'seqcount') # with few inputs, some sublists might be empty even_genes_groups = [i for i in even_genes_groups if len(i) > 0] # add common arguments blastp_path = os.path.join(blast_path, ct.BLASTP_ALIAS) makeblastdb_path = os.path.join(blast_path, ct.MAKEBLASTDB_ALIAS) even_genes_groups = [[i, schema_path, schema_short_path, blast_score_ratio, minimum_length, translation_table, size_threshold, blastp_path, makeblastdb_path, adapt_loci] for i in even_genes_groups] print('Adapting {0} genes...\n'.format(len(genes_list))) invalid_data = mo.map_async_parallelizer(even_genes_groups, mo.function_helper, cpu_cores, show_progress=True) # define paths and write files with list of invalid # alleles and invalid genes output_schema_basename = os.path.basename(output_directory.rstrip('/')) schema_parent_directory = os.path.dirname(schema_path) # write file with alleles that were determined to be invalid invalid_alleles = [sub[0] for sub in invalid_data] invalid_alleles = list(itertools.chain.from_iterable(invalid_alleles)) invalid_alleles_file = os.path.join(schema_parent_directory, '{0}_{1}'.format(output_schema_basename, 'invalid_alleles.txt')) with open(invalid_alleles_file, 'w') as inv: lines = ['{0}: {1}\n'.format(allele[0], allele[1]) for allele in invalid_alleles] inv.writelines(lines) # write file with identifiers of genes that had no valid alleles invalid_genes = [sub[1] for sub in invalid_data] invalid_genes = list(itertools.chain.from_iterable(invalid_genes)) invalid_genes_file = os.path.join(schema_parent_directory, '{0}_{1}'.format(output_schema_basename, 'invalid_genes.txt')) with open(invalid_genes_file, 'w') as inv: invalid_geqids = '\n'.join(invalid_genes) inv.write(invalid_geqids) stats_lines = [sub[2] for sub in invalid_data] stats_lines = list(itertools.chain.from_iterable(stats_lines)) stats_lines = ['\t'.join(line) for line in stats_lines] stats_genes_file = '{0}/{1}_{2}'.format(schema_parent_directory, output_schema_basename, 'summary_stats.txt') with open(stats_genes_file, 'w') as stats: summary_stats_text = '\n'.join(stats_lines) stats.write('Gene\tTotal_alleles\tValid_alleles\tNumber_representatives\n') stats.write(summary_stats_text) print('\n\nNumber of invalid genes: {0}'.format(len(invalid_genes))) print('Number of invalid alleles: {0}'.format(len(invalid_alleles))) print('\nSuccessfully adapted {0}/{1} genes present in the ' 'input schema.'.format(len(genes_list)-len(invalid_genes), len(genes_list)))
def proteome_annotations(schema_directory, temp_directory, taxa, blast_score_ratio, cpu_cores, proteome_matches, blast_path): """ Determines loci annotations based on alignment against UniProt's reference proteomes. Parameters ---------- schema_directory : str Path to the schema's directory. temp_directory : str Path to the temporary directory where intermediate files will be written to. taxa : list List of taxa scientific names. The process will search for reference proteomes whose "Species Name" field contain any of the provided taxa names. blast_score_ratio : float BLAST Score Ratio value. Hits with a BSR value >= than this value will be considered as high scoring hits that can be included in the final table according to the maximum number of matches to report. cpu_cores : int Number of threads used to run BLASTp. proteome_matches : int Maximum number of proteome matches to report. blast_path : str Path to BLAST executables. Returns ------- proteome_results : dict Dictionary with loci identifiers as keys and a list with information about loci retrieved from the most similar records in UniProt's reference proteomes. """ # get paths to files with representative sequences short_directory = fo.join_paths(schema_directory, ['short']) reps_paths = [fo.join_paths(short_directory, [file]) for file in os.listdir(short_directory) if file.endswith('.fasta') is True] print('Translating representative sequences...', end='') # translate representatives for all loci translated_reps = fo.join_paths(temp_directory, ['translated_reps']) fo.create_directory(translated_reps) reps_protein_files = fao.translate_fastas(reps_paths, translated_reps, 11) print('done.') print('Downloading list of reference proteomes...', end='') remote_readme = fo.join_paths(ct.UNIPROT_PROTEOMES_FTP, ['README']) local_readme = fo.join_paths(temp_directory, ['reference_proteomes_readme.txt']) # get README file with list of reference proteomes res = fo.download_file(remote_readme, local_readme) print('done.') # get lines with proteomes info for species of interest readme_lines = fo.read_lines(local_readme, strip=False) selected_proteomes = im.contained_terms(readme_lines, taxa) selected_proteomes = [line.strip('\n') for line in selected_proteomes] selected_proteomes = [line.split('\t') for line in selected_proteomes] print('Found {0} reference proteomes for ' '{1}.'.format(len(selected_proteomes), taxa)) proteome_results = {} if len(selected_proteomes) > 0: # create directory to store proteomes proteomes_directory = fo.join_paths(temp_directory, ['proteomes']) fo.create_directory(proteomes_directory) proteomes_files = ur.get_proteomes(selected_proteomes, proteomes_directory) # uncompress files and concatenate into single FASTA uncompressed_proteomes = [fo.unzip_file(file) for file in proteomes_files] proteomes_concat = fo.join_paths(proteomes_directory, ['full_proteome.fasta']) proteomes_concat = fo.concatenate_files(uncompressed_proteomes, proteomes_concat) # get self-scores # concatenate protein files reps_concat = fo.concatenate_files(reps_protein_files, fo.join_paths(temp_directory, ['reps_concat.fasta'])) print('\nDetermining self-score of representatives...', end='') blastp_path = os.path.join(blast_path, ct.BLASTP_ALIAS) makeblastdb_path = os.path.join(blast_path, ct.MAKEBLASTDB_ALIAS) self_scores = fao.get_self_scores(reps_concat, temp_directory, cpu_cores, blastp_path, makeblastdb_path) print('done.') # create BLASTdb with proteome sequences proteome_blastdb = fo.join_paths(proteomes_directory, ['proteomes_db']) stderr = bw.make_blast_db('makeblastdb', proteomes_concat, proteome_blastdb, 'prot') # BLASTp to determine annotations blast_inputs = [['blastp', proteome_blastdb, file, file+'_blastout.tsv', 1, 1, None, None, proteome_matches, None, bw.run_blast] for file in reps_protein_files] print('\nBLASTing representatives against proteomes...') blast_results = mo.map_async_parallelizer(blast_inputs, mo.function_helper, cpu_cores, show_progress=True) blastout_files = [fo.join_paths(translated_reps, [file]) for file in os.listdir(translated_reps) if 'blastout' in file] # index proteome file indexed_proteome = SeqIO.index(proteomes_concat, 'fasta') # process results for each BLASTp proteome_results = extract_annotations(blastout_files, indexed_proteome, self_scores, blast_score_ratio, proteome_matches) return proteome_results
def adapt_loci(genes, schema_path, schema_short_path, bsr, min_len, table_id, size_threshold, blastp_path, makeblastdb_path): """ Adapts a set of genes/loci from an external schema so that that schema can be used with chewBBACA. Removes invalid alleles and selects representative alleles to include in the "short" directory. Parameters ---------- genes_list : list A list with the following elements: - List with paths to the files to be processed. - Path to the schema directory. - Path to the "short" directory. - BLAST Score Ratio value. - Minimum sequence length value. - Genetic code. - Sequence size variation threshold. Returns ------- invalid_alleles : list List with the identifiers of the alleles that were determined to be invalid. invalid_genes : list List with the identifiers of the genes that had no valid alleles. summary_stats : list of list List with one sublist per processed locus. Each sublist has four elements: - The identifier of the locus. - The number of alleles in the external file. - The number of alleles that were a valid CDS. - The number of representatives determined determined by the process. The function writes the schema files . """ # divide input list into variables summary_stats = [] invalid_genes = [] invalid_alleles = [] for gene in genes: representatives = [] final_representatives = [] # get gene basename and identifier gene_basename = os.path.basename(gene) gene_id = gene_basename.split('.f')[0] # create paths to gene files in new schema gene_file = fo.join_paths(schema_path, ['{0}{1}'.format(gene_id, '.fasta')]) gene_short_file = fo.join_paths(schema_short_path, ['{0}{1}'.format(gene_id, '_short.fasta')]) # create path to temp working directory for current gene gene_temp_dir = fo.join_paths(schema_path, ['{0}{1}'.format(gene_id, '_temp')]) # create temp directory for the current gene fo.create_directory(gene_temp_dir) # dictionaries mapping gene identifiers to DNA sequences # and Protein sequences gene_seqs, prot_seqs, gene_invalid, seqids_map, total_sequences = \ sm.get_seqs_dicts(gene, gene_id, table_id, min_len, size_threshold) invalid_alleles.extend(gene_invalid) # if locus has no valid CDS sequences, # continue to next locus if len(prot_seqs) == 0: shutil.rmtree(gene_temp_dir) invalid_genes.append(gene_id) summary_stats.append([gene_id, str(total_sequences), '0', '0']) continue if len(gene_seqs) > 1: # identify DNA sequences that code for same protein equal_prots = sm.determine_duplicated_seqs(prot_seqs) # get only one identifier per protein ids_to_blast = [protids[0] for protein, protids in equal_prots.items()] # get longest sequence as first representative longest = sm.determine_longest(ids_to_blast, prot_seqs) representatives.append(longest) final_representatives.append(longest) # create FASTA file with distinct protein sequences protein_file = fo.join_paths(gene_temp_dir, ['{0}_protein.fasta'.format(gene_id)]) protein_lines = fao.fasta_lines(ids_to_blast, prot_seqs) fo.write_list(protein_lines, protein_file) # create blastdb with all distinct proteins blastp_db = os.path.join(gene_temp_dir, gene_id) bw.make_blast_db(makeblastdb_path, protein_file, blastp_db, 'prot') # determine appropriate blastp task (proteins < 30aa need blastp-short) blastp_task = bw.determine_blast_task(equal_prots) # cycles to BLAST representatives against non-representatives until # all non-representatives have a representative while len(set(ids_to_blast) - set(representatives)) != 0: # create FASTA file with representative sequences rep_file = fo.join_paths(gene_temp_dir, ['{0}_rep_protein.fasta'.format(gene_id)]) rep_protein_lines = fao.fasta_lines(representatives, prot_seqs) fo.write_list(rep_protein_lines, rep_file) # create file with seqids to BLAST against ids_str = im.concatenate_list([str(i) for i in ids_to_blast], '\n') ids_file = fo.join_paths(gene_temp_dir, ['{0}_ids.txt'.format(gene_id)]) fo.write_to_file(ids_str, ids_file, 'w', '') # BLAST representatives against non-represented blast_output = fo.join_paths(gene_temp_dir, ['{0}_blast_out.tsv'.format(gene_id)]) # set max_target_seqs to huge number because BLAST only # returns 500 hits by default blast_stderr = bw.run_blast(blastp_path, blastp_db, rep_file, blast_output, 1, 1, ids_file, blastp_task, 100000, ignore=ct.IGNORE_RAISED) if len(blast_stderr) > 0: raise ValueError(blast_stderr) # import BLAST results blast_results = fo.read_tabular(blast_output) # get self-score for representatives rep_self_scores = {res[1]: res[2] for res in blast_results if res[0] == res[1]} # divide results into high, low and hot BSR values hitting_high, hitting_low, hotspots, high_reps, low_reps, hot_reps = \ bsr_categorizer(blast_results, representatives, rep_self_scores, bsr, bsr+0.1) excluded_reps = [] # remove high BSR hits that have representative hitting_high = set(hitting_high) ids_to_blast = [i for i in ids_to_blast if i not in hitting_high] # remove representatives that led to high BSR with subjects that were removed prunned_high_reps = {k: [r for r in v if r in ids_to_blast] for k, v in high_reps.items()} reps_to_remove = [k for k, v in prunned_high_reps.items() if len(v) == 0] excluded_reps.extend(reps_to_remove) # determine smallest set of representatives that allow to get all cycle candidates excluded = [] hotspot_reps = set(im.flatten_list(list(hot_reps.values()))) for rep, hits in hot_reps.items(): common = hotspot_reps.intersection(set(hits)) if len(common) > 0: hotspot_reps = hotspot_reps - common else: excluded.append(rep) excluded_reps.extend(excluded) # remove representatives that only led to low BSR excluded_reps.extend(low_reps) representatives = [rep for rep in representatives if rep not in excluded_reps] ids_to_blast = [i for i in ids_to_blast if i not in excluded_reps] # determine next representative from candidates rep_candidates = list(set(hotspots) - hitting_high) # sort to guarantee reproducible results with same datasets rep_candidates = sorted(rep_candidates, key=lambda x: int(x)) representatives, final_representatives = select_candidate(rep_candidates, prot_seqs, ids_to_blast, representatives, final_representatives) # remove files created for current gene iteration os.remove(rep_file) os.remove(blast_output) os.remove(ids_file) else: final_representatives = list(prot_seqs.keys()) # write schema file with all alleles gene_lines = fao.fasta_lines(list(gene_seqs.keys()), gene_seqs) fo.write_list(gene_lines, gene_file) # get total number of valid sequences valid_sequences = len(gene_lines) # write schema file with representatives final_representatives = [seqids_map[rep] for rep in final_representatives] gene_rep_lines = fao.fasta_lines(final_representatives, gene_seqs) fo.write_list(gene_rep_lines, gene_short_file) # get number of representatives representatives_number = len(gene_rep_lines) summary_stats.append([gene_id, str(total_sequences), str(valid_sequences), str(representatives_number)]) shutil.rmtree(gene_temp_dir) return [invalid_alleles, invalid_genes, summary_stats]