def reciprocal_blast(good_proteins_fasta, fasta_files): """Create blast database for good_proteins_fasta, blast all fasta_files against this database & return hits.""" # Create blast database, retrieve path & name database_url = _create_blast_database(good_proteins_fasta) # Submit job for each fasta files, and store jobid & hits file name tuples jobids_and_hits_filenames = [ _submit_blast_run(database_url, fasta_file) for fasta_file in fasta_files ] # Retrieve all results, which should only take neglishably longer than waiting for the slowest results x_vs_all_hits = [ _retrieve_blast_hits(jobid, hits_file) for jobid, hits_file in jobids_and_hits_filenames ] # Concatenate the individual blast result files into one allvsall = tempfile.mkstemp(suffix='.tsv', prefix='all_vs_all')[1] concatenate(allvsall, x_vs_all_hits) # Clean up local and remote for x_vs_all in x_vs_all_hits: os.remove(x_vs_all) #Remote database_url port :444 requires certificate authentication, which we remove to use the basic authentication database_url = database_url.replace(':444', '') send_request(database_url, method='DELETE') return allvsall
def reciprocal_blast(good_proteins_fasta, fasta_files): """Create blast database for good_proteins_fasta, blast all fasta_files against this database & return hits.""" run_dir = tempfile.mkdtemp(prefix='reciprocal_blast_') # Create blast database, retrieve path & name db_dir, db_name = _create_blast_database(run_dir, good_proteins_fasta) # Blast individual fasta files against the made blast databank, instead of the much larger good_proteins_fasta x_vs_all_hits = [_blast_file_against_database(db_dir, db_name, fasta) for fasta in fasta_files] # Concatenate the individual blast result files into one allvsall = tempfile.mkstemp(suffix='.tsv', prefix='all-vs-all_')[1] concatenate(allvsall, x_vs_all_hits) # Clean up shutil.rmtree(run_dir) return allvsall
def _translate_genome(tuples_of_gbk_and_ptt_files): """Translate all files for genome and concatenate them into single DNA and Protein fasta files.""" assert tuples_of_gbk_and_ptt_files is not None, 'No genbank files were provided' project_id = tuples_of_gbk_and_ptt_files[0][0] out_dir = create_directory('translations/' + project_id) dna_files = [] protein_files = [] for project_id, gbk_file, ptt_file in tuples_of_gbk_and_ptt_files: dna_file, protein_file = _extract_gene_and_protein(out_dir, project_id, gbk_file, ptt_file) dna_files.append(dna_file) protein_files.append(protein_file) #Concatenate files into one dna_concatemer = os.path.join(out_dir, '{pid}.ffn'.format(pid=project_id)) protein_concatemer = os.path.join(out_dir, '{pid}.faa'.format(pid=project_id)) concatenate(dna_concatemer, dna_files) concatenate(protein_concatemer, protein_files) return dna_concatemer, protein_concatemer
def reciprocal_blast(good_proteins_fasta, fasta_files): """Create blast database for good_proteins_fasta, blast all fasta_files against this database & return hits.""" run_dir = tempfile.mkdtemp(prefix='reciprocal_blast_') # Create blast database, retrieve path & name db_dir, db_name = _create_blast_database(run_dir, good_proteins_fasta) # Blast individual fasta files against the made blast databank, instead of the much larger good_proteins_fasta x_vs_all_hits = [ _blast_file_against_database(db_dir, db_name, fasta) for fasta in fasta_files ] # Concatenate the individual blast result files into one allvsall = tempfile.mkstemp(suffix='.tsv', prefix='all-vs-all_')[1] concatenate(allvsall, x_vs_all_hits) # Clean up shutil.rmtree(run_dir) return allvsall
def _translate_genome(tuples_of_gbk_and_ptt_files): """Translate all files for genome and concatenate them into single DNA and Protein fasta files.""" assert tuples_of_gbk_and_ptt_files is not None, 'No genbank files were provided' project_id = tuples_of_gbk_and_ptt_files[0][0] out_dir = create_directory('translations/' + project_id) dna_files = [] protein_files = [] for project_id, gbk_file, ptt_file in tuples_of_gbk_and_ptt_files: dna_file, protein_file = _extract_gene_and_protein( out_dir, project_id, gbk_file, ptt_file) dna_files.append(dna_file) protein_files.append(protein_file) #Concatenate files into one dna_concatemer = os.path.join(out_dir, '{pid}.ffn'.format(pid=project_id)) protein_concatemer = os.path.join(out_dir, '{pid}.faa'.format(pid=project_id)) concatenate(dna_concatemer, dna_files) concatenate(protein_concatemer, protein_files) return dna_concatemer, protein_concatemer
def reciprocal_blast(good_proteins_fasta, fasta_files): """Create blast database for good_proteins_fasta, blast all fasta_files against this database & return hits.""" # Create blast database, retrieve path & name database_url = _create_blast_database(good_proteins_fasta) # Submit job for each fasta files, and store jobid & hits file name tuples jobids_and_hits_filenames = [_submit_blast_run(database_url, fasta_file) for fasta_file in fasta_files] # Retrieve all results, which should only take neglishably longer than waiting for the slowest results x_vs_all_hits = [_retrieve_blast_hits(jobid, hits_file) for jobid, hits_file in jobids_and_hits_filenames] # Concatenate the individual blast result files into one allvsall = tempfile.mkstemp(suffix=".tsv", prefix="all_vs_all")[1] concatenate(allvsall, x_vs_all_hits) # Clean up local and remote for x_vs_all in x_vs_all_hits: os.remove(x_vs_all) # Remote database_url port :444 requires certificate authentication, which we remove to use the basic authentication database_url = database_url.replace(":444", "") send_request(database_url, method="DELETE") return allvsall
def calculate_tables(genome_ids_a, genome_ids_b, sico_files, oddeven=False): """Compute a spreadsheet of data points each for A and B based the SICO files, without duplicating computations.""" #Convert file names into identifiers while preserving filenames, as filenames are used both for BioPython & PhiPack orth_files = [(os.path.split(sico_file)[1].split('.')[0], sico_file) for sico_file in sico_files] #Find PhiPack values for each sico file orth_phipack_values = _phipack_values_for_sicos(orth_files) #Convert list of sico files into ortholog name mapped to BioPython Alignment object sico_alignments = [(ortholog, AlignIO.read(sico_file, 'fasta')) for ortholog, sico_file in orth_files] #Only retrieve genomes once which we'll use to link gene names to orthologs all_genome_ids = list(genome_ids_a) all_genome_ids.extend(genome_ids_b) genomes = select_genomes_by_ids(all_genome_ids).values() #For each ortholog, determine the newest gene name across taxa so unannotated taxa also get gene names ortholog_gene_names = dict( (ortholog, get_most_recent_gene_name(genomes, alignmnt)) for ortholog, alignmnt in sico_alignments) #Split individual sico alignments into separate alignments for each of the clades per ortholog #These split alignments can later be reversed and/or subselections can be made to calculate for alternate alignments split_alignments = [ (ortholog, MultipleSeqAlignment(seqr for seqr in alignmnt if seqr.id.split('|')[0] in genome_ids_a), MultipleSeqAlignment(seqr for seqr in alignmnt if seqr.id.split('|')[0] in genome_ids_b)) for ortholog, alignmnt in sico_alignments ] #Calculate tables for normal sico alignments log.info('Starting calculations for full alignments') table_a, table_b = _tables_for_split_alignments(split_alignments, ortholog_gene_names, orth_phipack_values) if not oddeven: return table_a, table_b #As an alternate method of calculating number of substitutions for independent X-axis of eventual graph: #split each alignment for a and b into two further alignments of odd and even codons odd_even_split_orth_alignments = [ (orthologname, _every_other_codon_alignments(alignment_x), _every_other_codon_alignments(alignment_y)) for orthologname, alignment_x, alignment_y in split_alignments ] #Recover odd alignments as first from each pair of alignments odd_split_alignments = [(orthologname, odd_even_x[0], odd_even_y[0]) for orthologname, odd_even_x, odd_even_y in odd_even_split_orth_alignments] #Create files for all the odd codon alignments, so we can run PhiPack for them odd_alignments_dir = tempfile.mkdtemp(prefix='odd_codon_alignments_') odd_files = dict( (ortholog, os.path.join(odd_alignments_dir, ortholog + '.ffn')) for ortholog, odd_x, odd_y in odd_split_alignments) for ortholog, odd_x, odd_y in odd_split_alignments: AlignIO.write([odd_x, odd_y], odd_files[ortholog], 'fasta') odd_phipack_vals = _phipack_values_for_sicos(odd_files.items()) shutil.rmtree(odd_alignments_dir) #Calculate tables for odd codon sico alignments log.info('Starting calculations for odd alignments') table_a_odd, table_b_odd = _tables_for_split_alignments( odd_split_alignments, ortholog_gene_names, odd_phipack_vals) #Recover even alignments as second from each pair of alignments even_split_alignments = [(orthologname, odd_even_x[1], odd_even_y[1]) for orthologname, odd_even_x, odd_even_y in odd_even_split_orth_alignments] #Create files for all the odd codon alignments, so we can run PhiPack for them even_alignments_dir = tempfile.mkdtemp(prefix='even_codon_alignments_') even_files = dict( (ortholog, os.path.join(even_alignments_dir, ortholog + '.ffn')) for ortholog, even_x, even_y in even_split_alignments) for ortholog, even_x, even_y in even_split_alignments: AlignIO.write([even_x, even_y], even_files[ortholog], 'fasta') even_phipack_vals = _phipack_values_for_sicos(even_files.items()) shutil.rmtree(even_alignments_dir) #Calculate tables for even codon sico alignments log.info('Starting calculations for even alignments') table_a_even, table_b_even = _tables_for_split_alignments( even_split_alignments, ortholog_gene_names, even_phipack_vals) #Concatenate tables and return their values table_a_full = tempfile.mkstemp(suffix='.tsv', prefix='table_a_full_')[1] table_b_full = tempfile.mkstemp(suffix='.tsv', prefix='table_b_full_')[1] concatenate(table_a_full, [table_a, table_a_odd, table_a_even]) concatenate(table_b_full, [table_b, table_b_odd, table_b_even]) return table_a_full, table_b_full
def calculate_tables(genome_ids_a, genome_ids_b, sico_files, oddeven=False): """Compute a spreadsheet of data points each for A and B based the SICO files, without duplicating computations.""" #Convert file names into identifiers while preserving filenames, as filenames are used both for BioPython & PhiPack orth_files = [(os.path.split(sico_file)[1].split('.')[0], sico_file) for sico_file in sico_files] #Find PhiPack values for each sico file orth_phipack_values = _phipack_values_for_sicos(orth_files) #Convert list of sico files into ortholog name mapped to BioPython Alignment object sico_alignments = [(ortholog, AlignIO.read(sico_file, 'fasta')) for ortholog, sico_file in orth_files] #Only retrieve genomes once which we'll use to link gene names to orthologs all_genome_ids = list(genome_ids_a) all_genome_ids.extend(genome_ids_b) genomes = select_genomes_by_ids(all_genome_ids).values() #For each ortholog, determine the newest gene name across taxa so unannotated taxa also get gene names ortholog_gene_names = dict((ortholog, get_most_recent_gene_name(genomes, alignmnt)) for ortholog, alignmnt in sico_alignments) #Split individual sico alignments into separate alignments for each of the clades per ortholog #These split alignments can later be reversed and/or subselections can be made to calculate for alternate alignments split_alignments = [(ortholog, MultipleSeqAlignment(seqr for seqr in alignmnt if seqr.id.split('|')[0] in genome_ids_a), MultipleSeqAlignment(seqr for seqr in alignmnt if seqr.id.split('|')[0] in genome_ids_b)) for ortholog, alignmnt in sico_alignments] #Calculate tables for normal sico alignments log.info('Starting calculations for full alignments') table_a, table_b = _tables_for_split_alignments(split_alignments, ortholog_gene_names, orth_phipack_values) if not oddeven: return table_a, table_b #As an alternate method of calculating number of substitutions for independent X-axis of eventual graph: #split each alignment for a and b into two further alignments of odd and even codons odd_even_split_orth_alignments = [(orthologname, _every_other_codon_alignments(alignment_x), _every_other_codon_alignments(alignment_y)) for orthologname, alignment_x, alignment_y in split_alignments] #Recover odd alignments as first from each pair of alignments odd_split_alignments = [(orthologname, odd_even_x[0], odd_even_y[0]) for orthologname, odd_even_x, odd_even_y in odd_even_split_orth_alignments] #Create files for all the odd codon alignments, so we can run PhiPack for them odd_alignments_dir = tempfile.mkdtemp(prefix='odd_codon_alignments_') odd_files = dict((ortholog, os.path.join(odd_alignments_dir, ortholog + '.ffn')) for ortholog, odd_x, odd_y in odd_split_alignments) for ortholog, odd_x, odd_y in odd_split_alignments: AlignIO.write([odd_x, odd_y], odd_files[ortholog], 'fasta') odd_phipack_vals = _phipack_values_for_sicos(odd_files.items()) shutil.rmtree(odd_alignments_dir) #Calculate tables for odd codon sico alignments log.info('Starting calculations for odd alignments') table_a_odd, table_b_odd = _tables_for_split_alignments(odd_split_alignments, ortholog_gene_names, odd_phipack_vals) #Recover even alignments as second from each pair of alignments even_split_alignments = [(orthologname, odd_even_x[1], odd_even_y[1]) for orthologname, odd_even_x, odd_even_y in odd_even_split_orth_alignments] #Create files for all the odd codon alignments, so we can run PhiPack for them even_alignments_dir = tempfile.mkdtemp(prefix='even_codon_alignments_') even_files = dict((ortholog, os.path.join(even_alignments_dir, ortholog + '.ffn')) for ortholog, even_x, even_y in even_split_alignments) for ortholog, even_x, even_y in even_split_alignments: AlignIO.write([even_x, even_y], even_files[ortholog], 'fasta') even_phipack_vals = _phipack_values_for_sicos(even_files.items()) shutil.rmtree(even_alignments_dir) #Calculate tables for even codon sico alignments log.info('Starting calculations for even alignments') table_a_even, table_b_even = _tables_for_split_alignments(even_split_alignments, ortholog_gene_names, even_phipack_vals) #Concatenate tables and return their values table_a_full = tempfile.mkstemp(suffix='.tsv', prefix='table_a_full_')[1] table_b_full = tempfile.mkstemp(suffix='.tsv', prefix='table_b_full_')[1] concatenate(table_a_full, [table_a, table_a_odd, table_a_even]) concatenate(table_b_full, [table_b, table_b_odd, table_b_even]) return table_a_full, table_b_full