def run(self): # todo: filter fasta to keep just the largest contig. # run the fasta through barrnap fd, barrnap_outputfile = mkstemp() self.files_to_cleanup.append(barrnap_outputfile) b = Barrnap(self.input_file, self.threads) subprocess.check_output( b.construct_barrnap_command(barrnap_outputfile), shell=True) boundries = b.read_barrnap_output(barrnap_outputfile) f = Fasta(self.input_file) fragments = f.calc_fragment_coords(boundries) f.populate_fragments_from_chromosome(fragments, self.max_bases_from_ends) ff = FragmentFiles(fragments, self.output_directory, fragment_order=self.fragment_order) ff.create_fragment_fastas() # create a default profile.txt file default_profile = ProfileGenerator(self.output_directory, len(ff.ordered_fragments), self.dnaa_fasta, self.threads) default_profile.write_output_file()
def test_calc_fragment_coords_gz(self): f = Fasta(os.path.join(data_dir, 'calc_fragment_coords.fa.gz'), False) boundries = [[45, 55], [90, 110], [150, 180]] fragments = f.calc_fragment_coords(boundries) coords = [f.coords for f in fragments] self.assertEqual(coords, [[[180, 200], [0, 45]], [[55, 90]], [[110, 150]]])
def test_chop_from_ends(self): f = Fasta(os.path.join(data_dir, 'calc_fragment_coords.fa'), False) fragments = f.calc_fragment_coords([[45, 55], [90, 110], [150, 180]]) sequences = [str(f.sequence) for f in fragments] f.populate_fragments_from_chromosome(fragments, 5) sequences = [str(f.sequence) for f in fragments] self.assertEqual(sequences, ['TTTTTNNNAAAAA', 'CCCCCNNNCCCCC', 'GGGGGNNNGGGGG'])
def test_calc_fragment_coords(self): f = Fasta(os.path.join(data_dir, 'calc_fragment_coords.fa'), False) boundries = [[45, 55], [90, 110], [150, 180]] fragments = f.calc_fragment_coords(boundries) coords = [f.coords for f in fragments] self.assertEqual(coords, [[[180, 200], [0, 45]], [[55, 90]], [[110, 150]]]) f.populate_fragments_from_chromosome(fragments, None)
def run_analysis(self, input_file, p, d): # run the fasta through barrnap fd, barrnap_outputfile = mkstemp() b = Barrnap(input_file, self.threads) subprocess.check_output( b.construct_barrnap_command(barrnap_outputfile), shell=True) boundries = b.read_barrnap_output(barrnap_outputfile) f = Fasta(input_file, is_circular = self.is_circular) fragments = f.calc_fragment_coords( boundries) f.populate_fragments_from_chromosome(fragments, self.max_bases_from_ends) tmpdir = mkdtemp() self.dirs_to_cleanup.append(tmpdir) ff = FragmentFiles(fragments, tmpdir) ff.create_fragment_fastas() # take each fasta file and blast it against the database blast = Blast(d.db_prefix, self.threads) gat_profile = GATProfile(fragments = []) for fasta_file in ff.output_filenames: blast_results = blast.run_blast(fasta_file) fb = FilterBlast(blast_results, self.min_bit_score, self.min_alignment_length) top_result = fb.return_top_result() if top_result is None: gat_profile.fragments.append('?') fasta_file with open(fasta_file, "r") as fasta_file_fh: with open(self.new_fragments, "a+") as newfrag_fh: newfrag_fh.write(fasta_file_fh.read()) continue else: self.top_results.append(top_result) if top_result.is_forward(): gat_profile.fragments.append( str(top_result.subject)) else: gat_profile.fragments.append( str(top_result.subject)+ '\'') gat_profile.orientate_for_dnaA() # lookup the gat_profile to get the number tg = TypeGenerator(p, gat_profile) type_output_string = tg.calculate_type() + "\t" + str(gat_profile) if not tg.has_previously_seen: with open(self.novel_profiles, "a+") as output_fh: output_fh.write(self.db_dir + "\t" + type_output_string + "\n") return type_output_string
def test_populate_fragments_from_chromosome(self): f = Fasta(os.path.join(data_dir, 'calc_fragment_coords.fa'), False) fragments = f.calc_fragment_coords([[45, 55], [90, 110], [150, 180]]) sequences = [str(f.sequence) for f in fragments] self.assertEqual(sequences, ["", "", ""]) f.populate_fragments_from_chromosome(fragments, None) sequences = [str(f.sequence) for f in fragments] self.assertEqual(sequences, [ "TTTTTTTTTTTTTTTTTTTTAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC", "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG" ])
def test_chop_from_ends(self): f = Fasta(os.path.join(data_dir, 'calc_fragment_coords.fa'), False) boundries = [ Operon(45, 55, True), Operon(90, 110, False), Operon(150, 180, True) ] fragments = f.calc_fragment_coords(boundries) sequences = [str(f.sequence) for f in fragments] f.populate_fragments_from_chromosome(fragments, 5) sequences = [str(f.sequence) for f in fragments] self.assertEqual(sequences, ['TTTTTNNNAAAAA', 'CCCCCNNNCCCCC', 'GGGGGNNNGGGGG'])
def shrink_files(self): output_filenames = [] # copy profile files to destination shutil.copy(os.path.join(self.input_database, 'profile.txt'), self.output_database) shutil.copy(os.path.join(self.input_database, 'profile.txt.yml'), self.output_database) fasta_file_names_compressed = self.get_database_files_compressed() fasta_file_names_uncompressed = self.get_database_files() fasta_file_names = fasta_file_names_uncompressed + fasta_file_names_compressed fasta_obj = [ Fasta(f) for f in fasta_file_names] for f in fasta_obj: fb = FilterBlast(self.blast_results, 1, 1) destination_filename = os.path.join(self.output_database, str(f.fragment_number()) + '.fa') if len(f.chromosome.seq) < self.target_bases: if f.input_file in fasta_file_names_compressed: shutil.copy(f.input_file, destination_filename + '.gz') output_filenames.append(destination_filename + '.gz') else: shutil.copy(f.input_file, destination_filename) output_filenames.append(destination_filename) else: blocks = fb.identify_regions(f.fragment_number(), self.target_bases) sequence = "" for b in blocks: sequence += f.chromosome.seq[(b[0]):(b[1])] record = [SeqRecord(sequence, str(f.fragment_number()) , '', '')] SeqIO.write(record, destination_filename, "fasta") output_filenames.append(destination_filename) return self.compress_files(output_filenames)
def find_boundries(self, coords): boundries = [] starting_coords = [] ending_coords = [] variable_s = self.five_or_23s(coords) for c in coords: if (c[2] == 16 and c[3] == '+') or (c[2] == variable_s and c[3] == '-'): # start of ribo starting_coords.append(c[0]) elif (c[2] == 16 and c[3] == '-') or (c[2] == variable_s and c[3] == '+'): # end of ribo ending_coords.append(c[1]) starting_coords = self.filter_out_close_start_coords(starting_coords) ending_coords = self.filter_out_close_end_coords(ending_coords) for start_index in range(len(starting_coords)): start = starting_coords[start_index] if start < 0: continue for end_index in range(len(ending_coords)): end = ending_coords[end_index] if end < 0: continue if end - start < self.len_70s and end - start > 0: boundries.append([start, end]) ending_coords[end_index] = -1 starting_coords[start_index] = -1 continue # check for 70S that goes over the end of the genome and for errors remaining_start_coords = [s for s in starting_coords if s >= 0] remaining_end_coords = [e for e in ending_coords if e >= 0] if len(remaining_start_coords) > 0 and len(remaining_end_coords) > 0: chromosome_length = self.chromosome_length if self.chromosome_length <= 0: chromosome_length = len( Fasta(self.input_file, self.verbose).chromosome) for start_index in range(len(remaining_start_coords)): start = remaining_start_coords[start_index] for end_index in range(len(remaining_end_coords)): end = remaining_end_coords[end_index] if end < 0: continue if (chromosome_length - start ) < self.len_70s and end < self.len_70s and ( chromosome_length - start) + end < self.len_70s: boundries.append([start, end]) remaining_end_coords[end_index] = -1 remaining_start_coords[start_index] = -1 continue return boundries
def test_populate_fragments_from_chromosome(self): f = Fasta(os.path.join(data_dir, 'calc_fragment_coords.fa'), False) boundries = [ Operon(45, 55, True), Operon(90, 110, False), Operon(150, 180, True) ] fragments = f.calc_fragment_coords(boundries) sequences = [str(f.sequence) for f in fragments] self.assertEqual(sequences, ["", "", ""]) f.populate_fragments_from_chromosome(fragments, None) sequences = [str(f.sequence) for f in fragments] self.assertEqual(sequences, [ "TTTTTTTTTTTTTTTTTTTTAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", "CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC", "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG" ])
def test_fasta_get_largest_contig(self): f = Fasta(os.path.join(data_dir, 'get_largest_contig.fa'), False) largest_contig_record = f.get_chromosome_from_fasta() self.assertEqual(len(largest_contig_record.seq), 60)
def populate_fragments_from_chromosome(self, input_file, boundries): f = Fasta(input_file, self.verbose, is_circular=self.is_circular) fragments = f.calc_fragment_coords(boundries) f.populate_fragments_from_chromosome(fragments, self.max_bases_from_ends) return fragments