Esempio n. 1
0
 def test_blast_record_set(self):
     # prepare database
     seqfile_ops.write_fasta(self.db_file, self.db_records)
     db_records_list = seqfile_ops.load_multifasta(self.db_file)
     index = 0
     for record in db_records_list:
         self.assertEqual(record.id,self.db_records[index].id)
         self.assertEqual(str(record.seq),str(self.db_records[index].seq))
         index +=1
     # make database
     self.dbfile_path, db_report = blasting.make_blastDB(self.temp_dir,
                                                         self.db_name,
                                                         self.db_file,
                                                         'nucl')
     self.assertIs(db_report['status'], 0)
     self.assertEquals(db_report['message'], 'database exists')
     # run local blast batch (with multiple queries)
     matches_multi = blasting.blast_record_set(self.dbfile_path,
                                               self.multi_records,
                                               self.prefs)
     self.assertIs(len(matches_multi), 3)
     index = 0
     for record in self.multi_records:
         self.assertEqual(matches_multi[record.id][0]['contig_id'],
                          self.multi_records[index].id)
         self.assertEqual(matches_multi[record
         .id][0]['details']['match_p100'], 100) 
         index +=1
Esempio n. 2
0
 def test_local_blastn(self):
     # prepare query
     seqfile_ops.write_fasta(self.single_q_file, self.single_record)
     query_record = seqfile_ops.load_fasta(self.single_q_file)
     self.assertEqual(query_record.id,self.record_1.id)
     self.assertEqual(str(query_record.seq),str(self.record_1.seq))
     # prepare database
     seqfile_ops.write_fasta(self.db_file, self.db_records)
     records_list = seqfile_ops.load_multifasta(self.db_file)
     index = 0
     for record in records_list:
         self.assertEqual(record.id,self.db_records[index].id)
         self.assertEqual(str(record.seq),str(self.db_records[index].seq))
         index +=1
     # make database
     self.dbfile_path, db_report = blasting.make_blastDB(self.temp_dir,
                                                         self.db_name,
                                                         self.db_file,
                                                         'nucl')
     self.assertIs(db_report['status'], 0)
     self.assertEquals(db_report['message'], 'database exists')
     # run local blast with single query
     self.status = blasting.local_blastn(self.single_q_file,
                                         self.single_out_file,
                                         self.dbfile_path,
                                         self.prefs)
     self.assertEquals(self.status['output'], '')
     self.assertIsNone(self.status['error'])
     # parse blast output
     matches_single = blasting.parse_blast_out6(self.single_out_file,
                                                self.prefs)
     self.assertIs(len(matches_single), 1)
     self.assertEqual(matches_single[0]['contig_id'],
                      self.single_record.id)
     self.assertEqual(matches_single[0]['details']['match_p100'], 100)
Esempio n. 3
0
 def test_seq_subset_load_from_chop_by_size(self):
     seqfile_ops.write_fasta(self.single_q_file, self.single_record)
     subset_mode = 'size'
     subset_args = {'size': 5, 'chop_mode': 'exact_size'}
     subset, subset_file = dataset_load.seq_subset_load(self.single_q_file,
                                                        subset_mode,
                                                        subset_args)
     self.assertIs(len(subset), 10)
     self.assertEqual(subset[0].id, 'temp_1_0-5')
Esempio n. 4
0
def seq_subset_load(infile, subset_mode, subset_args):
    """Load a subset of sequence segments from a sequence file."""
    from analysis.sequence_ops import feat_collect, feature_coords, \
        coord_chop, get_seq_subset_by_coords 
    from analysis.seqfile_ops import load_multifasta, surefmt_load, \
        write_fasta
    from analysis.text_manipulation import adaptive_list_load
    if subset_mode is 'flatfile':
        # in this case the sequence file MUST be multifasta
        try: subset = load_multifasta(infile)
        except: raise
        else:
            print "set of", len(subset), "sequence segments"
            subset_file = infile
    else:
        # load the query single sequence file (convert format if necessary)
        try: seq_record = surefmt_load(infile, 'fasta', 'generic_dna')
        except: raise
        else: print "query sequence loaded from", infile
        # load or generate coordinate pairs for target segments
        if subset_mode is 'coordinates':
            try:
                coords_file = subset_args['file']
                header = subset_args['header']
                columns = subset_args['columns']
                coords_list = adaptive_list_load(coords_file, header, columns)
            except: raise
            else: print len(coords_list), "segments loaded from", infile
        elif subset_mode is 'features':
            try:
                feat_mode = subset_args
                features = feat_collect(infile, feat_mode)
                coords_list = feature_coords(features)
                print coords_list
            except: raise
            else: print len(coords_list),"features loaded from", infile
        elif subset_mode is 'size':
            try:
                size = subset_args['size']
                chop_mode = subset_args['chop_mode']
                coords_list = coord_chop(len(seq_record.seq), size, chop_mode)
            except: raise
            else: print len(coords_list), "segments generated to fit", size
        else:
            print "ERROR: A mode MUST be specified."
            coords_list = None
        # collect subset of sequence segments using resulting coords_list
        try: subset = get_seq_subset_by_coords(seq_record, coords_list)
        except: raise
        else: print "subset of", len(subset), "sequence segments"
        # save subset to multifasta file for later use or reference
        subset_file = seq_record.id+'_subset.fas'
        try: write_fasta(subset_file, subset)
        except: raise
        else: print "subset written to fasta file", subset_file
    return subset, subset_file
Esempio n. 5
0
 def test_seq_subset_load_from_multifasta(self):
     seqfile_ops.write_fasta(self.multi_q_file, self.multi_records)
     subset_mode = 'flatfile'
     subset_args = None
     subset, subset_file = dataset_load.seq_subset_load(self.multi_q_file,
                                                        subset_mode,
                                                        subset_args)
     self.assertIs(len(subset), 5)
     index = 0
     for record in subset:
         self.assertEqual(subset[index].id, self.multi_records[index].id)
         index += 1
     self.assertIs(subset_file, self.multi_q_file)
Esempio n. 6
0
 def test_seq_subset_load_from_coords(self):
     seqfile_ops.write_fasta(self.single_q_file, self.single_record)
     temp_file = open(self.coords_file, 'w')
     temp_file.write(self.str_contents)
     temp_file.close()
     subset_mode = 'coordinates'
     subset_args = {'file': self.coords_file, 'header': 1,
                    'columns': (1, 2)}
     subset, subset_file = dataset_load.seq_subset_load(self.single_q_file,
                                                        subset_mode,
                                                        subset_args)
     self.assertIs(len(subset), 3)
     self.assertEqual(subset[0].id, 'temp_1_0-10')
     self.assertEqual(str(subset[2].seq), 'TTTGGCGCTCGCGGCGGG')
Esempio n. 7
0
 def test_write_and_load_multifasta(self):
     count = seqfile_ops.write_fasta(self.fas_filename,
                                           self.three_records) 
     self.assertIs(count, 3)
     fas_records = seqfile_ops.load_multifasta(self.fas_filename)
     for index in range (0,2):
         self.assertEqual(fas_records[index].id,
                          self.three_records[index].id)
Esempio n. 8
0
def blast_record_set(dbfile_path, fasta_records, blast_prefs):
    """Loop through fasta entries and blast against database."""
    import os
    from analysis.seqfile_ops import write_fasta
    from analysis.blasting import local_blastn
    matches = {}
    for query_record in fasta_records:
        query_file = 'temp.fas'
        out_file = 'temp.blast'
        write_fasta(query_file,query_record)
        try:
            status = local_blastn(query_file, out_file, dbfile_path,
                                  blast_prefs) 
        except: raise
        else:
            query_matches = parse_blast_out6(out_file, blast_prefs)
            matches[query_record.id] = query_matches
        finally:
            os.remove(query_file)
            os.remove(out_file)
    return matches
Esempio n. 9
0
 def setUp(self):
     # create temp directory
     self.temp_dir = "tests/temp_data/"
     os.mkdir(self.temp_dir)
     # assign database directory (but don't create it yet)
     self.db_path = "tests/temp_data/temp_db/"
     # define file names
     self.gen_filename_1 = "temp_gen1.fas"
     self.gen_filename_2 = "temp_gen2.fas"
     self.gen_filename_3 = "temp_gen3.fas"
     self.genomes_list = self.temp_dir+"temp_genomes.txt"
     # create content for the genomes file
     self.header_contents = ['genome_name', 'file']
     self.line_1_contents = ['genome_1', self.gen_filename_1]
     self.line_2_contents = ['genome_2', self.gen_filename_2]
     self.line_3_contents = ['genome_3', self.gen_filename_3]
     self.raw_contents = [self.header_contents, self.line_1_contents,
                          self.line_2_contents, self.line_3_contents]
     # transform into a single string
     self.str_contents = ""
     for line_set in self.raw_contents:
         joined = "\t".join(line_set)
         self.str_contents += joined+"\n"
     # create some sequence records
     self.seq_1 = Seq('AATTTAATGGCGCAGGCTAGGAGAGAGATTTTTGGCGCTCGCGGCGGGG')
     self.seq_2 = Seq('GGATTATACCAAAGGCTTAAACTATAGGCTAGGAGAGATAGACG')
     self.seq_3 = Seq('GGAATATACCTTAGGCTTAAACTATAGGCTAGGAGAGGCTCG')
     self.seq_4 = Seq('GGGGATTACAGCCATAGTAACCAGATATTAaGACG')
     self.seq_5 = Seq('GGAACCGCTGATACATGATTATAGATCTATAGGGTCTAAAACATCG')
     self.seq_6 = Seq('AGGTCATGTACGATGCAGAATTTGTCGTACGATGTTAGTACGATGGTA')
     self.seq_7 = Seq('TTTTTCGCGCGCTTAGACCCAAAATATATTGTCGCTATAGGTCCCTCT')
     self.seq_8 = Seq('ACCGTGTGGCATTTATATTACACCACACACAGATTGGGTGTGCCAATCAG')
     self.seq_9 = Seq('ACCGTACGTACCATATTATTATATAGGATAGATATTTAGAGGATTTAGAT')
     self.record_1 = SeqRecord(self.seq_1, id='temp_1')
     self.record_2 = SeqRecord(self.seq_2, id='temp_2')
     self.record_3 = SeqRecord(self.seq_3, id='temp_3')
     self.record_4 = SeqRecord(self.seq_4, id='temp_4')
     self.record_5 = SeqRecord(self.seq_5, id='temp_5')
     self.record_6 = SeqRecord(self.seq_5, id='temp_6')
     self.record_7 = SeqRecord(self.seq_5, id='temp_7')
     self.record_8 = SeqRecord(self.seq_5, id='temp_8')
     self.record_9 = SeqRecord(self.seq_5, id='temp_9')
     # create record sets
     self.gen1_records = [self.record_1, self.record_2, self.record_3]
     self.gen2_records = [self.record_4, self.record_5, self.record_6]
     self.gen3_records = [self.record_7, self.record_8, self.record_9]
     # create all primary files
     seqfile_ops.write_fasta(self.temp_dir+self.gen_filename_1,
                             self.gen1_records)
     seqfile_ops.write_fasta(self.temp_dir+self.gen_filename_2,
                             self.gen2_records)
     seqfile_ops.write_fasta(self.temp_dir+self.gen_filename_3,
                             self.gen3_records) 
     seq_file = open(self.genomes_list, 'w')
     seq_file.write(self.str_contents)
     seq_file.close()
Esempio n. 10
0
 def test_surefmt_load_fas2fas(self):
     count = seqfile_ops.write_fasta(self.fas_filename, self.record)
     self.assertIs(count, 1)
     fas_record = seqfile_ops.surefmt_load(self.fas_filename,
                                                 'fasta', generic_dna)
     self.assertEqual(fas_record.id, self.record.id)
Esempio n. 11
0
 def test_seqfile_format_fas(self):
     count = seqfile_ops.write_fasta(self.fas_filename, self.record)
     self.assertIs(count, 1)
     format, name = seqfile_ops.seqfile_format(self.fas_filename)
     self.assertEqual(format, 'fasta')
Esempio n. 12
0
 def test_load_agnostic_fas(self):
     count = seqfile_ops.write_fasta(self.fas_filename, self.record)
     self.assertIs(count, 1)
     fas_record, type = seqfile_ops.load_agnostic(self.fas_filename)
     self.assertEqual(fas_record.id, self.record.id)
     self.assertEqual(type, 'fasta')
Esempio n. 13
0
 def test_write_and_load_single_fasta(self):
     count = seqfile_ops.write_fasta(self.fas_filename, self.record)
     self.assertIs(count, 1)
     fas_record = seqfile_ops.load_fasta(self.fas_filename)
     self.assertEqual(fas_record.id, self.record.id)
Esempio n. 14
0
 def test_surefmt_load_fas2gbk(self):
     count = seqfile_ops.write_fasta(self.fas_filename, self.record)
     self.assertIs(count, 1)
     gbk_record = seqfile_ops.surefmt_load(self.fas_filename,
                                                 'genbank', generic_dna)
     self.assertEqual(gbk_record.id, self.record.id)