Exemple #1
0
def parse_blast_out6(out_file, blast_prefs):
    """Parse BLAST results formatted in Tabular format (outfmt=6)."""
    from analysis.text_manipulation import adaptive_list_load
    score_pref = blast_prefs['score']
    len_pref = blast_prefs['length']
    query_matches = []
    # default blast outfmt yields
    # 0=qseqid 1=sseqid 2=pident 3=length 4=mismatch 5=gapopen
    # 6=qstart 7=qend 8=sstart 9=send 10=evalue 11=bitscore
    results = adaptive_list_load(out_file, 0, (1, 2, 3, 6, 7, 8, 9, 10, 11))
    for line in results:
        contig_ID = line[0]
        match_p100 = float(line[1])
        length = int(line[2])
        q_start = int(line[3])
        q_end = int(line[4])
        s_start = int(line[5])
        s_end = int(line[6])
        evalue = line[7]
        bitscore = float(line[8])
        if bitscore < score_pref:
            pass # this is bad
        elif length < len_pref:
            pass # this is bad too
        else:
            if q_start > q_end: m_orient = '-'
            else: m_orient = '+'
            match_details = {'match_p100': match_p100,
                             'length': length, 'm_orient': m_orient,
                             'q_start': q_start, 'q_end': q_end,
                             's_start': s_start, 's_end': s_end,
                             'evalue': evalue, 'bitscore': bitscore}
            match = {'contig_id': contig_ID, 'details': match_details}
            query_matches.append(match)
    return query_matches
Exemple #2
0
def genome_sets_load(genomes_path, input_file, input_prefs, db_path):
    """Load genome datasets listed in an input file."""
    import os, sys
    from classes.analysis_obj import GenomeSet
    from analysis.seqfile_ops import ensure_fasta
    from analysis.text_manipulation import adaptive_list_load
    from analysis.blasting import make_blastDB
    header = input_prefs['header']
    columns = input_prefs['columns']
    genomes_list = adaptive_list_load(input_file, header, columns)
    print "prepping BLAST databases"
    genome_sets = []
    for line in genomes_list:
        genome_name = line[0]
        seq_file = os.path.join(genomes_path, line[1])
        try: db_infile = ensure_fasta(seq_file)
        except: raise
        else: print "genome FASTA sequence available in", db_infile
        dbfile_path, DB_report = make_blastDB(db_path, genome_name,
                                              seq_file, 'nucl') 
        if DB_report['status'] is 1:
            print genome_name, ":", DB_report['message']['error']
            sys.exit()
        elif DB_report['status'] is 0:
            print genome_name, ":", DB_report['message']
        new_genome_set = GenomeSet(db_infile, genome_name)
        genome_sets.append(new_genome_set)
    print "   ", len(genome_sets),"databases ready to search"
    return genome_sets
Exemple #3
0
def seq_subset_load(infile, subset_mode, subset_args):
    """Load a subset of sequence segments from a sequence file."""
    from analysis.sequence_ops import feat_collect, feature_coords, \
        coord_chop, get_seq_subset_by_coords 
    from analysis.seqfile_ops import load_multifasta, surefmt_load, \
        write_fasta
    from analysis.text_manipulation import adaptive_list_load
    if subset_mode is 'flatfile':
        # in this case the sequence file MUST be multifasta
        try: subset = load_multifasta(infile)
        except: raise
        else:
            print "set of", len(subset), "sequence segments"
            subset_file = infile
    else:
        # load the query single sequence file (convert format if necessary)
        try: seq_record = surefmt_load(infile, 'fasta', 'generic_dna')
        except: raise
        else: print "query sequence loaded from", infile
        # load or generate coordinate pairs for target segments
        if subset_mode is 'coordinates':
            try:
                coords_file = subset_args['file']
                header = subset_args['header']
                columns = subset_args['columns']
                coords_list = adaptive_list_load(coords_file, header, columns)
            except: raise
            else: print len(coords_list), "segments loaded from", infile
        elif subset_mode is 'features':
            try:
                feat_mode = subset_args
                features = feat_collect(infile, feat_mode)
                coords_list = feature_coords(features)
                print coords_list
            except: raise
            else: print len(coords_list),"features loaded from", infile
        elif subset_mode is 'size':
            try:
                size = subset_args['size']
                chop_mode = subset_args['chop_mode']
                coords_list = coord_chop(len(seq_record.seq), size, chop_mode)
            except: raise
            else: print len(coords_list), "segments generated to fit", size
        else:
            print "ERROR: A mode MUST be specified."
            coords_list = None
        # collect subset of sequence segments using resulting coords_list
        try: subset = get_seq_subset_by_coords(seq_record, coords_list)
        except: raise
        else: print "subset of", len(subset), "sequence segments"
        # save subset to multifasta file for later use or reference
        subset_file = seq_record.id+'_subset.fas'
        try: write_fasta(subset_file, subset)
        except: raise
        else: print "subset written to fasta file", subset_file
    return subset, subset_file
 def test_adaptive_list_load(self):
     trimlines = text_manipulation.adaptive_list_load(self.filename, 0,
                                                      (1,3))
     self.assertEqual(len(trimlines), 4)
     self.assertEqual(trimlines[1][1], self.line_1_contents[3])