Example #1
0
def getHits(gene):
    ''' BLAST parser using Biopython
    Input: name of blast out file in standard ouput format
    Outputs: 2 files 
    '''
    inf = open(o.blast, 'rU')
    parser = NCBIStandalone.BlastParser()
    error_parser = NCBIStandalone.BlastErrorParser(inf)
    iterator = NCBIStandalone.Iterator(inf, error_parser)
    err_iterator = NCBIStandalone.Iterator(inf, error_parser)
    #next_record =

    ## *** Parsing *** ##
    lg = len(gene)
    if o.verbose == True:
        sys.stderr.write("\nGetting hits...\n")
    for record in iterator:
        query = record.query.split(" ")[0]

        if query in gene:
            out.write("%s\n" % gene[query])
            if record.alignments is []:
                out.write("%s\tNA\tNA\tNA\n" % gene[query])
            else:
                flag = 0
                for alignment in record.alignments:
                    for hsp in alignment.hsps:
                        #-->## ** Selection Process **##
                        if float(hsp.expect) < 0.0001 and flag < 3:
                            out.write(
                                "%s\t%s\t%s\tHigh\n" %
                                (gene[query], alignment.title.split(">")[1],
                                 float(hsp.expect)))
                            flag += 1
                        elif float(hsp.expect) < 1.0 and flag < 3:
                            out.write(
                                "%s\t%s\t%s\tLow\n" %
                                (gene[query], alignment.title.split(">")[1],
                                 float(hsp.expect)))
                            flag += 1
                        elif float(hsp.expect) < 5.0 and flag < 3:
                            out.write(
                                "%s\t%s\t%s\tScare\n" %
                                (gene[query], alignment.title.split(">")[1],
                                 float(hsp.expect)))
                            flag += 1
                        elif float(hsp.expect) > 1.0 and flag < 1:
                            out.write("%s\tNA\tNA\tNA\n" % gene[query])
                            flag += 1
            del gene[query]
            if o.verbose == True:
                sys.stderr.write('\r' + '' * 0)
                sys.stderr.write(str(int((lg - len(gene)) * 100 / lg)) + '%')
                sys.stdout.flush()
        else:
            pass

    if (lg - len(gene)) != len(gene):
        sys.stderr.write("\nGenes not found:\n%s" % gene.keys())
Example #2
0
def blast2data(filehandle):  ###This should be for blast-txt
    """BLAST output to data dict"""
    data = {}
    blast_parser = NCBIStandalone.BlastParser()
    blast_iterator = NCBIStandalone.Iterator(filehandle, blast_parser)
    for blast_record in blast_iterator:
        readname = blast_record.query.split()[0]
        for alignment in blast_record.alignments:
            if re.search("\|", alignment.title):
                fields = alignment.title[1:].strip().split('|')
                refgi = fields[1]
            else:
                fields = alignment.title[1:].strip().split(' ')
            refgi = fields[0]
            for hsp in alignment.hsps:
                if hsp.expect > EVALUE_CUTOFF:
                    continue
                identity = round(
                    float(hsp.identities[0]) * 100 / hsp.identities[1], 1)
                start = hsp.sbjct_start
                end = hsp.sbjct_end
                if start > end:
                    tmp = start
                    start = end
                    end = tmp
                if not data.has_key(refgi):
                    data[refgi] = []
                if not refgi in references:
                    references.append(refgi)
                    refgenome2json(refgi)
                if not refLengths.has_key(refgi):
                    refLengths[refgi] = gi2length(refgi)
                data[refgi].append([start, identity, end, readname])
    return data
def getCoordinatesFromBlo(bloFname, padding):
    '''
    # Extract coordinates from blo file
    '''
    coord = {}

    #outf = open(outFile, 'w')
    blast_parser = NCBIStandalone.BlastParser()
    blast_iterator = NCBIStandalone.Iterator(open('temp.blo'), blast_parser)
    #blast_iterator = SearchIO.parse(open('temp.blo'),'blast-txt') #if switch to SearchIO, this is the way to go (not working yet)
    for hit in blast_iterator:
        for alignment in hit.alignments:
            for hsp in alignment.hsps:
                #print alignment.title
                #print hsp.sbjct_start, hsp.sbjct_end
                #print hsp.sbjct
                #outf.write("%s_%s-%s\n%s\n\n"%(alignment.title, hsp.sbjct_start, hsp.sbjct_end, hsp.sbjct))
                new = True
                fullName = alignment.title.replace('>', '')
                if fullName in coord.keys() and hsp.sbjct_start >= coord[
                        fullName][0] and hsp.sbjct_end <= coord[fullName][1]:
                    new = False
                if new:
                    coord[fullName] = [hsp.sbjct_start, hsp.sbjct_end]
    return coord
Example #4
0
def blast_parse(file, e, output):

    result_handle = open(file)
    
    blast_parser = NCBIStandalone.BlastParser()
    blast_iterator = NCBIStandalone.Iterator(result_handle, blast_parser)
    blast_record = next(blast_iterator)
    
    output = open(output, 'w')
    output.write('query title\tdescription\tlength\te value' + '\n')
    for blast_record in blast_iterator:
        for alignment in blast_record.alignments:
            for hsp in alignment.hsps:
                if hsp.expect < e:
                    output.write(str(blast_record.query[:18]) + ' \t')
                    output.write(str(alignment.title) + '\t')
                    output.write(str(alignment.length) + '\t')
                    output.write(str(hsp.expect) + '')
                    output.write('\n')
    
    
    output.close()
Example #5
0
 def __init__(self, handle):
     """Initialize the class."""
     self.handle = handle
     blast_parser = NCBIStandalone.BlastParser()
     self.blast_iter = NCBIStandalone.Iterator(handle, blast_parser)
Example #6
0
dbFile = argv[2]
outFile = argv[3]
#padding = int(argv[4])

# Format dbFile
if os.path.exists("%s.nin" % dbFile):
    print "--[WARNING]blastdb already formated, using the existing one."
else:
    print "Formatting database..."
    os.system('formatdb -i %s -p F' % dbFile)

# Run BLAST
os.system(
    'blastall -p blastn -i %s -d %s -e 1e-10 -v 100000 -b 100000 -m 0 -o temp.blo -q -2'
    % (queryFile, dbFile))

# Extract sequeces from blo file
outf = open(outFile, 'w')
blast_parser = NCBIStandalone.BlastParser()
blast_iterator = NCBIStandalone.Iterator(open('temp.blo'), blast_parser)
#blast_iterator = SearchIO.parse(open('temp.blo'),'blast-txt') #if switch to SearchIO, this is the way to go (not working yet)
for hit in blast_iterator:
    for alignment in hit.alignments:
        for hsp in alignment.hsps:
            #print alignment.title
            #print hsp.sbjct_start, hsp.sbjct_end
            #print hsp.sbjct
            outf.write(
                "%s_%s-%s\n%s\n\n" %
                (alignment.title, hsp.sbjct_start, hsp.sbjct_end, hsp.sbjct))
Example #7
0
# biopython
from Bio.Blast import NCBIStandalone

my_blast_db = os.path.join(os.getcwd(), 'at-est', 'a_cds-10-7.fasta')
my_blast_file = os.path.join(os.getcwd(), 'at-est', 'test_blast',
                             'sorghum_est-test.fasta')
my_blast_exe = os.path.join(os.getcwd(), 'blast', 'blastall')

print 'Running blastall...'
blast_out, error_info = NCBIStandalone.blastall(my_blast_exe, 'blastn',
                                                my_blast_db, my_blast_file)


b_parser = NCBIStandalone.BlastParser()

b_iterator = NCBIStandalone.Iterator(blast_out, b_parser)

while 1:
    b_record = b_iterator.next()

    if b_record is None:
        break
    
    E_VALUE_THRESH = 0.04
    for alignment in b_record.alignments:
        for hsp in alignment.hsps:
            if hsp.expect < E_VALUE_THRESH:
                print '****Alignment****'
                print 'sequence:', alignment.title
                print 'length:', alignment.length
                print 'e value:', hsp.expect
Example #8
0
 def __init__(self, handle):
     self.handle = handle
     blast_parser = NCBIStandalone.BlastParser()
     self.blast_iter = NCBIStandalone.Iterator(handle, blast_parser)
Example #9
0
import string
from Bio.Seq import Seq
from Bio.Blast import NCBIStandalone

import fileinput
import glob

OutFile = r'Blast\out\02.blast_result_total.txt'

f = open(OutFile, 'r')

Mismatch_total_file = open('Mismatch_total_LOD v3_171103.txt', 'w')

blast_parser = NCBIStandalone.BlastParser()
print blast_parser
iterator = NCBIStandalone.Iterator(f, blast_parser)

for record in iterator:
    for alignment in record.alignments:
        for hsp in alignment.hsps:
            mismatch_Number = (hsp.identities[1] -
                               (hsp.identities[0] + hsp.gaps[0]))
            Gaps_Number = hsp.gaps[0]
            if (mismatch_Number != 0):
                Mismatch_total_file.write('%s\t%s\t%s\t%s\n' %
                                          (record.query, alignment.title,
                                           mismatch_Number, Gaps_Number))

f.close()
Mismatch_total_file.close()
def blast(blastRootDirectory):
    if sys.platform == 'win32':
        blast_db = os.path.join(blastRootDirectory, 'blastDB.fasta')
    else:
        if not os.path.isdir('/tmp/BLAST'):
            print "making directory '/tmp/BLAST'"
            os.mkdir('/tmp/BLAST/')
        if not os.path.exists('/tmp/BLAST/formatdb'):
            shutil.copy(os.path.join(blastRootDirectory, 'formatdb'),
                        '/tmp/BLAST')
            print "copying 'formatdb' to '/tmp/BLAST/'"
        blast_db = os.path.join('/tmp/BLAST', 'blastDB.fasta')
    #print 'path to blastDB.fasta:', blast_db

    blast_file = os.path.join(blastRootDirectory, 'filetoblast.txt')
    #print 'path to filetoblast.txt:', blast_file

    if sys.platform == 'win32':
        blastall_name = 'Blastall.exe'
        blast_exe = os.path.join(blastRootDirectory, blastall_name)
    else:
        blastall_name = 'blastall'
        blast_exe = os.path.join(os.getcwd(), '../../BLAST/bin/',
                                 blastall_name)

    #print 'path to blastall:', blast_exe

    if sys.platform == 'win32':
        import win32api
        blast_db = win32api.GetShortPathName(blast_db)
        blast_file = win32api.GetShortPathName(blast_file)
        blast_exe = win32api.GetShortPathName(blast_exe)

    #cont = raw_input('blah')
    #try:
    blast_out, error_info = NCBIStandalone.blastall(blast_exe,
                                                    'blastp',
                                                    blast_db,
                                                    blast_file,
                                                    align_view=7)
    #except:
    #  f = open(blast_file, 'r')
    #  s = file.read()
    #  print s

    #print 'done BLASTing'

    print 'errors:', error_info.read()
    print 'blast output:', blast_out.read()

    b_parser = NCBIXML.BlastParser()
    #print 'got parser'

    b_record = b_parser.parse(blast_out)
    b_iterator = NCBIStandalone.Iterator(blast_out, b_parser)
    #print 'got iterator'
    results = []
    recordnumber = 0
    nonmatchingQueries = []
    while 1:
        recordnumber += 1
        b_record = b_iterator.next()

        if not b_record: break
        print 'query:', b_record.query
        if b_record is None:
            break
        e_value_thresh = 0.001
        print 'number of alignments:', len(b_record.alignments)
        significant = False
        for alignment in b_record.alignments:
            for hsp in alignment.hsps:
                if hsp.expect < e_value_thresh:
                    alignment.title = alignment.title.replace(">", "")
                    if b_record.query != alignment.title:
                        significant = True
                        print 'adding', b_record.query, 'and', alignment.title, 'to the list of matches'
                        results.append(
                            (b_record.query, alignment.title, hsp.expect))
        print b_record.query, significant
        if not significant:
            print 'adding', b_record.query, 'to the list of queries without matches'
            nonmatchingQueries.append(b_record.query)

    return nonmatchingQueries, results
Example #11
0
argparser.add_argument('-d', '--dump', type=argparse.FileType('w'),
                       dest='dump_file',
                       help='pickle intermediate results in tempfile')
argparser.add_argument('-l', '--load', type=argparse.FileType('r'),
                       dest='load_file',
                       help='depickle intermediate results from tempfile')
argparser.add_argument('-o', '--outfile', type=argparse.FileType('w'), default=sys.stdout)

args = argparser.parse_args()
assert not (args.load_file and args.dump_file)

parser1 = NCBIStandalone.BlastParser()
parser2 = NCBIStandalone.BlastParser()

# PXL: PMZ(Q) x Lamp3(S), LXP: Lamp3(Q) x PMZ(S)
pxl_records = NCBIStandalone.Iterator(args.blast1, parser1)
lxp_records = NCBIStandalone.Iterator(args.blast2, parser2)

pxl_re = re.compile(r'(PMZ_[^\s]+)')
pxl_key_fn = lambda k: pxl_re.findall(k)[0]
lxp_re = re.compile(r'(lamp3[^\s]+ [^\s]+ len\d+)')  # matching 'not whitespace' is faster and more robust
lxp_key_fn = lambda k: lxp_re.findall(k)[0]

pxl_lookup, lxp_lookup = None, None

if args.load_file:
    pxl_lookup = cPickle.load(args.load_file)
    lxp_lookup = cPickle.load(args.load_file)
else:
    pxl_lookup = make_lookup(pxl_records, pxl_key_fn)
    lxp_lookup = make_lookup(lxp_records, lxp_key_fn)
Example #12
0
    def ReadBlast(self, file, OUT, iszipped=0, is_psiblast=None):

        output = open(OUT, "w")
        self.selfhits = []
        if is_psiblast:
            print >> sys.stderr, 'Parsing PSI-Blast'
            self.parser = NCBIStandalone.PSIBlastParser()
        else:
            self.parser = NCBIStandalone.BlastParser()
        if file[-3:] == '.gz' or iszipped:
            handle = gzip.open(file)
        else:
            handle = open(file)

        self.iter = NCBIStandalone.Iterator(handle=handle, parser=self.parser)
        self.blastDict = {}

        while 1:
            try:
                rec = self.iter.next()
                if not rec: break
            except:
                sys.stderr.write(
                    'Can\'t iterate on blast records anymore. Abort.\n')
                import traceback
                traceback.print_exc()
                return 'Error parsing %s' % file

            self.query = rec.query.split(" ")[
                0]  ##  blast_record.query.split(" ")[0]
            self.length = rec.query_letters

            if self.length < self.min_size:
                self.printer("Does not meet the minimum length " +
                             str(self.min_size))
                break

            if is_psiblast: rec = rec.rounds[-1]

            # each alignment is one potential hit
            for n, alignment in enumerate(rec.alignments):

                hsp = alignment.hsps[0]  #no multiple hsps
                alnlength = hsp.align_length
                hit = alignment.title
                #targetlength = alignment.length
                #m = re.search("sp\|([A-Z0-9]+)\|([A-Z0-9_]+) ?(.+)?", alignment.title)

                m = re.search("sp\|(.+?)\|(.+?) (.+)?", alignment.title)
                if m:  # pyphynr blast result
                    hit_sp_ac = m.group(1)
                    hit_sp_id = m.group(2)
                    hit_sp_note = m.group(3)
                elif alignment.title[
                        0] == '>':  # result from qadditional blast databases
                    hit_sp_ac = None
                    hit_sp_id = alignment.title[1:].split()[0]
                    hit_sp_note = None
                else:
                    hit_sp_ac = None
                    hit_sp_id = None
                    hit_sp_note = None

                self.printer(hit_sp_id)
                similarity = hsp.positives[0] / float(hsp.positives[1]) * 100
                if float(hsp.expect) <= float(self.HSP_max_evalue):
                    if float(similarity) >= int(self.HSP_minimal_positives):
                        coverage = hsp.positives[1] / float(self.length) * 100
                        if float(coverage) >= int(self.HSP_minimal_coverage):
                            #targetcoverage = hsp.positives[1]/float(targetlength)*100
                            #if  float(targetcoverage) > int(self.HSP_minimal_targetcov):
                            #self.compatibles.append((hit_sp_ac, hit))
                            #hitlist = [hit_sp_id, n+1 , hsp.positives[0]/float(hsp.positives[1])*100, hsp.positives[1]/float(self.length)*100, hsp.positives[1]/float(targetlength)*100, hsp.score, hsp.expect]
                            hitlist = [
                                hit_sp_id, hsp.positives[0] /
                                float(hsp.positives[1]) * 100,
                                hsp.positives[1] / float(self.length) * 100,
                                hsp.score, hsp.expect
                            ]
                            if self.cB: self.createblastDict(query, hitlist)
                            output.write("%s\t" % (self.query)),
                            for element in hitlist:
                                output.write("%s\t" % element),
                            output.write("\n")
        output.close()
        handle.close()
        return None
Example #13
0
    aster = []
    for t, pair in enumerate(zip(query, sbjct), 1):
        q, s = pair
        if q == 'N':
            aster.append([t, q, s])
    return aster


if __name__ == "__main__":

    blast_parser = NCBIStandalone.BlastParser()

    GLB = glob.glob('*.nofmt')
    for glb in GLB:
        handle = open(glb, 'r')
        blast_iterator = NCBIStandalone.Iterator(handle, blast_parser)
        total = 0
        E_VALUE_THRESH = 1
        with open(glb[:-6] + ".csv", 'w') as outcsv:
            for blast_record in blast_iterator:
                print("query:", blast_record.query)
                for alignment in blast_record.alignments:
                    for hsp in alignment.hsps:
                        if hsp.expect < E_VALUE_THRESH:
                            if hsp.align_length < 100:
                                continue
                            if len(str(hsp.sbjct)) == 0:
                                continue
                            print(hsp.query)
                            print(hsp.match)
                            print(hsp.sbjct)