def getCoordinatesFromBlo(bloFname, padding):
    '''
    # Extract coordinates from blo file
    '''
    coord = {}

    #outf = open(outFile, 'w')
    blast_parser = NCBIStandalone.BlastParser()
    blast_iterator = NCBIStandalone.Iterator(open('temp.blo'), blast_parser)
    #blast_iterator = SearchIO.parse(open('temp.blo'),'blast-txt') #if switch to SearchIO, this is the way to go (not working yet)
    for hit in blast_iterator:
        for alignment in hit.alignments:
            for hsp in alignment.hsps:
                #print alignment.title
                #print hsp.sbjct_start, hsp.sbjct_end
                #print hsp.sbjct
                #outf.write("%s_%s-%s\n%s\n\n"%(alignment.title, hsp.sbjct_start, hsp.sbjct_end, hsp.sbjct))
                new = True
                fullName = alignment.title.replace('>', '')
                if fullName in coord.keys() and hsp.sbjct_start >= coord[
                        fullName][0] and hsp.sbjct_end <= coord[fullName][1]:
                    new = False
                if new:
                    coord[fullName] = [hsp.sbjct_start, hsp.sbjct_end]
    return coord
Example #2
0
def blast2data(filehandle):  ###This should be for blast-txt
    """BLAST output to data dict"""
    data = {}
    blast_parser = NCBIStandalone.BlastParser()
    blast_iterator = NCBIStandalone.Iterator(filehandle, blast_parser)
    for blast_record in blast_iterator:
        readname = blast_record.query.split()[0]
        for alignment in blast_record.alignments:
            if re.search("\|", alignment.title):
                fields = alignment.title[1:].strip().split('|')
                refgi = fields[1]
            else:
                fields = alignment.title[1:].strip().split(' ')
            refgi = fields[0]
            for hsp in alignment.hsps:
                if hsp.expect > EVALUE_CUTOFF:
                    continue
                identity = round(
                    float(hsp.identities[0]) * 100 / hsp.identities[1], 1)
                start = hsp.sbjct_start
                end = hsp.sbjct_end
                if start > end:
                    tmp = start
                    start = end
                    end = tmp
                if not data.has_key(refgi):
                    data[refgi] = []
                if not refgi in references:
                    references.append(refgi)
                    refgenome2json(refgi)
                if not refLengths.has_key(refgi):
                    refLengths[refgi] = gi2length(refgi)
                data[refgi].append([start, identity, end, readname])
    return data
Example #3
0
def getHits(gene):
    ''' BLAST parser using Biopython
    Input: name of blast out file in standard ouput format
    Outputs: 2 files 
    '''
    inf = open(o.blast, 'rU')
    parser = NCBIStandalone.BlastParser()
    error_parser = NCBIStandalone.BlastErrorParser(inf)
    iterator = NCBIStandalone.Iterator(inf, error_parser)
    err_iterator = NCBIStandalone.Iterator(inf, error_parser)
    #next_record =

    ## *** Parsing *** ##
    lg = len(gene)
    if o.verbose == True:
        sys.stderr.write("\nGetting hits...\n")
    for record in iterator:
        query = record.query.split(" ")[0]

        if query in gene:
            out.write("%s\n" % gene[query])
            if record.alignments is []:
                out.write("%s\tNA\tNA\tNA\n" % gene[query])
            else:
                flag = 0
                for alignment in record.alignments:
                    for hsp in alignment.hsps:
                        #-->## ** Selection Process **##
                        if float(hsp.expect) < 0.0001 and flag < 3:
                            out.write(
                                "%s\t%s\t%s\tHigh\n" %
                                (gene[query], alignment.title.split(">")[1],
                                 float(hsp.expect)))
                            flag += 1
                        elif float(hsp.expect) < 1.0 and flag < 3:
                            out.write(
                                "%s\t%s\t%s\tLow\n" %
                                (gene[query], alignment.title.split(">")[1],
                                 float(hsp.expect)))
                            flag += 1
                        elif float(hsp.expect) < 5.0 and flag < 3:
                            out.write(
                                "%s\t%s\t%s\tScare\n" %
                                (gene[query], alignment.title.split(">")[1],
                                 float(hsp.expect)))
                            flag += 1
                        elif float(hsp.expect) > 1.0 and flag < 1:
                            out.write("%s\tNA\tNA\tNA\n" % gene[query])
                            flag += 1
            del gene[query]
            if o.verbose == True:
                sys.stderr.write('\r' + '' * 0)
                sys.stderr.write(str(int((lg - len(gene)) * 100 / lg)) + '%')
                sys.stdout.flush()
        else:
            pass

    if (lg - len(gene)) != len(gene):
        sys.stderr.write("\nGenes not found:\n%s" % gene.keys())
Example #4
0
 def __init__(self, dbname=None, blastexe=None, mode=None, parser=None):
     if dbname is None:
         dbname = DEFAULT_BLAST_DB
     if blastexe is None:
         blastexe = DEFAULT_BLAST_EXE
     if mode is None:
         mode = DEFAULT_BLAST_MODE
     if parser is None:
         parser = NCBIStandalone.BlastParser()
     self.dbname = dbname
     self.blastexe = blastexe
     self.parser = parser
     self.mode = mode
Example #5
0
def blastall_seq2db(header,
                    sequence,
                    dbname="",
                    blastprogram="blastp",
                    output="ncbiparsed",
                    extra_blastp_params={
                        'F': 'F',
                        'e': '10'
                    }):
    """
    """
    if blastprogram not in ['blastp', 'tblastn', 'blastn', 'blastx']:
        raise "only blastp and tblastn are supported"

    extra_params = " ".join(
        ["-%s %s" % (k, v) for k, v in extra_blastp_params.iteritems()])
    # generate (semi ;-) unique filename
    uniquetag = get_random_string_tag()
    fname = "_".join(
        [uniquetag,
         str(header).replace(" ", "_"), sequence[0:10] + ".fa"])
    fname = osPathJoin(OSgetcwd(), fname)
    fh = open(fname, 'w')
    fh.write(">%s\n%s\n" % (header, sequence))
    fh.close()
    command = "%s -p %s %s -i %s -d %s " % (BLASTALL_PATH, blastprogram,
                                            extra_params, fname, dbname)
    try:
        ci, co, ce = osPopen3(command)
        ci.close()
        if output == "ncbiparsed":
            b_parser = NCBIStandalone.BlastParser()
            blastallout = b_parser.parse(co)
        else:
            blastallout = co.read()
        co.close()
        ce.close()
    except:
        # for some kind of - obvious or freak accident case -
        # Blast or parsing of the blast record failed
        # No debugging here; just cleanup and return False
        print "BLAST CRASHED::"
        print command
        blastallout = False

    # remove the created Query file
    osRemove(fname)
    # and return!
    return blastallout
Example #6
0
def blast_parse(file, e, output):

    result_handle = open(file)
    
    blast_parser = NCBIStandalone.BlastParser()
    blast_iterator = NCBIStandalone.Iterator(result_handle, blast_parser)
    blast_record = next(blast_iterator)
    
    output = open(output, 'w')
    output.write('query title\tdescription\tlength\te value' + '\n')
    for blast_record in blast_iterator:
        for alignment in blast_record.alignments:
            for hsp in alignment.hsps:
                if hsp.expect < e:
                    output.write(str(blast_record.query[:18]) + ' \t')
                    output.write(str(alignment.title) + '\t')
                    output.write(str(alignment.length) + '\t')
                    output.write(str(hsp.expect) + '')
                    output.write('\n')
    
    
    output.close()
Example #7
0
def blastall_file2db(fname,
                     dbname="",
                     blastprogram="blastp",
                     output="ncbiparsed",
                     extra_blastp_params={
                         'F': 'F',
                         'e': '10'
                     }):
    """
    """
    if blastprogram not in ['blastp', 'tblastn', 'blastn', 'tblastx']:
        raise "only blastp and tblastn are supported"

    extra_params = " ".join(
        ["-%s %s" % (k, v) for k, v in extra_blastp_params.iteritems()])
    command = "%s -p %s %s -i %s -d %s " % (BLASTALL_PATH, blastprogram,
                                            extra_params, fname, dbname)
    try:
        ci, co, ce = osPopen3(command)
        ci.close()
        if output == "ncbiparsed":
            b_parser = NCBIStandalone.BlastParser()
            blastallout = b_parser.parse(co)
        else:
            blastallout = co.read()
        co.close()
        ce.close()
        # do NOT remove the input fname
    except:
        co.close()
        error = ce.read().strip()
        ce.close()
        print command
        print "ERROR: '%s'" % error
        raise "BLAST CRASHED...."
    # and return!
    return blastallout
Example #8
0
scanner = NCBIStandalone._Scanner()
for test in all_tests:
    print "*" * 50, "TESTING %s" % test
    datafile = os.path.join("Blast", test)
    scanner.feed(open(datafile), ParserSupport.AbstractConsumer())

for test in detailed_tests:
    print "*" * 50, "TESTING %s" % test
    datafile = os.path.join("Blast", test)
    scanner.feed(open(datafile), ParserSupport.TaggingConsumer())

### BlastParser

print "Running tests on BlastParser"

parser = NCBIStandalone.BlastParser()
pb_parser = NCBIStandalone.PSIBlastParser()
for test in all_tests:
    print "*" * 50, "TESTING %s" % test
    datafile = os.path.join("Blast", test)
    try:
        # First, try parsing it with the normal parser.
        rec = parser.parse(open(datafile))
    except ValueError, x:
        # If it complains that the input is psiblast data, then
        # parse it with the psiblast parser.
        if string.find(str(x), 'PSI-BLAST data') >= 0:
            rec = pb_parser.parse(open(datafile))
        else:
            raise
Example #9
0
 def __init__(self, handle):
     """Initialize the class."""
     self.handle = handle
     blast_parser = NCBIStandalone.BlastParser()
     self.blast_iter = NCBIStandalone.Iterator(handle, blast_parser)
Example #10
0
def main():
    parser = OptionParser()
    parser.add_option("-i",
                      "--input",
                      action="store",
                      dest="input",
                      help="input file to make phylotree")
    parser.add_option("-g",
                      "--germline",
                      action="store",
                      dest="germline",
                      help="germline fasta")
    parser.add_option("-o",
                      "--output",
                      action="store",
                      dest="output",
                      help="the file where you want all your data")
    (options, args) = parser.parse_args()
    if len(sys.argv) < 2:
        dowhat()
        parser.print_help()
        exit()

    open(options.output, 'w').write("Your Sequence Results:\n\n")
    copy(options.input, "workable.fasta")
    copy(options.germline, "germ.fasta")

    list_of_database_files = SeqIO.to_dict(
        SeqIO.parse("workable.fasta", "fasta"))

    while list_of_database_files:
        list_of_database_files = SeqIO.to_dict(
            SeqIO.parse("workable.fasta", "fasta"))
        populate_database("workable.fasta")
        print "***DatabasePopulated***"

        newsequence_search = open("germ.fasta", "r")
        cline = NcbiblastpCommandline(matrix="PAM30",
                                      evalue="20",
                                      word_size="2",
                                      query="germ.fasta",
                                      cmd='blastp',
                                      db="temporary_database",
                                      out="blastout")
        newsequence_search.close

        print "****Cline = *** --->", cline

        call_blast(cline)
        print "***Call_blast_successful***"

        result_handle = open('blastout')
        print "***result handle successful***"

        blast_parser = NCBIStandalone.BlastParser()
        print "***blast_parser****"

        blast_record = blast_parser.parse(result_handle)
        print "***blast_record***"

        newsequence_search = open("germ.fasta", 'w')
        newsequence_search.write(">" +
                                 str(blast_record.alignments[0].title[2:]) +
                                 "\n" +
                                 str(blast_record.alignments[0].hsps[0].sbjct))

        current_object = blast_record.alignments[0].title[2:]
        print current_object

        newfile = open(options.output, 'a')
        newfile.write(
            str(blast_record.alignments[0].hsps[0].query[:]) + "----> Query\n")
        newfile.write(
            str(blast_record.alignments[0].hsps[0].match[:]) +
            "----> Score of: " +
            str(blast_record.alignments[0].hsps[0].score) + "\n")
        newfile.write(
            str(blast_record.alignments[0].hsps[0].sbjct[:]) +
            "----> Template\n\n")

        list_of_database_files.pop(current_object)
        SeqIO.write(list_of_database_files.values(), "workable.fasta", "fasta")
Example #11
0
 def __init__(self, handle):
     self.handle = handle
     blast_parser = NCBIStandalone.BlastParser()
     self.blast_iter = NCBIStandalone.Iterator(handle, blast_parser)
Example #12
0
def blastall_seq2seq(fastadata=(),
                     filenames=(),
                     output="ncbiparsed",
                     blastprogram="blastp",
                     remove_files=True,
                     extra_blastp_params={
                         'F': 'F',
                         'e': '10'
                     }):
    """
    choose proper input:
    fastadata   ( ( headerQUERY, seqQUERY ) , ( headerSBJCT, seqSBJCT ) )
     or
    filenames   ( filenameQUERY, filenameSBJCT )
    """
    input = None

    if blastprogram not in ['blastp', 'tblastn', 'tblastx', 'blastx']:
        raise "only blastp and tblastn are supported"
    elif blastprogram in ['tblastn', 'tblastx']:
        dna_or_prot = "F"
    else:
        dna_or_prot = "T"

    if fastadata and type(fastadata) == type(
        ()) and len(fastadata) == 2 and not filenames:
        # input is fasta headers and sequence
        input = "fastadata"
        # write input filenames
        uniquetag = get_random_string_tag()
        fname_q = "_".join([uniquetag, str(fastadata[0][0]), 'Q.fa'])
        fname_s = "_".join([uniquetag, str(fastadata[1][0]), 'S.fa'])
        fh = open(fname_q, 'w')
        fh.write(">%s\n%s" % (fastadata[0][0], fastadata[0][1]))
        fh.close()
        fh = open(fname_s, 'w')
        fh.write(">%s\n%s" % (fastadata[1][0], fastadata[1][1]))
        fh.close()
    elif filenames and type(filenames) == type(
        ()) and len(filenames) == 2 and not fastadata:
        # input is (supposed to be) filenames
        input = "filenames"
        # get filenames
        fname_q = filenames[0]
        fname_s = filenames[1]
    elif not filenames and not fastadata:
        raise "no input!"
    else:
        raise "inproper input!"

    # formatdb
    OSsystem("%s -i %s -p %s" % (FORMATDB_PATH, fname_s, dna_or_prot))
    # and blastall!
    extra_params = " ".join(
        ["-%s %s" % (k, v) for k, v in extra_blastp_params.iteritems()])
    ci, co, ce = osPopen3(
        "%s -p %s %s -i %s -d %s " %
        (BLASTALL_PATH, blastprogram, extra_params, fname_q, fname_s))
    ci.close()
    if output == "ncbiparsed":
        b_parser = NCBIStandalone.BlastParser()
        blastallout = b_parser.parse(co)
    else:
        blastallout = co.read()
    co.close()
    ce.close()
    if remove_files:
        OSsystem("rm %s.*" % fname_s)
        osRemove("%s" % fname_s)
        osRemove("%s" % fname_q)
    # and return!
    return blastallout
Example #13
0
argparser = argparse.ArgumentParser(description='Find reciprocal best hits in two BLAST outputs')
argparser.add_argument('blast1', type=file, help='BLAST results')
argparser.add_argument('blast2', type=file, help='inverted BLAST results')
argparser.add_argument('-d', '--dump', type=argparse.FileType('w'),
                       dest='dump_file',
                       help='pickle intermediate results in tempfile')
argparser.add_argument('-l', '--load', type=argparse.FileType('r'),
                       dest='load_file',
                       help='depickle intermediate results from tempfile')
argparser.add_argument('-o', '--outfile', type=argparse.FileType('w'), default=sys.stdout)

args = argparser.parse_args()
assert not (args.load_file and args.dump_file)

parser1 = NCBIStandalone.BlastParser()
parser2 = NCBIStandalone.BlastParser()

# PXL: PMZ(Q) x Lamp3(S), LXP: Lamp3(Q) x PMZ(S)
pxl_records = NCBIStandalone.Iterator(args.blast1, parser1)
lxp_records = NCBIStandalone.Iterator(args.blast2, parser2)

pxl_re = re.compile(r'(PMZ_[^\s]+)')
pxl_key_fn = lambda k: pxl_re.findall(k)[0]
lxp_re = re.compile(r'(lamp3[^\s]+ [^\s]+ len\d+)')  # matching 'not whitespace' is faster and more robust
lxp_key_fn = lambda k: lxp_re.findall(k)[0]

pxl_lookup, lxp_lookup = None, None

if args.load_file:
    pxl_lookup = cPickle.load(args.load_file)
Example #14
0
    def ReadBlast(self, file, OUT, iszipped=0, is_psiblast=None):

        output = open(OUT, "w")
        self.selfhits = []
        if is_psiblast:
            print >> sys.stderr, 'Parsing PSI-Blast'
            self.parser = NCBIStandalone.PSIBlastParser()
        else:
            self.parser = NCBIStandalone.BlastParser()
        if file[-3:] == '.gz' or iszipped:
            handle = gzip.open(file)
        else:
            handle = open(file)

        self.iter = NCBIStandalone.Iterator(handle=handle, parser=self.parser)
        self.blastDict = {}

        while 1:
            try:
                rec = self.iter.next()
                if not rec: break
            except:
                sys.stderr.write(
                    'Can\'t iterate on blast records anymore. Abort.\n')
                import traceback
                traceback.print_exc()
                return 'Error parsing %s' % file

            self.query = rec.query.split(" ")[
                0]  ##  blast_record.query.split(" ")[0]
            self.length = rec.query_letters

            if self.length < self.min_size:
                self.printer("Does not meet the minimum length " +
                             str(self.min_size))
                break

            if is_psiblast: rec = rec.rounds[-1]

            # each alignment is one potential hit
            for n, alignment in enumerate(rec.alignments):

                hsp = alignment.hsps[0]  #no multiple hsps
                alnlength = hsp.align_length
                hit = alignment.title
                #targetlength = alignment.length
                #m = re.search("sp\|([A-Z0-9]+)\|([A-Z0-9_]+) ?(.+)?", alignment.title)

                m = re.search("sp\|(.+?)\|(.+?) (.+)?", alignment.title)
                if m:  # pyphynr blast result
                    hit_sp_ac = m.group(1)
                    hit_sp_id = m.group(2)
                    hit_sp_note = m.group(3)
                elif alignment.title[
                        0] == '>':  # result from qadditional blast databases
                    hit_sp_ac = None
                    hit_sp_id = alignment.title[1:].split()[0]
                    hit_sp_note = None
                else:
                    hit_sp_ac = None
                    hit_sp_id = None
                    hit_sp_note = None

                self.printer(hit_sp_id)
                similarity = hsp.positives[0] / float(hsp.positives[1]) * 100
                if float(hsp.expect) <= float(self.HSP_max_evalue):
                    if float(similarity) >= int(self.HSP_minimal_positives):
                        coverage = hsp.positives[1] / float(self.length) * 100
                        if float(coverage) >= int(self.HSP_minimal_coverage):
                            #targetcoverage = hsp.positives[1]/float(targetlength)*100
                            #if  float(targetcoverage) > int(self.HSP_minimal_targetcov):
                            #self.compatibles.append((hit_sp_ac, hit))
                            #hitlist = [hit_sp_id, n+1 , hsp.positives[0]/float(hsp.positives[1])*100, hsp.positives[1]/float(self.length)*100, hsp.positives[1]/float(targetlength)*100, hsp.score, hsp.expect]
                            hitlist = [
                                hit_sp_id, hsp.positives[0] /
                                float(hsp.positives[1]) * 100,
                                hsp.positives[1] / float(self.length) * 100,
                                hsp.score, hsp.expect
                            ]
                            if self.cB: self.createblastDict(query, hitlist)
                            output.write("%s\t" % (self.query)),
                            for element in hitlist:
                                output.write("%s\t" % element),
                            output.write("\n")
        output.close()
        handle.close()
        return None