Esempio n. 1
0
def blast(cmd, query, db, **kwargs):

    outfmt = "'6 {}'".format(' '.join(TABULAR_BLAST_FIELDS.keys()))
    ext = '.tsv'

    out_file = Path(query.parent, 'result{}'.format(ext))

    if cmd == 'blastn':
        blast_on_db = NcbiblastnCommandline(query=str(query), db=str(db),
                                            out=str(out_file), outfmt=outfmt, **kwargs)
    elif cmd == 'blastp':
        blast_on_db = NcbiblastpCommandline(query=str(query), db=str(db),
                                            out=str(out_file), outfmt=outfmt, **kwargs)
    elif cmd == 'blastx':
        blast_on_db = NcbiblastxCommandline(query=str(query), db=str(db),
                                            out=str(out_file), outfmt=outfmt, **kwargs)
    elif cmd == 'tblastn':
        blast_on_db = NcbitblastnCommandline(query=str(query), db=str(db),
                                             out=str(out_file), outfmt=outfmt, **kwargs)
        
    else:
        sys.exit(f'Unknown command: {cmd}')
    print(blast_on_db)
    blast_on_db()

    return out_file
Esempio n. 2
0
 def run_local_blast(self, db_type, local_db, local_output, record,
                     record_seq):
     """
     BLAST the probe file against the local database
     :param db_type: string of the database type: either assemblies or refseq
     :param local_db: name and path of the BLAST database file
     :param local_output: BLAST output file
     :param record: Name of the record
     :param record_seq: Sequence of the record
     """
     logging.info('Running {record} local {db_type} BLAST'.format(
         record=record, db_type=db_type))
     # BLAST command line call.
     if self.amino_acid:
         blast = NcbitblastnCommandline(db=os.path.splitext(local_db)[0],
                                        num_alignments=100000000,
                                        evalue=self.expect[record],
                                        num_threads=self.cpus,
                                        task='tblastn',
                                        outfmt=self.outfmt,
                                        word_size=3,
                                        out=local_output)
     else:
         blast = NcbiblastnCommandline(db=os.path.splitext(local_db)[0],
                                       num_alignments=100000000,
                                       evalue=self.expect[record],
                                       num_threads=self.cpus,
                                       task='blastn',
                                       outfmt=self.outfmt,
                                       perc_identity=75,
                                       word_size=self.word_size[record],
                                       out=local_output)
     if not os.path.isfile(local_output):
         # Run BLAST - supply the record sequence as stdin, so BLAST doesn't look for an input file
         blast(stdin=record_seq)
def blasting(db_name=None,
             evalue=1e-5,
             query_name=None,
             gene_name=None,
             make_database=True):

    #print 'Blasting the sequences against reference query'

    #print 'Making a blast data set'
    #if make_database ==True:
    # 	blastdb_cmd = 'makeblastdb -in {} -dbtype prot'.format(db_name)

    # 	# Creating a home database
    # 	DB_process = subprocess.Popen(blastdb_cmd,
    # 							  shell=True,
    # 							  stdin=subprocess.PIPE,
    # 							  stdout=subprocescs.PIPE,
    # 							  stderr=subprocess.PIPE)
    # 	DB_process()

    blastx_cline = NcbitblastnCommandline(cmd="blastp",
                                          query=str(query_name),
                                          db=str(db_name),
                                          evalue=evalue,
                                          outfmt=6)
    print(blastx_cline)
    out, err = blastx_cline()
    list_out = re.split('\n|\t', out)
    del list_out[-1]
    #print(list_out)
    blast_df = pd.DataFrame(
        np.array(list_out).reshape(len(list_out) // 12, 12),
        columns=[
            'qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen',
            'qstart', 'qends', 'start', 'send', 'evalue', 'bitscore'
        ])

    print("here are the blast results")
    print(blast_df)
    # Changing the type of the data frame:
    blast_df[[
        'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qends', 'start',
        'send', 'evalue', 'bitscore'
    ]] = blast_df[[
        'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qends', 'start',
        'send', 'evalue', 'bitscore'
    ]].apply(pd.to_numeric)
    parsed_assembly = parse_fasta(db_name)

    sequences = list()

    # Extracting the sequences that match with PRDM9
    for i in blast_df['sseqid'].tolist():
        sequences.append(parsed_assembly[i])

    blast_df['sequences'] = sequences

    blast_df['Gene'] = os.path.basename(gene_name)
    return (blast_df[:5])
Esempio n. 4
0
 def _tblastn_wrapper(self):
     cline = NcbitblastnCommandline(
         query=self.query2,
         db=self.genome,
         evalue=0.001,
         out=os.path.join(self.tmp, 'tblastn.res'),
         outfmt="6 qseqid sstart send sstrand evalue")
     cline()
Esempio n. 5
0
def tblastn(query, subject_filename):
    """tblastn wrapper"""
    with tempfile.NamedTemporaryFile(delete=True) as query_file:
        SeqIO.write(query, query_file.name, 'fasta')
        proc = NcbitblastnCommandline(query=query_file.name,
                                      subject=subject_filename,
                                      outfmt=5,
                                      parse_deflines=True,
                                      seg='no',
                                      db_gencode=11)
        out, err = proc()
    if err:
        raise Exception(err)
    else:
        return StringIO(out)
def tblastn_and_extract(query_fasta_prot, db_nucl, seq_header = "query_seq"):
    '''return longest seq among the best hits (consider only the best hit of each query_fasta_prot)
    1. check if all best hits are localized in the same region
    2. return the longest (without gaps)

    '''
    from Bio.Blast.Applications import NcbitblastnCommandline
    from Bio.Blast import NCBIXML
    from Bio.Seq import Seq
    from Bio.SeqRecord import SeqRecord
    from Bio.Alphabet import IUPAC
    tblastn_cline = NcbitblastnCommandline(query=query_fasta_prot, db=db_nucl, evalue=0.001, outfmt=5, out="temp.xml")
    stdout, stderr = tblastn_cline()

    result_handle = open("temp.xml")
    blast_records = NCBIXML.parse(result_handle)
    blast_records = list(blast_records)
    best_hit = blast_records[0]
    best_hit_length = len(best_hit.alignments[0].hsps[0].sbjct.replace("-", ""))
    for i in range(1, len(blast_records)):
        blast_record = blast_records[i]
        #print blast_record.query, "vs", blast_record.alignments[0].hit_def
        #print blast_record.alignments[0].hsps[0].sbjct.replace("-", "")

        if "*" in blast_record.alignments[0].hsps[0].sbjct:
            print "Achtung, stopcodon in the align"
            continue
            
        
        elif not check_blast_colocalization(best_hit, blast_record):
            print "problem with colocalization"
            continue
        else:
            hit_len = len(blast_record.alignments[0].hsps[0].sbjct.replace("-", ""))
            if hit_len > best_hit_length:
                best_hit_length = hit_len
                best_hit = blast_record


    descript = "(" + best_hit.query + " vs " + best_hit.alignments[0].hit_def + ")"
    print descript
    seq = best_hit.alignments[0].hsps[0].sbjct.replace("-", "")
    biorecord = SeqRecord(seq=Seq(seq, IUPAC.protein),
                          id=seq_header,
                          name=seq_header,
                          description=descript,
                          dbxrefs=[])
    return biorecord
def perform_local_blast_search(query_filename: str, db_filename: str,
                               out_filename: str):
    """
    Perform local BLAST (tblastn) search and parse output to .csv file.
    """
    tmp_filename = 'data/tmp.xml'

    # perform search
    NcbitblastnCommandline(query=query_filename,
                           db=db_filename,
                           outfmt=5,
                           out=tmp_filename,
                           max_target_seqs=1)()

    # parse results
    with open(out_filename, 'w', newline='') as csv_file:
        blast_records: List[Blast] = NCBIXML.parse(open(tmp_filename))
        for record in blast_records:
            alignment = record.alignments[0]
            hsp = alignment.hsps[0]
            csv_file.write(
                f'{record.query[:-17]},{alignment.hit_id},{hsp.expect}\n')

    os.remove(tmp_filename)
Esempio n. 8
0
def run_BLAST(query, database, args, cons_run):
    """
    Given a mfa of query sequences of interest & a database, search for them.

    Important to note:
        * Turns dust filter off,
        * Only a single target sequence (top hit),
        * Output in XML format as blast.xml.

    # TODO: Add  evalue filtering ?
    # TODO: add task='blastn' to use blastn scoring ?

    .. warning:: default is megablast

    .. warning:: tblastx funcationality has not been checked

    :param query: the fullpath to the vf.mfa
    :param database: the full path of the databse to search for the vf in
    :param args: the arguments parsed to argparse
    :param cons_run: part of a mapping consensus run

    :type query: string
    :type database: string
    :type args: argparse args (dictionary)
    :type cons_run: boolean

    :returns: the path of the blast.xml file
    """
    tmp1 = os.path.splitext(query.split('/')[-1])[0]
    tmp2 = os.path.splitext(database.split('/')[-1])[0]
    if not cons_run:
        outfile = os.path.join("BLAST_results/",
                               "DB="+tmp1+"ID="+tmp2+"_blast.xml")
    else:
        outfile = os.path.join("BLAST_results/",
                               "cons_DB="+tmp1+"ID="+tmp2+"_blast.xml")
    protein = False
    # File type not specified, determine using util.is_protein()
    if args.reftype is None:
        if SeqFindr.util.is_protein(query) != -1:
            protein = True
            sys.stderr.write('%s is protein' % (query))
    elif args.reftype == 'prot':
        protein = True
        sys.stderr.write('%s is protein\n' % (query))
    run_command = ''
    if protein:
        sys.stderr.write('Using tblastn\n')
        run_command = NcbitblastnCommandline(query=query, seg='no',
                    db=database, outfmt=5, num_threads=args.BLAST_THREADS,
                    max_target_seqs=1, evalue=args.evalue, out=outfile)
    else:
        if args.tblastx:
            sys.stderr.write('Using tblastx\n')
            run_command = NcbitblastxCommandline(query=query, seg='no',
                        db=database, outfmt=5, num_threads=args.BLAST_THREADS,
                        max_target_seqs=1, evalue=args.evalue,
                        out=outfile)
        else:
            sys.stderr.write('Using blastn\n')
            if args.short == False:
                run_command = NcbiblastnCommandline(query=query, dust='no',
                            db=database, outfmt=5,
                            num_threads=args.BLAST_THREADS,
                            max_target_seqs=1, evalue=args.evalue,
                            out=outfile)
            else:
                sys.stderr.write('Optimising for short query sequences\n')
                run_command = NcbiblastnCommandline(query=query, dust='no',
                            db=database, outfmt=5, word_size=7,
                            num_threads=args.BLAST_THREADS, evalue=1000,
                            max_target_seqs=1, out=outfile)

    sys.stderr.write(str(run_command)+"\n")
    run_command()
    return os.path.join(os.getcwd(), outfile)