Ejemplo n.º 1
0
def blast_seq():
	outfile = my_module.FILENAME + ".blastn"
	print("INFO: Running BLAST")
	start_time = timeit.default_timer()
	blastn(cmd=my_module.BLAST_BINARY + "/blastn", query=my_module.FILENAME, db=my_module.BLAST_DATABASE, evalue=1e-20, out=outfile, show_gis="true", dust = 'no', soft_masking = "false", num_descriptions=500, num_alignments=500)()
	print("INFO: Verifying BLAST")
	if(verify_blast(outfile)):
		raise Exception(""" (%s) BLAST output count is not equal to input sequence count""" % __file__ )
	elapsed = timeit.default_timer() - start_time
	print("DONE: BLAST complete (time taken: %d secs)" % int(elapsed))
Ejemplo n.º 2
0
def result():
    seqs = request.forms.get('seqs')
    db = os.path.join(DB_BASE_PATH, 'UniVec_Core')
    if request.forms.get('vector', 'customdb') == 'customdb':
        db = os.path.join(DB_BASE_PATH, 'custom')
    # Create a temporary file
    with NamedTemporaryFile(mode='w') as fasta_in_fh:
        # Write the user entered sequence into this temporary file
        fasta_in_fh.write(seqs)
        # Flush data to disk without closing and deleting the file,
        # since that closing a temporary file also deletes it
        fasta_in_fh.flush()
        # Get the name of the temporary file
        file_in = fasta_in_fh.name
        # Run the BLAST query
        blastn_cline = blastn(cmd=BLAST_EXE,
                              query=file_in,
                              db=db,
                              evalue=.0005,
                              outfmt=5)
        rh, eh = blastn_cline()
        # Create contamination position and store it in a dictionary
        bat1 = create_rel(rh)
        # Get the sequences masked
        newseqs = maskseqs(file_in, bat1)
    with io.StringIO() as fasta_out_fh:
        SeqIO.write(newseqs, fasta_out_fh, 'fasta')
        fasta_out_fh.seek(0)
        finalout = fasta_out_fh.read()
    return {'finalout': finalout}
Ejemplo n.º 3
0
def blast_gene(seq, database):
    tempfasta = open('temp.fasta', 'w')
    SeqIO.write(seq, tempfasta, 'fasta')
    tempfasta.close()
    run = blastn(query='temp.fasta',
                 db=database,
                 num_descriptions=1,
                 num_threads=6,
                 outfmt=5,
                 word_size=4,
                 evalue=0.01,
                 task="megablast",
                 out='temp.xml')
    run()
    result_handle = open('temp.xml')
    result = NCBIXML.read(result_handle)
    rets = []
    for i in result.descriptions:
        ttl = i.title
        e = i.e
        species = ttl.split(' ')[0]
        rets.append(species)
        rets.append(str(e))
    for i in result.alignments:
        for j in i.hsps:
            rets.append(str(j.frame[1]))
            rets.append(str(j.query))
            rets.append(str(j.match))
            rets.append(str(j.sbjct_start))
    return rets
Ejemplo n.º 4
0
def result():
    seqs = request.forms.get('seqs')
    db = os.path.join(DB_BASE_PATH, 'UniVec_Core')
    if request.forms.get('vector','customdb') == 'customdb':
        db = os.path.join(DB_BASE_PATH, 'custom')
    # Create a temporary file
    with NamedTemporaryFile(mode='w') as fasta_in_fh:
        # Write the user entered sequence into this temporary file
        fasta_in_fh.write(seqs)
        # Flush data to disk without closing and deleting the file,
        # since that closing a temporary file also deletes it
        fasta_in_fh.flush()
        # Get the name of the temporary file
        file_in = fasta_in_fh.name
        # Run the BLAST query
        blastn_cline = blastn(cmd=BLAST_EXE, query=file_in, db=db,
                              evalue=.0005, outfmt=5)
        rh, eh = blastn_cline()
        # Create contamination position and store it in a dictionary
        bat1 = create_rel(rh)
        # Get the sequences masked
        newseqs = maskseqs(file_in, bat1)
    with io.StringIO() as fasta_out_fh:
        SeqIO.write(newseqs, fasta_out_fh, 'fasta')
        fasta_out_fh.seek(0)
        finalout = fasta_out_fh.read()
    return {'finalout':finalout}
Ejemplo n.º 5
0
def psiBlastScoring():
    """
	JB's instructions on psi-blast
	
	1. We find all similar interfaces.
	
	2. Make a MSA of structurally aligned sequences (multiple sequence alignment) 


	3. Form a score from the probability of a particular mutation showing up in the 
	   MSA 

	4. Evaluate the mutation with this score  - This is where we need the mutation

	
	Notes:
	
	-> Should not require an HPC to run each blast computation
	

	Some biopython options for blast:
	---------------------------------------------------------
	blastn -> nucleotide vs nucleotide
	blastp -> protein vs protein 
	blastx -> translated nucleotide vs protein
	tblastn -> protein vs translcated nucleotide
	tblastx -> translated nucelotide vs translated nucleotide
	---------------------------------------------------------
	"""
    try:
        # imports from previous functions
        from Bio.PDB.PDBIO import PDBIO
        from Bio.PDB.PDBParser import PDBParser
        from Bio.Data.IUPACData import protein_letters
        from Bio.SeqUtils.ProtParam import ProteinAnalysis
        from Bio.PDB.Polypeptide import PPBuilder
        from Bio.PDB.Polypeptide import standard_aa_names  # Standard amino acid names - https://biopython.org/DIST/docs/api/Bio.PDB.Polypeptide-module.html
        from Bio.PDB.Polypeptide import aa1  #  aa1 = 'ACDEFGHIKLMNPQRSTVWY'
        from Bio.PDB.Polypeptide import aa3  #  aa3 = ['ALA', 'CYS', 'ASP', 'GLU', 'PHE', 'GLY', 'HIS', 'ILE',... ]

        # Basic Local Alignment Search Tool (BLAST) reader

        from Bio.Blast.Applications import NcbiblastformatterCommandline as blastn
        from Bio.Blast import NCBIXML  # For reaidng the BLAST output
    except ImportError:
        print("Error - cannot imoort")

    BLAST_EXE = '/home/oohnohnoh1/Desktop/ACADEMIA/Papermaking/OPTIMUS_BIND/ZIP/ncbi-blast-2.9.0+/bin/blastn'  # The example given is /home/sb/opt/ncbi-blast-2.6.0+/bin/blastn
    #f_in = 'seq3.txt'
    b_db = 'db/samples/TAIR8cds'
    blastn_cline = blastn(cmd=BLAST_EXE,
                          query=f_in,
                          db=b_db,
                          evalue=.0005,
                          outfmt=5)
    rh, eh = blastn.cline()

    rh.readline()
Ejemplo n.º 6
0
def query(fasta, contig, q, db_path, organism):
 
    search = blastn(query = StringIO(q), db = db_path, outfmt = 5)

    stdout, stderr = search()

    record = NCBIXML.read(stdout)

    correct = []

    for aln in record.alignments:
        for hsp in aln.hsps:
            correct.append(organism.lower() in aln.title.lower())

    return (fasta, contig, any(correct))
Ejemplo n.º 7
0
def query(fasta, contig, q, db_path, organism):

    search = blastn(query=StringIO(q), db=db_path, outfmt=5)

    stdout, stderr = search()

    record = NCBIXML.read(stdout)

    correct = []

    for aln in record.alignments:
        for hsp in aln.hsps:
            correct.append(organism.lower() in aln.title.lower())

    return (fasta, contig, any(correct))
Ejemplo n.º 8
0
	def Make_Repeat_Mask_Txt( self, word_size=17, gapopen=5, e_thresh=0.0001, perc_identity=90, gapextend=2,
	                          min_length=75 ):
		"""
        Run blastn on contigs in input fasta file against database dbname. Parameters set to NCBI recommended defaults for blastn.
        """
		outfastapath = os.path.join(
			self.outdir, '{0}.fasta'.format(self.newrefid))
		prefix = os.path.join(self.outdir, self.newrefid)
		maskpath = prefix + '_repmask.array'
		regionspath = prefix + '_repregions.array'
		statspath = prefix + '.stats'
		
		blastn_cline = blastn(cmd=COMPASSCFG['tools']['blast']['path'] + "blastn", db=prefix, query=outfastapath,
		                      dust='no', word_size=word_size, gapopen=gapopen, gapextend=gapextend, evalue=e_thresh,
		                      perc_identity=perc_identity,
		                      outfmt='"6 qseqid sseqid pident length qstart qend sstart send"')
		try:
			blast_out, blast_err = blastn_cline()
			assert not blast_err
		except (AppError, AssertionError) as err:
			raise Exception(
				'Erro: Blast failed during construction of repeat mask : {0}'.format(err))
		
		repmask_fp = open(maskpath, 'w')
		repregions_fp = open(regionspath, 'w')
		total_bp = 0
		repetitive_bp = 0
		num_regions = 0
		
		# each blast_rec is result from one query sequence (contig)
		blast_stream = StringIO(blast_out)
		prev_header = None
		for contig_count, contig in enumerate(SeqIO.parse(outfastapath, 'fasta'), 1):
			if prev_header != contig.name:
				repregions_fp.write('>{0}\n'.format(contig.name))
				prev_header = contig.name
			total_bp += len(contig)
			repmask = np.zeros(len(contig), dtype=np.bool)
			try:
				fields = blast_stream.next().split()
			except StopIteration:
				fields = None
			while fields and fields[0] == contig.name:
				contig_name, match_name = fields[:2]
				hit_perc_ident = float(fields[2])
				hit_length, q_start, q_end, s_start, s_end = (
					int(x) for x in fields[3:])
				(x1, y1), (x2, y2) = sorted(
					((q_start, q_end), sorted((s_start, s_end))))
				if hit_length >= min_length and (contig_name != match_name or not (x2 <= x1 <= y2 and x2 <= y1 <= y2)):
					repmask[q_start - 1:q_end] = True
				try:
					fields = blast_stream.next().split()
				except StopIteration:  # end of blast hits
					fields = None
			# output.bam repmask as 1 and 0, 100 per line
			repmask_fp.write('>{0}\n'.format(contig.name))
			for i in xrange(0, len(repmask), 100):
				j = min(i + 100, len(repmask))
				repmask_fp.write('{0}\n'.format(''.join(str(i)
				                                        for i in repmask[i:j].astype(int))))
			# identify postitions of repetitive regions (runs of 1s in the
			# repmask array)
			# 0-based numbering
			region_starts = list(np.where(repmask[1:] > repmask[:-1])[0] + 1)
			region_ends = list(np.where(repmask[1:] < repmask[:-1])[0] + 1)
			# special case: full blast hit for this contig against another
			# contig
			if repmask.all():
				region_starts = [0]
				region_ends = [len(repmask)]
			# fix ends, in case regions start from the first position in the
			# sequence or end at the last
			if region_starts and ((not region_ends) or (region_starts[-1] > region_ends[-1])):
				region_ends.append(len(repmask))
			if region_ends and ((not region_starts) or (region_starts[0] > region_ends[0])):
				region_starts = [0] + region_starts
			repregions_fp.writelines('{0}\t{1}\n'.format(
				rs, re) for rs, re in izip(region_starts, region_ends))
			repetitive_bp += repmask.sum()
			num_regions += len(region_starts)
		
		repmask_fp.close()
		repregions_fp.close()
		pct_repetitive = '{0:.2f}'.format(
			(float(repetitive_bp) / total_bp) * 100)
		LE.debug(
			'Info: Repetitive regions for all of {0}: {1}/{2} bp ({3}%)'.format(self.newrefid, repetitive_bp, total_bp,
			                                                                    pct_repetitive))
		
		# save result summary
		statsvalues = '\t'.join((self.newrefid, self.newrefid, str(contig_count), str(total_bp), str(repetitive_bp),
		                         str(num_regions), pct_repetitive))
		with open(statspath, 'w') as o:
			o.write('refid\trefcd\tcontigs\tnumbp\trepetitivebp\trepregions\trepetitivepct\n{values}\n'.format(
				values=statsvalues))
		return
                newRec = SeqRecord(seq=record.seq,
                                   id=name + "_" + str(idx),
                                   description="")
                fasta.append(newRec)

    with open(outDir + "negative_control_reads.fasta", "w") as out_handle:
        SeqIO.write(fasta, out_handle, "fasta")

os.chdir(outDir)
if os.path.isfile(outDir + "neg_control_hits.csv") is False:
    from Bio.Blast import NCBIWWW
    from Bio.Blast.Applications import NcbiblastnCommandline as blastn
    from Bio.Blast import NCBIXML
    blastn_cm = blastn(query='negative_control_reads.fasta',
                       db='16S_ribosomal_RNA',
                       evalue=0.001,
                       outfmt=5,
                       out='nc_reads_blast.xml')
    stout, sterr = blastn_cm()
    result_handle = open('nc_reads_blast.xml')
    blast_records = NCBIXML.parse(result_handle)
    hits = pd.DataFrame()
    for record in blast_records:
        if len(record.alignments) > 0:
            best_alignment = record.alignments[0]
            print(record.query)
        else:
            print(record.query + " has NO ALIGNMENT")
        for hsp in best_alignment.hsps:
            #if hsp.expect < E_VALUE_THRESH:
            print("****Alignment****")