def blastBACTEUK(arg):
	out=open('bacterial.txt','a')
	out2=open('eukaryotic.txt','a')
	records = SeqIO.parse(open(arg), format="fasta")
	
	for record in records:
		try:
			name = record.id
			result_handleB = NCBIWWW.qblast("blastx", "nr", record.format("fasta"), ncbi_gi=False, descriptions= "1", alignments="1", format_type="XML", hitlist_size="1", entrez_query='(Bacteria[ORGN] OR Archaea[ORGN])')
			result_handleE = NCBIWWW.qblast("blastx", "nr", record.format("fasta"), ncbi_gi=False, descriptions= "1", alignments="1", format_type="XML", hitlist_size="1", entrez_query='(Eukaryota[ORGN])')

			blast_recordsB = NCBIXML.read(result_handleB)
			blast_recordsE = NCBIXML.read(result_handleE)

			if blast_recordsB.descriptions:
				print record.id

				name = record.id


				out.write(name + ',' + str(blast_recordsB.alignments[0].hsps[0].expect) + '\n')
			else:
				out.write(name + ', no hit'  + '\n')

			if blast_recordsE.descriptions:
				out2.write(name + ',' +  str(blast_recordsE.alignments[0].hsps[0].expect) + '\n')
			else:
				out2.write(name + ', no hit'  + '\n')
		except:
			errorout = open('errorlog.txt','a')
			error out.write('problem blasting ' + record.id + '\n')
			errorout.close()

	out.close()
	out2.close()
예제 #2
0
def fetch_indentity_from_local(seq):
    def extract_prot_id(string):
        s = string.split('|')[2]
        s = s.split(' ')[1]
        return s

    result = []
    record = SeqRecord(Seq(seq), id="tmp", name="", description="")
    SeqIO.write(record, "tmp.fastaa", "fasta")

    NcbiblastpCommandline(query='tmp.fastaa', db='_data_/_db_/HUMAN_DB', outfmt=5, out='blastp_human_output.xml')()
    NcbiblastpCommandline(query='tmp.fastaa', db='_data_/_db_/RODENTS_DB', outfmt=5, out='blastp_rodents_output.xml')()

    result_handle = open("blastp_human_output.xml")
    b_record = NCBIXML.read(result_handle)
    for alignment in b_record.alignments:
        for hsp in alignment.hsps:
            if hsp.positives == hsp.identities:
                result.append(extract_prot_id(alignment.title))

    result_handle = open("blastp_rodents_output.xml")
    b_record = NCBIXML.read(result_handle)
    for alignment in b_record.alignments:
        for hsp in alignment.hsps:
            if hsp.positives == hsp.identities:
                result.append(extract_prot_id(alignment.title))

    return ";".join(result)
예제 #3
0
def blast_bulk (fasta_file, settings):

	# The blast modules are imported from biopython
	from Bio.Blast import NCBIWWW, NCBIXML
	from Bio import SeqIO
	
	# parse the fasta file
	seq_list = [seq for seq in SeqIO.parse(fasta_file, 'fasta')]

	# open the fasta file
	#fasta_open = open(fasta_file, 'r')
	#fasta_handle = fasta_open.read()
	
	blast_list = []

	for seq in seq_list:
		print seq
		result_handle = NCBIWWW.qblast(settings[0], settings[1], seq.format('fasta'), megablast=settings[3], hitlist_size=settings[2])
		blast_list.append(NCBIXML.read(result_handle))
	# Blast the sequences against the NCBI nucleotide database
	# return a list with the blast results
	#result_handle = NCBIWWW.qblast(settings[0], settings[1], fasta_handle, megablast=settings[3], hitlist_size=settings[2])
	#blast_list = [item for item in NCBIXML.parse(result_handle)]	

	return blast_list
예제 #4
0
def blastTranscript(transcript, blastDB, seqfile):
    transcript = transcript.strip()
    spec = seqfile.replace('_prot.fa', '')
    blastDB = blastDB.replace('.phr', '')

    #Make fasta file of the individual protein
    seqiter = SeqIO.parse(open(seqfile), 'fasta')
    SeqIO.write((seq for seq in seqiter if seq.id in transcript),
                "temp" + transcript + ".fa", "fasta")

    blastp_cline = NcbiblastpCommandline(query="temp" + transcript + ".fa",
                                         db=blastDB,
                                         evalue=1e-10,
                                         outfmt=5,
                                         out="blast" + transcript + ".xml")

    stdout, stderr = blastp_cline()

    result_handle = open("blast" + transcript + ".xml")
    blast_record = NCBIXML.read(result_handle)
    E_VALUE_THRESH = 1e-10

    alignments = []

    #Tabulate all alignments
    for alignment in blast_record.alignments:
        for hsp in alignment.hsps:
            if hsp.expect < E_VALUE_THRESH:
                with open(outfile, 'a') as f:
                    alignments = np.append(alignments, str(alignment.title))

    call(["rm", "temp" + transcript + ".fa", "blast" + transcript + ".xml"])
    alignments2 = [re.split(' ', aln)[1] for aln in alignments]
    alignments3 = [re.split('_', aln)[0] for aln in alignments2]
    return alignments3
def blastinfo(filename):
    '''
    The blastinfo function takes the name of a xml file with the results of
    a BLAST and prints out some of the parameters used, the max alignment
    length and the max score, along with their respective sequence accession
    numbers. Finally, the function returns all accession numbers of the blast
    results.
    '''
    with open(filename) as file:
        blast_record = NCBIXML.read(file)
    print("####### BLAST Parameters #######")
    print("Query ID:", blast_record.query_id)
    print("Database:", blast_record.database)
    print("E-value threshold:", blast_record.expect)
    print("Match score:", blast_record.sc_match)
    print("Mismatch score:", blast_record.sc_mismatch)
    max_score = -999
    max_length = -999
    result = {}
    acessions = []
    for align in blast_record.alignments:
        temp = align.title.split("|")[3]
        acessions.append(temp)
        if align.length > max_length:
            max_length = align.length
            result["Max length"] = (max_length, temp)
        for hsp in align.hsps:
            if hsp.expect < 0.05:
                if hsp.score > max_score:
                    max_score = hsp.score
                    result["Max score"] = (max_score, temp)
    print(result)
    return acessions
예제 #6
0
def blastSearch(sequence_record, less_than_threshold):
    # Import required packages
    from Bio.Seq import Seq
    from Bio.Blast import NCBIWWW, NCBIXML
    # Convert the sequence record to a sequence (i.e. strip annotations and background)
    seq = sequence_record.seq
    # print('1')
    # Create a handle for the blast search
    result_handle = NCBIWWW.qblast("blastn", "nt", seq)
    # print('2')
    # Create an object to hold results of the blast search
    blast_records = NCBIXML.read(result_handle)
    # Create a blank list to hold all the blast records that are beyond a given threshold
    # print('4')
    blast_records_threshold = []
    # For every returned alignment in the blast records
    for alignment in blast_records.alignments:
        # For every high scoring pair in the alignments
        for hsp in alignment.hsps:
            # If the hsp.expect value is less than 0.001
            if hsp.expect < less_than_threshold:
                # Add the alignment into a the threshold list
                blast_records_threshold.append(alignment)
    print('Number of alignments with hsp.expect < ' +
          str(less_than_threshold) + ' = ' + str(len(blast_records_threshold)))
    return blast_records
예제 #7
0
파일: blast.py 프로젝트: Pfiver/RNA-Seqlyze
def _compare_by_blast(input_ref, xref_db, blast_out, subject_blast=False):
    """Compare all genes in an input file to the output database.
    """
    cl = NcbiblastpCommandline(query=input_ref,
                               db=xref_db,
                               out=blast_out,
                               outfmt=5,
                               num_descriptions=1,
                               num_alignments=0)
    try:
        subprocess.check_call(str(cl).split())
    # handle BLAST errors cleanly; write an empty file and keep moving
    except (OSError, subprocess.CalledProcessError):
        with open(blast_out, "w") as out_handle:
            out_handle.write("\n")
    with codecs.open(blast_out, encoding="utf-8",
                     errors="replace") as blast_handle:
        result = blast_handle.read()
        for problem in [u"\ufffd"]:
            result = result.replace(problem, " ")
        try:
            rec = NCBIXML.read(StringIO.StringIO(result))
        except (xml.parsers.expat.ExpatError, ValueError):
            rec = None
        if rec and len(rec.descriptions) > 0:
            id_info = _normalize_id(rec.descriptions[0].title.split()[1])
            return id_info, rec.descriptions[0].bits
        else:
            return "", 0
예제 #8
0
def blast_score(query_cdrs, subject_cdrs):
    blastOptions = "-evalue=200000 -word_size=2 -matrix='PAM30' -comp_based_stats='0' -outfmt=5"
    outData ={}
    for i in range(3):

        query = "-query <(echo -e '>Name\n" + query_cdrs[i] +"') "
        subject = "-subject <(echo -e '>Name\n" + subject_cdrs[i] +"') "       
        blastString = "blastp " + query + subject + blastOptions


#             # Run BLAST and parse the output as XML
        process = subprocess.Popen(
            args=blastString,
            stdout=PIPE,
            stderr = subprocess.STDOUT,
            shell=True,
            executable='/bin/bash',
            close_fds=True)

        output=process.communicate()[0]
        blast_result_record = NCBIXML.read(StringIO(output))
            


        if len(blast_result_record.alignments)>0 :
            for alignment in blast_result_record.alignments:
                for hsp in alignment.hsps[0:1]:
                    #save data 
                    outData[i] = np.array([hsp.score, hsp.expect, hsp.align_length, alignment.length, hsp.bits])

        else:
            outData[i] = np.array([0, .5, 0, 0, 0])

    return np.concatenate((outData[0], outData[1], outData[2]), axis=1)
예제 #9
0
def blast_gene(seq, database):
    tempfasta = open('temp.fasta', 'w')
    SeqIO.write(seq, tempfasta, 'fasta')
    tempfasta.close()
    run = blastp(query='temp.fasta',
                 db=database,
                 num_descriptions=5,
                 num_threads=6,
                 outfmt=5,
                 out='temp.xml')
    run()
    result_handle = open('temp.xml')
    result = NCBIXML.read(result_handle)
    rets = []
    for i in result.descriptions:
        ttl = i.title
        e = i.e
        if 'Tfl|' in ttl:
            species = 'T. flavus'
            d = ttl[ttl.find('Tfl'):]
        elif 'Pfu|' in ttl:
            species = 'P. funiculosum'
            d = ttl[ttl.find('Pfu'):]
        elif 'PMAA_' in ttl:
            species = 'T. marneffei'
            d = ttl[ttl.find('PMAA'):]
        else:
            species = ttl[ttl.find('[') + 1:ttl.find(']')]
            d = ttl[ttl.find('| ') + 1:ttl.find('[') - 1]
        rets.append(species)
        rets.append(d)
        rets.append(str(e))
    return rets
예제 #10
0
 def parse(self):
     """
     Call the report parsing method for all the BLAST output files
     """
     logging.info('Parsing outputs')
     # Call parse_report for every file
     for sample in self.samples:
         if os.path.isfile(sample.blast_outputs):
             # Read in the BLAST results
             try:
                 with open(sample.blast_outputs, 'r') as result_handle:
                     logging.info(
                         'Parsing {sn} nr report'.format(sn=sample.name))
                     blast_record = NCBIXML.read(result_handle)
                     # Iterate through all the alignments
                     for alignment in blast_record.alignments:
                         # Iterate through each HSP per alignment
                         for hsp in alignment.hsps:
                             # Only retrieve sequences that are as long as the query sequence, and do not have gaps
                             if len(hsp.sbjct) == len(
                                     sample.records
                             ) and '-' not in hsp.sbjct:
                                 # Do not allow for more than five mismatches
                                 if hsp.identities >= len(
                                         sample.records) * (self.cutoff /
                                                            100):
                                     # Create a Seq object to add to the set
                                     sample.alleleset.add(hsp.sbjct)
             except FileNotFoundError:
                 pass
예제 #11
0
파일: Blast.py 프로젝트: kietjohn/SeqDB
    def __init__(self, ID, raw_blast_result, blast_object=None):
        self.db_index = ID
        self.blast_result = blast_object

        self.pursue = 0
        self.hits = []

        if not self.blast_result:
            with open(raw_blast_result, 'r') as record:
                self.blast_result = NCBIXML.read(record)

        self.clone = self.blast_result.query

        for align in self.blast_result.alignments:
            for hit in align.hsps:
                genome = align.accession
                organism = align.hit_def
                identity = float(Decimal(hit.identities) / Decimal(hit.align_length))
                self.hits.append((self.db_index, genome, organism, hit.expect,
                                hit.query_start, hit.query_end,
                                hit.sbjct_start, hit.sbjct_end, identity))

        self.colonizer_matches = [item for item in self.hits
                if (item[1] in COLONIZER or item[2].split(' ') in COLONIZER)]
        self.non_colonizer_matches = [item for item in self.hits
                if (item[1] in NON_COLONIZER or item[2].split(' ') in NON_COLONIZER)]
        if len(self.non_colonizer_matches) == 0:
            self.pursue = 1
예제 #12
0
def searchBlast(fastafile, e):
    """
    Submits a FASTA DNA sequence file to the NCBI Blast web service
    to determine the genome that the sequence belongs to.

    Parameters:
        fastafile : A DNA sequence in the FASTA file format, a string.
        e : E value threshold, e values of the alignment must be below this value, a float.

    Returns:
        A list with each item a list of alignment elements.
    """
    results = []
    fasta_string = open(fastafile).read()
    result_handle = NCBIWWW.qblast("blastn","nt", fasta_string)
    blast_record = NCBIXML.read(result_handle)
    for alignment in blast_record.alignments:
        for hsp in alignment.hsps:
            alignment_record = []
            if hsp.expect < e:
                alignment_record.append(alignment.title)
                alignment_record.append(alignment.length)
                alignment_record.append(hsp.expect)
                alignment_record.append(hsp.query)
                alignment_record.append(hsp.match)
                alignment_record.append(hsp.sbjct)
                results.append(alignment_record)
    return results
예제 #13
0
파일: worker.py 프로젝트: animesh/scop3d
def downloadUniprotSequences(uniprotID, blastFile, sequencesFile, cutoff,
                             verbose):
    print('Obtaining sequences from UniProt...')
    with open(blastFile, 'r') as f:
        records = NCBIXML.read(f)
    if verbose:
        print('Found ' + str(len(records.alignments)) + ' matches')
    with open(sequencesFile, 'w') as f:
        if uniprotID != None:
            sequence = urllib2.urlopen('http://www.uniprot.org/uniprot/' +
                                       uniprotID + '.fasta')
            f.write(sequence.read() + "\n")
        for idx, alignment in enumerate(records.alignments):
            for hsp in alignment.hsps:
                title = alignment.title
                words = title.split('|')
                seqID = ''
                if words[0] == 'gi':
                    seqID = words[3]
                elif words[0] == 'sp' or words[0] == 'ref':
                    seqID = words[1]
                if not seqID == '':
                    identityPercent = 100.0 * float(hsp.identities) / float(
                        hsp.align_length)
                    if (identityPercent >= float(cutoff)):
                        try:
                            sequence = urllib2.urlopen(
                                'http://www.uniprot.org/uniprot/' + seqID +
                                '.fasta')
                            f.write(sequence.read() + "\n")
                            if verbose:
                                print(seqID + " (identity " +
                                      str(identityPercent) + "% >= cutoff " +
                                      str(cutoff) + "%) - adding")
                        except Exception as e:
                            if verbose:
                                print("WARNING: unable to download entry " +
                                      seqID + " from Uniprot: " + str(e))
                                print("Trying NCBI protein...")
                            handle = Entrez.efetch(db="protein",
                                                   id=seqID,
                                                   rettype="fasta",
                                                   retmode="xml")
                            erecords = Entrez.parse(handle)
                            for erecord in erecords:
                                r = SeqRecord(
                                    Seq.Seq(erecord['TSeq_sequence'],
                                            IUPAC.unambiguous_dna),
                                    id=erecord['TSeq_accver'],
                                    description=erecord['TSeq_defline'])
                            SeqIO.write(r, f, "fasta")
                            f.write("\n")
                            if verbose:
                                print("OK")
                            handle.close()
                    else:
                        if verbose:
                            print(seqID + " (identity " +
                                  str(identityPercent) + "% < cutoff " +
                                  str(cutoff) + "%) - skipping")
예제 #14
0
def blast_seq(seq):
    print("Blasting...")
    # local blast
    with open("my_fas.fas", "w") as my_fasta:
        my_fasta.write(">new seq\n" + seq)
    job_id = blast_runner("my_fas.fas", outfile="my_blast.xml",
                          hitlist_size=1)  #pbs job id

    # try:
    #     process = subprocess.check_output("qstat | grep " + str(job_id), shell=True)
    #     while process != "":
    #         process = subprocess.check_output("qstat | grep " + str(job_id), shell=True)
    #         sleep(0.05)
    # except (subprocess.CalledProcessError):
    #     process = ""
    # if process == "":
    #     print("Blasted!")

    status = check_pbs(job_id)
    print(status)
    if status == "Done!":
        xml_file = open("my_blast.xml", "r")
        blast_record = NCBIXML.read(xml_file)
        xml_file.close()
        try:
            for alignment in blast_record.alignments:
                for hsp in alignment.hsps:
                    title = (str(alignment.title).split("|")[4])
                    return title
        except (RuntimeError, TypeError, NameError, ValueError):
            return None
예제 #15
0
파일: exons.py 프로젝트: kpiszczek/exons
def compareSequences(seq1, seq2):
    """
    compareSequences(seq1, seq2) -> Alignment

    Tworzy obiekt prównania dwóch sekwencji zwartych
    w plikach .fasta.
    """
    output = ''

    try:
        output = NcbiblastpCommandline(
            query=seq1, subject=seq2, outfmt=5, use_sw_tback=True)()[0]
    except Bio.Application.ApplicationError as err:
        print('Brak programu Blast w ścieżce systemowej')
        print(err)
        # print('Próba połączenia się z wersją online...')
        # try:
        #     seq1 = open(seq1).read()
        #     output = NCBIWWW.qblast(
        #         "blastp", 'nt', sequence=seq1, query_file=open(seq2)).read()
        # except IOError:
        #     raise ValueError('Nie udało się pobrać danych')
        exit()

    if not output:
        return

    blast_result_record = NCBIXML.read(StringIO(output))

    alignment = blast_result_record.alignments[0]
    hsp = alignment.hsps[0]
    return Alignment(query=hsp.query, match=hsp.match, subject=hsp.sbjct)
def write_flanks(rbase,flanksfile):
    '''
    Parse the results from BLASTing the F-plasmid against the de novo assemblies.
    get the query length, get the first BLAST hit that matches the 3'-end of the
    query, and write the flanking region to file. 
    
    '''
    flank_record_list = []
    ## iterate over BLASTs against de novo assemblies.
    denovo_dirs = [x for x in listdir(rbase) if x.startswith('REL') or x.startswith('RM')] 
    for mygenome in denovo_dirs:
        myfulldir = join(rbase, mygenome)
        ##print(myfulldir)
        result_f = join(myfulldir,"results.xml")
        result_h = open(result_f)
        blast_record = NCBIXML.read(result_h)
        query_length = int(blast_record.query_letters)
        for alignment in blast_record.alignments:
            for hsp in alignment.hsps:
                if hsp.expect > 0.0000000001:
                    ## skip bad hits.
                    continue
                if hsp.query_end != query_length:
                    ## skip hits that don't match 3' end of F-plasmid query.
                    continue
                subject_seq = join(myfulldir,"scaffolds.fasta")
                ##print(mygenome)
                my_flank_seq = get_flank(alignment, hsp, subject_seq)
                flank_record_list.append(SeqRecord(seq=my_flank_seq, id=mygenome+'_flank'))
    with open(flanksfile,'w') as flanks_outhandle:               
        SeqIO.write(flank_record_list,flanks_outhandle, format="fasta")
예제 #17
0
def parse_blast(seq, output):
    blast_output = StringIO(output)

    try:
        blast_records = NCBIXML.read(blast_output)
    except ValueError as e:
        sys.stderr.write("-----Blast output------")
        sys.stderr.write(blast_output.getvalue())
        if blast_output.getvalue(
        ) == "BLAST engine error: XML formatting is only supported for a database search":
            sys.stderr.write(
                "Please ensure that you are using the latest blastx version of blastn"
            )
            sys.stderr.write(
                "You may need to update your environment's PATH variable")
        raise e

    try:
        alignment = blast_records.alignments[0]
    except:
        return (output, -1)

    hsp = alignment.hsps[0]

    mutations = get_muts(hsp.query, hsp.sbjct)

    return (mutations, 1)
def get_sequences():
    ana_dir = "Analysis"
    bla_dir = "BLAST"
    BLASTWriter.simple_dir(bla_dir)
    root_dir = "NRPSRoot"
    xml_dir = "BLASTXML"
    BLASTWriter.create_dir(xml_dir, bla_dir)
    stan_dir = "BLASTStandard"
    BLASTWriter.create_dir(stan_dir, bla_dir)
    fas_dir = "BLASTFASTA"
    main_dir = []
    for [dirpath, dirname, filename] in os.walk(root_dir):
       main_dir.extend(filename)
    for file in main_dir:
        if file[0:len(file)-4] not in os.listdir(os.path.join(ana_dir, os.path.join(bla_dir, stan_dir))):
            record = SeqIO.read(os.path.join(root_dir, file), format="gb")
            #BLASTExecute.blast_execute(record)
            result_handle = open(os.path.join(ana_dir, os.path.join(bla_dir, os.path.join(xml_dir, "BLAST-" + record.name + ".xml"))))
            blast_record = NCBIXML.read(result_handle)
            k = 0
            e_threshold = .00000001
            rec_dir = record.name
            for alignment in blast_record.alignments:
                k += 1
                i = 0
                al_dir = alignment.title[:alignment.title.index(" ")]
                BLASTWriter.create_dir(os.path.join(stan_dir, os.path.join(rec_dir, al_dir)), bla_dir)
                BLASTWriter.create_dir(os.path.join(fas_dir, os.path.join(rec_dir, al_dir)), bla_dir)
                BLASTWriter.write_full_standard(k, alignment, record)
                for hsp in alignment.hsps:
                    if hsp.expect < e_threshold:
                        i += 1
                        BLASTWriter.write_blast_standard(i, alignment, hsp, rec_dir)
                        BLASTWriter.write_blast_fasta(i, alignment, hsp, rec_dir)
예제 #19
0
파일: main.py 프로젝트: kevinxin90/NGS_Wu
def get_seq_pos(fasta_list):
    seq_pos_list = []
    for i, _seq in enumerate(fasta_list):
        if i % 500 == 0:
            print('{} has been processed!'.format(i))
        seq1 = SeqRecord(Seq(_seq))
        SeqIO.write(seq1, "seq1.fasta", "fasta")
        # Run BLAST and parse the output as XML
        try:
            output = NcbiblastnCommandline(query="seq1.fasta",
                                           subject="parent.fasta",
                                           outfmt=5)()[0]
            blast_result_record = NCBIXML.read(StringIO(output))
            # Print some information on the result
            if blast_result_record.alignments != []:
                hsps = blast_result_record.alignments[0].hsps
                if len(hsps) == 2:
                    results = []
                    for hsp in hsps:
                        results.append(hsp.sbjct_start)
                        results.append(hsp.sbjct_end)
                    seq_pos_list.append(sorted(results))
        except:
            print('failed to blast!')
            continue
    return seq_pos_list
예제 #20
0
파일: RunBlast.py 프로젝트: papoku/LSRs
def globalRun(d_dataset, p_dir_blast, debug=1):

    for PDB_ID in d_dataset.keys():
        if d_dataset[PDB_ID]["conserve"] == 1:
            p_fasta = d_dataset[PDB_ID]["best"]["fasta"]
            p_out_blast = p_dir_blast + PDB_ID + ".xml"
            blastp_cline = NcbiblastpCommandline(query=p_fasta,
                                                 db="pdb",
                                                 outfmt=5,
                                                 out=p_out_blast)
            if debug: print blastp_cline
            if not path.exists(p_out_blast):
                stdout, stderr = blastp_cline()
            d_dataset[PDB_ID]["xml"] = p_out_blast
            d_dataset[PDB_ID]["align"] = {}
            # parse blast out
            result_handle = open(p_out_blast)
            blast_records = NCBIXML.read(result_handle)
            for alignment in blast_records.alignments:
                for hsp in alignment.hsps:
                    #                     print alignment.title
                    PDB_find = alignment.title.split("|")[4].split(" ")[0]
                    d_dataset[PDB_ID]["align"][PDB_find] = hsp.expect

            result_handle.close()
예제 #21
0
def blastpSp(sp, db, evalue=0.0001):
    """
    directory = tempfile.mkdtemp()
    fasta = fetchFasta(spAcc)
    fastaFile = '%s/seq.fasta' % directory
    wf = open(fastaFile, 'w')
    print(fasta, file=wf)
    wf.close()
    """
    directory = tempfile.mkdtemp()
    fastaFile = '%s/seq.fasta' % directory
    fasta = '>query\n%s' % seq(sp)
    wf = open(fastaFile, 'w')
    print(fasta, file=wf, sep='', end='')
    wf.close()
    blastp = NcbiblastpCommandline(query=fastaFile, db=db, evalue=evalue,
                                   outfmt=5, out='%s/result.xml' % directory)
    stdout, stderr = blastp()
    print(stdout, end='', sep='')
    print(stderr, end='', sep='')
    result_handle = open('%s/result.xml' % directory)
    blast_record = NCBIXML.read(result_handle)
    result_handle.close()
    os.remove(fastaFile)
    os.remove('%s/result.xml' % directory)
    os.removedirs(directory)
    hits = [align.title for align in blast_record.alignments]
    hits = [i.split('|')[1] for i in hits]
    return hits
예제 #22
0
def write_blast(str1, str2, name1, name2):
    '''
    Blast two sequences in fasta format

    Input:
        str1: the first sequence string
        str2: the second sequence string
        name1: the first sequence name
        name2: the second sequence name

    Return: None
    '''
    seq1 = SeqRecord(Seq(str1),
                   id=name1)
    seq2 = SeqRecord(Seq(str2),
                   id=name2)
    SeqIO.write(seq1, "seq1.fasta", "fasta")
    SeqIO.write(seq2, "seq2.fasta", "fasta")
    output = NcbiblastpCommandline(query="seq1.fasta", subject="seq2.fasta", outfmt=5)()[0]
    blast_result_record = NCBIXML.read(StringIO(output))
    for alignment in blast_result_record.alignments:
        for hsp in alignment.hsps:
            print('****Alignment****')
            print('sequence:', alignment.title)
            print('length:', alignment.length)
            print('e value:', hsp.expect)
            print(hsp.query)
            print(hsp.match)
            print(hsp.sbjct)
예제 #23
0
    def run (self, input_seq):

        output = []
        #Windows has problems with Popen and PIPE
        if sys.platform == 'win32':
            tmp = tempfile.NamedTemporaryFile()
            logger.debug("Running Blast with sequence: {}".format(input_seq))
            tmp.write(bytes(str(input_seq) + '\n', 'latin1'))
            tmp.seek(0)
            blast = Popen('%s -db %s -outfmt 5' % (self.blast_path, self.blastdb), universal_newlines=True, stdin=tmp,
                stdout=PIPE, stderr=PIPE)
            (blast_out, blast_err) = blast.communicate()
        else:
        #Rest of the world:
            blast = Popen('%s -db %s -outfmt 5' % (self.blast_path, self.blastdb), universal_newlines=True, shell=True,
                stdin=PIPE, stdout=PIPE, stderr=PIPE)
            (blast_out, blast_err) = blast.communicate(input=str(input_seq))

        if len(blast_err) != 0:
            logger.debug(blast_err)
        if blast_out!='\n':
            result = NCBIXML.read(StringIO(blast_out))
            for aln in result.alignments[:self.top_results]:
                logger.debug("Looping over alignments, current hit: {}".format(aln.hit_id))
                output.append((aln.hit_id, aln))
        return output
예제 #24
0
def blast_pdb(target_sequence, num_hits=1000):
    """
    Query the PDB using NCBI blast and return MSMSeeds initialized with the results

    Parameters
    ----------
    target_sequence : String
        The sequence of the target to use to query blast
    num_hits : int, optional
        The maximum number of hits returned by BLAST. Default: 1000

    Returns
    -------
    msmseeds : list of MSMSeed objects
        A list of MSMSeed objects initialized with a target sequence, template sequence, template structure,
        and BLAST e-value. Can be readily parallelized in Spark.
    """
    from Bio.Blast import NCBIWWW, NCBIXML
    result_handle = NCBIWWW.qblast("blastp",
                                   "pdb",
                                   target_sequence,
                                   hitlist_size=num_hits)
    blast_record = NCBIXML.read(result_handle)
    alignments = blast_record.alignments
    msmseeds = []
    for alignment in alignments:
        e_val = alignment.hsps[0].expect
        template_fasta, template_structure = _retrieve_chain(
            alignment.accession)
        msmseeds.append(
            MSMSeed(target_sequence, template_fasta, template_structure,
                    e_val))
    return msmseeds
예제 #25
0
def blast(dbname, blast_program, query, evalue_threshold=0.001):

    infile = None
    with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
        infile = f.name
        f.write(">Query\n%s\n" % query)

    outfile = "%s.out.xml" % infile
    if blast_program == 'tblastn':
        blast_cl = NcbitblastnCommandline(query=infile, db=dbname,
                                          evalue=evalue_threshold,
                                          word_size=6, outfmt=5, out=outfile)
    else:
        blast_cl = NcbiblastnCommandline(query=infile, db=dbname,
                                         evalue=evalue_threshold,
                                         word_size=6, outfmt=5, out=outfile)

    cl = str(blast_cl)
    cl = "%s/%s" % (settings.NCBI_BIN_DIR, cl)
    r = subprocess.call(cl.split(" "))
    os.unlink(infile)

    if r != 0:
        print "Blast failed: %s" % cl
        return []

    results = []
    with open(outfile, "r") as f:
        blast_record = NCBIXML.read(f)
        for alignment in blast_record.alignments:
            accession = Blast_Accession(alignment.accession)
            for hsp in alignment.hsps:
                if accession.fragment_length is not None:
                    if hsp.sbjct_start > accession.fragment_length and \
                       hsp.sbjct_end > accession.fragment_length:
                        continue
                    # don't apply '% accession.fragment_length' to
                    # sbjct_start/end. Blast_Result#strand compares sbjct_start
                    # and sbjct_end to determine which strand the hit is on.
                    # Caller should just handle when sbjct_start/end is greater
                    # than fragment length. alternatively, we can store strand
                    # explicit, but that also creates complexity when using
                    # sbjct_start/end coordinates.

                f = Blast_Result(fragment_id=accession.fragment_id,
                                 fragment_length=accession.fragment_length,
                                 hit_def=alignment.hit_def,
                                 query_start=hsp.query_start,
                                 query_end=hsp.query_end,
                                 subject_start=hsp.sbjct_start,
                                 subject_end=hsp.sbjct_end,
                                 evalue=hsp.expect,
                                 alignment=dict(query=hsp.query,
                                                match=hsp.match,
                                                matchi=inverse_match(hsp.match),
                                                subject=hsp.sbjct))
                results.append(f)

    os.unlink(outfile)
    return results
예제 #26
0
def blast_sequences(comp_seq, ref_seq):
    '''
    Perform BLAST of two protein sequences using NCBI BLAST+ package.

    Output is two dictionaries: residue numbering in PDB chain (key) mapped to
    the residue position in the reference sequence (value), and vice versa.

    Notes:
        User must have NCBI BLAST+ package installed in user's PATH.

    Args:
        comp_seq (str): A comparison protein sequence.
        ref_seq (str): A reference protein sequence.

    Returns:
        dict: A dictionary mapping comparison sequence numbering (key) to
            reference sequence numbering (value)
        dict: A dictionary mapping reference sequence numbering (key) to
            comparison sequence numbering (value)
    '''
    with tempfile.NamedTemporaryFile(mode='w') as comp_seq_file, \
         tempfile.NamedTemporaryFile(mode='w') as ref_seq_file:
        comp_seq_file.write(">\n" + str(comp_seq) + "\n")
        ref_seq_file.write(">\n" + str(ref_seq) + "\n")
        ref_seq_file.flush()
        comp_seq_file.flush()
        blastp_cline = NcbiblastpCommandline(query=comp_seq_file.name,
                                             subject=ref_seq_file.name,
                                             evalue=0.001,
                                             outfmt=5)
        alignment, _stderror = blastp_cline()
    blast_xml = StringIO(alignment)
    blast_record = NCBIXML.read(blast_xml)
    temp_score = 0
    high_scoring_hsp = None
    #Retrieve highest scoring HSP
    for alignment in blast_record.alignments:
        for hsp in alignment.hsps:
            if hsp.score > temp_score:
                temp_score = hsp.score
                high_scoring_hsp = hsp
    #Create dictionary mapping position in PDB chain to position in ref sequence
    pdb_to_ref = {}
    ref_to_pdb = {}
    if high_scoring_hsp is not None:
        query_string = high_scoring_hsp.query
        sbjct_string = high_scoring_hsp.sbjct
        key = high_scoring_hsp.query_start
        ref = high_scoring_hsp.sbjct_start
        for i, res in enumerate(query_string):
            if res.isalpha() and sbjct_string[i].isalpha():
                pdb_to_ref[key] = ref
                ref_to_pdb[ref] = key
                key += 1
                ref += 1
            elif res.isalpha():
                key += 1
            elif sbjct_string[i].isalpha():
                ref += 1
    return pdb_to_ref, ref_to_pdb
예제 #27
0
파일: blast.py 프로젝트: ErillLab/cgb
    def search(self, blast_program, query, e_val):
        """Runs BLAST to search query sequence in the target database.

        Args:
            blast_program (string): BLAST flavor to run. 'tblastx' or 'tblastn'
            query (string): query sequence in FASTA format
            eval (float): E-value threshold
        Returns:
            Bio.Blast.Record.Blast object
        """
        assert blast_program in ['tblastn', 'tblastx', 'blastx']
        # temporary file in temp directory, automatically named
        output_file = temp_file_name()
        # temporary file in temp directory, automatically named
        query_file = temp_file_name()
        with open(query_file, 'w') as f:
            f.write(query)
        cmd = '{prog} -query {q} -db {db} -evalue {e} -out {out} -outfmt 5'.format(
            prog=blast_program, q=query_file, db=self._db_file, e=e_val,
            out=output_file)
        logging.debug(cmd)
        os.system(cmd)

        # Parse results
        with open(output_file) as results_handle:
            blast_record = NCBIXML.read(results_handle)
        return blast_record
def parse_output(query, output):

    length_of_record = len(SeqIO.read(query, 'fasta'))

    result_handle = open(output)

    blast_record = NCBIXML.read(result_handle)

    for alignment in blast_record.alignments:

        for hsp in alignment.hsps:

            if hsp.expect < 0.01:

                percentage = (hsp.identities / length_of_record) * 100

                percentage2 = str(percentage)

                variant = alignment.title.split(' ')[-1]

                e_value = str(hsp.expect)

                organism_one, organism_two = organism_finder(query)

                write_blast_results(organism_one, organism_two, query,
                                    percentage2, variant, e_value)

                if percentage != 100.0:

                    add_to_file(query)

                    make_new_blast_db()

        break
예제 #29
0
def parseRecord(xmlfile,genomePath,debug):
    if debug:
        print "In BLASTing.parseRecord"
    
    result = nxml.read(open('Files/extras/temp_blast.xml'))
    hit = result.alignments[0].hit_def
    e = result.descriptions[0].e
    if debug:
        print "Blast match: ",hit
        print "E-value: ",e
        
    hitL = hit.split()
    hitID = hitL[0]
    t = [n for n in hitL if '..' in n]
    hitInfo = t[0]
    num1,num2 = hitInfo.split('..')
    num2 = num2[:num2.find('(')]
    num1,num2 = int(num1),int(num2)
    strand = hitInfo[hitInfo.find('('):]

    
    # Determine the direction, relative location, and position of the gene
    direction = getDirection(hitInfo)
    termUpper,termLower = getRelativeLocation(genomePath)
    pos = getLocation(num1,termUpper,termLower)

    # TODO
    # Integrate warning for multiple hits
    
    return num1,direction,pos,hit,e,''
예제 #30
0
 def _test_describe():
     blast_record = NCBIXML.read(
         open(
             os.path.join(global_settings.temp_folder, 'blastpdb',
                          'S438966_blast.xml')))
     print(blast_record.alignments)
     print(blast_record.alignments[0].title)
예제 #31
0
def get_evalues():
    with open('C:/Users/Alyssa/Desktop/CSE182/project/blast_eval.txt',
              'w') as outfile:
        # Parse all raw BLAST output files in directory
        for rawblast_file in os.listdir(file_path):

            # Load the BLAST result back onto handle
            result_handle = open(file_path + rawblast_file)

            # Parse the BLAST output
            blast_record = NCBIXML.read(result_handle)

            for alignment in blast_record.alignments:
                # Can have more than one result due to protein redundancy

                for hsp in alignment.hsps:
                    e_value = hsp.expect
                    outfile.write(str(e_value) + ',')
            outfile.write('\n')

    # Strip the trailing comma in each row
    with open('C:/Users/Alyssa/Desktop/CSE182/project/blast_eval.txt',
              'r') as readin:
        with open(
                'C:/Users/Alyssa/Desktop/CSE182/project/FINAL_blast_eval.txt',
                'w') as eval_out:
            for line in readin:
                line = line.rstrip(',\n')
                eval_out.write(str(line))
                eval_out.write('\n')
예제 #32
0
 def onlineParsing(self):
     '''
     Analyze online blast XML output files
     '''
     '''Parsing on the XML files from the online blast.'''
     print("Online parsing ...")
     with open("Online_BLAST_results", "w") as results:
         for fasta in self.list_fasta:
             results_handle = open(
                 "Data/output_blast/Online_output_{}".format(fasta))
             blast_record = NCBIXML.read(results_handle)
             results_handle.close()
             results.write("{}\n".format(fasta))
             '''Only the 10 best alignments are saved'''
             i = 0
             for alignment in blast_record.alignments:
                 hsp = alignment.hsps[0]
                 if i < 10:
                     identity = str(hsp.identities) + "/" + str(
                         hsp.align_length)
                     results.write("{}\t{}\n".format(
                         alignment.title, identity))
                     i += 1
                 else:
                     break
             results.write("\n")
     results.close()
     print("Online parsing done")
예제 #33
0
def init(blast_output_path):
    with GraknClient(uri="localhost:48555") as client:
        with client.session(keyspace="proteins") as session:
            print("Connected to the proteins knowledge graph.")
            print("- - - - - - - - - - - - - - - - - - - - -")
            target_sequences = query_target_sequences(session)

            for sequence in target_sequences:
                print("BLASTing for: ", sequence)
                print("- - - - - - - - - - - - - - - - - - - - -")
                print(
                    "Waiting for BLAST search to complete. This can take a few minutes."
                )
                # result_handle = NCBIWWW.qblast(
                #     "blastp",
                #     "nr",
                #     sequence
                # )
                # print("Reading BLAST results")
                # print("- - - - - - - - - - - - - - - - - - - - -")
                # with open('./blast-output.xml', 'w') as output_file:
                #     output_file.write(result_handle.read())
                blast_record = NCBIXML.read(open(blast_output_path))

                print(
                    "Inserting BLAST results into the proteins knowledge graph."
                )
                print("- - - - - - - - - - - - - - - - - - - - -")
                insert_new_proteins_n_alignments(session, sequence,
                                                 blast_record)
예제 #34
0
def searchBlast(fastafile, e):
    """
    Submits a FASTA DNA sequence file to the NCBI Blast web service
    to determine the genome that the sequence belongs to.

    Parameters:
        fastafile : A DNA sequence in the FASTA file format, a string.
        e : E value threshold, e values of the alignment must be below this value, a float.

    Returns:
        A list with each item a list of alignment elements.
    """
    results = []
    fasta_string = open(fastafile).read()
    result_handle = NCBIWWW.qblast("blastn", "nt", fasta_string)
    blast_record = NCBIXML.read(result_handle)
    for alignment in blast_record.alignments:
        for hsp in alignment.hsps:
            alignment_record = []
            if hsp.expect < e:
                alignment_record.append(alignment.title)
                alignment_record.append(alignment.length)
                alignment_record.append(hsp.expect)
                alignment_record.append(hsp.query)
                alignment_record.append(hsp.match)
                alignment_record.append(hsp.sbjct)
                results.append(alignment_record)
    return results
예제 #35
0
    def run(self, input_seq):

        output = []
        #Windows has problems with Popen and PIPE
        if sys.platform == 'win32':
            tmp = tempfile.NamedTemporaryFile()
            logger.debug("Running Blast with sequence: {}".format(input_seq))
            tmp.write(bytes(str(input_seq) + '\n', 'latin1'))
            tmp.seek(0)
            blast = Popen('%s -db %s -outfmt 5' %
                          (self.blast_path, self.blastdb),
                          universal_newlines=True,
                          stdin=tmp,
                          stdout=PIPE,
                          stderr=PIPE)
            (blast_out, blast_err) = blast.communicate()
        else:
            #Rest of the world:
            blast = Popen('%s -db %s -outfmt 5' %
                          (self.blast_path, self.blastdb),
                          universal_newlines=True,
                          shell=True,
                          stdin=PIPE,
                          stdout=PIPE,
                          stderr=PIPE)
            (blast_out, blast_err) = blast.communicate(input=str(input_seq))
        if len(blast_err) != 0:
            logger.debug(blast_err)
        if blast_out != '\n':
            result = NCBIXML.read(StringIO(blast_out))
            for aln in result.alignments[:self.top_results]:
                logger.debug("Looping over alignments, current hit: {}".format(
                    aln.hit_id))
                output.append((aln.hit_id, aln))
        return output
예제 #36
0
def blast_gene(seq, database):
    tempfasta = open('temp.fasta', 'w')
    SeqIO.write(seq, tempfasta, 'fasta')
    tempfasta.close()
    run = blastn(query='temp.fasta',
                 db=database,
                 num_descriptions=1,
                 num_threads=6,
                 outfmt=5,
                 word_size=4,
                 evalue=0.01,
                 task="megablast",
                 out='temp.xml')
    run()
    result_handle = open('temp.xml')
    result = NCBIXML.read(result_handle)
    rets = []
    for i in result.descriptions:
        ttl = i.title
        e = i.e
        species = ttl.split(' ')[0]
        rets.append(species)
        rets.append(str(e))
    for i in result.alignments:
        for j in i.hsps:
            rets.append(str(j.frame[1]))
            rets.append(str(j.query))
            rets.append(str(j.match))
            rets.append(str(j.sbjct_start))
    return rets
예제 #37
0
def draw_blast(path="C:/Users/arvid/Documents/arbeit/Blast.xml",
               max_entry=10,
               yMax=1000,
               xMax=1000):
    handle = open(path)
    blast_record = NCBIXML.read(handle)
    E_VALUE_THRESH = 0.01
    plot_werte = []
    dy = yMax / max_entry
    y = yMax - dy

    index = 0
    for alignment in blast_record.alignments:
        y = y - dy / 2
        index = index + 1
        for hsp in alignment.hsps:
            if hsp.expect < E_VALUE_THRESH and index < max_entry:
                plt.text(
                    hsp.query_end, y, "   " + str(index) + " " +
                    alignment.title[alignment.title.find("PREDICTED: ") +
                                    11:alignment.title.find("PREDICTED: ") +
                                    31])
                plt.plot([hsp.query_start, hsp.query_end], [y, y], 'g-')
    plt.plot([0, xMax], [yMax - dy, yMax - dy], 'r-')

    plt.ylim((0, yMax))
    cur_axes = plt.gca()
    cur_axes.axes.get_yaxis().set_visible(False)
    plt.show()
def findOffTargets (refSeq,sgRNAseq):
    candidates=[]  # Return this list of candidates
    f = open('temp.fasta','wb')
    f.write(sgRNAseq+'\n')
    f.close()

    cline = NcbiblastnCommandline(query="temp.fasta", db="testdb",outfmt=5, out="temp.xml",task='blastn-short')
    cline()
    result=open('temp.xml','r')
    records = NCBIXML.read(result)
    if len(records.alignments) == 0 :
        return candidates
    records=records.alignments[0].hsps

    for record in records:
        if record.query_end < 20:  # Require ends at the seed
            continue
        if record.match[-5:] != '|'*5:  # Require 5 bp of seed is perfect match
            #print record
            continue
        if record.sbjct_end > record.sbjct_start:
            end=record.sbjct_end
            # on the + strand, sequence is from [start,end]
            if refSeq[end+2:end+4]=='GG':
                candidates.append(record)
        else:  # On the - strand
            end=record.sbjct_end
            if refSeq[end-3:end-1] == 'CC':
                candidates.append(record)
    return candidates
예제 #39
0
def blast_pdb(target_sequence, num_hits=1000):
    """
    Query the PDB using NCBI blast and return MSMSeeds initialized with the results

    Parameters
    ----------
    target_sequence : String
        The sequence of the target to use to query blast
    num_hits : int, optional
        The maximum number of hits returned by BLAST. Default: 1000

    Returns
    -------
    msmseeds : list of MSMSeed objects
        A list of MSMSeed objects initialized with a target sequence, template sequence, template structure,
        and BLAST e-value. Can be readily parallelized in Spark.
    """
    from Bio.Blast import NCBIWWW, NCBIXML
    result_handle = NCBIWWW.qblast("blastp", "pdb", target_sequence, hitlist_size=num_hits)
    blast_record = NCBIXML.read(result_handle)
    alignments = blast_record.alignments
    msmseeds = []
    for alignment in alignments:
        e_val = alignment.hsps[0].expect
        template_fasta, template_structure = _retrieve_chain(alignment.accession)
        msmseeds.append(MSMSeed(target_sequence,template_fasta, template_structure, e_val))
    return msmseeds
예제 #40
0
def get_blast_alignments(seq, query):
  ncbi = NCBIWWW.qblast(program="blastn" , database="nr", 
                      sequence=seq, entrez_query=query, format_type="XML", hitlist_size = 500, expect = 100.0)
  blast = NCBIXML.read(ncbi);
  remove_alignments = []
  
  query_length = len(seq);

  #results = []
  #for alignment in blast.alignments:
    #positive = alignment.hsps[0].positives * 100 / 80
    #if positive >= 80:
      #results.append(alignment)
  #return results
  for alignment in blast.alignments:
    overall_length = 0.0
    for hsp in alignment.hsps:
      overall_length += hsp.align_length
    if (overall_length / query_length) < 0.8:
      remove_alignments.append(alignment)

  for alignment in remove_alignments:
    blast.alignments.remove(alignment)
    
  return blast.alignments;
예제 #41
0
    def execute_blast(self, id_seq, limit, evalue):
        seq = self.db.getSeq(id_seq)

        if seq == None:
            print(Messages.nonexistent_sequence())
            return

        blast_result = NCBIWWW.qblast("blastn","nr", seq.getSeq(), hitlist_size= limit)

        file_blast = open(".blast_result.xml", "w")
        file_blast.write(blast_result.read())
        file_blast.close()

        file_blast=open(".blast_result.xml")
        blast_record = NCBIXML.read(file_blast)
        file_blast.close()

        for alignment in blast_record.alignments:
            for hsps in alignment.hsps:
                if hsps.expect < evalue:
                    print(f"sequence: {alignment.title}")
                    print(f"accession: {alignment.accession}")
                    print(f"length: {alignment.length}")
                    print(f"e value: {hsps.expect}")
                    print(f"score: {hsps.score}")
                    print(f"identities: {hsps.identities}")
                else:
                    print("Inferior to the provided evalue!")
예제 #42
0
 def parse(infolder, outfolder):
     if not os.path.exists(
             os.path.join(global_settings.temp_folder, outfolder)):
         os.mkdir(os.path.join(global_settings.temp_folder, outfolder))
     for file in os.listdir(
             os.path.join(global_settings.temp_folder, infolder)):
         if '.xml' in file:
             try:
                 blast_record = NCBIXML.read(
                     open(
                         os.path.join(global_settings.temp_folder, infolder,
                                      file)))
                 matches = []
                 for align in blast_record.alignments:
                     for hsp in align.hsps:
                         if hsp.score > 100:
                             pdb = align.title.split('|')[3]
                             chain = align.title.split('|')[4][0]
                             d = {
                                 'x':
                                 int(hsp.query_start),
                                 'y':
                                 int(hsp.align_length + hsp.query_start),
                                 'description':
                                 align.hit_def.split('&gt;')[0],
                                 'id':
                                 'blastpdb_{p}_{x}_{y}_{c}'.format(
                                     p=pdb,
                                     c=chain,
                                     x=hsp.query_start,
                                     y=hsp.align_length + hsp.query_start),
                                 'chain':
                                 chain,
                                 'url':
                                 pdb,
                                 'offset':
                                 hsp.sbjct_start - hsp.query_start,
                                 'extra': {
                                     'match':
                                     align.title[0:50],
                                     'match_score':
                                     hsp.score,
                                     'match_start':
                                     hsp.query_start,
                                     'match_length':
                                     hsp.align_length,
                                     'match_identity':
                                     hsp.identities / hsp.align_length
                                 }
                             }
                             matches.append(d)
                 with open(
                         os.path.join(global_settings.temp_folder,
                                      outfolder,
                                      file.replace('.xml', '.json')),
                         'w') as w:
                     json.dump(matches, w)
             except ValueError as err:
                 warn(f'Value error for {file}: {err}'
                      )  ##why art thou so empty?
예제 #43
0
def BlastGenome(queryFile,genome,debug,outputFile='Files/extras/temp_blast.xml'):
    if debug:
        print "In BLASTing.BlastGenome"

    # Modify the genome filename to reflect the path to the genome
    genome = genome.replace(' ','')
    genomePath = 'Files/genome/' + genome + '/' + genome

    ## Call blast+ from python
    cline = ncl(query=queryFile,db=genomePath,out=outputFile,outfmt=5)
    ret_code = subprocess.call(str(cline),shell=True)

    if ret_code:
        print 'BLASTing file "%s" returned error code %s' % (queryFile,ret_code)

    temp = open(queryFile).read()
    geneID = temp.split()[0]
    geneID = geneID.lstrip('>')
    result = nxml.read(open(outputFile))
    
    # If the blast returns no results, it will be treated as a gene
    # in the ambiguous region and oligos will be made from both strands
    if result.alignments:
        return parseRecord(result,genomePath,debug)
    else:
        return 0,0,'Ambiguous','No Match','N/A'
예제 #44
0
파일: Blast.py 프로젝트: kietjohn/SeqDB
    def __init__(self, ID, raw_blast_result, blast_object=None):
        self.db_index = ID
        self.blast_result = blast_object

        self.pursue = 0
        self.hits = []

        if not self.blast_result:
            with open(raw_blast_result, 'r') as record:
                self.blast_result = NCBIXML.read(record)

        self.clone = self.blast_result.query

        for align in self.blast_result.alignments:
            for hit in align.hsps:
                genome = align.accession
                organism = align.hit_def
                identity = float(
                    Decimal(hit.identities) / Decimal(hit.align_length))
                self.hits.append((self.db_index, genome, organism, hit.expect,
                                  hit.query_start, hit.query_end,
                                  hit.sbjct_start, hit.sbjct_end, identity))

        self.colonizer_matches = [
            item for item in self.hits
            if (item[1] in COLONIZER or item[2].split(' ') in COLONIZER)
        ]
        self.non_colonizer_matches = [
            item for item in self.hits
            if (item[1] in NON_COLONIZER or item[2].split(' ') in NON_COLONIZER
                )
        ]
        if len(self.non_colonizer_matches) == 0:
            self.pursue = 1
예제 #45
0
def runBlast(runtype, sequence):
    #Format sequence using FASTA standard
    fastaFormat = ">Test\n%s\n" % sequence
    blastType = ""
    # Set correct type of BLAST search to be performed
    if (runtype == "n"):
        # Nucleotide
        blastType = "blastn"
        db = "nt"
    elif (runtype == "p"):
        # Amino Acid / Protein
        blastType = "blastp"
        db = "nr"
    else:
        # Raises Error if improper Blast type is set. This is for debugging purposes as the blast type cannot be implicitly changed by the user
        raise Exception("INVALID BLAST TYPE")
    # Run BLAST query
    result_handle = NCBIWWW.qblast(blastType, db, fastaFormat)
    # Read BLAST result into BLAST object
    blast_record = NCBIXML.read(result_handle)
    hitString = ""
    # Format result for display to User
    for alignment in blast_record.alignments:
        for hsp in alignment.hsps:
            hitString += (
                "*****Alignment*****\n sequence: %s\n length: %s\n e value: %s\n %s...\n %s...\n %s... "
                % (alignment.title, alignment.length, hsp.expect,
                   hsp.query[0:75], hsp.match[0:75], hsp.sbjct[0:75]))

    # Return formatted result
    return hitString
예제 #46
0
def blastdemo(genbankID):
    # run blastp on the swissprot database NB to scale this up we must do it locally on cluster
    result_handle = NCBIWWW.qblast("blastp", "swissprot", genbankID)
    # read the results as XML
    blast_record = NCBIXML.read(result_handle)

    # Set this value to ridiculously low
    E_VALUE_THRESH = 0.00000000000000001
    # for each alignment found, display the one with the lowest e-value, and also protein function information.
    for alignment in blast_record.alignments:
        for hsp in alignment.hsps:
            if hsp.expect < E_VALUE_THRESH:
                print ("****Alignment****")
                print ("sequence:", alignment.title)
                print ("length:", alignment.length)
                print ("e value:", hsp.expect)
                print (hsp.query[0:75] + "...")
                print (hsp.match[0:75] + "...")
                print (hsp.sbjct[0:75] + "...")
                print "\n"

    ### h is not defined yet, Will (problem from  iPython nb's!)
    # print h.query[0:75] + '...'
    # print h.match[0:75] + '...'
    # print h.sbjct[0:75] + '...'

    for a in blast_record.alignments:
        print a.length
예제 #47
0
def downloadNCBISequences(blastFile, sequencesFile, cutoff, verbose):
	print('Obtaining sequences from NCBI...')
	with open(blastFile, 'r') as f:
		records = NCBIXML.read(f)
	if verbose:
		print('Found ' + str(len(records.alignments)) + ' matches')
	with open(sequencesFile, 'w') as f:
		sequences = []
		for idx, alignment in enumerate(records.alignments):
			for hsp in alignment.hsps:
				title = alignment.title
				words = title.split('|')
				seqID = words[3]
				identityPercent = 100.0 * float(hsp.identities) / float(hsp.align_length)
				if (identityPercent >= float(cutoff)):
					sequences.append(seqID);
					if verbose:
						print(seqID + " (identity " + str(identityPercent) + "% >= cutoff " + str(cutoff) + "%) - adding")
				else:
					if verbose:
						print(seqID + " (identity " + str(identityPercent) + "% < cutoff " + str(cutoff) + "%) - skipping")
		try:
			handle = Entrez.efetch(db="nuccore", id=",".join(sequences), rettype="fasta", retmode="xml")
			records = Entrez.parse(handle)
			DNAsequences = []
			for record in records:
				DNAsequences.append( SeqRecord( Seq.Seq(record['TSeq_sequence'], IUPAC.unambiguous_dna ), id=record['TSeq_accver'], description=record['TSeq_defline']) )
			SeqIO.write(DNAsequences, f, "fasta")
			handle.close()
		except Exception as e:
			print("WARNING: unable to download this entry: " + str(e))
	print('OK')
예제 #48
0
def read_blast_xml(filename, **kwargs):
    """Read BLAST XML format."""
    # Read file.
    with open(filename, 'r') as f:
        blast_record = NCBIXML.read(f)

    # Prepare DataFrame fields.
    data = {
        'accession': [],
        'hit_def': [],
        'hit_id': [],
        'title': [],
        'length': [],
        'e_value': [],
        'sequence': []
    }

    # Get alignments from blast result.
    for i, s in enumerate(blast_record.alignments):
        data['accession'] = s.accession
        data['hit_def'] = s.hit_def
        data['hit_id'] = s.hit_id
        data['title'] = s.title
        data['length'] = s.length
        data['e_value'] = s.hsps[0].expect
        data['sequence'] = s.hsps[0].sbjct

    # Port to DataFrame.
    return pd.DataFrame(data)
예제 #49
0
def blast_test():
    '''BLAST result interpretation

    Given several BLAST result xml, load the results, and then do the sorting.
    Compare the generated result with manual input result.
    '''
    blast_object1 = BlastRecord(55, 'tests/test_data/blast/single_blast1.xml')
    assert_equal(blast_object1.match(), (
        55, 'HM991502',
        'Pseudomonas fluorescens strain Q8r1-96 type III secretion gene cluster, complete sequence',
        0.0, 20, 992, 11854, 10866, 0.98, 1))

    blast_object2 = BlastRecord(46, 'tests/test_data/blast/single_blast2.xml')
    assert_equal(blast_object2.match(), (
        46, 'CP002585',
        'Pseudomonas brassicacearum subsp. brassicacearum NFM421, complete genome',
        0.0, 19, 525, 636636, 636111, 0.96, 0))

    with open('tests/test_data/blast/single_blast2.xml', 'r') as handle:
        blast = NCBIXML.read(handle)
    multi_test = BlastRecord(65, 'dummy_place_holder', blast)
    assert_equal(multi_test.match(), (
        65, 'CP002585',
        'Pseudomonas brassicacearum subsp. brassicacearum NFM421, complete genome',
        0.0, 19, 525, 636636, 636111, 0.96, 0))
예제 #50
0
def blast(sequence, db):
  infile = None
  feature_list = []
  input = clean_dna_sequence(sequence)
  input2 = input+input

  with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
    infile = f.name 
    f.write(">Query\n%s\n" % (input2,))

  outfile = "%s.out.xml" % (infile,)
  blast_cl = NcbiblastnCommandline(query=infile, db="%s/%s" % (settings.NCBI_DATA_DIR, db),
                                   evalue=0.001, word_size=6, outfmt=5, out=outfile)
  cl = str(blast_cl)
  cl = "%s/%s" % (settings.NCBI_BIN_DIR, cl)
  r = subprocess.call(cl.split(" "))
  if r != 0:
    raise Exception("Blast failed: %s" % (cl,))
  
  with open(outfile, "r") as f:
    blast_record = NCBIXML.read(f)
    for alignment in blast_record.alignments:
      accession = Blast_Accession(alignment.accession)
      for hsp in alignment.hsps:
        #print "seq %s %s %s" % (accession.type, accession.feature_length, alignment.hit_def,)
        #print 'identities %s/%s' % (hsp.identities, len(hsp.query))
        #print 'qs %s-%s, ms %s-%s' % (hsp.query_start, hsp.query_end, hsp.sbjct_start, hsp.sbjct_end)
        #print '    '+hsp.query[0:75] + '...'
        #print '    '+hsp.match[0:75] + '...'
        #print '    '+hsp.sbjct[0:75] + '...'

        percent = 100.0*hsp.identities/(1.0*len(hsp.sbjct))
        if percent < 85: # this is some what arbitrary...
          continue

        start = hsp.query_start
        end = hsp.query_end
        if hsp.sbjct_end > hsp.sbjct_start:
          clockwise = True
          hit_start = hsp.sbjct_start
          hit_end = hsp.sbjct_end
        else:
          clockwise = False
          hit_end = hsp.sbjct_start
          hit_start = hsp.sbjct_end

        feature = alignment.hit_def
        if hit_start != 1 or hit_end != accession.feature_length:
          feature = '%s (%s-%s/%s)' % (feature, hit_start, hit_end, accession.feature_length)

        if start <= len(input):
          end = end % len(input)
          f = Aligned_Feature(feature, alignment.hit_def, start, end, clockwise, accession.type,
                              hsp.query, hsp.match, hsp.sbjct)
          feature_list.append(f)

  os.unlink(outfile)
  os.unlink(infile)
  return feature_list
    def get_blast_records(self):
        if not os.path.isfile(self.search_output_file):
            blast_output = NCBIWWW.qblast(self.program, self.database, self.record.seq, self.entrez_query, 500, 100.0)
            with open(self.search_output_file, "w") as tempFile:
                tempFile.write(blast_output.read())

        blast_file = open(self.search_output_file)
        return NCBIXML.read(blast_file)
예제 #52
0
    def run(self, input_seq):
        output = []

        result = NCBIXML.read(NCBIWWW.qblast(self.blast_program, self.db, input_seq, auto_format='xml'))
        for aln in result.alignments[:self.top_results]:
            logger.debug("Looping over alignments, current hit: {}".format(aln.hit_id))
            output.append((aln.hit_id, aln))
        return output
예제 #53
0
def blast_test():
    """BLAST result interpretation

    Given several BLAST result xml, load the results, and then do the sorting.
    Compare the generated result with manual input result.
    """
    blast_object1 = BlastRecord(55, "tests/test_data/blast/single_blast1.xml")
    assert_equal(
        blast_object1.match(),
        (
            55,
            "HM991502",
            "Pseudomonas fluorescens strain Q8r1-96 type III secretion gene cluster, complete sequence",
            0.0,
            20,
            992,
            11854,
            10866,
            0.98,
            1,
        ),
    )

    blast_object2 = BlastRecord(46, "tests/test_data/blast/single_blast2.xml")
    assert_equal(
        blast_object2.match(),
        (
            46,
            "CP002585",
            "Pseudomonas brassicacearum subsp. brassicacearum NFM421, complete genome",
            0.0,
            19,
            525,
            636636,
            636111,
            0.96,
            0,
        ),
    )

    with open("tests/test_data/blast/single_blast2.xml", "r") as handle:
        blast = NCBIXML.read(handle)
    multi_test = BlastRecord(65, "dummy_place_holder", blast)
    assert_equal(
        multi_test.match(),
        (
            65,
            "CP002585",
            "Pseudomonas brassicacearum subsp. brassicacearum NFM421, complete genome",
            0.0,
            19,
            525,
            636636,
            636111,
            0.96,
            0,
        ),
    )
예제 #54
0
def get_BLAST(taxid, queryseq):
    '''
    Input taxid to BLAST queryseq against
    '''
    e_query = "txid" + taxid + " [ORGN]"
    #, other_advanced='-G 4 -E 1'
    blast_result = NCBIWWW.qblast("blastn", "nt", queryseq, megablast=True, entrez_query=e_query,
        word_size='11', other_advanced='-G 5 -E 2') #, other_advanced='-G 4 -E 1'
    return NCBIXML.read(blast_result)
예제 #55
0
def seqdist(seq1,seq2):
    cline = NcbiblastpCommandline(query=seq1, subject=seq2, outfmt=5, evalue=100000000)
    out,err = cline()
    result = NCBIXML.read(StringIO.StringIO(out))

    if len(result.alignments) == 0:
        return -1

    return result.alignments[0].hsps[0].expect
예제 #56
0
def adaptor_blast(query,dbpatch="adaptor.fasta"):
    # build the blast db, maybe adding an asserting to identify the exsentise of the db is better
    db=dbpatch.split(".")[0]
    print myexe("makeblastdb -in %s -dbtype nucl -input_type fasta -out %s" % (dbpatch,db))

    blastn_cline = NcbiblastnCommandline(db=db, outfmt=5)
    out, err = blastn_cline(stdin=query)
    blast_records = NCBIXML.read(StringIO(out))  # return is a generator, need a loop to parse the result
    return blast_records
예제 #57
0
    def blast(self, input="search_output.xml"):
        blastOutput = NCBIWWW.qblast(self.program, self.database, self.record.seq,
                                     entrez_query=self.entrezQuery, format_type=self.formatType)

        outputFile = open(input, "w")
        outputFile.write(blastOutput.read())

        result = open(input)
        return NCBIXML.read(result);
def blast_seqs(each_seq):
    ''' Takes a sequence and runs a blast search '''
    # My blast parameters, let me know if they could be better optimised for shorter primers
    blast_handle = NCBIWWW.qblast("blastn", "nt", each_seq, expect=0.04, hitlist_size = 1000, word_size=7)
    blast_result = NCBIXML.read(blast_handle)
    blast_handle.close()
    for alignment in blast_result.alignments:
        for hsp in alignment.hsps:
            return alignment.title
def parseBlastFile(xmlfil):
    """
    Input 
    -----
    Uniprot XML Output  
    parseBlastFile(xmlfil)
    e.g. parsebBlastFile('O00238_blast.xml')
    
    Description
    -----------
    Parses the following information out of the bast output xml file. 
    
    Uniprot | PDBid | chain | query_to | query_from | Iter Query Len | e-value | Query Coverage | Sequence Identity 
    
    Output 
    -------
    Uniprot.csv 
    """
    NP_id = xmlfil.split('_')[0]
    
    result_handle = open(xmlfil)
    blast_record = NCBIXML.read(result_handle)
    result_handle.close()
    
    #E_VALUE_THRESH = 1E-25
    
    outfilname = NP_id+'.csv'
    
    with open(outfilname,"w") as out_file:
        print "Writing output to %s"%(outfilname) 
        out_file.write('Uniprot,PDBid,chain,query_to,query_from,IterQueryLen,e-value,QueryCov,SeqId\n')
        sequencequeryLength = blast_record.query_length
        for alignment in blast_record.alignments:
            for hsp in alignment.hsps:
                first = float(hsp.identities)
                second = len(hsp.query)
                identity = 100*float(first/second)
                identity = int(round(identity,0))
                coverage =  round(100*float(hsp.query_end - hsp.query_start)/sequencequeryLength,0) 
                             
                line1=alignment.title
                b=line1.split('|')
                pdbid = str(b[3])
                
                out_file.write(NP_id+",")
                out_file.write(pdbid+",")
                line2=b[4]
                chain=line2.split()
                               
                out_file.write(str(chain[0]) +",")
                out_file.write(str(hsp.query_end)+",")
                out_file.write(str(hsp.query_start)+",")
                out_file.write(str(sequencequeryLength)+",")
                out_file.write(str(hsp.expect) +",")
                out_file.write("%f"%coverage +",")
                out_file.write(str(identity) +"\n")
def runBlast(cline, bOutFile, locus_sbjct):
	os.system(str(cline))
	rec = open(bOutFile)
	blast_record = NCBIXML.read(rec)
	
	if os.path.isfile(locus_sbjct):
		os.remove(locus_sbjct)
	os.remove(bOutFile)
	
	return blast_record