Esempio n. 1
0
def run_blastp(query, subject, outfile, evalue=1e-4):
    """
    This function runs blastp

    :param query: file that contains query sequences
    :param subject: file that contains the sequences used for blasting
    :param outfile: the path to the file that will contain blast results if the blastp
    is successful
    :param evalue: threshold value for blast results, default is 1e-4 or user defined
    :return: returns the amount of blast hits which says if the blastp was successful
    """
    cmd = blastp(query=query,
                 subject=subject,
                 evalue=evalue,
                 outfmt=6,
                 out=outfile)

    #print(cmd)
    stdout, stderr = cmd()

    #if stderr:
    #    logging.warn("STDERR = ", stderr)

    if not os.path.isfile(outfile):
        raise Exception('No BLAST output')

    if os.path.getsize(outfile) == 0:
        return 0

    return len(open(outfile).read().splitlines())
Esempio n. 2
0
def blast_gene(seq, database):
    tempfasta = open('temp.fasta', 'w')
    SeqIO.write(seq, tempfasta, 'fasta')
    tempfasta.close()
    run = blastp(query='temp.fasta',
                 db=database,
                 num_descriptions=5,
                 num_threads=6,
                 outfmt=5,
                 out='temp.xml')
    run()
    result_handle = open('temp.xml')
    result = NCBIXML.read(result_handle)
    rets = []
    for i in result.descriptions:
        ttl = i.title
        e = i.e
        if 'Tfl|' in ttl:
            species = 'T. flavus'
            d = ttl[ttl.find('Tfl'):]
        elif 'Pfu|' in ttl:
            species = 'P. funiculosum'
            d = ttl[ttl.find('Pfu'):]
        elif 'PMAA_' in ttl:
            species = 'T. marneffei'
            d = ttl[ttl.find('PMAA'):]
        else:
            species = ttl[ttl.find('[') + 1:ttl.find(']')]
            d = ttl[ttl.find('| ') + 1:ttl.find('[') - 1]
        rets.append(species)
        rets.append(d)
        rets.append(str(e))
    return rets
Esempio n. 3
0
def Blast(type,protein_sequence,start, end, genomic_sequence):
        result=[]
	ORF=[]
        M = re.search('M',protein_sequence)
        if M:
            query = protein_sequence[M.start():]
	    query=query+"*"
            temp = open("temp.ORF", "w")
            print >>temp, '>blasting'
            print >>temp, query
            temp.close()
            cline=blastp(query="'temp.ORF'", db = "./Blast/DB.blast.txt", evalue=0.01, outfmt=5, out=type +".BLAST")
            os.system(str(cline))
            blast_out=open(type+".BLAST")
            string=str(blast_out.read())
            DEF=re.search("<Hit_def>((.*))</Hit_def>",string)
	    if DEF:
		if DEF.group(1)=='L1':
		    real_start=start+M.start()+M.start()+M.start()
		    result.append(type)
		    result.append('L1')
		    L1_pre=genomic_sequence[(start+3*M.start()):int(end)]
		    splice='(C|T)(C|T)(A|C|G|T)(C|T)AG(A)TG'
		    spliced=re.search(splice,str(L1_pre))
		    if spliced:
			start_L1 = int(spliced.start())+6
			if start_L1 % 3 == 0:
			    if start_L1 > 600:
				L1_post=L1_pre
				result.append(str(start_L1)+".."+str(end))
				result.append(str(L1_post))
				result.append(Seq(str(L1_post)).translate())
			    else:
				L1_post=L1_pre[start_L1:]
				result.append(str(int(real_start+1)+int(start_L1))+".."+str(end))
				result.append(str(L1_post))
				result.append(Seq(str(L1_post)).translate())
			else:
			    L1_post=L1_pre
			    result.append(str(real_start+1)+".."+str(end))
			    result.append(str(L1_post))
			    result.append(Seq(str(L1_post)).translate())
		    else:
			L1_post=L1_pre
			result.append(str(real_start+1)+".."+str(end))
			result.append(str(L1_post))
			result.append(Seq(str(L1_post)).translate())
		    
		else:
		    real_start=start+M.start()+M.start()+M.start()
		    result.append(type)
		    result.append(DEF.group(1))
		    result.append(str(real_start+1)+".."+str(end))
		    result.append(genomic_sequence[int(real_start):int(end)])
		    result.append(query)

        return result
Esempio n. 4
0
 def do_blast(self, subject):
     print 'Running BLAST...'
     blast = blastp()
     blast.db = self.standard
     blast.query = subject
     blast.outfmt = 5
     blast.out = subject + '.xml'
     blast.comp_based_stats = "0"
     if os.path.exists(blast.out):
         return
     blast()
Esempio n. 5
0
 def secondary_blast(self):
     if os.path.exists('orthologs.faa') is False or os.path.exists('secondary.xml') is True:
         return
     os.system('makeblastdb -dbtype prot -in orthologs.faa')
     blast = blastp()
     blast.db = 'orthologs.faa'
     blast.query = self.substract
     blast.outfmt = 5
     blast.out = 'secondary.xml'
     blast.evalue = self.expect_diff
     blast.comp_based_stats = "0"
     blast()
Esempio n. 6
0
 def primary_blast(self):
     if os.path.exists('primary.xml') is True:
         return
     print 'Running BLAST...'
     blast = blastp()
     blast.db = self.db
     blast.query = self.genomes[0][1]
     blast.outfmt = 5
     blast.out = 'primary.xml'
     blast.evalue = self.expect
     blast.comp_based_stats = "0"
     blast()
Esempio n. 7
0
 def circle_blast(self, loc):
     print 'Running Circualr BLAST...'
     blast = blastp()
     blast.db = loc + '/circle'
     blast.outfmt = 5
     blast.comp_based_stats = "0"
     for subject in self.genomes:
         self.build_genome(loc, subject)
         blast.query = subject
         blast.out = loc + '/' + subject + '.xml'
         if os.path.exists(blast.out):
             return
         blast()
Esempio n. 8
0
def match_fasta_position(query, subject, num=None):
    """return dataframe of position matching in subject and query fasta, or a dictionary given a list of number"""
    df = []
    blastp(query=query,
           subject=subject,
           out=query + '_' + subject + '.xml',
           outfmt=5,
           max_hsps=1)()
    xml = SearchIO.read(query + '_' + subject + '.xml', "blast-xml")
    for n in range(len(xml)):
        x = xml[n][0]
        hit_gap = np.array(
            [index for index, value in enumerate(x.hit) if value == '-'])
        q_gap = np.array(
            [index for index, value in enumerate(x.query) if value == '-'])
        hit_num = list(np.arange(x.hit_start + 1, x.hit_end + 1))
        q_num = list(np.arange(x.query_start + 1, x.query_end + 1))
        for i in hit_gap:
            hit_num.insert(i, np.nan)
        for i in q_gap:
            q_num.insert(i, np.nan)
        df.append(
            list(
                zip(
                    len(x.query) * [query],
                    len(x.query) * xml[n].description, x.query, q_num, x.hit,
                    hit_num)))
    df = pd.DataFrame([j for i in df for j in i],
                      columns=[
                          'query', 'sequence', 'query_res', 'query_num',
                          'hit_res', 'hit_num'
                      ])
    df.set_index(['query'], inplace=True)
    os.remove(query + '_' + subject + '.xml')
    if num == None:
        return df
    else:
        return df[df.query_num.isin(list(map(int, num)))].to_dict('records')
Esempio n. 9
0
def run_blastp(query, subject, outfile, evalue=1e-4):
    cmd = blastp(query=query,
                 subject=subject,
                 evalue=evalue,
                 outfmt=6,
                 out=outfile)

    print(cmd)
    stdout, stderr = cmd()

    #if stderr:
    #    logging.warn("STDERR = ", stderr)

    if not os.path.isfile(outfile):
        raise Exception('No BLAST output')

    if os.path.getsize(outfile) == 0:
        return 0

    return len(open(outfile).read().splitlines())
Esempio n. 10
0
def blast_gene(seq, database):
    tempfasta = open('temp.fasta', 'w')
    SeqIO.write(seq, tempfasta, 'fasta')
    tempfasta.close()
    run = blastp(query='temp.fasta',
                 db=database,
                 max_target_seqs=1,
                 num_threads=6,
                 outfmt=5,
                 out='temp.xml')
    run()
    result_handle = open('temp.xml')
    result = NCBIXML.read(result_handle)
    if len(result.descriptions) > 0:
        rets = result.descriptions[0].title
        rets = [rets[rets.find('GQ'):]]
        rets.append(str(result.descriptions[0].e))
        return rets
    else:
        return ['none']
Esempio n. 11
0
def blast_gene(ids, eval, database, of):
    fasta_sequences = SeqIO.parse(open(database), "fasta")
    for seq in fasta_sequences:
        if seq.id == ids:
            SeqIO.write(seq, "temp.fasta", "fasta")
    run = blastp(query='temp.fasta',
                 db=database,
                 num_threads=6,
                 outfmt=5,
                 word_size=4,
                 evalue=eval,
                 out='temp.xml')
    run()
    result_handle = open('temp.xml')
    result = NCBIXML.read(result_handle)
    rets = []
    for i in result.descriptions:
        ttl = i.title
        e = i.e
        species = ttl.split(' ')[0]
        rets.append(species)
        rets.append(str(e))
    # for i in result.alignments:
    #     for j in i.hsps:
    #         rets.append(str(j.frame[1]))
    #         rets.append(str(j.query))
    #         rets.append(str(j.match))
    #         rets.append(str(j.sbjct_start))
    os.remove('temp.fasta')
    os.remove('temp.xml')
    genlis = []
    for i in range(0, len(rets), 2):
        genlis.append(rets[i])
        print rets[i]
    fasta_sequences = SeqIO.parse(open(database), "fasta")
    seqs = []
    for seq in fasta_sequences:
        if seq.id in genlis:
            seqs.append(seq)
    SeqIO.write(seqs, of, "fasta")
Esempio n. 12
0
def Blast(type, protein_sequence, start, end, genomic_sequence):
    result = []
    ORF = []
    M = re.search('M', protein_sequence)
    if M:
        query = protein_sequence[M.start():]
        query = query + "*"
        temp = open("temp.ORF", "w")
        print >> temp, '>blasting'
        print >> temp, query
        temp.close()
        cline = blastp(query="'temp.ORF'",
                       db="./Blast/DB.blast.txt",
                       evalue=0.01,
                       outfmt=5,
                       out=type + ".BLAST")
        os.system(str(cline))
        blast_out = open(type + ".BLAST")
        string = str(blast_out.read())
        DEF = re.search("<Hit_def>((.*))</Hit_def>", string)
        if DEF:
            if DEF.group(1) == 'L1':
                real_start = start + M.start() + M.start() + M.start()
                result.append(type)
                result.append('L1')
                L1_pre = genomic_sequence[(start + 3 * M.start()):int(end)]
                splice = '(C|T)(C|T)(A|C|G|T)(C|T)AG(A)TG'
                spliced = re.search(splice, str(L1_pre))
                if spliced:
                    start_L1 = int(spliced.start()) + 6
                    if start_L1 % 3 == 0:
                        if start_L1 > 600:
                            L1_post = L1_pre
                            result.append(str(start_L1) + ".." + str(end))
                            result.append(str(L1_post))
                            result.append(Seq(str(L1_post)).translate())
                        else:
                            L1_post = L1_pre[start_L1:]
                            result.append(
                                str(int(real_start + 1) + int(start_L1)) +
                                ".." + str(end))
                            result.append(str(L1_post))
                            result.append(Seq(str(L1_post)).translate())
                    else:
                        L1_post = L1_pre
                        result.append(str(real_start + 1) + ".." + str(end))
                        result.append(str(L1_post))
                        result.append(Seq(str(L1_post)).translate())
                else:
                    L1_post = L1_pre
                    result.append(str(real_start + 1) + ".." + str(end))
                    result.append(str(L1_post))
                    result.append(Seq(str(L1_post)).translate())

            else:
                real_start = start + M.start() + M.start() + M.start()
                result.append(type)
                result.append(DEF.group(1))
                result.append(str(real_start + 1) + ".." + str(end))
                result.append(genomic_sequence[int(real_start):int(end)])
                result.append(query)

    return result
Esempio n. 13
0
def get_bsr(work_path,
            job_name,
            reference_fasta,
            comparison_fastas,
            strain_names,
            force_redo=False,
            matrix='PAM30'):
    """Return a BSR_RecordSet containing BSR_Record objects with
    the results of a BSR analysis.

    Args:
        work_path (str): The path were generated files will be saved to.
        job_name (str): Prefix used for all generated file names, and stored
            in the returned BSR_Record
        reference_fasta (str): Path of the FastA file used as the reference
        comparison_fastas (iter[str]): List or tuple of paths to FastA files
            for which BSR will be generated.
        strain_names (iter[str]): Strain/species names of organisms in the
            analysis. Used when generating graphs and optionally to obtain
            data of that strain.
        force_redo (bool): If True the analysis will be performed from the
            start, overwriting any files in the workpath. Defaults to False.
        matrix (str) optional, 'PAM30' (default) or 'BLOSUM62': Specifies the
            scoring matrix used by BLAST. PAM30 is better for more similar
            sequences.

    Returns:
        BSR_RecordSet. See documentation for this object for more info

    A series of blastp are performed using the reference_fasta as the query
        against the comparison_fastas. Files archiving the process are created
        and saved to disc using pickle. If the process is interupted then it
        can be continued by running the same command.
    """
    assert len(comparison_fastas) == len(strain_names) - 1

    #matrix = MATRICES[matrix]

    created_files = []
    # Create (or load) the initial BSR_records from the prime fasta
    fprefix = work_path + job_name
    init_bsr_records_fn = fprefix + ' - ref BSR records.pickle'

    bsr_records = _do_next_step_pickle(
        init_bsr_records_fn,
        get_initial_BSR_Records_with_ref_scores,
        (reference_fasta, strain_names[0],
         matrix),  # Don't forget to pass *args as tuple
        force_redo=force_redo)
    created_files.append(init_bsr_records_fn)
    # Go through each proteome to be compared to the reference and get BSR.
    if type(comparison_fastas) is not list:
        comparison_fastas = [comparison_fastas]
    comparison_num = 0
    for comparee_index, subject_fasta in enumerate(comparison_fastas):
        comparison_num += 1
        # Get a blast of every reference sequences against the comparison proteome
        blast_results_path = fprefix + ' - BLAST ref vs ' + os.path.split(
            subject_fasta)[1] + '.xml'
        created_files.append(blast_results_path)
        if not os.path.isfile(blast_results_path) or force_redo:
            blast_query = blastp(query=reference_fasta,
                                 subject=subject_fasta,
                                 remote=False,
                                 outfmt=5,
                                 max_hsps=1,
                                 matrix=matrix,
                                 comp_based_stats="0")
            # run the blast
            blast_results = blast_query()

            with open(blast_results_path, 'w') as f:
                f.write(blast_results[0])
                print('BLAST results file saved:', blast_results_path)

        bsr_records_path = fprefix + ' - {} of {} BSR calculated.pickle'.format(
            comparison_num, len(comparison_fastas))
        created_files.append(bsr_records_path)
        bsr_records = _do_next_step_pickle(bsr_records_path,
                                           get_bsr_for_strain,
                                           (bsr_records, blast_results_path,
                                            strain_names[comparee_index + 1]),
                                           force_redo=force_redo)

    final_results_path = fprefix + ' - complete BSR results.pickle'
    created_files.append(final_results_path)
    final_results = BSR_RecordSet(job_name, bsr_records, reference_fasta,
                                  comparison_fastas, strain_names,
                                  created_files, matrix)
    fn = work_path + job_name + ' - Final record set.pickle'
    with open(fn, 'wb') as f:
        pickle.dump(final_results, f)
        print('File saved:', fn)

    return final_results