예제 #1
0
 def read_write_and_compare(self, source_file, source_format, out_file,
                            out_format, **kwargs):
     """Compares read QueryResults after it has been written to a file."""
     source_qresult = SearchIO.read(source_file, source_format, **kwargs)
     SearchIO.write(source_qresult, out_file, out_format, **kwargs)
     out_qresult = SearchIO.read(out_file, out_format, **kwargs)
     self.assertTrue(compare_search_obj(source_qresult, out_qresult))
예제 #2
0
 def fetch_identifier(cls, sequence=None, filename=None):
     """
     Given a sequence or a XML filename get the NC data.
     If sequence and filename are specified it runs a blast and write to filename.
     If sequence is not specified it read the filename.
     :param sequence: a protein sequence
     :param filename: the XML filename.
     :return:
     """
     ## read xml
     assert sequence or filename, 'You need to specify at least a sequence or a filename'
     if sequence:
         if hasattr(sequence, 'seq'):
             result_handle = NCBIWWW.qblast("blastp", "nr", sequence.seq)
         else:
             result_handle = NCBIWWW.qblast("blastp", "nr", sequence)
         if filename:
             with open(filename, 'w') as w:
                 w.write(result_handle.read())
             xml_result = SearchIO.read(filename, 'blast-xml')
         else:
             xml_result = SearchIO.read(result_handle, 'blast-xml')
     else:
         xml_result = SearchIO.read(filename, 'blast-xml')
     ## parse xml
     identifiers = cls._get_identifers(xml_result)
     cls.debugprint('There are {} identifiers: {}'.format(
         len(identifiers), identifiers))
     for identifier in identifiers:
         entry = cls._try_identifer(identifier)
         if entry:
             cls.debugprint(
                 'This identifier {} has a hit'.format(identifier))
             cls._fetch_protein(identifier)
             return entry
예제 #3
0
 def read_write_and_compare(self, source_file, source_format, out_file,
         out_format, **kwargs):
     """Compares read QueryResults after it has been written to a file."""
     source_qresult = SearchIO.read(source_file, source_format, **kwargs)
     SearchIO.write(source_qresult, out_file, out_format, **kwargs)
     out_qresult = SearchIO.read(out_file, out_format, **kwargs)
     self.assertTrue(compare_search_obj(source_qresult, out_qresult))
예제 #4
0
def get_srch_file_info(search_result_path):
    """Returns the format string necessary for the SearchIO module to parse a
    given file.
    """
    # Define regular expressions that will only match any line in a file if it
    # is of a certain format. See the SearchIO documentation here:
    # http://biopython.org/DIST/docs/api/Bio.SearchIO-module.html
    fmt_expr_dict = {
        'blast-xml': [re.compile(r'^<!DOCTYPE BlastOutput')],
        'hmmer3-tab': [re.compile(r'^# Option settings: hmmsearch --tblout')],
        'hmmer3-text': [re.compile(r'^# HMMER ')]
    }

    # See if any of the regular expressions match the file.
    ident_fmt = None
    with open(search_result_path) as srh:
        for line in srh:
            for fmt in fmt_expr_dict.keys():
                for r in fmt_expr_dict[fmt]:
                    if r.search(line) is not None:
                        ident_fmt = fmt
                        break
                if ident_fmt is not None:
                    break

    # Check that a format was found.
    assert ident_fmt is not None, """Error: Could not identify format for
    search result file: %s""" % search_result_path

    ident_prog = None
    with open(search_result_path) as srh:
        ident_prog = SearchIO.read(srh, ident_fmt).program
    assert ident_prog is not None, """Could not identify program name from file
    format type."""

    # Decide which dict key to use based on identified program version, and
    # then identify version in file.
    ident_vers = None
    #vers_key = None
    with open(search_result_path) as srh:
        ident_vers = SearchIO.read(srh, ident_fmt).version

    # Check that the program version was found.
    assert ident_vers is not None, """Error: Could not identify name of program
    used to generate search result file: %s""" % search_result_path

    # Return a list defining the input file format.
    return [ident_prog, ident_vers, ident_fmt]
예제 #5
0
def append_known_inversions(ir):

    # BLAST against the db
    ir_seq = SeqRecord(seq=Seq(ir, generic_nucleotide).upper(), id='IRSEQ')
    with open('ir.fasta', 'w') as o:
        SeqIO.write(ir_seq, o, 'fasta')
    blast('ir.fasta', dbname='SWITCH', out_xml='ir.xml', prog='blastn')
    blast_result = SearchIO.read('ir.xml', 'blast-xml')
    best_hit = 'None detected'
    if blast_result.hits:
        bh = blast_result.hits[0]
        hsp = bh.hsps[0]
        best_hit_name = bh.blast_id
        evalue = hsp.evalue
        hit_strand = hsp.hit_strand
        hit_range = hsp.hit_range

        mod = ''
        if hit_strand == -1:
            mod = '(reverse strand)'

        best_hit = '{0}: {1} eval:{3} {2}'.format(best_hit_name, hit_range,
                                                  mod, evalue)

        # print(blast_result.hits[0])
        # print(blast_result.hsps[0])

    return best_hit
예제 #6
0
 def makeDb(self):
     #self.sc.createFolder(self.intermediateDb)
     newAmbDb = self.resourcePath("/" + self.newDb + "/" + self.dbName)
     if os.path.exists(newAmbDb) and os.path.isdir(newAmbDb):
         shutil.rmtree(newAmbDb)
     for bases, dirs, files in os.walk(self.filesPath):
         #SI NO CREA BIEN LA BBDD AMBIGUA VER DE DESCOMENTAR ESTO QUE CREABA UNA CARPETA DEMAS SIN USO PARA LAUEBA
         # newFolderPath = self.newDb + "/" + bases
         #self.sc.createFolder(newFolderPath)
         #print ("crea el directorio  " + newFolderPath)
         for file in files:
             # print(file)
             # por cada archivo de salida que se haya generado en la busqueda,
             # generar una nueva secuencia fasta por cada uno de los resultados obtenidos
             outputName = bases + "/" + file
             blast_qresult = SearchIO.read(outputName, "blast-xml")
             sequences = self.getSequencesFromBlastResult(blast_qresult)
             sequencePath = self.resourcePath("/" + self.newDb + "/" +
                                              self.dbName + "/" + file)
             self.sc.createFolder(sequencePath)
             self.sc.saveSequencesInFile(sequencePath, sequences, file)
             db = sequencePath
             self.sc.setOutputFile(file)
             #para evitar que se genere mal alguna base de datos y el error aparezca en etapas posteriores
             while (self.testDbFails(db, file)):
                 self.sc.makeBlastDb(db)
     shutil.rmtree(self.resourcePath("/Test" + "/" + self.dbName))
예제 #7
0
    def assign_class(self, seq_record):
        """Classifies sequence as BCR, TCR, or MHC

        Args:
            seq_recored: A biopython sequence record object
        
        Returns:
            The receptor and chain type of input sequence, if available
        """
        with tempfile.NamedTemporaryFile(mode="w") as hmm_out:
            receptor, chain_type = None, None
            self.run_hmmscan(seq_record, hmm_out)
            hmmer_query = SearchIO.read(hmm_out.name, 'hmmer3-text')
            hit_table, top_descriptions = self.parse_hmmer_query(hmmer_query)
            try:
                score = int(hit_table[1][3] - 100)
            except:
                score = int(0 - 100)
            receptor, chain_type = self.get_chain_type(top_descriptions)

            # We have no hits so now we check for MHC and IgNAR
            # This avoids excessive computations
            if not receptor or not chain_type:
                if self.is_b2m(seq_record):
                    return ("B2M", "-", 0)
                if self.is_ignar(seq_record):
                    return ("BCR", "IgNAR", 0)
                mhc_I_score = None
                mhc_I_score = self.is_MHC(str(seq_record.seq), self.mhc_I_hmm)
                if mhc_I_score >= self.hmm_score_threshold:
                    return ('MHC-I', 'alpha',
                            int(mhc_I_score - self.hmm_score_threshold))
                else:
                    mhc_II_alpha_score = None
                    mhc_II_alpha_score = self.is_MHC(str(seq_record.seq),
                                                     self.mhc_II_alpha_hmm)
                    if mhc_II_alpha_score and mhc_II_alpha_score >= self.hmm_score_threshold:
                        return ('MHC-II', 'alpha',
                                mhc_II_alpha_score - self.hmm_score_threshold)
                    else:
                        mhc_II_beta_score = None
                        mhc_II_beta_score = self.is_MHC(
                            str(seq_record.seq), self.mhc_II_beta_hmm)
                        if mhc_II_beta_score and mhc_II_beta_score >= self.hmm_score_threshold:
                            return ('MHC-II', 'beta',
                                    int(mhc_II_beta_score -
                                        self.hmm_score_threshold))
                        else:
                            if mhc_II_alpha_score == 0 and mhc_II_beta_score == 0:
                                return (None, None, score)
                            if mhc_II_alpha_score >= mhc_II_beta_score:
                                return (None, None,
                                        int(mhc_II_alpha_score -
                                            self.hmm_score_threshold))
                            else:
                                return (None, None,
                                        int(mhc_II_beta_score -
                                            self.hmm_score_threshold))
            else:
                return (receptor, chain_type, score)
예제 #8
0
def blast():
    fasta = open("assembly.fasta").read()
    handle = NCBIWWW.qblast("blastn",
                            "nr",
                            fasta,
                            entrez_query='"Herpesviridae"[organism]'
                            )  #run blast against the assembled sequence
    with open("blast.xml", "w") as out_handle:
        out_handle.write(handle.read())
    out_handle.close()
    blast_qresult = SearchIO.read("blast.xml", "blast-xml")
    output = open('MiniProject.log', 'a')
    output.write('seq_title ' + 'align_len ' + 'number_HSPs ' +
                 'topHSP_ident ' + 'topHSP_gaps ' + 'topHSP_bits ' +
                 'topHSP_expect \n')
    max_blast_id = 10
    if len(
            blast_qresult
    ) < 10:  #prevents program from crashing when there are less than 10 results
        max_blast_id = len(blast_qresult)
    for i in range(0, max_blast_id):
        hit = blast_qresult[i]
        blast_hsp = blast_qresult[i][0]
        output.write(
            str(hit.description) + ' ' + str(hit.seq_len) + ' ' +
            str(len(hit.hsps)) + ' ' + str(blast_hsp.ident_num) + ' ' +
            str(blast_hsp.gap_num) + ' ' + str(blast_hsp.bitscore) + ' ' +
            str(blast_hsp.evalue) + '\n')
    output.close()
예제 #9
0
def blaster(fasta_file):
    """
    Based on a target species list, we BLAST the given
    input sequence and put them in a file. 
    """
    fasta_string = open(fasta_file).read()
    print("BLAST initiated...")

    # qblast opens up the BLAST function in NCBI.
    result_handle = NCBIWWW.qblast("blastn", "nt", fasta_string)

    print("BLAST search done.")
    # Records will then be written in a file.
    records = []
    # Results need to go into an XML file.
    with open("my_blast.xml", "w") as out_handle:
        out_handle.write(result_handle.read())

    blast_result = SearchIO.read("my_blast.xml", "blast-xml")
    print("Writing BLAST results to file..")
    for i in target_species:
        # Interate through the blast result hits.
        for hit in blast_result:
            print(hit)
            if i in hit.description:
                # If the taret species is found, append.
                records.append(hit[0].hit)

    # Pretty easy way to write the given sequences in one file.
    SeqIO.write(records, "blast-results.fasta", "fasta")
    print("\nBLAST result file written to blast_results.fasta.")
    return ("blast_results.fasta")
def extract_faa_seqs(HMM_TO_USE):
    HMM_OUTPUT_FILE = TEMP_FOLDER + "/" + HMM_TO_USE.rsplit(".")[0] + ".out"
    HMM_OUTPUT_OBJECT = SearchIO.read(HMM_OUTPUT_FILE, 'hmmer3-tab')
    FAA_OUTPUT_FILE = TEMP_FOLDER + "/" + HMM_TO_USE.rsplit(".")[0] + ".faa"
    if HMM_OUTPUT_OBJECT:
        DICT_OF_HIT = dict()
        for seq_record in SeqIO.parse(ORF_FILE, "fasta"):
            for hit in HMM_OUTPUT_OBJECT:
                # Compare the seq record from the fasta file to the IDs in the hits
                if seq_record.id == hit.id:
                    BIN_ID = G2B_DICT[hit.id]['binID']
                    # If binID has been added to dictionary, keep the one with the higher bitscore
                    if BIN_ID in DICT_OF_HIT:
                        print(BIN_ID + " has multiple hits for " + HMM_TO_USE)
                        if int(hit.bitscore) > int(
                                DICT_OF_HIT[BIN_ID]['bitscore']):
                            DICT_OF_HIT[BIN_ID] = {
                                "sequence": seq_record.seq.rstrip('*'),
                                "bitscore": hit.bitscore
                            }
                    # If binID hasn't been added to dictionary, add it.
                    if BIN_ID not in DICT_OF_HIT:
                        DICT_OF_HIT[BIN_ID] = {
                            "sequence": seq_record.seq.rstrip('*'),
                            "bitscore": hit.bitscore
                        }
        with open(FAA_OUTPUT_FILE, 'w') as OPENED_FAA_OUTPUT:
            for bin_id in DICT_OF_HIT:
                OPENED_FAA_OUTPUT.write('>' + bin_id + '\n' +
                                        str(DICT_OF_HIT[bin_id]['sequence']) +
                                        '\n')
def reciprocal_hmm_search(modelname, modelname_regex, filename, organism, rev_inc_bitscore_percentage, out_filename = None):
    ''' Performs reciprocal hmmer search against the proteome of given organism
    '''
    print "# Reciprocal search..."
    is_found = False
    if out_filename == None:
        out_filename = modelname + ".rechits_" + organism
    reciprocal_search_command = "phmmer --noali --tblout " + out_filename + " " + filename + " " + proteomes_dir + organism + ".fasta > hmmer_res"
    os.system(reciprocal_search_command)
    try:
        hits = SearchIO.read(out_filename, "hmmer3-tab")
        max_bitscore = hits[0].bitscore
    except:
        hits = []
        max_bitscore = 0
    if len(hits) > 0:
        if re.search(modelname_regex, hits[0].description):
            is_found = True
    # for h in hits:
    #     if h.bitscore > rev_inc_bitscore_percentage * max_bitscore:
    #         if manual_mode:
    #             print modelname_regex, h.description, re.search(modelname_regex, h.description)
    #             raw_input("...")
    #         if re.search(modelname_regex, h.description):
    #             is_found = True
    if manual_mode:
        print filename, is_found
        raw_input("Check reciprocal search results...")
    return is_found
예제 #12
0
def process_blastn(filename, loci_name):
    blastn_qresult = SearchIO.read(filename, 'blast-xml')
    if len(list(blastn_qresult.hits)) == 0:
        print("No match found")
        return None
    best_hit = blastn_qresult[0]
    best_hit_species = best_hit.description.split(' ')[0]
    best_hit_genus = best_hit.description.split(' ')[1]
    species_same_genus = set([each_hit.description.split(' ')[0] + " " + each_hit.description.split(' ')[1] for each_hit in
                          blastn_qresult.hits if best_hit_genus in each_hit.description])
    blast_hsp = blastn_qresult[0][0]

    with open("Output_" + filename, 'w') as output_handle:
        output_handle.write("--------------------------------------------------------------------\n")
        output_handle.write("Result Summary\n")
        output_handle.write("--------------------------------------------------------------------\n")
        output_handle.write("Sample name : %s\nLoci name : %s\n" % (filename.split('_')[0], loci_name))
        output_handle.write("Estimated species : %s\n" % (best_hit_genus + " " + best_hit_species))
        output_handle.write("Other specie(s) with same genus : ")
        output_handle.write(str(species_same_genus)+'\n')
        if blast_hsp.hit_strand != blast_hsp.query_strand:
            output_handle.write("Reverse complemented? : Yes\n")
        else:
            output_handle.write("Reverse complemented? : No\n")
        output_handle.write("--------------------------------------------------------------------\n")
        output_handle.write("Hit summary\n")
        output_handle.write("--------------------------------------------------------------------\n")
        output_handle.write(str(blast_hsp))
        output_handle.write('\n')
        output_handle.write("--------------------------------------------------------------------\n")
        output_handle.write("Alignment detail\n")
        output_handle.write("--------------------------------------------------------------------\n")
        output_handle.write("%s - %s\n" % (blast_hsp.aln_all[0][0].seq, blast_hsp.aln_all[0][0].id))
        output_handle.write(blast_hsp.aln_annotation['similarity']+'\n')
        output_handle.write("%s - %s\n" % (blast_hsp.aln_all[0][1].seq, blast_hsp.aln_all[0][1].id))
예제 #13
0
 def generate_protein_model(self, query: str, template: str,
                            blast_xml_path: str, out_dir: str,
                            template_dir: str):
     hits = [
         _ for _ in SearchIO.read(blast_xml_path, 'blast-xml').hits
         if _.id == template
     ]
     assert len(hits) == 1
     best = hits[0].hsps[0].aln
     tseq = replace_missing_residues(str(best[1].seq),
                                     f'{template_dir}/{template}.ent')
     Path(out_dir).mkdir(parents=True, exist_ok=True)
     pir_file = f'{out_dir}/{template}.pir'
     SeqIO.write([
         SeqRecord(Seq(str(best[0].seq), generic_protein),
                   id=query,
                   name='',
                   description=f'sequence:{query}::::::::'),
         SeqRecord(
             Seq(tseq, generic_protein),
             id=template,
             name='',
             description=
             f'structureX:{template}::{template[5].upper()}::{template[5].upper()}::::'
         )
     ], pir_file, 'pir')
     arg = [
         self.modpysh, 'python3',
         Path(__file__).parent.resolve() / 'modeller_script.py', pir_file,
         template, query, template_dir
     ]
     subprocess.run(arg,
                    stdout=subprocess.PIPE,
                    stderr=subprocess.PIPE,
                    universal_newlines=True)
예제 #14
0
def read_xml():
    """Reads the blast_result xml file and returns the contents.

    This function reads the blast_result xml file containing the results of a single BLAST search.
    Of every hit, the score, percentage query coverages, percentage identity, percentage positives, expect value and
    protein accession codes are stored in lists and returned.
    """
    score = []
    query_cover = []
    identity = []
    positives = []
    evalue = []
    protein_codes = []
    blast_qresult = SearchIO.read('blast_result.xml', 'blast-xml')
    for i in range(len(blast_qresult)):
        score.append(blast_qresult[i][0].bitscore)
        query_cover.append(
            round((float(blast_qresult[i][0].query_span) /
                   float(blast_qresult[i][0].query_end) * 100), 1))
        identity.append(
            round((float(blast_qresult[i][0].ident_num) /
                   float(blast_qresult[i][0].hit_span) * 100), 1))
        positives.append(
            round((float(blast_qresult[i][0].pos_num) /
                   float(blast_qresult[i][0].hit_span) * 100), 1))
        evalue.append(blast_qresult[i][0].evalue)
        protein_codes.append(blast_qresult[i].accession)

    return score, query_cover, identity, positives, evalue, protein_codes
예제 #15
0
def blast_search(filename, blast_temp_path):
    '''
        Perform a blast search, using a file as input. Either FASTA or accession number

        Return results as list of accession numbers and locations
    '''

    filepath = "%s/%s.xml" % (blast_temp_path, filename)
    if not os.path.exists(filepath):
        with open('%s/%s' % (blast_temp_path, filename), 'w') as f:
            f.write(filename)
        search_cmd = NcbitblastnCommandline(query='%s/%s' % (blast_temp_path, filename),  db="/research/sequences/GenBank/blast/db/refseq_genomic", outfmt=5, out=filepath)
        subprocess.call(str(search_cmd), shell=True)

    result = SearchIO.read(filepath, 'blast-xml')

    # Filter e-value
    hsps = filter(lambda hit: hit.evalue < 1e-10, result.hsps)

    # Filter length
    hsps = filter(lambda hit: hit.aln_span > 209, hsps)

    # Filter identity
    iden_cutoff = 0.2
    hsps = filter(lambda hit: float(hit.ident_num)/float(result.seq_len) > iden_cutoff, hsps)

    return [(hit.hit_id.split('|')[-2], hit.hit_start, hit.hit_end) for hit in hsps]
예제 #16
0
def analyze_BLAST_result(input_fasta_name_wo_path, result_handle):
    show_header("Step 2. Analyzing the BLAST result.")

    output_file_name = "retrieved_from_" + str(
        input_fasta_name_wo_path)[:-6] + ".xml"

    if not os.path.exists('sample/output'):
        os.makedirs('sample/output')

    current_dir = os.getcwd()
    output_folder = os.path.join(current_dir, "sample/output")

    os.chdir(output_folder)

    output_file = open(
        output_file_name, "w"
    )  # since it is 'w', an existing file will be overwritten. (if this is "a", new info will be appended to an existing file)
    output_file.write(result_handle.read())
    output_file.close()

    blast_qresult = SearchIO.read(output_file_name,
                                  "blast-xml")  # query_result
    filter_for_no_predicted_hypothetical = lambda hit: ("PREDICTED" in hit.
                                                        description == False)
    filtered_qresult = blast_qresult.hit_filter(
        filter_for_no_predicted_hypothetical)
    for hit in filtered_qresult:
        print("%s" % (hit.description))
예제 #17
0
def xml2fasta(infile=None, outfile=None):

    print('\nConverting ' + infile +
          ' to fasta format, removing duplicates...')

    # Load the blast output file
    blast_qresult = SearchIO.read(infile, "blast-xml")

    # Iterate through ids and sequences and add them to lists. Sequences are
    # only added to the list if they are not already in the list. This is done
    # because sometimes there will be duplicates in the blast output, which
    # will produce an error when aligning with clustal

    ids = []
    sequences = []
    for hsp in blast_qresult.hsps:
        if hsp.hit.id not in ids:
            ids.append(str(hsp.hit.id))
            sequences.append(str(hsp.hit.seq))

    # Open the sequences output file then for each high-scoring pair
    # in the blast results, write the hit ID (proceeded by a ">" for
    # fasta format), followed by the hit sequence on the next line
    with open(outfile, "w") as f:
        for i, s in zip(ids, sequences):
            f.write('> ' + i + '\n')
            f.write(s + '\n')

    print('\tDone: writing to ' + outfile)
예제 #18
0
def find_similar_region_for_vntr(sema, reference_vntr, ref_file, result_list):
    from Bio import SearchIO
    vntr_id = reference_vntr.id
    q = reference_vntr.left_flanking_region[
        -30:] + reference_vntr.pattern + reference_vntr.right_flanking_region[:
                                                                              30]
    search_index = vntr_id
    qfile = settings.BLAST_TMP_DIR + str(vntr_id) + '_' + str(
        search_index) + '_query.fasta'
    with open(qfile, "w") as output_handle:
        my_rec = SeqRecord.SeqRecord(seq=Seq.Seq(q),
                                     id='query',
                                     description='')
        SeqIO.write([my_rec], output_handle, 'fasta')
    output = 'blat_out/output_%s_%s.psl' % (vntr_id, search_index)
    command = 'blat -q=dna -oneOff=1 -tileSize=8 -stepSize=3 -minIdentity=75 %s %s %s' % (
        ref_file, qfile, output)
    os.system(command)
    os.system('rm %s' % qfile)
    try:
        qresult = SearchIO.read(output, 'blat-psl')
        if is_false_vntr_hit(qresult, reference_vntr):
            print('there is similar sequence for %s' % vntr_id)
            result_list.append(vntr_id)
    except ValueError:
        pass
    sema.release()
예제 #19
0
	def __init__(self,Maxicircle,out_file):
		file_out=open(out_file,'w')
		writer = csv.writer(file_out,delimiter="\t")
		writer.writerow(["##gff-version","3"])
		rows=[]
		
		for protein in glob.glob("/Users/Said/Github/Maxicircle/DB/AA/*.faa"):
			output_file=protein.split("/")[-1]+".xml"
			blastx_cline = NcbiblastxCommandline(query=Maxicircle , db=protein, 
		                                      outfmt=5, out=output_file)
			blastx_cline()
			blast_qresult = SearchIO.read(output_file, 'blast-xml')
			if len(blast_qresult)>0:
				best=blast_qresult[0][0]
				query_range=[x for x in best.query_range]
				if best.query_strand>0:
					query_strand="+"
				else:
					query_strand="-"
				chromosome=best.query_id
				rows.append([chromosome,".","exon",query_range[0],query_range[1],".",query_strand,".","ID="+protein.split("/")[-1].split(".faa")[0]])
		
		print(str(len(rows))+" exons found")
		rows=iter(rows)
		writer.writerows(rows)
예제 #20
0
def blast(seq):
    result_handle = NCBIWWW.qblast("blastx", "nr", seq)
    print(result_handle)
    # blast_records = NCBIWWW.parse(result_handle)
    # print (blast_records)
    with open("derde.xml", 'w') as out_handle:
        out_handle.write(result_handle.read())

    with open("derde.xml", 'r') as out_handle:
        blast_records = NCBIXML.parse(out_handle)
        blast_record = next(blast_records)
        # print(blast_record)
        E_VALUE_THRESH = 0.04
        #max_hsps = 1
        for alignment in blast_record.alignments:
            for hsp in alignment.hsps:
                if hsp.expect < E_VALUE_THRESH:
                    print("****Alignment****")
                    title = ("sequence:", alignment.title)
                    # print("length:", alignment.length)
                    # print("e value:", hsp.expect)
                    # print(hsp.query[0:75] + "...")
                    # print(hsp.match[0:75] + "...")
                    # print(hsp.sbjct[0:75] + "...")

        blast_qresult = SearchIO.read("derde.xml", "blast-xml")
        blast_hsp = blast_qresult[0][
            0]  # alleen informatie van eerste hit word meegenomen
        print(blast_hsp)

    return blast_hsp
예제 #21
0
def get_locus(sequences, kir=False, verbose=False, refdata=None, evalue=10):
    """
    Gets the locus of the sequence by running blastn

    :param sequences: sequenences to blast
    :param kir: bool whether the sequences are KIR or not
    :rtype: ``str``

    Example usage:

        >>> from Bio.Seq import Seq
        >>> from seqann.blast_cmd import get_locus
        >>> sequence = Seq('AGAGACTCTCCCGAGGATTTCGTGTACCAGTTTAAGGCCATGTGCTACTTCACC')
        >>> locus = get_locus(sequence)

    """
    if not refdata:
        refdata = ReferenceData()

    file_id = str(randomid())
    input_fasta = file_id + ".fasta"
    output_xml = file_id + ".xml"
    SeqIO.write(sequences, input_fasta, "fasta")
    blastn_cline = NcbiblastnCommandline(query=input_fasta,
                                         db=refdata.blastdb,
                                         evalue=evalue,
                                         outfmt=5,
                                         reward=1,
                                         penalty=-3,
                                         gapopen=5,
                                         gapextend=2,
                                         dust='yes',
                                         out=output_xml)

    stdout, stderr = blastn_cline()

    blast_qresult = SearchIO.read(output_xml, 'blast-xml')

    #   Delete files
    cleanup(file_id)

    if len(blast_qresult.hits) == 0:
        return ''

    loci = []
    for i in range(0, 3):
        if kir:
            loci.append(blast_qresult[i].id.split("*")[0])
        else:
            loci.append(blast_qresult[i].id.split("*")[0])

    locus = set(loci)
    if len(locus) == 1:
        if has_hla(loci[0]) or kir:
            return loci[0]
        else:
            return "HLA-" + loci[0]
    else:
        return ''
예제 #22
0
def similarity_filter(inp_file, level):
    blast_qresult = SearchIO.read(inp_file, "blast-xml")
    table = []
    for hsp in blast_qresult:
        for h in hsp:
            if float(h.ident_num * 100 / len(h.query)) > int(level):
                table.append(h)
    return table
예제 #23
0
def read_hmmer_file(f_path):
    """
    Uses Biopython's SearchIO to parse a HMMER output file.
    Returns an iterator with search hits.
    """

    f_path = _check_file(f_path)
    return SearchIO.read(f_path, 'hmmer3-text')
예제 #24
0
def read_hmmer_file(f_path):
    """
    Uses Biopython's SearchIO to parse a HMMER output file.
    Returns an iterator with search hits.
    """

    f_path = _check_file(f_path)
    return SearchIO.read(f_path, "hmmer3-text")
예제 #25
0
def check_blast(blast_res):
    try:
        with open("ncbi_result.xml", "w") as result_file:
            result_file.write(blast_res.read())
        result = SearchIO.read("ncbi_result.xml", "blast-xml")
        return result, True
    except:
        return "", False
def get_efficiency(reads_file, genome, target):
    # Bowtie for the uninterrupted insertion site to get approximate integration efficiency
    # This data was not used in the paper, only for our own checks (not the best method in any case for this data)
    uninterrupted_reads, other_site_reads, efficiency = None, None, None
    if target:
        site = genome.seq.upper().find(target)
        # add length of target, and the span of +35 to +65
        if site > 0:
            uninterrupted_insertion_site = genome.seq[site + len(target) +
                                                      35:site + len(target) +
                                                      65]
        else:
            site = genome.seq.upper().reverse_complement().find(target)
            uninterrupted_insertion_site = genome.seq.reverse_complement(
            )[site + len(target) + 35:site + len(target) + 65]
        uninterrupted_fasta = f'tmp/uninterrupted.fasta'
        SeqIO.write(SeqRecord(uninterrupted_insertion_site, id="target"),
                    uninterrupted_fasta, 'fasta')
        blast_filename = f'tmp/uninterrupted_blastresults.xml'
        do_blast(uninterrupted_fasta, reads_file, blast_filename)
        # We are querying the 30bp sequence around the insertion site for WT sequence
        res = SearchIO.read(blast_filename, 'blast-xml')
        hits = [
            hit for hit in res.hits
            if len(hit.hsps) == 1 and hit.hsps[0].ident_num > 28
        ]
        uninterrupted_reads = len(hits)

        # also do a blast for another random site in the genome to compare
        other_site = site - 5000 if site > 20000 else site + 5000
        SeqIO.write(
            SeqRecord(genome.seq[other_site:other_site + 30], id='site'),
            'tmp/other_site.fasta', 'fasta')
        do_blast('tmp/other_site.fasta', reads_file,
                 'tmp/other_site_blast.xml')
        res = SearchIO.read('tmp/other_site_blast.xml', 'blast-xml')
        hits = [
            hit for hit in res.hits
            if len(hit.hsps) == 1 and hit.hsps[0].ident_num > 28
        ]
        other_site_reads = len(hits)
        efficiency = (other_site_reads -
                      uninterrupted_reads) / other_site_reads

    return (uninterrupted_reads, other_site_reads, efficiency)
예제 #27
0
def test():
    E_VALUE_THRESH = 0.001
    #blast_search('data/test.fasta', 'blastp', 'swissprot')
    blast_records = SearchIO.read('output/my_blast.xml', 'blast-xml')
    records = []
    for blast_record in blast_records:
        records.append(blast_record[0].hit)
    SeqIO.write(records, 'data/msa.fasta', 'fasta')
    '''
예제 #28
0
def find_frameshift(sbjct_dict, query_dict, pseudo_hits, temp_dir,
                    out_frameshift):
    assert (os.path.isdir(temp_dir))
    sys.stderr.write("finding frameshift mutations...\n")

    frameshifts = []
    non_frameshift = []
    exn_query = temp_dir + "/" + "exn_query.fa"
    exn_target = temp_dir + "/" + "exn_target.fa"
    align_file = temp_dir + "/" + "fshift_exonerate.exn"

    for hit in pseudo_hits:
        chrom = hit[0]
        record = sbjct_dict[chrom]
        qseqid = hit[8].split(";")[0].split("=")[1]
        SeqIO.write(query_dict[qseqid], exn_query, "fasta")
        flank = 1000
        flank_record = _get_hit_record(record, hit, flank)
        SeqIO.write(flank_record, exn_target, "fasta")

        # alignment using exonerate
        p = subprocess.Popen("exonerate -m protein2dna -n 1 -q " + exn_query +
                             " -t " + exn_target + ">" + align_file,
                             shell=True)
        os.waitpid(p.pid, 0)
        fshift = False

        try:
            qresult = SearchIO.read(align_file, "exonerate-text")
            hsp = qresult[0][0]  # first hit, first hsp
            # query overlapping with the best-hit
            new_hit_start = flank + 1
            new_hit_end = len(flank_record.seq) - flank
            if hsp.hit_start + 1 <= new_hit_end and \
                    hsp.hit_end >= new_hit_start:
                # there are frameshifts
                if len(hsp.hit_frame_all) > 1:
                    fshift = True
        except:
            pass

        if fshift:
            fshift_seq = flank_record.seq[hsp.hit_start:hsp.hit_end]
            fshift_id = hit[0] + ":" + \
                str(hsp.hit_start + 1) + "-" + str(hsp.hit_end)
            frameshift_record = SeqRecord(fshift_seq,
                                          id=fshift_id,
                                          name=fshift_id,
                                          description="qseqid=" + qseqid)
            frameshifts.append(frameshift_record)
        else:
            non_frameshift.append(hit)
    # end for

    SeqIO.write(frameshifts, out_frameshift, "fasta")
    sys.stderr.write("done.\n")
    return non_frameshift
예제 #29
0
파일: hmmer.py 프로젝트: shane806/cgb
def process_hmmscan():
    """Processes the tab file resulting from a hmmscan search.
    """
    ###parse and return the result using hmmer3-tab format
    try:
        result = SearchIO.read(HMMOUT_FILENAME, 'hmmer3-tab')
        return (result)
    except ValueError:
        return ([])
예제 #30
0
def get_gene_homologous(gene: str, output_dir: str, limit: int = 100):
    from Bio import SeqIO
    from Bio.Blast import NCBIWWW

    path = Path(output_dir)
    path.mkdir(exist_ok=True)

    blast_results_filename = f'{output_dir}/results.xml'
    if not Path(blast_results_filename).exists():
        rec = next(SeqIO.parse(gene, 'fasta'))
        print('[*] Blastp-ing HBA1, please wait...')
        result_handle = NCBIWWW.qblast('blastp',
                                       'nr',
                                       rec.seq,
                                       hitlist_size=1000)
        with open(blast_results_filename, 'w') as save_file:
            blast_results = result_handle.read()
            save_file.write(blast_results)
            print(f'[*] Blastp-ed, results at {blast_results_filename}')

    from Bio import SearchIO
    results = SearchIO.read(blast_results_filename, 'blast-xml')

    Path(f'{output_dir}/fastas').mkdir(exist_ok=True)
    organisms = set()
    seqs_files = []

    def good_org(org_str: str):
        return (
            org_str and org_str not in organisms
            and org_str != 'synthetic construct' and
            # 'bacter' not in org_str and  ### uncomment to exclude bacterias
            'unclassified' not in org_str)

    for hit in results.hits:
        orgs = re.findall(r"\[([A-Za-z ]+)\]", hit.description)
        org = orgs[0] if orgs else None
        if not good_org(org):
            continue
        organisms.add(org)
        rec = hit.hsps[0].hit
        rec.description = ''
        rec.id = org.replace(' ', '_')
        print(f'{org} = {rec.id}')
        seq_file = f'{output_dir}/fastas/{rec.id}.fasta'
        SeqIO.write(rec, seq_file, 'fasta')
        seqs_files.append(seq_file)

        # limit species
        if len(organisms) == limit:
            break

    with open(f'{output_dir}/species.txt', 'w') as f:
        f.write('\n'.join(organisms))

    return seqs_files
예제 #31
0
def find_frameshift(sbjct_dict, query_dict, pseudo_hits,
                    temp_dir, out_frameshift):
    assert(os.path.isdir(temp_dir))
    sys.stderr.write("finding frameshift mutations...\n")

    frameshifts = []
    non_frameshift = []
    exn_query = temp_dir + "/" + "exn_query.fa"
    exn_target = temp_dir + "/" + "exn_target.fa"
    align_file = temp_dir + "/" + "fshift_exonerate.exn"

    for hit in pseudo_hits:
        chrom = hit[0]
        record = sbjct_dict[chrom]
        qseqid = hit[8].split(";")[0].split("=")[1]
        SeqIO.write(query_dict[qseqid], exn_query, "fasta")
        flank = 1000
        flank_record = _get_hit_record(record, hit, flank)
        SeqIO.write(flank_record, exn_target, "fasta")

        # alignment using exonerate
        p = subprocess.Popen("exonerate -m protein2dna -n 1 -q " +
                             exn_query + " -t " + exn_target + ">" +
                             align_file, shell=True)
        os.waitpid(p.pid, 0)
        fshift = False

        try:
            qresult = SearchIO.read(align_file, "exonerate-text")
            hsp = qresult[0][0]  # first hit, first hsp
            # query overlapping with the best-hit
            new_hit_start = flank + 1
            new_hit_end = len(flank_record.seq) - flank
            if hsp.hit_start + 1 <= new_hit_end and \
                    hsp.hit_end >= new_hit_start:
            # there are frameshifts
                if len(hsp.hit_frame_all) > 1:
                    fshift = True
        except:
            pass

        if fshift:
            fshift_seq = flank_record.seq[hsp.hit_start:hsp.hit_end]
            fshift_id = hit[0] + ":" + \
                str(hsp.hit_start + 1) + "-" + str(hsp.hit_end)
            frameshift_record = SeqRecord(
                fshift_seq, id=fshift_id, name=fshift_id,
                description="qseqid=" + qseqid)
            frameshifts.append(frameshift_record)
        else:
            non_frameshift.append(hit)
    # end for

    SeqIO.write(frameshifts, out_frameshift, "fasta")
    sys.stderr.write("done.\n")
    return non_frameshift
예제 #32
0
def getIndices(resultHandle):
    '''If not provided directly by the user, this function retrieves the best BLAST hit's indices.'''

    blast_result = SearchIO.read(resultHandle, 'blast-tab')

    print(blast_result[0][0])
    start = blast_result[0][0].hit_start
    end = blast_result[0][0].hit_end

    return start, end
예제 #33
0
def getIndices(resultHandle):
    """If not provided directly by the user, this function retrieves the best BLAST hit's indices."""

    blast_result = SearchIO.read(resultHandle, "blast-tab")

    print(blast_result[0][0])
    start = blast_result[0][0].hit_start
    end = blast_result[0][0].hit_end

    return start, end
    def sequence_search(self):
        '''Perform sequence search

        :returns: a list of target chains
        '''
        sequence = None
        # Write sequence file for input chain
        records = list(SeqIO.parse(self.reference_pdb_file, 'pdb-seqres'))
        for record in records:
            if ':' in record.id:
                (record_pdb_id, record_chain_id) = record.id.split(':')
            else:
                (record_pdb_id, record_chain_id) = ('', record.id)
            if record_pdb_id.lower() == self.reference_id.lower() and record_chain_id == self.reference_chain_id:
                sequence = record
                break

        if sequence is None:
            raise IndexError('Sequence not found for protein chain {}'.format(self.reference_chain_id))

        # Write query sequence file
        with open(os.path.join(self.output_dir, 'query.fasta'), 'w') as output_handle:
            SeqIO.write(sequence, output_handle, 'fasta')

        # Run FASTA '-b', str(self.args.maximum_sequences),
        args = ['-q', '-m', '10',
                os.path.join(self.output_dir, 'query.fasta'), self.args.sequence_file]

        # Read alignment output
        with open(os.path.join(self.output_dir, 'output.fasta'), 'w') as out:
            call([self.args.sequence_search_tool] + args, stdout=out)

        target_chains = []

        query_result = SearchIO.read(os.path.join(self.output_dir, 'output.fasta'), "fasta-m10")
        for hit in query_result:
            (target_pdb_id, target_chain_id) = self.extract_ids(hit.id)

            hsp = hit[0]
            if target_pdb_id.lower() == self.reference_id.lower() and target_chain_id == self.reference_chain_id:
                continue
            if hsp.ident_pct < self.args.minimum_identity or hsp.ident_pct > self.args.maximum_identity:
                continue
            if hsp.pos_pct < self.args.minimum_similarity or hsp.pos_pct > self.args.maximum_similarity:
                continue

            target_chains.append({'pdb_id': target_pdb_id,
                                  'chain_id': target_chain_id,
                                  'identity': '{:.2f}'.format(hsp.ident_pct),
                                  'similarity': '{:.2f}'.format(hsp.pos_pct)})

            if len(target_chains) >= self.args.maximum_sequences:
                break

        return target_chains
예제 #35
0
def hmmer(query):
    """Performs hmmsearch with a given gene.

    returns: tuple with (gene_name, list of hmm hits [hmm1, hmm2 ... hmmn]"""
    hmm_prof = str(Path(dfo, f'profiles/{query}.hmm'))
    cmd = f'hmmsearch -E 1e-10 {hmm_prof} {infile} > {args.output}/tmp/{sample_name}/{query}.hmmout'
    subprocess.run(cmd, shell=True, stdout=subprocess.DEVNULL)
    hits = []
    for hit in SearchIO.read(f'{args.output}/tmp/{sample_name}/{query}.hmmout', 'hmmer3-text'):
        hits.append(hit.id)
    return query, hits
예제 #36
0
def run_hmmsearch(in_file, hmm, threads):
    if which('hmmsearch') is None:
        exit('[!] hmmsearch not found')
    print("[>] Running hmmsearch... ", end="", flush=True)
    cmd = ['hmmsearch', '--noali', '--cut_tc', '--cpu', threads, hmm, '-']
    child = Popen(cmd, stdout=PIPE, stderr=PIPE, stdin=PIPE)
    try:
        child.stdin.write(in_file.encode())
        return SearchIO.read(StringIO(child.communicate()[0].decode('utf-8')), 'hmmer3-text')
    except IOError:
        return print(f'\n[!] hmmer failed to run, is your input DNA?')
예제 #37
0
def runLocalBLAST(genome, genomeDict,
                  genomeFastaF, genomeDatabaseF, workspaceRootDir, sourceIndex,
                  neighbor_num, bitscore_cutoff):

    # auxiliary files
    queryF = workspaceRootDir + 'query_active' + str(sourceIndex) + '.fasta'
    giF = workspaceRootDir + 'girestrict_active' + str(sourceIndex) + '.txt'
    blastOutF = workspaceRootDir + 'active_blast' + str(sourceIndex) + '.xml'

    # load sequences
    reciter = SeqIO.parse(genomeFastaF, 'fasta')
    for index, aaRecord in enumerate(reciter):
        # load information, find in genome
        thisGene = parseNCBIDescription(aaRecord.description)
        chromIndex = genomeDict[thisGene.gi][0]
        geneIndex = genomeDict[thisGene.gi][1]

        # write sequence
        with open(queryF, 'w') as queryFHandle:
            SeqIO.write(aaRecord, queryFHandle, 'fasta')

        # write neighbors
        neighbors = getNeighbors(chromIndex, geneIndex, neighbor_num, genome)
        with open(giF, 'w') as giFHandle:
            giFHandle.write('\n'.join(neighbors) + '\n')

        # run BLAST
        blastp_cline = NcbiblastpCommandline(query=queryF,
                db=genomeDatabaseF, evalue=0.1, outfmt=5, out=blastOutF,
                gilist=giF, dbsize=neighbor_num, searchsp=neighbor_num)
        stdout, stderr = blastp_cline()

        # parse BLAST output
        blastOutput = SearchIO.read(blastOutF, 'blast-xml')
        genome[chromIndex][geneIndex].alignments = parseBLASTOut(blastOutput,
                                                                 thisGene.gi,
                                                                 bitscore_cutoff)
    reciter.close()
    os.system('rm ' + queryF)
    os.system('rm ' + giF)
    os.system('rm ' + blastOutF)
예제 #38
0
def parseBLAST(blastXML):

	blast_qresult = SearchIO.read(StringIO(blastXML), 'blast-xml')

	# initialize output variables
	num_hits  = 0
	num_hsps =evalue = qstart = qend = hit_seq = "NA"


	if (len(blast_qresult)):

		num_hits   = len(blast_qresult)

		# HIT related data 
		num_hsps         = len(blast_qresult[0])       # blast_qresult[0] -> num hsps of first hit
		
		# HSP related data ( blast_qresult[0][0] -> first HSP from first HIT)
		evalue    = blast_qresult[0][0].evalue
		qstart    = blast_qresult[0][0].query_start+1  # it's the XML coordinate -1
		qend      = blast_qresult[0][0].query_end
		hit_seq   = str(blast_qresult[0][0][0].hit.seq)     # this is a SeqRecord object converted to string (blast_qresult[0][0][0] -> third level, fragments of the HSP)

	return(num_hits, num_hsps, evalue, qstart, qend, hit_seq)
예제 #39
0
def read_blat(path):
    data = {}

    for name in glob.glob(path):
        try:
            blat_result = SearchIO.read(name, 'blat-psl')
        except:
            continue

        filter_func = lambda hit: hit.query_span >= 200 and (hit.hit_span <= hit.query_span * 1.2 and hit.query_span <= hit.hit_span * 1.2)

        for hit in blat_result:
            hit = hit.filter(filter_func)
            if hit:
                #print(hit)
                hsp_hit = []
                for hsp in hit:
                    if hsp.query_id != hsp.hit_id:
                    #if hsp.is_fragmented:
                        hsp_hit.append(hsp.query_start)
                        hsp_hit.append(hsp.query_end)

                        print(hsp.query_id, hsp.hit_id, hsp.query_start, hsp.query_end, hsp.hit_start, hsp.hit_end)

                if len(hsp_hit) > 0:
                    ranges = ["{},{} ".format(hsp_hit[i], hsp_hit[i + 1] - hsp_hit[i]) for i in range(0, len(hsp_hit), 2)]
                    if hsp.query_id in data:
                        data[hsp.query_id] += "".join(ranges)
                        #data[hsp.query_id] += "{},{} ".format(min(hsp_hit), max(hsp_hit) - min(hsp_hit))
                    else:
                        data[hsp.query_id] = "".join(ranges)
                        #data[hsp.query_id] = "{},{} ".format(min(hsp_hit), max(hsp_hit) - min(hsp_hit))

                    print(ranges)

    return data
def filter_forw_hmm_hits(modelname, forw_inc_bitscore_percentage, unique=True):
    ''' Reads hmmer hits from standart hmmer output.
    Returns hits with bitscore more than provided max bitscore percentage.
    By default returns only hits that from unique loci.
    '''
    gene_regex = re.compile(r'GN=([^=]+)')
    try:
        hits = SearchIO.read(modelname + ".hits", "hmmer3-tab")
    except:
        hits = []
    # select the longest isoform
    filtered_hits = []
    try:
        max_bitscore = hits[0].bitscore
    except:
        max_bitscore = 0
    unique_genes = []
    na_gene_count = 0
    for h in hits:
        # try to guess gene from hit description
        gene = None
        for m in gene_regex.finditer(h.description):
            gene = m.group(1)[:-3]
        if not gene:
            gene = "gene" + str(na_gene_count)
            na_gene_count += 1
        #print h.id, gene, na_gene_count

        if gene not in unique_genes:
            unique_genes.append(gene)
            if h.bitscore > max_bitscore * forw_inc_bitscore_percentage:
                filtered_hits.append(h)
        if manual_mode:
            print unique_genes
            # raw_input("...")
    return filtered_hits
예제 #41
0
def parse_queries(query_database, query_species, query_subspecies, query_type):

	print '\nparsing ...'

	for file in xml_results:

		print '\nopening file ...'
		result_handle = open(file)

		print '\nXML processing ...'
		blast_record = NCBIXML.read(result_handle)

		query_name = ''

		hit_num = 0

		blast_qresult = SearchIO.read(file, 'blast-xml')
		for hit in blast_qresult:

			for hsp in hit:

				if hit_num == 0:

						query_name = hit.description

				# calculate the % identity of the hit to the query sequence
				hit_identity = ((1.0 * hsp.ident_num / hsp.aln_span) * 100)

				# add accession numbers to the hit_accessions array
				hit_accessions.append( [hit.accession, hit_identity] )

				hit_num += 1
		
		# remove duplicate hits from hit_accessions
		print '\nConsolidating duplicate hits ...'
		
		remove_duplicate_hits(hit_accessions)

		print '\nDone'

		# for each accession number in the hit_accessions:
			# get the FASTA sequence
			# get the phylogeny

		print '\nQuerying GenBank ...'

		all_accessions, full_hit_seqs = query_genbank(query_name, query_database)

		print '\nOutputting Results ...'

		output_full_hits(query_name, full_hit_seqs, all_accessions)

		# remove hits that are not full sequences (sequence length < 16000) TODO
		print '\nRemoving incomplete sequences ...'

		# ----------------------------------------- SORTS HITS BY PHYLOGENY
		# -----------------------------------------------------------------
		# sorts the hits by their phylogenetic relation to query
		# outputs final hit and query data to file directory
		print '\nSorting hits by Phylogeny ...'
		sort_by_phylogeny(query_species, query_subspecies, query_database, query_type)
def blast_it (input):
    try:
        t0 = datetime.datetime.now()
        # **** SETTINGS ****
        bitscore_cutoff = 100 # see calibration description in notes

        # find clusters using basic ordinal filter:
        # take max(min(max(left side neighbor genes), max(right side neighbor genes)), self)
        # then set hard cutoff (maybe >= 3 matches on edges, 6 on peak, 2 on length)
        # -> cuts off immediately once you're past the edge, but includes everything inside
        edge_thresh = 2.5 # must be greater than (e.g. 3 or more)
        peak_thresh = 5.5
        length_thresh = 1.5
        clust_extend = 0 # take N extra genes on either side just to show window
        # *********************
        # INPUT:
        sufi = input[0]
        sourcedir = input[1]
        timestr = input[2]

        # Obtain list of files
        suffix_f = open(sourcedir, 'rU')
        suffix_r = csv.reader(suffix_f)
        suffixpaths = []
        for row in suffix_r:
            if len(row) == 0:
                break
            suffixpaths.append(row[0])
            #print(row[0])
        suffix_f.close()

        suf = suffixpaths[sufi]
        if suf[-len('.fasta'):] == '.fasta':
            suf = suf[0:-len('.fasta')]

        # assemble list of genes on chromosome
        chromnames = []
        chromgenes = []
        reciter = SeqIO.parse(suf + '.fasta', 'fasta')
        for index, aarecord in enumerate(reciter):
            dparse = bracketparse(aarecord.description, '[]')
            newgeneobj = gene_obj()
            newgeneobj.gi = aarecord.description.split(' ')[0].split('|')[1]
            for dp in dparse:
                if dp.split('=')[0] == 'chromosome':
                    newgeneobj.chromosome = dp.split('=')[1]
                    continue
                if dp.split('=')[0] == 'location':
                    newgeneobj.location = [int(bracketparse(dp.split('=')[1], '()')[0].split(',')[0]),
                                           int(bracketparse(dp.split('=')[1], '()')[0].split(',')[1])]
                    continue
                if dp.split('=')[0] == 'direction':
                    newgeneobj.direction = dp.split('=')[1]
                    continue
                if dp.split('=')[0] == 'description':
                    newgeneobj.description = '='.join(dp.split('=')[1:])
                    continue
                if dp.split('=')[0] == 'protein_id':
                    newgeneobj.ncbi_id = dp.split('=')[1]
                    continue

            if newgeneobj.chromosome not in chromnames:
                chromnames.append(newgeneobj.chromosome)
                chromgenes.append([])

            chromgenes[-1].append(newgeneobj)
        reciter.close()

        # sort genes by location
        for chromi in range(len(chromnames)):
            chromgenes[chromi].sort(key=lambda thsgi: thsgi.location[0])

        # assemble nearby genes
        neighbor_num = 10 #** move
        neighbor_wind = int(round(neighbor_num/2))
        # window = 10000 could do this with nt #
        for chromi in range(len(chromgenes)):
            for ind, aaobj in enumerate(chromgenes[chromi]):
                #if suf.find('Streptomyces') == -1: # linear chromosome
                chromgenes[chromi][ind].nearls = [chromgenes[chromi][thsi].gi
                        for thsi in range(max([ind-neighbor_wind, 0]),
                        min([ind+neighbor_wind+1, len(chromgenes[chromi])]))]
                #else: #circular chromosome
                #    chromgenes[chromi][ind].nearls = [chromgenes[chromi][thsi].gi
                #            for thsi in np.mod(range(ind-neighbor_wind,
                #            ind+neighbor_wind+1), len(chromgenes[chromi]))]

        # run local BLAST
        db_f = suf + 'BLASTdb'
        db_root = '/'.join(suf.split('/')[0:-1]) + '/'

        gis = []
        inds = []
        for chromi in range(len(chromgenes)):
            inds += zip([chromi]*len(chromgenes[chromi]), range(len(chromgenes[chromi])))
            gis += [thsgene.gi for thsgene in chromgenes[chromi]]
        aadict = dict(zip(gis, inds))

        query_fname = db_root + 'query_active' + str(sufi) + '.fasta'
        gi_fname = db_root + 'girestrict_active' + str(sufi) + '.txt'
        out_fname = db_root + 'active_blast' + str(sufi) + '.xml'

        reciter = SeqIO.parse(suf + '.fasta', 'fasta')

        chrom_size_zeros = [[]]*len(chromgenes)
        for chromi in range(len(chromgenes)):
            chrom_size_zeros[chromi] = np.zeros(len(chromgenes[chromi]))

        median_hit_len = copy.deepcopy(chrom_size_zeros)
        hit_num = copy.deepcopy(chrom_size_zeros)
        pks_nrps_yn = copy.deepcopy(chrom_size_zeros)
        gi_nums = copy.deepcopy(chrom_size_zeros)
        for index, aarecord in enumerate(reciter):
            thsgi = aarecord.description.split(' ')[0].split('|')[1]
            thsgene = chromgenes[aadict[thsgi][0]][aadict[thsgi][1]]

            f_gi = open(gi_fname, 'w')
            f_gi.write('\n'.join(thsgene.nearls))
            f_gi.write('\n')
            f_gi.close()

            query_f = open(query_fname, 'w')
            SeqIO.write(aarecord, query_f, 'fasta')
            query_f.close()

            blastp_cline = NcbiblastpCommandline(query=query_fname,
                    db=db_f, evalue=0.1, outfmt=5, out=out_fname,
                    gilist=gi_fname, dbsize=neighbor_num, searchsp=neighbor_num)
            stdout, stderr = blastp_cline()

            blast_hsp = SearchIO.read(out_fname, 'blast-xml')
            thsgene.alignls = []
            if blast_hsp.hits != []:
                for hits_i in blast_hsp:
                    for algns_i in hits_i:
                        if algns_i.bitscore > bitscore_cutoff:
                            thsalign = alignobj()
                            thsalign.query_range = algns_i.query_range
                            thsalign.hit_range = algns_i.hit_range
                            thsalign.evalue = algns_i.evalue
                            thsalign.bitscore = algns_i.bitscore
                            thsalign.hit_gi = (9-len(algns_i.hit_id[3:]))*'0' + algns_i.hit_id[3:]
                            # reject full gene hit to itself
                            if thsalign.hit_gi == thsgene.gi and thsalign.hit_range == thsalign.query_range:
                                continue
                            thsgene.alignls.append(thsalign)

            # summary statistics
            if len(thsgene.alignls) > 0:
                median_hit_len[aadict[thsgi][0]][aadict[thsgi][1]] = (np.median(
                        np.array([thsa.hit_range[1] - thsa.hit_range[0]
                        for thsa in thsgene.alignls])))

            hit_num[aadict[thsgi][0]][aadict[thsgi][1]] = len(thsgene.alignls)
            gi_nums[aadict[thsgi][0]][aadict[thsgi][1]] = thsgene.gi
            annotation_set = thsgene.description
            if (annotation_set.find('synth') >= 0 and
                    ((annotation_set.find('polyketide') >= 0 or
                    annotation_set.find('poly-ketide') >= 0)
                    or
                    ((annotation_set.find('nonribosomal') >= 0 or
                    annotation_set.find('non-ribosomal') >= 0) and
                    annotation_set.find('peptide') >= 0))):
                pks_nrps_yn[aadict[thsgi][0]][aadict[thsgi][1]] = 1.

            #if index % 200 == 0:
            #    print(suf.split('/')[-2] + ': ' + str(index))


        reciter.close()

        # ********* Identify Clusters *********
        clusters = []
        clusterft = {'ind': [], 'height': [], 'avg': [], 'medhitlen': [], 'len': [], 'pks_nrps': []}
        for chromi in range(len(chromgenes)):
            # 1. filter
            filter_counts = np.zeros(len(chromgenes[chromi]))
            for genei in range(len(chromgenes[chromi])):
                neari = np.where(np.array(chromgenes[chromi][genei].nearls) == chromgenes[chromi][genei].gi)[0][0]
                filter_counts[genei] = max([min([max([hit_num[chromi][aadict[chromgenes[chromi][genei].nearls[ti]][1]]
                            for ti in range(neari)]+[0.0]),
                        max([hit_num[chromi][aadict[chromgenes[chromi][genei].nearls[ti]][1]]
                            for ti in range(neari+1, len(chromgenes[chromi][genei].nearls))]+[0.0])]),
                        hit_num[chromi][genei]])

            # 2. break apart
            on = False
            proposed_cluster = []
            for nhi in range(len(hit_num[chromi])-1):
                if on == False and filter_counts[nhi] > edge_thresh:
                    on = True
                    proposed_cluster.append(nhi)
                if on == True and filter_counts[nhi] < edge_thresh:
                    on = False
                    proposed_cluster.append(nhi)
                    if (proposed_cluster[1] - proposed_cluster[0] > length_thresh and
                            max(filter_counts[proposed_cluster[0]:proposed_cluster[1]]) > peak_thresh):


                        #** adjust for circular
                        clust_rng = [max([proposed_cluster[0]-clust_extend, 0]),
                                min([proposed_cluster[1]+clust_extend, len(chromgenes[chromi])])]
                        clusters.append([geneinfo for geneinfo in chromgenes[chromi][
                                clust_rng[0]:clust_rng[1]]])
                        clusterft['ind'].append([chromi, clust_rng])
                        clusterft['height'].append(max(hit_num[chromi][clust_rng[0]:clust_rng[1]]))
                        clusterft['avg'].append(np.mean(hit_num[chromi][clust_rng[0]:clust_rng[1]]))
                        clusterft['medhitlen'].append(np.median([item for sublist in
                                [[thsa.hit_range[1] - thsa.hit_range[0]
                                for thsa in chromgenes[chromi][ind].alignls]
                                for ind in range(clust_rng[0], clust_rng[1])]
                                for item in sublist]))
                        clusterft['len'].append(clust_rng[1] - clust_rng[0])
                        if sum(pks_nrps_yn[chromi][clust_rng[0]:clust_rng[1]] < 0.5) == clust_rng[1] - clust_rng[0]:
                            clusterft['pks_nrps'].append(0.)
                        elif sum(pks_nrps_yn[chromi][clust_rng[0]:clust_rng[1]] > 0.5) == clust_rng[1] - clust_rng[0]:
                            clusterft['pks_nrps'].append(1.)
                        else:
                            clusterft['pks_nrps'].append(0.5)
                    proposed_cluster = []

        genetable_f = suf + '_genetable_blasted_' + timestr + '.pkl'
        out_blasted = open(genetable_f, 'wb')
        pickle.dump(chromgenes, out_blasted)
        pickle.dump(aadict, out_blasted)
        pickle.dump(chromnames, out_blasted)
        out_blasted.close()

        hit_f = suf + '_blast_summary_' + timestr + '.pkl'
        out_summary = open(hit_f, 'wb')
        # alignment length, in units of a.a.
        pickle.dump(median_hit_len, out_summary)
        pickle.dump(hit_num, out_summary)
        pickle.dump(gi_nums, out_summary)
        pickle.dump(pks_nrps_yn, out_summary)
        out_summary.close()

        cluster_f = suf + '_clusters_' + timestr + '.pkl'
        out_clusters = open(cluster_f, 'wb')
        pickle.dump(clusters, out_clusters)
        pickle.dump(clusterft, out_clusters)
        out_clusters.close()

        os.system('rm ' + query_fname)
        os.system('rm ' + gi_fname)
        os.system('rm ' + out_fname)

        ## Save summary figure
        #plt.figure(1, figsize=(10, 6))
        #plt.hold(True)
        #for chromi in range(len(hit_num)):
        #    plt.scatter(hit_num[chromi], median_hit_len[chromi], 40)
        #plt.xlabel('Number of Local Hits', fontsize=18.0)
        #plt.ylabel('Median Hit Length (a.a.)', fontsize=18.0)
        #plt.title(suf.split('/')[-2], fontsize=20.0)
        #plt.rc('xtick', labelsize=14.)
        #plt.rc('ytick', labelsize=14.)
        #plt.savefig(suf + '_scatter_' + timestr + '.png')

        t1 = datetime.datetime.now()
        logging.info('finished: ' + suf.split('/')[-2] + '; time: ' + str(t0) + ' to ' + str(t1) + '(' + str(t1-t0) + ')')
        # output: name, error code, file size, time to run
        return [suf.split('/')[-2], 1, os.path.getsize(suf + '.fasta'), t1,
                [suf.split('/')[-2], genetable_f, hit_f, cluster_f]]
    except:
        #raise
        thserror = sys.exc_info()
        errorstr = 'Error: ' + str(thserror[0]) + ', ' + str(thserror[1])
        t1 = datetime.datetime.now()
        logging.debug('failed on: ' + suf.split('/')[-2] + '; time: ' + str(t0) + ' to ' + str(t1) + '(' + str(t1-t0) + ')')
        logging.exception('')
        #raise
        return [suf.split('/')[-2], errorstr, 0, t1, []]
예제 #43
0
파일: bwa2blat.py 프로젝트: cauyrd/MSI
		continue
	for each in read.cigar:
		if each[0] == 4 and each[1]/float(read.rlen) >= cutoff:
			# generating fasta file
			fa = open('temp.fa','w')
			print >> fa,'>'+read.qname
			print >> fa, read.seq
			fa.close()

			# run blat
			code = os.system('gfClient localhost 50000 '+sys.argv[1]+' temp.fa temp.psl >/dev/null 2>&1 ')
			if code != 0:
				print 'Execute gfClient failed!'
				sys.exit(1)
			try:
				blat = SearchIO.read('temp.psl','blat-psl')
			except:
				break
			hsps = blat.hsps
			hsps.sort(key=lambda k:k.score, reverse=True)
			if hsps[0].hit_id == bwa_bam.getrname(read.tid):
				# matching genomic coordinate
				if hsps[0].hit_start == read.pos or hsps[0].hit_end==read.aend:
					cigarstring, soft_len = psl2sam(hsps[0],blat.seq_len)
					if soft_len == 0:
						read.cigarstring, read.pos= cigarstring, hsps[0].hit_start
			break
	blat_bam.write(read)
bwa_bam.close()
blat_bam.close()
from Bio.Alphabet import IUPAC
import re
import sys




#Get a contig sequence given contig name  ##############################
recs = SeqIO.index(genome, "fasta")
def seq_for_contig(contig_name,recs):
    return recs[contig_name].seq
########################################################################



YR_blast_qresult = SearchIO.read(YR_xml, 'blast-xml')
if len(YR_blast_qresult) == 0:
    raise Exception('No YR were found')

#make fragments
#print 'Preparing YR fragments'
fragment_stores = []
for hit in YR_blast_qresult:
    hit_sequence = seq_for_contig(hit.id, recs)
    for hsp in hit.hsps:
        fragstart = 0
	fragend = len(hit_sequence)
	if hsp.hit_start > extend:
	    fragstart = hsp.hit_start - extend
	if fragend - hsp.hit_end > extend:
	    fragend =  hsp.hit_end + extend
예제 #45
0
파일: hmmer.py 프로젝트: xapple/seqsearch
 def hits(self):
     if not self.out_path:
         raise Exception("You can't access results from HMMER before running the algorithm.")
     return SearchIO.read(self.out_path, 'hmmer3-tab')
예제 #46
0
def blat_alignment(mapping, reference, scliplen_cutoff, lowqual_cutoff, min_percent_hq, mapq_cutoff, blat_ident_pct_cutoff, gfServer_port, hetero_factor, input, output):
	bwa_bam = pysam.Samfile(input, 'rb')
	blat_bam = pysam.Samfile(output + '.temp.bam', 'wb', template=bwa_bam)
	if hetero_factor != 'a':
		denovo = open(output+'.temp.fasta', 'w')
	putative_indel_cluster = set()
	for read in bwa_bam.fetch(until_eof=True):
		if read.is_secondary:
			# secondary alignment
			continue
		if read.is_unmapped:
			if hetero_factor != 'a':
				print >> denovo, '>' + read.qname
				print >> denovo, read.seq
			continue
		soft_len, soft_qual, soft_pos = get_softclip_length(read)
		sclip_ratio = soft_len / float(read.rlen)
		if soft_pos != -1:
			sclip_hq_ratio = len(soft_qual[soft_qual >= lowqual_cutoff]) / float(len(soft_qual))
		else:
			sclip_hq_ratio = 0
		if sclip_ratio >= scliplen_cutoff and sclip_hq_ratio >= min_percent_hq and read.mapq >= mapq_cutoff:
			blat_aln = False
			soft_chr = bwa_bam.getrname(read.rname)
			if hetero_factor == 'a':
				blat_aln = True
			elif (soft_chr, soft_pos) in putative_indel_cluster:
				blat_aln = True
				print >> denovo, '>' + read.qname
				print >> denovo, read.seq
				if not mapping:
					blat_bam.write(read)
					continue
			# estimate the probability of indels given the coverage and number of soft-clipping readss
			elif prob_of_indel_with_error(input, soft_chr, soft_pos, hetero_factor) < 0.05:
				putative_indel_cluster.add((soft_chr, soft_pos))
				blat_aln = True
				print >> denovo, '>' + read.qname
				print >> denovo, read.seq
				if not mapping:
					blat_bam.write(read)
					continue
			if blat_aln:
				fa = open(output+'.temp.fa', 'w')
				print >> fa, '>' + read.qname
				print >> fa, read.seq
				fa.close()
				try:
					subprocess.check_call('gfClient localhost ' + gfServer_port +' '+ reference['blat'] + ' ' + output + '.temp.fa ' + output + '.temp.psl >/dev/null 2>&1 ', shell=True)
				except subprocess.CalledProcessError as e:
					print >> sys.stderr, 'Execution failed for gfClient:', e
					sys.exit(1)
				try:
					blat = SearchIO.read(output+'.temp.psl', 'blat-psl')
					print >> sys.stderr, 'Blat aligned read:',read.qname
				except:
					print >> sys.stderr, 'No blat hit for read:',read.qname
					blat_bam.write(read)
					continue
				hsps = blat.hsps
				hsps.sort(key=lambda k: k.score, reverse=True)
				if hsps[0].ident_pct / 100 >= blat_ident_pct_cutoff and hsps[0].hit_id == bwa_bam.getrname(read.tid):
					# matching genomic coordinate
					if hsps[0].hit_start == read.pos or hsps[0].hit_end == read.aend:
						cigarstring, soft_len = psl2sam(hsps[0], blat.seq_len)
						if soft_len == 0:
							read.cigarstring, read.pos = cigarstring, hsps[0].hit_start
		blat_bam.write(read)
	bwa_bam.close()
	blat_bam.close()
	if hetero_factor != 'a':
		denovo.close()
	os.system('samtools sort ' + output + '.temp.bam ' + output + '.temp.sorted')
	os.system('mv ' + output + '.temp.sorted.bam ' + output)
	os.system('samtools index ' + output)
	bwa_bam = pysam.Samfile(input, 'rb')
	readlen = bwa_bam.next().rlen
	bwa_bam.close()
	return readlen
예제 #47
0
파일: parse_hmmer_out.py 프로젝트: DRL/hox
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys, os, collections
from Bio import SearchIO

hmm_file = sys.argv[1]
hmm_result = SearchIO.read(hmm_file, 'hmmer3-text')
print hmm_result
예제 #48
0
#   - blast XML result
	
# Output:
# 	- STDOUT




#####
## MAIN
#####
parser = argparse.ArgumentParser()
parser.add_argument('xml', help="Reference genome in fasta") # read stdin if file not provided
args = parser.parse_args()

blast_qresult = SearchIO.read(args.xml, 'blast-xml')

XMLname    = splitext(basename(args.xml))[0]
nameParts  = XMLname.split("_")

# INDEL related data
chrom      = nameParts[0]
start      = nameParts[1]
len_indel  = nameParts[4][3:]


if (len(blast_qresult)):

	num_hits   = len(blast_qresult)

	# HIT related data
예제 #49
0
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio import SearchIO


humdb="/mithril/Data/Pacbio/Aligned/151019_proc/blast/humiso_blast"

blastn_cline=NcbiblastnCommandline(query="temp.fasta", db=humdb, gapopen=1, gapextend=2, word_size=9, reward=1, evalue=10, outfmt=5, out="try.xml")

stdout, stderr=blastn_cline()

bres=SearchIO.read("try.xml", 'blast-xml')
SearchIO.write(bres, 'try.tsv', 'blast-tab')

##ok - this was nice, but can't output because blast is pairwise, and I think we actually want a MAF

예제 #50
0
This script takes quite a long time and often interrupts, due to NCBI not 
responding in the alloted time. Has to be restarted several times when 
350 sequences were analyzed, removing the sequences for which the results were
obtained.

Run as:
python blast_annotation.py proteins.faa > output.tab
"""

from Bio import SeqIO
from Bio import SearchIO
from Bio.Blast import NCBIWWW
from sys import argv

sequences = open(argv[1], 'r')

for sequence in SeqIO.parse(sequences, "fasta"):
    result_handle = NCBIWWW.qblast("blastp", "nr", str(sequence.seq),
                                   hitlist_size=10, expect=1e-03)
    save_file = open("my_blast.xml", "w")
    save_file.write(result_handle.read())
    save_file.close()
    result_handle.close()
    blast_qresult = SearchIO.read('my_blast.xml', 'blast-xml')
    for i in range(0, len(blast_qresult)):
        blast_hsp = blast_qresult[i][0]
        evalue = blast_hsp.evalue
        desc = blast_hsp.hit_description
        hit_id = blast_hsp.hit_id
        print sequence.id, '\t', evalue, '\t', hit_id, '\t', desc