Ejemplo n.º 1
0
def getting_blastn_cline2_rRNA_vs_geno_getting_positions(pathMain,operon_list,fasta_files,uniqueGenoNames):
    
    dict_blast_split_lines = {}
    
    for operonfile, fastafile, uniqueGenoName in itertools.izip(operon_list,fasta_files,uniqueGenoNames):
        
        name = uniqueGenoName
        
        finalPath = pathMain + operonfile
        fastaPath = pathMain + fastafile
    
        # using NcbiblastnCommandline
        blastn_cline2 = NcbiblastnCommandline(query= finalPath, 
                                         subject = fastaPath, 
                                         outfmt = 6, max_hsps = 2)()[0]
        
        blastn_cline2_split_lines = blastn_cline2.splitlines( )
    
        dict_blast_split_lines[name] = blastn_cline2_split_lines
        
    return dict_blast_split_lines
Ejemplo n.º 2
0
    def RunCommand(self):
        """Run the BLAST search."""
        self.fh_in, self.infile = tempfile.mkstemp()
        self.fh_out, self.outfile = tempfile.mkstemp()

        with open(self.infile, 'w+') as f:
            f.write('>Name\n')
            f.write(self.command_data[0])

        blast_program = self.command_data[1]
        database = self.command_data[2]

        # Check if user supplied additional options and extract them
        if self.command_data[3]:
            option = self.command_data[3]
            options = {}
            for x in range(0, len(option.split()) - 1, 2):
                options[option.split()[x]] = option.split()[x + 1]
        else:
            options = {}

        args, kwargs = blast_program, {'query': self.infile, 'db': database,
                                       'out': self.outfile}

        if blast_program.endswith('blastn'):
            blast_cmd = NcbiblastnCommandline(args, **kwargs)
        elif blast_program.endswith('blastp'):
            blast_cmd = NcbiblastpCommandline(args, **kwargs)
        elif blast_program.endswith('blastx'):
            blast_cmd = NcbiblastxCommandline(args, **kwargs)
        elif blast_program.endswith('tblastn'):
            blast_cmd = NcbitblastnCommandline(args, **kwargs)
        elif blast_program.endswith('tblastx'):
            blast_cmd = NcbitblastxCommandline(args, **kwargs)
        else:
            return

        if options:
            try:
                for key in options:
                    blast_cmd.set_parameter(key, options[key])
            except ValueError as e:
                messagebox.showerror('xbb tools',
                                     'Commandline error:\n\n' + str(e))
                self.tid.destroy()
                return

        self.worker = BlastWorker(blast_cmd)
        self.worker.start()

        self.UpdateResults()
Ejemplo n.º 3
0
    def RunCommand(self):
        self.fh_in, self.infile = tempfile.mkstemp()
        self.fh_out, self.outfile = tempfile.mkstemp()

        with open(self.infile, "w+") as f:
            f.write(">Name\n")
            f.write(self.command_data[0])

        blast_program = self.command_data[1]
        database = self.command_data[2]

        # Check if user supplied additional options and extract them
        if self.command_data[3]:
            option = self.command_data[3]
            options = {}
            for x in range(0, len(option.split()) - 1, 2):
                options[option.split()[x]] = option.split()[x + 1]
        else:
            options = {}

        args, kwargs = blast_program, {"query": self.infile, "db": database, "out": self.outfile}

        if blast_program.endswith("blastn"):
            blast_cmd = NcbiblastnCommandline(args, **kwargs)
        elif blast_program.endswith("blastp"):
            blast_cmd = NcbiblastpCommandline(args, **kwargs)
        elif blast_program.endswith("blastx"):
            blast_cmd = NcbiblastxCommandline(args, **kwargs)
        elif blast_program.endswith("tblastn"):
            blast_cmd = NcbitblastnCommandline(args, **kwargs)
        elif blast_program.endswith("tblastx"):
            blast_cmd = NcbitblastxCommandline(args, **kwargs)
        else:
            return

        if options:
            try:
                for key in options:
                    blast_cmd.set_parameter(key, options[key])
            except ValueError as e:
                messagebox.showerror("xbb tools", "Commandline error:\n\n" + str(e))
                self.tid.destroy()
                return

        self.worker = BlastWorker(blast_cmd)
        self.worker.start()

        self.UpdateResults()
def directed_local_alignment(proteome_file, ref_seq_file, proteome_name,
                             workdir, mode, source, strand,
                             per_proteome_sequences):
    """
    This function does the main lifting, running Supermatcher or BLAST on your proteomes/genomes.
    """
    #open result file
    alignment_file = os.path.join(workdir, "curr_alignment.aln")
    #If Supermatcher was chosen as the alignment algorithm
    if mode.lower() == "supermatcher":
        #Unlike BLAST, Supermatcher only searches one strand, so we create a temp file to use as the supermatcher
        #bsequence, and we can write into it either the positive or negative strand depending on the "strand" parameter
        records = open_proteome(proteome_file)
        if not records: return []
        if not strand:
            records = [
                i.reverse_complement(id=True,
                                     name=True,
                                     description=True,
                                     features=True,
                                     annotations=True,
                                     letter_annotations=True,
                                     dbxrefs=True) for i in records
            ]
        #write file
        proteome_fasta_file = os.path.join(workdir, "curr_proteome.fasta")
        SeqIO.write(records, proteome_fasta_file, "fasta")
        #Build the matching command
        if source.lower() == "protein" or source.lower() == "proteome":
            matrix = "EBLOSUM62"  #AA matrix
        elif source.lower() == "nucleotide" or source.lower() == "genome":
            matrix = "EDNAFULL"
        #run supermatcher
        cmd = SuperMatcherCommandline(asequence=ref_seq_file,
                                      bsequence=proteome_fasta_file,
                                      gapopen=10,
                                      gapextend=0.5,
                                      datafile=matrix,
                                      outfile=alignment_file)
        #Excecute the command
        stdout, stderr = cmd()
        #Parse the resulting alignments
        alignments = []
        try:
            #Create list of MultipleSeqAlignment objects representing the Supermatcher results (list may be empty)
            align_seq_list = list(AlignIO.parse(
                alignment_file,
                "amir_emboss"))  #List of MultipleSeqAlignment objects
            #Iterate through list, only the first alignment will be used if per_proteome_sequences==1 and all alignments
            #will be used if per_proteome_sequences==None
            for number, alignment in enumerate(
                    align_seq_list[0:per_proteome_sequences]):
                #get the alignent
                align_seq = alignment[
                    1]  #SeqRecord objects, [0] is query and [1] is sbjct
                #remove gaps
                align_seq._set_seq(align_seq.seq.ungap("-"))
                #get name
                if (not per_proteome_sequences or per_proteome_sequences > 1
                    ) and len(align_seq_list) > 1:
                    usable_name = proteome_name + "_" + str(number)
                else:
                    usable_name = proteome_name
                align_seq.name = usable_name
                align_seq.id = usable_name
                #finalize parsing
                score, identity_percentage = parse_supermatcher_result(
                    alignment_file, number)
                #add to list
                alignments.append(
                    (score, align_seq, identity_percentage, 0,
                     1))  #1 is given arbitrarily as gene_percentage
                #alignments is a list of tuples with each element being (score, align_seq, identity_percentage)
            #detlete temp files
            os.remove(alignment_file)
            os.remove(proteome_fasta_file)
            #return
            return alignments
        except ValueError or IndexError:
            raise NoMatchForSeqException(proteome_fasta_file, ref_seq_file)
    #If BLAST was chosen as the alignment algorithm
    elif mode.upper() == "BLAST":
        #if file is called XXXX.file_type.gz, db name should be XXXX
        if proteome_file.endswith(".gz"):
            db_name_temp = ".".join(proteome_file.split(".")[:-2])
            #if file is called XXXX.file_type, db name should be XXXX
        else:
            db_name_temp = ".".join(proteome_file.split(".")[:-1])
        #define dir
        directory = os.path.dirname(proteome_file)
        directory_files = os.listdir(directory)
        #iterate through files and find database file
        for dir_file in directory_files:
            #determine type of die
            if os.path.basename(
                    db_name_temp) in dir_file and ".nhr" in dir_file:
                db_name = directory + "/" + dir_file.split(".nhr")[0]
            elif os.path.basename(
                    db_name_temp) in dir_file and ".phr" in dir_file:
                db_name = directory + "/" + dir_file.split(".phr")[0]
        #open file
        records = open_proteome(proteome_file)
        if not records: return []
        #Build matching command
        if source.lower() == "protein" or source.lower() == "proteome":
            cmd = NcbiblastpCommandline(query=ref_seq_file,
                                        db=db_name,
                                        out=alignment_file,
                                        outfmt=5)
        elif source.lower() == "nucleotide" or source.lower() == "genome":
            cmd = NcbiblastnCommandline(query=ref_seq_file,
                                        db=db_name,
                                        out=alignment_file,
                                        outfmt=5,
                                        task="blastn")
            #cmd =  NcbiblastnCommandline(query=ref_seq_file, db=db_name, out=alignment_file, outfmt=5)
        #Execute command
        stdout, stderr = cmd()
        #Open result
        try:
            result_handle = open(alignment_file)
            blast_record = list(
                NCBIXML.parse(result_handle))[0]  #BLAST record object
            result_handle.close()
        except ValueError:
            raise NoMatchForSeqException(proteome_file, ref_seq_file)
        #Parse resulting alignments
        alignments = []
        try:
            #Iterate through list, only the first alignment will be used if per_proteome_sequences==1 and all alignments
            #will be used if per_proteome_sequences==None
            for number, alignment in enumerate(
                    blast_record.alignments[0:per_proteome_sequences]):
                hsp = alignment.hsps[
                    0]  #HSP contains all the details about the alignment
                sequence = hsp.sbjct
                score = hsp.score
                evalue = hsp.expect
                identities = hsp.identities
                query_length = blast_record.query_letters
                align_length = hsp.align_length
                #calculate percentages
                identity_percentage = float(identities) / align_length
                if (not per_proteome_sequences or per_proteome_sequences > 1
                    ) and len(blast_record.alignments) > 1:
                    name = proteome_name + "_" + str(number)
                else:
                    name = proteome_name
                #length percentage
                percentage = float(align_length) / query_length
                #convert to SeqRecord0
                align_seq = SeqRecord(Seq(sequence, IUPAC.protein),
                                      id=name,
                                      name=name,
                                      description=name)
                align_seq._set_seq(align_seq.seq.ungap("-"))  #Remove the gaps
                #score, identity_percentage = parse_blast_result(alignment_file)
                alignments.append((score, align_seq, identity_percentage,
                                   evalue, percentage))
                #alignments is a list of tuples with each element being (score, align_seq, identity_percentage, evalue, length_percentage)
        except IndexError:  #If alignments are empty, doesn't actually do anything
            sequence = ""
            score = 0
            identity_percentage = 0
        #remove temp file
        os.remove(alignment_file)
        #return
        return alignments
    #not BLAST or Supermatcher!
    else:
        raise BaseException(
            "Only Supermatcher and BLAST modes are currently supported.")
Ejemplo n.º 5
0
 def final_fasta(self, path_to_fasta):
     records_id = [
         record.id for record in SeqIO.parse(path_to_fasta, "fasta")
     ]
     rank_prefixes = [
         rank_id.split("_")[0] for rank_id in records_id
         if "_TR_1_x_" in rank_id
     ]
     other_id = [other for other in records_id if "Contig" in other]
     other_prefixes = [oth_id.split("_")[0] for oth_id in other_id]
     require_pref = set(other_prefixes).difference(set(rank_prefixes))
     ffasta_path = self.path_to_final.joinpath(path_to_fasta.name)
     prime_tmp_fasta_path = self.path_to_prime.joinpath(
         f"{path_to_fasta.stem}_tmp.fasta")
     prime_fasta_path = self.path_to_prime.joinpath(path_to_fasta.name)
     with open(ffasta_path, "w") as ffasta:
         for record in SeqIO.parse(path_to_fasta, "fasta"):
             if record.id not in other_id:
                 SeqIO.write(record, ffasta, "fasta")
             else:
                 continue
     if len(require_pref) == 0:
         return
     tmp_id = self.__tmp_other_id(require_pref, other_id)
     with open(prime_tmp_fasta_path, "w") as tmp, \
             open(prime_fasta_path) as src:
         for record in SeqIO.parse(src, "fasta"):
             if record.id in tmp_id:
                 SeqIO.write(record, tmp, "fasta")
     outfmt = "6 qseqid sseqid slen qcovhsp"
     cline = NcbiblastnCommandline(query=ffasta_path,
                                   subject=prime_tmp_fasta_path,
                                   out="-",
                                   outfmt=outfmt,
                                   task=self.task)
     output = cline()[0].strip()
     rows = [line.split() for line in output.splitlines()]
     cols = ["qseqid", "sseqid", "slen", "qcovhsp"]
     data_types = {
         "qseqid": str,
         "sseqid": str,
         "slen": int,
         "qcovhsp": float
     }
     b_tab = pd.DataFrame(rows, columns=cols).astype(data_types)
     if b_tab.empty:
         return
     b_tab.rename(columns={
         0: "qseqid",
         1: "sseqid",
         2: "slen",
         3: "qcovhsp"
     },
                  inplace=True)
     best_contig_id = self.__get_best_contig(b_tab)
     with open(ffasta_path, "a") as ffasta, \
             open(prime_tmp_fasta_path) as tmp:
         for record in SeqIO.parse(tmp, "fasta"):
             if record.id == best_contig_id:
                 SeqIO.write(record, ffasta, "fasta")
             else:
                 continue
Ejemplo n.º 6
0
def identify_DNA_chain(test_seq, evalue=100):
    '''
    performs local blast search via known NPS
    input:
    test_seq - either BIO.Seq object, or string
    evalue - blast evalue threshold (default 100)
    returns:
    str(sequence_name)
    str(direction) ('top', 'bot')
    # TODO: check for telomeric sequence is stupid
    '''
    # Merge all sequences to one
    nps_seeds = {
        'all': os.path.join(DATA_PATH, 'positioning_sequences',
                            'sequences.fasta')
    }
    db_seq_records = []
    for nps_seed in nps_seeds.values():
        db_seq_records.extend(list(SeqIO.parse(nps_seed, "fasta")))
    nps_seq_records = []
    for seq_rec in db_seq_records:
        # TERRIBLE SOLUTION REFACTOR!!!!
        #dyad_locations=seq_rec.description.split('|')[1].split()[1].split(',')
        nps_seq_records.append(seq_rec)
        #for dyad in dyad_locations:
        #    nps_seq_records.append(seq_rec)
        #nps_seq_records[-1].seq=seq_rec.seq[int(dyad)-70:int(dyad)+70]
        #nps_seq_records.append(nps_seq_records[-1].reverse_complement())
        #nps_seq_records[-1].id=seq_rec.id+'_rev_comp'
        #nps_seq_records[-1].name=seq_rec.name+'_rev_comp'
        #nps_seq_records[-1].description=seq_rec.description

    #remove gaps from MSA
    for record in nps_seq_records:
        record.seq = record.seq.ungap("-")

    n1 = str(uuid.uuid4())
    n2 = str(uuid.uuid4())
    if (not isinstance(test_seq, Seq)) and (isinstance(test_seq, str)):
        test_seq = Seq(test_seq)
    else:
        raise TypeError("Test sequence must be either Bio.Seq or string")
    with tempfile.TemporaryDirectory() as TEMP:
        SeqIO.write([SeqRecord(test_seq, id='Query', name='Query')],
                    os.path.join(TEMP, n2 + '.fasta'), 'fasta')

        SeqIO.write(nps_seq_records, os.path.join(TEMP, n1 + '.faa'), "fasta")
        os.system(
            'makeblastdb -dbtype nucl -in %s.faa -out %s.db > /dev/null' %
            (os.path.join(TEMP, n1), os.path.join(TEMP, n1)))

        blastn_cline = NcbiblastnCommandline(
            query=os.path.join(TEMP, n2 + '.fasta'),
            db=os.path.join(TEMP, n1 + '.db'),
            evalue=evalue,
            outfmt=5,
            strand='both',
            word_size=20,
            perc_identity=90,
            out=os.path.join(TEMP, n1 + '.xml'))
        stdout, stderr = blastn_cline(cwd=TEMP)

        blast_record = NCBIXML.read(open(os.path.join(TEMP, n1 + '.xml'), 'r'))

        sname = list()
        evalue = list()
        hsp_list = list()
        for alignment in blast_record.alignments:
            for hsp in alignment.hsps:
                sname.append(alignment.title)
                evalue.append(hsp.expect)
                hsp_list.append(hsp)

                # length_list.append(alignment.length)
        if len(evalue) > 0:
            nps_identified = sname[evalue.index(min(evalue))].split()[1]
            strand = hsp_list[evalue.index(min(evalue))].strand[1]
            if strand == 'Plus':
                direction = 'DNAtop'
            elif strand == 'Minus':
                direction = 'DNAbot'

        elif 'TTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGG' in str(test_seq):
            direction = 'DNAtop'
            nps_identified = 'telomeric_human'
        elif 'CCCTAACCCTAACCCTAACCCTAACCCTAACCCTAA' in str(test_seq):
            direction = 'DNAbot'
            nps_identified = 'telomeric_human'
        else:
            direction = nps_identified = None

        #os.system("rm %s.faa %s.db.nhr %s.db.nin %s.db.nsq %s.fasta %s.xml"%(n1,n1,n1,n1,n2,n1))
    return nps_identified, direction
Ejemplo n.º 7
0
def blast(dbname, blast_program, query, evalue_threshold=0.001):

    infile = None
    with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
        infile = f.name
        f.write(">Query\n%s\n" % query)

    outfile = "%s.out.xml" % infile
    if blast_program == 'tblastn':
        blast_cl = NcbitblastnCommandline(query=infile, db=dbname,
                                          evalue=evalue_threshold,
                                          word_size=6, outfmt=5, out=outfile)
    else:
        blast_cl = NcbiblastnCommandline(query=infile, db=dbname,
                                         evalue=evalue_threshold,
                                         word_size=6, outfmt=5, out=outfile)

    cl = str(blast_cl)
    cl = "%s/%s" % (settings.NCBI_BIN_DIR, cl)
    # print cl
    r = subprocess.call(cl.split(" "))
    os.unlink(infile)

    if r != 0:
        print "Blast failed: %s" % cl
        return []

    results = []
    with open(outfile, "r") as f:
        blast_record = NCBIXML.read(f)
        for alignment in blast_record.alignments:
            accession = Blast_Accession(alignment.accession)
            for hsp in alignment.hsps:
                if accession.fragment_length is not None:
                    if hsp.sbjct_start > accession.fragment_length and \
                       hsp.sbjct_end > accession.fragment_length:
                        continue
                    # don't apply '% accession.fragment_length' to
                    # sbjct_start/end. Blast_Result#strand compares sbjct_start
                    # and sbjct_end to determine which strand the hit is on.
                    # Caller should just handle when sbjct_start/end is greater
                    # than fragment length. alternatively, we can store strand
                    # explicit, but that also creates complexity when using
                    # sbjct_start/end coordinates.

                f = Blast_Result(fragment_id=accession.fragment_id,
                                 fragment_length=accession.fragment_length,
                                 hit_def=alignment.hit_def,
                                 query_start=hsp.query_start,
                                 query_end=hsp.query_end,
                                 subject_start=hsp.sbjct_start,
                                 subject_end=hsp.sbjct_end,
                                 evalue=hsp.expect,
                                 alignment=dict(query=hsp.query,
                                                match=hsp.match,
                                                matchi=inverse_match(hsp.match),
                                                subject=hsp.sbjct))
                results.append(f)

    os.unlink(outfile)
    return results
Ejemplo n.º 8
0
def test_primers(args):
    ref = args.Reference[0]
    primers = args.Primers[0]
    out = args.Output[0]
    nproc = args.processes[0]
    tm_offset = args.tm_offset[0]
    tm_size = args.tm_size[0]
    #min_align = args.min_align[0]
    skip_tm = args.skip_tm

    # File Handler
    iTFH = TFH(ref, primers, out)

    # 2. Run blastmakedb
    db = os.path.join(iTFH.outdir, os.path.basename(iTFH.ref) + ".db")
    cline = NcbimakeblastdbCommandline(dbtype="nucl",
                                       input_file=iTFH.ref,
                                       out=db)
    print("Building BLAST Database...")
    print(cline)
    run(cline.__str__())

    # 3. Run short-blast
    result = os.path.join(iTFH.outdir,
                          os.path.basename(iTFH.primers) + ".blast.tsv")
    result_tmp = os.path.join(iTFH.outdir,
                              os.path.basename(iTFH.primers) + ".tmp")
    cline = NcbiblastnCommandline(
        query=iTFH.primers,
        db=db,
        task="blastn-short",
        num_threads=nproc,
        outfmt=
        "6 qseqid sseqid sstart send mismatch qlen length pident qseq sseq",
        out=result_tmp)
    print("Running short-BLAST...")
    print(cline)
    run(cline.__str__())

    f = open(result, "w")
    f.write(
        "#PrimerName\tTargetName\tTargetStart\tTargetEnd\t#Mismatches\tPrimerLength\tAlignedLength\t%Identity\tPrimerSeq\tContigSeq\n"
    )
    f.writelines(open(result_tmp, "r").readlines())
    f.close()
    os.remove(result_tmp)

    if skip_tm:
        return

    # 4. Thermodynamics of BLAST results
    print("Running thermodynamic check on blast results...")
    tm_result_file = os.path.join(
        iTFH.outdir,
        os.path.basename(iTFH.primers) + ".blast.TM.tsv")
    chunks = to_chunks(result, nproc)
    tm_result = run_thermodynamics(chunks, nproc, iTFH.ref, tm_offset, tm_size)

    # PrimerName	TargetName	TargetStart	TargetEnd	#Mismatches	PrimerLength	AlignedLength	%Identity	PrimerSeq	ContigSeq	Struct_found	TM	DG	DH	DS
    f = open(tm_result_file, "w")
    f.write(
        "#PrimerName\tTargetName\tTargetStart\tTargetEnd\t#Mismatches\tPrimerLength\tAlignedLength\t%Identity\tPrimerSeq\tContigSeq\tPrimerTM\tHeteroDimerTM\tHeteroDimerDG\t3EndStabilityTM\t3EndStabilityDG\n"
    )
    for line in tm_result:
        f.write(line + "\n")
    f.close()
Ejemplo n.º 9
0
def blast_product(product, tmp_dir, db, string_min,
                  string_max, subunit_length):
    blast_dir = mkdtemp(dir=tmp_dir)
    queryFileName = blast_dir + '/query'
    outFileName = blast_dir + '/output.xml'

    SeqIO.write(product, queryFileName, 'fasta')
    cline = NcbiblastnCommandline(
        cmd='blastn',
        query=queryFileName,
        out=outFileName,
        outfmt=5,
        db=db,
        evalue=0.01)
    stderr = ''
    try:
        stdout, stderr = cline()
    except ApplicationError as err:
        return('', err.stderr)

    result_handle = open(outFileName)
    status = ''

    blast_record = NCBIXML.read(result_handle)
    midline_regex = re.compile(r"\|{20,}")
    alignment_status = ''
    self_alignments = []
    conflicting_alignments = []
    matching_alignments = []
    reasons = []

    # counter for tracking number of self hits
    selfhits = 0
    # counter for tracking conflicting hits identified
    conflicting = 0
    # counter for tracking hits with match exceeding subunit length
    matching = 0

    for alignment in blast_record.alignments:

        alignment_data = {
            'accession': alignment.hit_id,
            'description': alignment.hit_def,
            'subj_length': alignment.length,
        }
        hsp_count = 0
        hsp_idents = []
        # lengths of consecutive bases...
        hsp_match_lengths = []
        hsp_alignments = []
        hsp_hit_lengths = []

        # Original RNAit implementation reports single value for identity, which
        # is tricky without tiling HSPs We'll use some slightly different critera
        # here

        # >1 hsp suggests the alignment is to a repetitive sequence which is
        # unlikely to amplify cleanly so mark these as bad

        for hsp in alignment.hsps:
            hsp_count += 1
            # check for matches of >20bp identity by checking for stretches of
            # >20 '|' characters in the HSP midline
            match = midline_regex.search(hsp.match)
            match_len = match.end() - match.start()
            ident = (hsp.identities / hsp.align_length)
            hsp_idents.append(ident)
            hsp_match_lengths.append(match_len)
            hsp_hit_lengths.append(hsp.align_length)

            # pretty format alignment
            text_alignment = format_alignment(hsp)
            hsp_alignments.append(text_alignment)

        if (hsp_count == 1):
            length_cov = hsp_match_lengths[0] / blast_record.query_letters
            if (hsp_idents[0] > 0.99 and length_cov == 1):
                alignment_status = 'Self alignment'
                self_alignments.append(alignment_data)
                selfhits += 1
                if selfhits > 1:
                    reasons.append('Multiple self hits')
            elif (hsp_idents[0] * 100 > string_min and hsp_idents[0] * 100 < string_max):
                alignment_status = 'Conflicting hits'
                conflicting_alignments.append(alignment_data)
                conflicting += 1
                reasons.append("Identity is %s" % (hsp_idents[0]))
            elif (hsp_match_lengths[0] > subunit_length):
                alignment_status = 'Match exceeding subunit length'
                matching_alignments.append(alignment_data)
                matching += 1
                reasons.append(
                    "%s bp identical sequence" %
                    (hsp_match_lengths[0]))
            else:
                alignment_status = 'Good'
        else:
            alignment_status = 'Multiple HPSs'

        hsp_idents = list(map(format_ident, hsp_idents))
        alignment_data['status'] = alignment_status
        alignment_data['reasons'] = reasons
        alignment_data['hsps'] = hsp_count
        alignment_data['ident'] = ";".join(map(str, hsp_idents))
        alignment_data['hsp_alignments'] = hsp_alignments
        alignment_data['hsp_hit_lengths'] = ";".join(map(str, hsp_hit_lengths))

    if selfhits > 1:
        primer_status = 'Bad'
    elif conflicting:
        primer_status = 'Bad'
    elif matching:
        primer_status = 'Bad'
    else:
        primer_status = 'Suitable'

    blast_data = {
        'record': blast_record,
        'primer_status': primer_status,
        'self_hits': selfhits,
        'self_alignments': self_alignments,
        'conflicting_alignments': conflicting_alignments,
        'matching_alignments': matching_alignments,
    }
    shutil.rmtree(blast_dir)

    return(blast_data, None)
Ejemplo n.º 10
0
# Make Blast DB
leader_filename = '../Genome/' + ECgenome

blast_database_file = leader_filename + ".fasta"

#os.system("makeblastdb -in %s -dbtype nucl -title %s_BLAST_DB -out %s_BLAST_DB" % (blast_database_file, leader_filename, leader_filename))

#
# Do BLAST via BioPython with reading in XML format (although file is larger than necessary in this case ... does make it more portable).
#

blast_output_file = blast_leader + '_' + ECgenome + ".blast_output"

blastn_cline = NcbiblastnCommandline(query=blast_input_file,
                                     db=leader_filename + "_BLAST_DB",
                                     task='blastn-short',
                                     evalue='0.1',
                                     outfmt=5,
                                     out=blast_output_file)

print(blastn_cline)
# Should not really produce standard output or error.
stdout, stderr = blastn_cline()

result_handle = open(blast_output_file)
blast_records = NCBIXML.parse(result_handle)

recordmap = {
    record.query.split(':')[0]: record
    for record in list(blast_records)
}
Ejemplo n.º 11
0
for item in testseqs:
    nam = item.split()[0]
    if nam not in identdict:
        identdict[nam] = {}
    seqnams.add(nam)
    fi = open("query.txt", 'w')
    fi.write(item.split()[1])
    fi.close()
    for i in range(1, 2):
        call([
            "python", "convertformat.py", "-i", "nexus", "fasta",
            "ancestorT{:d}.nex".format(i)
        ])
        blastn_run = NcbiblastnCommandline(
            query="query.txt",
            subject="ancestorT{}.fasta".format(i),
            evalue=0.001,
            outfmt=5,
            out="test.xml")
        stdout, stderr = blastn_run()
        result_handle = open("test.xml")
        blast_records = NCBIXML.parse(result_handle)
        for rec in blast_records:
            for alignment in rec.alignments:
                node = alignment.title.split()[-1]
                if node not in identdict[nam]:
                    identdict[nam][node] = []
                for hsp in alignment.hsps:
                    identdict[nam][node].append(hsp.bits)

for nam in identdict:
    sumdict = []
Ejemplo n.º 12
0
def split_to_exons():
    print('Splitting best hits to exons...')
    with open(separat_exons) as all_exons:
        all_exons_parsed = SeqIO.to_dict(
            SeqIO.parse(all_exons, 'fasta', generic_dna))
    with open(best_separate_exons, 'w') as best_exons:
        for besthit in best_hits:
            locus = besthit.split()[1].split('-')[0]
            probe = besthit.split()[0]
            exons = [
                val for key, val in all_exons_parsed.items() if locus in key
            ]
            for exon in exons:
                name = str(exon.id)
                sequence = str(exon.seq)
                best_exons.write(f'>{probe}_{name}\n{sequence}\n')
    NcbimakeblastdbCommandline(dbtype='nucl',
                               input_file=probes,
                               out=probes,
                               parse_seqids=True)()
    NcbiblastnCommandline(
        task=blast_task,
        query=best_separate_exons,
        db=probes,
        out=f'{best_separate_exons}_against_{probes}.txt',
        num_threads=4,
        outfmt=
        '6 qaccver saccver pident qcovhsp evalue bitscore sstart send qstart qend'
    )()
    with open(f'{best_separate_exons}_against_{probes}.txt'
              ) as new_blast_results:
        hits = new_blast_results.readlines()
    cleaned_hits = []
    for hit in hits:
        if hit.split()[0].split('_')[0] == hit.split()[1]:
            cleaned_hits.append(hit)
    cleaned_hits.sort(key=lambda x: float(x.split()[5]), reverse=True)
    cleaned_hits.sort(key=lambda x: float(x.split()[4]))
    cleaned_hits.sort(key=lambda x: float(x.split()[2]), reverse=True)
    cleaned_hits.sort(key=lambda x: float(x.split()[3]), reverse=True)
    cleaned_hits.sort(
        key=lambda x: int(x.split()[0].split('-')[3].split('_')[1]))
    cleaned_hits.sort(key=lambda x: x.split()[0].split('-')[2])
    hits_exons = set()
    cleaned_dedup_hits = []
    for cleaned_hit in cleaned_hits:
        if cleaned_hit.split()[0] not in hits_exons:
            cleaned_dedup_hits.append(cleaned_hit)
            hits_exons.add(cleaned_hit.split()[0])
    cleaned_dedup_hits.sort(
        key=lambda x: int(x.split()[0].split('-')[3].split('_')[1]))
    cleaned_dedup_hits.sort(key=lambda x: x.split()[1].split('-')[1])
    with open(f'{best_separate_exons}_against_{probes}.txt',
              'w') as new_blast_results:
        for cleaned_hit in cleaned_dedup_hits:
            new_blast_results.write(cleaned_hit)
    with open(probes) as probes_to_parse:
        probes_as_dict = SeqIO.to_dict(
            SeqIO.parse(probes_to_parse, 'fasta', generic_dna))
    with open(best_separate_exons) as best_exons:
        best_exons_as_dict = SeqIO.to_dict(
            SeqIO.parse(best_exons, 'fasta', generic_dna))
    with open(result_file, 'w') as resultfile, open(result_file2,
                                                    'w') as resultfile2:
        for cleaned_dedup_hit in cleaned_dedup_hits:
            name_of_locus = cleaned_dedup_hit.split()[1]
            name_of_exon = cleaned_dedup_hit.split()[0]
            num_exon = cleaned_dedup_hit.split()[0].split('-')[3].split('_')[1]
            if int(cleaned_dedup_hit.split()[6]) > int(
                    cleaned_dedup_hit.split()[7]):
                start = int(cleaned_dedup_hit.split()[7])
                end = int(cleaned_dedup_hit.split()[6])
                sequence = str(probes_as_dict[name_of_locus]
                               [start - 1:end].seq.reverse_complement())
            else:
                start = int(cleaned_dedup_hit.split()[6])
                end = int(cleaned_dedup_hit.split()[7])
                sequence = str(probes_as_dict[name_of_locus][start -
                                                             1:end].seq)

            resultfile.write(f'>{name_of_locus}_exon_{num_exon}\n{sequence}\n')
            if int(cleaned_dedup_hit.split()[8]) > int(
                    cleaned_dedup_hit.split()[9]):
                start_opt = int(cleaned_dedup_hit.split()[9])
                end_opt = int(cleaned_dedup_hit.split()[8])
                sequence_opt = str(best_exons_as_dict[name_of_exon]
                                   [start_opt -
                                    1:end_opt].seq.reverse_complement())
            else:
                start_opt = int(cleaned_dedup_hit.split()[8])
                end_opt = int(cleaned_dedup_hit.split()[9])
                sequence_opt = str(
                    best_exons_as_dict[name_of_exon][start_opt -
                                                     1:end_opt].seq)
            resultfile2.write(
                f'>{name_of_locus}_exon_{num_exon}\n{sequence_opt}\n')
    print('Done')
Ejemplo n.º 13
0
def main(args=[]):
    usage = '''
    usage: %prog [options] arg \nProgram parses blast XML file, translates exons boundaries (annotated on reference sequences) to query sequences (e.g. transcript)"
            '''

    parser = OptionParser(usage, version='%prog version 1.0')

    parser.add_option("-r",
                      "--reference_fasta",
                      dest="REFERENCE_FASTA",
                      help="reference in fasta format")
    parser.add_option("-g",
                      "--gff_reference_fasta",
                      dest="GFF_REFERENCE_FASTA",
                      help="annotation for reference in gff3 format")
    parser.add_option("-q",
                      "--query_fasta",
                      dest="QUERY_FASTA",
                      help="query in fasta format")

    parser.add_option("-b",
                      "--blast_db_path_and_name",
                      dest="BLAST_DB_PATH_AND_NAME",
                      help="blast+ database"
                      ''', default="blast_out.xml"''')
    parser.add_option("-v",
                      "--blast_xml_file",
                      dest="BLAST_XML_FILE",
                      help="blast results in xml file format",
                      default="blast_out.xml")
    parser.add_option("-f",
                      "--output_file",
                      dest="OUTPUT_FILE",
                      help="output file",
                      action="store",
                      type="string",
                      default=str(__name__) + ".txt")
    parser.add_option("-o",
                      "--output_folder",
                      dest="OUTPUT_FOLDER",
                      help="output folder",
                      default="./")
    parser.add_option("-e",
                      "--e_value_thresh",
                      dest="E_VALUE_THRESH",
                      help="threshold e-value",
                      default=1e-8)
    parser.add_option("-a",
                      "--only_best_Alignment",
                      dest="ONLY_BEST_ALIGNMENT",
                      help="take only 1, best q-s pair",
                      default=True)

    parser.add_option("-w",
                      "--blast_word_size",
                      dest="BLAST_WORD_SIZE",
                      help="blast word_size",
                      default=11)
    parser.add_option("-n",
                      "--blast_num_threads",
                      dest="BLAST_NUM_THREADS",
                      help="number of threads",
                      default=2)

    parser.add_option("-m",
                      "--blast_match_score",
                      dest="BLAST_MATCH_SCORE",
                      help="reward for nt match",
                      default=1)
    parser.add_option("-s",
                      "--blast_mismatch_score",
                      dest="BLAST_MISMATCH_SCORE",
                      help="penalty for nt mismatch",
                      default=-3)
    parser.add_option("-y",
                      "--blast_gap_open",
                      dest="BLAST_GAP_OPEN",
                      help="cost of opening a gap",
                      default=5)
    parser.add_option("-x",
                      "--blast_gap_extend",
                      dest="BLAST_GAP_EXTEND",
                      help="cost of gap extension",
                      default=2)

    (options, arg) = parser.parse_args(args)

    # --- Entering program

    t_st = time.time()

    if not os.path.isdir(options.OUTPUT_FOLDER):
        sys.stdout.write('\nWrong output directory!')
        return

    os.chdir(options.OUTPUT_FOLDER)

    logging_file = "log_output"
    if options.OUTPUT_FILE != "":
        logging_file = options.OUTPUT_FILE
    log_info_hlr = open(options.OUTPUT_FOLDER + os.sep + logging_file + ".log",
                        "w")

    log_info = "Entering program: {}\n".format(os.path.basename(__file__))
    sys.stdout.write(log_info)
    log_info_hlr.write(log_info)

    log_info = "\nUsed options: {}\n".format("\n".join(
        str(options).split(",")))
    sys.stdout.write(log_info)
    log_info_hlr.write(log_info)

    # --- workspace

    s = os.path.join(os.path.dirname(__file__), '.')
    os.chdir(s)
    print os.getcwd()

    # --- parsing gff3 file

    log_info = "Parsing {}\n".format(options.GFF_REFERENCE_FASTA)
    sys.stdout.write(log_info)
    log_info_hlr.write(log_info)

    gff_dic = {}
    s_prev_gen = ""
    gff_ref_hlr = open(options.GFF_REFERENCE_FASTA, "r")
    for line_gff in gff_ref_hlr:
        line_gff_list = line_gff.split("\t")

        gene_name = line_gff_list[0]
        exon_start = int(line_gff_list[3])
        exon_end = int(line_gff_list[4])
        exon_strand = line_gff_list[6]
        exon_name = line_gff_list[8].split(";")[0].split("=")[1]
        last_exon = line_gff_list[8].split(";")[1].strip()

        info_pack = [exon_name, exon_strand, last_exon]

        if gene_name not in gff_dic:
            coord_exons_dic = {}
            for x_coord in range(exon_start, exon_end + 1):
                coord_exons_dic[x_coord] = [info_pack]
            gff_dic[gene_name] = coord_exons_dic
        else:
            coord_exons_dic = gff_dic[gene_name]
            for x_coord in range(exon_start, exon_end + 1):
                if x_coord not in coord_exons_dic:
                    coord_exons_dic[x_coord] = []
                coord_exons_dic[x_coord].append(info_pack)

        if gene_name != s_prev_gen:
            s_prev_gen = gene_name
            log_info = "Parsing gff for gene: {}\n".format(s_prev_gen)
            sys.stdout.write(log_info)
            log_info_hlr.write(log_info)

    # --- blast analysis for each sequence

    from Bio.Blast.Applications import NcbiblastnCommandline

    blast_db_source = options.BLAST_DB_PATH_AND_NAME
    blastx_cline = NcbiblastnCommandline(query=options.QUERY_FASTA,
                                         db=blast_db_source,
                                         evalue=float(options.E_VALUE_THRESH),
                                         outfmt=5,
                                         out=options.BLAST_XML_FILE,
                                         word_size=options.BLAST_WORD_SIZE,
                                         num_threads=options.BLAST_NUM_THREADS,
                                         reward=options.BLAST_MATCH_SCORE,
                                         penalty=options.BLAST_MISMATCH_SCORE,
                                         gapopen=options.BLAST_GAP_OPEN,
                                         gapextend=options.BLAST_GAP_EXTEND)
    stdout, stderr = blastx_cline()

    # --- analysis of blast alignment

    out_file_core = "exons_alignment_by_blast_out"
    out_local_gff3_hlr = open(
        options.OUTPUT_FOLDER + os.sep + out_file_core + "_local.gff3", "w")
    out_global_gff3_hlr = open(
        options.OUTPUT_FOLDER + os.sep + out_file_core + "_global.gff3", "w")
    out_fasta_hlr = open(
        options.OUTPUT_FOLDER + os.sep + out_file_core + ".fasta", "w")

    result_handle = open(options.BLAST_XML_FILE)
    blast_records = NCBIXML.parse(result_handle)
    for blast_record in blast_records:
        for alignment in blast_record.alignments:
            for hsp in alignment.hsps:
                if hsp.expect < float(options.E_VALUE_THRESH):

                    alignment_geneName = str(alignment.hit_def)

                    print('sequence:', alignment.title)

                    print("len hsp.query", len(hsp.query))
                    print("len hsp.sbjct", len(hsp.sbjct))
                    print("len hsp.match", len(hsp.match))

                    print("hsp.sbjct", str(hsp.sbjct))
                    print("hsp.match", str(hsp.match))
                    print("hsp.query", str(hsp.query))

                    # coordinates: subject
                    print("hsp.sbjct_start", hsp.sbjct_start)
                    print("hsp.sbjct_start", hsp.sbjct_end)

                    # coordinates: query
                    print("hsp.sbjct_start", hsp.query_start)
                    print("hsp.sbjct_start", hsp.query_end)

                    coord_exons_dic = gff_dic[alignment_geneName]

                    # generate alignment objects list
                    query_geneName = str(blast_record.query)
                    query_geneName_local = query_geneName + "__q[" + str(
                        hsp.query_start
                    ) + ":" + str(hsp.query_end) + "]" + "_s[" + str(
                        hsp.sbjct_start) + ":" + str(
                            hsp.sbjct_end) + "]" + "_" + alignment_geneName
                    alignment_object_list = get_hsp_alignment_object_list(
                        hsp, alignment_geneName, query_geneName,
                        query_geneName_local)

                    query_seq = "".join([xx.q for xx in alignment_object_list])
                    out_fasta_hlr.write(">" + query_geneName_local + "\n")
                    out_fasta_hlr.write(query_seq + "\n")

                    # set exons info into alignment objects
                    for al_obj_in_hsp in alignment_object_list:
                        if al_obj_in_hsp.position_subject in coord_exons_dic:
                            al_obj_in_hsp.set_exons(coord_exons_dic[
                                al_obj_in_hsp.position_subject])

                    # global & local gff output
                    extract_and_write_gff(alignment_object_list,
                                          out_global_gff3_hlr,
                                          out_local_gff3_hlr)

            if options.ONLY_BEST_ALIGNMENT:
                break

    out_local_gff3_hlr.close()
    out_global_gff3_hlr.close()
    out_fasta_hlr.close()

    # --- closing program

    t_end = time.time()

    sys.stdout.write("\n\nWork done...")
    sys.stdout.write("\nProcess time [s]: " + str(t_end - t_st))
Ejemplo n.º 14
0
        db_fna += ">" + gene.name_id + '\n' + str(gene.seq) + '\n'
        db_trna += ">" + gene.name_id + '\n' + str(gene.seq) + '\n'

with open("blast_db/db.fna", 'w') as outfile_fna:
    outfile_fna.write(db_fna)
with open("blast_db/db.faa", 'w') as outfile_faa:
    outfile_faa.write(db_faa)
with open("blast_db/db_tRNA.fna", 'w') as outfile_trna:
    outfile_trna.write(db_trna)

#Call BLAST to make blastDBs
subprocess.call("makeblastdb.exe -in blast_db/db.fna -dbtype nucl -out blast_db/nt_db", shell=True)
subprocess.call("makeblastdb.exe -in blast_db/db.faa -dbtype prot -out blast_db/aa_db", shell=True)
subprocess.call("makeblastdb.exe -in blast_db/db_tRNA.fna -dbtype nucl -out blast_db/trna_db", shell=True)

#Names of BLAST dbs
#blast_db/nt_db
#blast_db/aa_db
#blast_db/trna_db

#########################################

#BLAST the databases against themselves
nt_blast = NcbiblastnCommandline(cmd="blastn.exe", task = 'dc-megablast', out="blast_output/nt_blast.txt", outfmt='"6 qseqid sseqid pident qlen length mismatch gapope evalue bitscore qcovhsp"' , query= "blast_db/db.fna", db="blast_db/nt_db")
aa_blast = NcbiblastnCommandline(cmd="blastp.exe",  out="blast_output/aa_blast.txt", outfmt='"6 qseqid sseqid pident qlen length mismatch gapope evalue bitscore qcovhsp"' , query= "blast_db/db.faa", db="blast_db/aa_db")
trna_blast = NcbiblastnCommandline(cmd="blastn.exe", task = 'dc-megablast', out="blast_output/trna_blast.txt", outfmt='"6 qseqid sseqid pident qlen length mismatch gapope evalue bitscore qcovhsp"' , query= "blast_db/db_tRNA.fna", db="blast_db/trna_db")
nt_blast()
aa_blast()
trna_blast()

################################################################
Ejemplo n.º 15
0
def blastSingle(item, query_virus_dir, output_dir, seqid, numThreads):
    query_name = item.split('.')[0]
    query_file = os.path.join(query_virus_dir, item)
    output_file = os.path.join(output_dir, query_name) + '.blast'
    if seqid is not None:  # specify seqidlist
        blast_call = NcbiblastnCommandline(
            query=query_file,
            db=db_host_prefix,
            out=output_file,
            outfmt="6 qacc sacc qstart qend qlen",
            evalue=0.01,
            gapopen=10,
            penalty=-1,
            reward=1,
            gapextend=2,
            word_size=11,
            perc_identity=90,
            seqidlist=seqid,
            num_threads=numThreads)
    else:
        blast_call = NcbiblastnCommandline(
            query=query_file,
            db=db_host_prefix,
            out=output_file,
            outfmt="6 qacc sacc qstart qend qlen",
            evalue=0.01,
            gapopen=10,
            penalty=-1,
            reward=1,
            gapextend=2,
            word_size=11,
            perc_identity=90,
            num_threads=numThreads)
    blast_call()
    '''
    Parse blast results for a single file
    '''
    if os.stat(output_file).st_size == 0:
        ind = False
        return ind, None
    else:
        with open(query_file) as f:
            query_len = len(f.read())  # bp
        query_res = pd.read_table(output_file, header=None)
        # need to make sure a same value for the last column
        # map headers to genome names
        #query_res[0] = query_name
        query_res[1] = [dict_genome[k] for k in list(query_res[1])]
        df_blast_positions = query_res.groupby([0, 1]).agg({
            2:
            lambda x: tuple(x - 1),
            3:
            lambda x: tuple(x - 1),
            4:
            min
        })
        df_blast_positions.index = df_blast_positions.index.droplevel()
        df_blast_perc = df_blast_positions.apply(lambda x: cal_perc(x),
                                                 axis=1) / query_len
        sr_blast = df_blast_perc.groupby(level=0, sort=False).apply(sum)
        ind = True
        return ind, pd.DataFrame({query_name: sr_blast}).T
Ejemplo n.º 16
0
def run_BLAST(query, database, args, cons_run):
    """
    Given a mfa of query sequences of interest & a database, search for them.

    Important to note:
        * Turns dust filter off,
        * Only a single target sequence (top hit),
        * Output in XML format as blast.xml.

    # TODO: Add  evalue filtering ?
    # TODO: add task='blastn' to use blastn scoring ?

    .. warning:: default is megablast

    .. warning:: tblastx funcationality has not been checked

    :param query: the fullpath to the vf.mfa
    :param database: the full path of the databse to search for the vf in
    :param args: the arguments parsed to argparse
    :param cons_run: part of a mapping consensus run

    :type query: string
    :type database: string
    :type args: argparse args (dictionary)
    :type cons_run: boolean

    :returns: the path of the blast.xml file
    """
    tmp1 = os.path.splitext(query.split('/')[-1])[0]
    tmp2 = os.path.splitext(database.split('/')[-1])[0]
    if not cons_run:
        outfile = os.path.join("BLAST_results/",
                               "DB="+tmp1+"ID="+tmp2+"_blast.xml")
    else:
        outfile = os.path.join("BLAST_results/",
                               "cons_DB="+tmp1+"ID="+tmp2+"_blast.xml")
    protein = False
    # File type not specified, determine using util.is_protein()
    if args.reftype is None:
        if SeqFindr.util.is_protein(query) != -1:
            protein = True
            sys.stderr.write('%s is protein' % (query))
    elif args.reftype == 'prot':
        protein = True
        sys.stderr.write('%s is protein\n' % (query))
    run_command = ''
    if protein:
        sys.stderr.write('Using tblastn\n')
        run_command = NcbitblastnCommandline(query=query, seg='no',
                    db=database, outfmt=5, num_threads=args.BLAST_THREADS,
                    max_target_seqs=1, evalue=args.evalue, out=outfile)
    else:
        if args.tblastx:
            sys.stderr.write('Using tblastx\n')
            run_command = NcbitblastxCommandline(query=query, seg='no',
                        db=database, outfmt=5, num_threads=args.BLAST_THREADS,
                        max_target_seqs=1, evalue=args.evalue,
                        out=outfile)
        else:
            sys.stderr.write('Using blastn\n')
            if args.short == False:
                run_command = NcbiblastnCommandline(query=query, dust='no',
                            db=database, outfmt=5,
                            num_threads=args.BLAST_THREADS,
                            max_target_seqs=1, evalue=args.evalue,
                            out=outfile)
            else:
                sys.stderr.write('Optimising for short query sequences\n')
                run_command = NcbiblastnCommandline(query=query, dust='no',
                            db=database, outfmt=5, word_size=7,
                            num_threads=args.BLAST_THREADS, evalue=1000,
                            max_target_seqs=1, out=outfile)

    sys.stderr.write(str(run_command)+"\n")
    run_command()
    return os.path.join(os.getcwd(), outfile)
Ejemplo n.º 17
0
    sys.exit()

mst_type = args.mst_type
tmp_path = "tmp"
os.mkdir(tmp_path)
if "".join(mst_type) == "all":
    for i in range(1, 31):
        db_file = "mst" + str(i) + ".fasta"
        tmp_out = "mst" + str(i) + ".tab"
        if not os.path.isfile(args.db_directory / db_file):
            print("Missing database file {}".format(db_file))
            sys.exit()
        else:
            query = NcbiblastnCommandline(query=args.query_sequence,
                                          db=args.db_directory / db_file,
                                          evalue=0.001,
                                          outfmt=6,
                                          out=tmp_path / tmp_out,
                                          ungapped=True)
else:
    for i in mst_type:
        if int(i) not in list(range(1, 31)):
            print("Invalid argument MST type {}".format(i))
        else:
            db_file = "mst" + str(i) + ".fasta"
            tmp_out = "mst" + str(i) + ".tab"
            if not os.path.isfile(args.db_directory / db_file):
                print("Missing database file {}".format(db_file))
                sys.exit()
            else:
                query = NcbiblastnCommandline(query=args.query_sequence,
                                              db=args.db_directory / db_file,
Ejemplo n.º 18
0
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIXML
import pandas as pd
import os.path

thermal = "16S_thermophile.fasta"
meso = "16S_mesophile.fasta"
output = "16S_blast.xml"
_new = False

if _new:
    # 16S gene blast
    NcbiblastnCommandline(query=thermal, subject=meso, outfmt=5,
                          out=output)()[0]
    print("blast finished!")

blast_records = NCBIXML.parse(open(output, "r"))
f = open("16S_blast_org.csv", 'w')
f.write("query_seq,hit_seq,hit_len,identity,score,evalue\n")
for blast_record in blast_records:
    for alignment in blast_record.alignments:
        for hsp in alignment.hsps:
            if hsp.gaps != 0 and blast_record.query != alignment.hit_def:
                f.write(
                    f"{blast_record.query},{alignment.hit_def},{hsp.align_length},{hsp.identities},{hsp.score},{hsp.expect}\n"
                )
f.close()
Ejemplo n.º 19
0
#help(NcbiblastnCommandline)
import os
import sys

directory = sys.argv[
    -1]  #first argument input after 'python BLAST_loop.py' in command line will be stored in variable 'directory'

#change this directory as needed. Note that local blast may experience issues if you're not working in the blast directory (here NCBI/blast-2.2.30+)
#directory = "C:/Users/madeleine/Documents/NCBI/phageParser/data"

for fn in os.listdir("%s/spacers" % directory):

    query1 = "%s/spacers/%s" % (directory, fn)
    ext = fn.index('.')
    outfile1 = fn.replace(fn[ext + 1:], ("txt"))
    outfile = "%s/phages/%s" % (directory, outfile1)

    # These parameters are more or less the same as the ones on PhagesDB.org
    blastn_obj = NcbiblastnCommandline(query=query1,
                                       db="phagedb",
                                       evalue=10,
                                       num_descriptions=100,
                                       num_alignments=100,
                                       dust="no",
                                       task="blastn",
                                       reward=1,
                                       penalty=-3,
                                       out=outfile)

    stdout, stderr = blastn_obj()
Ejemplo n.º 20
0
def blast_pacbio(input_data_filename, blast_database_name, output_data_filename, max_hsps, max_target_seqs):
    out_format = '6 qseqid sseqid pident qcovhsp length mismatch gapopen qstart qend sstart send evalue bitscore staxids'
    blastn_cline = NcbiblastnCommandline(cmd = 'blastn', query = input_data_filename, db = blast_database_name, max_hsps = max_hsps, max_target_seqs = max_target_seqs, outfmt = '"' + out_format + '"', out = output_data_filename)
    blastn_cline()
    return
Ejemplo n.º 21
0
    def blast2circos_file(self,
                          blast,
                          reference,
                          blastn=False,
                          identity_cutoff=80):
        '''

        tblastn vs contigs by default
        can be switch to blastn

        :param blast:
        :param reference:
        :param blastn:
        :return:
        '''

        from TPutils import shell_command
        from TPutils import blast_utils
        from Bio.Blast.Applications import NcbitblastnCommandline
        from Bio.Blast.Applications import NcbiblastnCommandline

        # todo catch IO errors, orther potential errors
        a, b, c = shell_command.shell_command(
            'makeblastdb -in %s -dbtype nucl' % (reference))
        # print a
        # print b
        print(c)
        if not blastn:
            blast_cline = NcbitblastnCommandline(
                query=blast,
                db=reference,
                evalue=0.00000001,  # 0.001
                outfmt=6,
                out="blast.tmp",
                max_target_seqs=1)
            print(blast_cline)
        else:
            blast_cline = NcbiblastnCommandline(query=blast,
                                                db=reference,
                                                evalue=0.001,
                                                outfmt=6,
                                                out="blast.tmp")
        stdout, stderr = blast_cline()

        # a,b,c = shell_command.shell_command('tblastn -query %s -db %s -evalue 1e-5 -max_target_seqs 1 -outfmt 6 > blast.tmp' % (blast, reference))
        # a,b,c = shell_command.shell_command('tblastn -query %s -db %s -evalue 1e-5 -max_target_seqs 1 -outfmt 6' % (blast, reference))
        print('############## BLAST ###################')
        # print a
        # print b
        # print c

        blast2data, queries = blast_utils.remove_blast_redundancy(
            ["blast.tmp"], check_overlap=False)

        o = open('circos_blast.txt', "w")
        l = open('circos_blast_labels.txt', "w")

        # with open(blast, 'r') as b:
        '''
        for line in a.split('\n'):

            data = line.rstrip().split('\t')
            #print data
            try:
                if float(data[2])>80:
                    location = sorted([data[8], data[9]])
                    o.write("%s\t%s\t%s\n" % (data[1], location[0], location[1]))
                    l.write("%s\t%s\t%s\t%s\n" % (data[1],  location[0], location[1], data[0]))
            except IndexError:
                continue
        '''
        for contig in blast2data:
            cname = re.sub("\|", "", contig)
            for gene in blast2data[contig]:
                if float(blast2data[contig][gene]
                         [0]) >= identity_cutoff:  # 80,20
                    location = sorted(blast2data[contig][gene][1:3])
                    o.write("%s\t%s\t%s\n" %
                            (contig, location[0] + self.contigs_add[cname][0],
                             location[1] + self.contigs_add[cname][0]))
                    l.write("%s\t%s\t%s\t%s\n" %
                            (contig, location[0] + self.contigs_add[cname][0],
                             location[1] + self.contigs_add[cname][0], gene))

        o.close()
Ejemplo n.º 22
0
                    else:  # Unsuccessful. Stdout will be '1'. Entry was not found or some other error
                        blastn_log.error("There was an error with : %s" % str(Accession[2]))  # Log it.
                        break

                    # Create/Open a XML file that stores BLAST data for a particular Organism.
                    # By opening for writing, we can overwrite already existing xml files.
                    save_file = open("%s_%s.xml" % (Accession[1], Org), "w")

                    # Create a copy of the gi list file per taxonomy id to be used in blast
                    os.system("cp " + h + "/data/gi-lists/" + TAX + "gi " + TAX + "gi")

                    # Use Biopython's NCBIBlastnCommandline tool
                    result_handle1 = NcbiblastnCommandline(query="temp.fasta", db="refseq_rna",
                                                           strand="plus", evalue=0.001,  # DONT GO LOWER
                                                           out="%s_%s.xml" % (Accession[1], Org),
                                                           outfmt=5, gilist=TAX + "gi",
                                                           max_target_seqs=10, task="blastn")
                    stdout_str, stderr_str = result_handle1()
                    #blastn_log.info(result_handle1)  # log the result handle as a check.

                    # Remove the gi list obinary file from the current directory
                    os.remove(TAX + "gi")
                    blastn_log.info(TAX + "gi file has been deleted." + "\n")

                    # Remove the temp.fasta file in the directory
                    os.remove("temp.fasta")
                    blastn_log.info("The temp.fasta file has been deleted." + "\n")
                    blastn_log.info("%s_%s.xml" % (Accession[1],Org) + " is being parsed." + "\n")

# ------------------------------------------------------------------------------
Ejemplo n.º 23
0
    # if gene_name not in genefiles:

    seq1 = SeqRecord(gene_seq,
                     id=gene_name,
                     name=read.name,
                     description=read.description.translate(
                         string.maketrans(
                             "",
                             "",
                         ), bad_chard))
    gene = gene_dir + gene_name + ".fasta"
    SeqIO.write(seq1, gene, "fasta")
    # Run BLAST and parse the output as XML
    output = NcbiblastnCommandline(
        query=gene,
        subject="LacO_T0.fasta",
        dust="no",
        soft_masking="false",
        outfmt=5)()[0]  # -dust no -soft_masking false -outmft 6
    blast_result_record = NCBIXML.read(StringIO(output))

    # Print some information on the result
    if len(blast_result_record.alignments) == 0:  # no significant alignments
        lost_genes.update({gene_name: {'LacO': 'Lost'}})
    else:
        tot_len = 0
        for alignment in blast_result_record.alignments:
            for hsp in alignment.hsps:
                tot_len += hsp.align_length
                if hsp.align_length - hsp.identities > nmis:
                    if gene_name not in lost_genes:
                        lost_genes.update({
Ejemplo n.º 24
0
    def map_primers_to_genome(self,blast_db,outfile=None,search_set=None,default_to_PCR=False,temp_dir = None, keep_temp=False, tolerance=1):
        workingDir = temp_dir if temp_dir is not None else self.tempDirObj.name
        if outfile == '':
            outfile = None
        if search_set == None:
            search_set = set(self.primers_dict.keys())
        temp_infile = os.path.join(workingDir,'tmp_primer.fasta')
        temp_outfile = os.path.join(workingDir,'tmp_primer_blast.fasta')
        blast_combined = blankBLASTtable()
        ql_head = 'query_length' #new column to add
        fh_head = 'forward hit'
        export_regions = dict() #name for region, coordinates of innermost nucleotide on outermost primers (draw data from seq_borders dict in the sequencing reaction)
        for locus in search_set:
            if locus not in self.primers_dict.keys():
                print("Error: {} is not in the set of primer loci".format(locus))
            locus_dict = self.primers_dict[locus].copy() #so that I can modify it
            if default_to_PCR: #Make sure there are primers for sequencing the entire region
                seq_dict = locus_dict['Seq']
                if 'All' not in seq_dict.keys():
                    seq_dict['All'] = locus_dict['PCR']['All']
            export_regions[locus] = dict()
            ##Evaluate PCR dict first to find general range in which sequencing primers can bind
            PCR_dict = locus_dict['PCR']
            range_list = []
            ## Create a master range limit if specified
            has_range = ('range_contig' in locus_dict.keys()
                        and 'range_from' in locus_dict.keys() 
                        and 'range_to' in locus_dict.keys())
            if has_range:
                master_range = region_record(locus_dict['range_contig'],locus_dict['range_from'],locus_dict['range_to'])
                range_list.append(master_range)
            ## Place BLAST hits into ranges
            for (subregion, subregion_dict) in PCR_dict.items(): ##Only one region: "all"
                for (primer,sequence) in subregion_dict.items():
                    #Write query file
                    my_seq = SeqRecord(Seq(sequence,IUPAC.ambiguous_dna),id="-".join([locus,'PCR',subregion,primer]))
                    with open(temp_infile,"w") as fout:
                            SeqIO.write(my_seq,fout,'fasta')
                    #Search BLAST
                    blast_cline = NcbiblastnCommandline(query=temp_infile,db=blast_db,outfmt=6,out=temp_outfile,task='blastn-short',evalue=1,reward=1,penalty=-1,gapopen=3,gapextend=2)
                    blast_cline() ##Should only print for errors
                    blast_table = loadBLASTtableToDataFrame(temp_outfile)
                    if keep_temp:
                        named_file = '{}_{}.tab'.format("-".join([locus,'PCR',subregion,primer]),os.path.basename(blast_db)) 
                        utilities.safeOverwriteTable(os.path.join(workingDir,named_file), blast_table, 'tab')
                    ##SPlace best hits into ranges
                    if len(blast_table) > 0:
                        ##Add some extra info to table
                        blast_table[ql_head] = len(my_seq)
                        blast_table[fh_head] = blast_table['s. start'] < blast_table['s. end']
                        ## Limit table to best hits
                        best = blast_table.sort_values(by=['bit score'],ascending=False).iloc[0]
                        best_table = blast_table[blast_table['bit score'] >= tolerance*best['bit score']] #This may be too stringent; may need to revisit
                        ## Add best hits to ranges
                        for _,this_hit in best_table.iterrows():
                            finished = False #if we found a range for it
                            for this_range in range_list:
                                if not finished: #stop upon success or if range is exclusive
                                    finished = this_range.try_add_primer(this_hit['subject id'],this_hit['s. start'],this_hit[fh_head],True)
                                    if this_range.exclusive and not finished:
                                        finished = True
                                        if len(best_table) == 1:
                                            print("Warning: an exclusive hit failed to map to the prespecified region. Please report to developer(s)")
                            if not finished:
                                new_range = region_record()
                                new_range.try_add_primer(this_hit['subject id'],this_hit['s. start'],this_hit[fh_head],True)
                                range_list.append(new_range)
                        
                        ## Record best hits for reporting
                        blast_combined = pd.concat([blast_combined,best_table],sort=True)##Note: this is compatible with pandas 0.23 +; older versions will fail. Without sort, it makes FutureWarning and exception.
                    else:
                            print("Warning: zero hits for {}".format(my_seq.id))
            ##Merge any ranges that are close/overlapping; test if ranges are valid (primer pairs)
            i = 0
            ValidRanges = set()
            while i < len(range_list):
                this_range = range_list[i]
                j = len(range_list)-1
                while j > i:
                    merger = this_range.try_merge_regions(range_list[j])
                    if merger:
                        print("Warning: this is an exceptional situation and has not been tested, please report to developer(s). Range merger")
                        del(range_list[j])
                    j-=1
                #Test validity of this_range
                if (len(this_range.For_list) > 0 and len(this_range.Rev_list) > 0):
                    if this_range.get_min() < this_range.get_max():
                        ValidRanges.add(i)
                i+=1
            #Remove invaled ranges
            range_list = [range_list[i] for i in ValidRanges]
            #Report oddities
            if len(range_list) == 0:
                print("Warning: Unable to find an amplification region for {}".format(locus))
            elif len(range_list) == 2:
                print("Warning: Detected multiple amplification regions for {}".format(locus))
            for this_range in range_list:
                vprint('\n'+locus + ": Potential amplicon region")
                vprint(this_range)
                    
            ## Find the sequencing sites within the defined ranges
            Seq_dict = locus_dict['Seq']
            for (subregion, subregion_dict) in  Seq_dict.items():
                    export_regions[locus][subregion] = dict()
                    seq_borders = dict() ##Use range as key to track where sequencing of subregion starts. Values outside of range indicate no matches
                    seq_primers = dict() ##primer names corresponding to border positions
                    for (primer,sequence) in subregion_dict.items():
                            my_seq = SeqRecord(Seq(sequence,IUPAC.ambiguous_dna),id="-".join([locus,'Seq',subregion,primer]))
                            with open(temp_infile,"w") as fout:
                                    SeqIO.write(my_seq,fout,'fasta')
                            blast_cline = NcbiblastnCommandline(query=temp_infile,db=blast_db,outfmt=6,out=temp_outfile,task='blastn-short',evalue=1,reward=1,penalty=-1,gapopen=3,gapextend=2)
                            blast_cline() ##Should only print for errors
                            blast_table = loadBLASTtableToDataFrame(temp_outfile)
                            if len(blast_table) > 0:
                                    ##Add some extra info to table
                                    blast_table[ql_head] = len(my_seq)
                                    blast_table[fh_head] = blast_table['s. start'] < blast_table['s. end']
                                    for my_range in range_list:
                                            ## Limit table to hits in range
                                            r_min = my_range.get_min()
                                            r_max = my_range.get_max()
                                            if my_range not in seq_borders: ##TODO: this should probably be initialized immediately after declaration. Need to check that it doesnt' break the downstream features
                                                    seq_borders[my_range] = [r_min -1, r_max+1]
                                                    seq_primers[my_range] = ['None','None']
                                            range_table = blast_table[blast_table['subject id'] == my_range.contig]
                                            range_table = range_table[range_table['s. end'] >= r_min]
                                            range_table = range_table[range_table['s. end'] <= r_max]
                                            if len(range_table) > 0:
                                                    ## Limit table to best hits
                                                    best_in_range = range_table.sort_values(by=['bit score'],ascending=False).iloc[0]
                                                    range_table = range_table[range_table['bit score'] >= best_in_range['bit score']] #This may be too stringent; may need to revisit
                                                    if len(range_table) > 0:
                                                            if len(range_table) > 1:
                                                                    export_line = "Warning: sequencing primer maps to multiple locations within PCR primers. Using outermost site: {}".format(my_seq.id)
#                                                                     if __name__ != "__main__": ##Being called from an outside procedure...indent to indicated subsidiary position
#                                                                         export_line = '\t'+export_line
                                                                    print(export_line)
                                                            for _, hit in range_table.iterrows():
                                                                    q_end = hit['q. end']
                                                                    gap = len(my_seq) - q_end
                                                                    s_end = hit['s. end']
                                                                    is_for = hit[fh_head]
                                                                    if is_for:
                                                                            if seq_borders[my_range][0] < my_range.get_min():
                                                                                    seq_borders[my_range][0] = s_end
                                                                                    seq_primers[my_range][0] = primer
                                                                                    if gap > 0:
                                                                                            vprint("Warning: sequencing primer does not match template at 3' end. Sequence probably needs trimming on the low end: {}".format(my_seq.id))
                                                                                    
                                                                            else:
                                                                                    if seq_borders[my_range][0] > s_end:
                                                                                            seq_borders[my_range][0] = s_end
                                                                                            seq_primers[my_range][0] = primer	
                                                                                            if gap > 0:
                                                                                                    vprint("Warning: sequencing primer does not match template at 3' end. Sequence probably needs trimming on the low end: {}".format(my_seq.id))	
                                                                                    vprint("Warning: multiple sequencing primers map in forward direction on template. Using outermost site: {}".format("-".join([locus,'Seq',subregion,seq_primers[my_range][0]])))
                                                                    else:
                                                                            if seq_borders[my_range][1] > my_range.get_max():
                                                                                    seq_borders[my_range][1] = s_end
                                                                                    seq_primers[my_range][1] = primer
                                                                                    if gap > 0:
                                                                                            vprint("Warning: sequencing primer does not match template at 3' end. Sequence probably needs trimming on the high end: {}".format(my_seq.id))
                                                                            else:
                                                                                    if seq_borders[my_range][1] < s_end:
                                                                                            seq_borders[my_range][1] = s_end																		
                                                                                            seq_primers[my_range][1] = primer
                                                                                            if gap > 0:
                                                                                                    vprint("Warning: sequencing primer does not match template at 3' end. Sequence probably needs trimming on the high end: {}".format(my_seq.id))
                                                                                    vprint("Warning: multiple sequencing primers map in reverse direction on template. Using outermost site: {}".format("-".join([locus,'Seq',subregion,seq_primers[my_range][1]])))
                                                    else: 
                                                            print("Warning: sequencing primer failed to map within PCR primers: {}".format(my_seq.id))
                                                    ## Record best hits for reporting
                                                    best_table = blast_table[blast_table['bit score'] >= best_in_range['bit score']] #This may be too stringent; may need to revisit
                                                    #~ print("Identified {} hits above threshold used for best in range".format(len(best_table)))
                                                    blast_combined = pd.concat([blast_combined,best_table],sort=True) ##Note: this is compatible with pandas 0.23 +; older versions will fail.
                                            else:
                                                    print("Warning: sequencing primer does not map to within PCR product. Exporting all matches for {}".format(my_seq.id))
                                                    blast_combined = pd.concat([blast_combined,blast_table],sort=True) ##Note: this is compatible with pandas 0.23 +; older versions will fail.
                    ##Export sequencing start sites
                    basename = locus
                    if subregion != 'All':
                        basename += '_' + subregion
                    for my_range in range_list:
                        if my_range in seq_primers: 
                            name = basename
                            name += '_{}_{}_{}'.format(seq_primers[my_range][0],seq_primers[my_range][1],os.path.basename(os.path.splitext(blast_db)[0])) ##Convoluted way to get the genome name
                            export_regions[locus][subregion][name] = {'contig':my_range.contig,'start':seq_borders[my_range][0]+1,'stop':seq_borders[my_range][1]-1}
                        else: ##seq_primers never got initialized because there is no match.
                            print("Notice: No sequencing primers for {} mapped with in the defined range for {}.".format(subregion,locus))
                    #I could add a way to orient the sequences (identify a reference primer)
            export_regions[locus]['OuterRange'] = range_list
        os.remove(temp_infile)
        os.remove(temp_outfile)
        if outfile != None:
                blast_combined.to_csv(outfile,index=False) ##columns=blast_default_headers+[ql_head,fh_head]
                export_line = 'Exported primer locations to '+outfile
    #             if __name__ != "__main__": ##Being called from an outside procedure...indent to indicated subsidiary position
    #                 export_line = '\t'+export_line
                print(export_line)
    #     current_verbose = default_verbose
        return export_regions
Ejemplo n.º 25
0
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio import SearchIO

humdb = "/mithril/Data/Pacbio/Aligned/151019_proc/blast/humiso_blast"

blastn_cline = NcbiblastnCommandline(query="temp.fasta",
                                     db=humdb,
                                     gapopen=1,
                                     gapextend=2,
                                     word_size=9,
                                     reward=1,
                                     evalue=10,
                                     outfmt=5,
                                     out="try.xml")

stdout, stderr = blastn_cline()

bres = SearchIO.read("try.xml", 'blast-xml')
SearchIO.write(bres, 'try.tsv', 'blast-tab')

##ok - this was nice, but can't output because blast is pairwise, and I think we actually want a MAF
Ejemplo n.º 26
0
def ensure_amplicons_not_in_exclusion(exclusion_blastdb, potential_amplicons, confirmed_amplicons, max_potential_amplicons=200):
    """
    Given a blast database of sequences we do not want amplicons to match to and a fasta file containing our
    potential amplicons, will blastn potential amplicons to make sure that they don't match too closely to the
    exclusion blastdb. Criteria for this: Top hit length can't be more than 40 base pairs (anything more than
    that might start getting amplified if we're really unlucky) and if more than one hit, can't have any two hits
    within 5000bp of each other, as those could also potentially amplify if we're really unlucky.
    Amplicons confirmed to meet these criteria will get written to confirmed_amplicons, which will overwrite any
    file that was already there.
    :param exclusion_blastdb: Path to exclusion blast database. In this pipeline, should have been created by
    make_all_exclusion_blast_db
    :param potential_amplicons: Path to potential amplicon fasta file. In this pipeline, should have been created by
    split_sequences_into_amplicons
    :param confirmed_amplicons: Path to your desired output confirmed amplicon file. Overwrites file if something
    was already there.
    """
    outstr = ''
    sequence_id = 1
    for potential_sequence in SeqIO.parse(potential_amplicons, 'fasta'):
        blastn = NcbiblastnCommandline(db=exclusion_blastdb,
                                       task='blastn',
                                       outfmt=5)
        stdout, stderr = blastn(stdin=str(potential_sequence.seq))
        top_hit_length = 999999  # Start this at ridiculouly high value

        # The hit location dict will store the locations of every blast hit to each contig.
        # Each contig is an entry into the dict, with each entry being a list of locations.
        # We'll later try every combination in each list to make sure no two matches are too close together.
        hit_location_dict = dict()
        records = NCBIXML.parse(StringIO(stdout))
        for record in records:
            try:
                top_hit_length = record.alignments[0].hsps[0].align_length
            except IndexError:  # Should happen if we don't have any hits at all.
                top_hit_length = 0
            for alignment in record.alignments:
                for hsp in alignment.hsps:
                    if alignment.title in hit_location_dict:
                        hit_location_dict[alignment.title].append(hsp.sbjct_start)
                    else:
                        hit_location_dict[alignment.title] = [hsp.sbjct_start]

        # Set up a flag that we'll turn to true if we find any sets of matches that are too close together.
        matches_too_close = False
        for contig in hit_location_dict:
            for i in range(len(hit_location_dict[contig])):
                for j in range(len(hit_location_dict[contig])):
                    if i != j:
                        # Make sure no two hits within 5000bp of each other.
                        if abs(hit_location_dict[contig][i] - hit_location_dict[contig][j]) < 5000:
                            matches_too_close = True
        # Allow writing to outstr if either we have no hits longer than a roughly two pcr primers (so 40ish bp)
        # Also can't have any two matches to the same contig within 5000bp of each other.
        if top_hit_length < 40 and matches_too_close is False:
            outstr += '>sequence' + str(sequence_id) + '\n'
            outstr += str(potential_sequence.seq) + '\n'
            sequence_id += 1
        if sequence_id > max_potential_amplicons:
            break
    with open(confirmed_amplicons, 'w') as f:
        f.write(outstr)
Ejemplo n.º 27
0
e_val = 10
name = sys.argv[1] #input_file
db_name = sys.argv[2]
haveXML = False 
out_file_name =  sys.argv[3] 
xml_file_name = out_file_name + ".xml"
novel_file_name = out_file_name + "_novel" 
mature_file_name = out_file_name + "_mature" 
db_fasta = sys.argv[4] #mirBASE (mature.fa)

#time_file_name = out_file_name + "_execution_time.txt"
#time_handle = open(time_file_name, "w")
#start = timeit.default_timer()

if not haveXML:
    blastn_cline = NcbiblastnCommandline(penalty=-5, reward=4, max_target_seqs=100, word_size=11, query=name, db=db_name, evalue=e_val,outfmt=5, out=xml_file_name)
    stdout, stderr = blastn_cline()
try:
    handle = open (xml_file_name)
    success = True
except IOError:
    pass

if success:
    novel = []
    mature = [] 
    records  = NCBIXML.parse(handle)
    record_index = SeqIO.index(name, "fasta")
    mirbase_index = SeqIO.index(db_fasta, "fasta")
    mir_dict = dict()
    mature_summary = open (mature_file_name+".csv",'w')
Ejemplo n.º 28
0
def blast (species_path_name,busco_result,query_file,species_out_path_name,pav_excel_name):
    '''
    input:1 contig
    input 2 busco_result provide species list
    input 3 blastn query file,pan gene
    intermediate_out: species_out_path=Path('../Pan_genome_data/c_blast_present_contig/')
    output 1: pav_excel name
    '''
    global pan_sh
    species_path=Path(species_path_name)
    species_95_list=extract_strain_id(busco_result)
    species_95_list.append("70-15")
    species_95_list.append("ina168")
    species_95_list.remove("magnaporthe_oryzae_70-15_8_proteins_T0")
    species_out_path=Path(species_out_path_name)
    excel_book=Workbook()
    pan_sh=excel_book.active
    species_count=1
    
    for species_id in species_95_list:
        # print (species_id+"\t")
        for species_file in species_path.glob(species_id.strip('\n')+".fasta"):
            # print (str(species_file)+"\n")
            species_count=species_count+1
            species_name=species_file.stem

            excel_species_name(species_count+1,species_name)

            species_file_path=str(species_file)
            species_out=species_out_path/(species_name+'.xml')
            # species_out_1=os.path.join("../test_why_00005_so_many_1/",species_name+'.txt')
            # blastdb(species_file_path)
            blast_cmd=NcbiblastnCommandline(
                cmd='/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.9.0+/bin/blastn',
                query=query_file,
                db=species_file_path,
                outfmt=5,
                out=species_out
                # perc_identity=95
            )
            if species_out.exists() is False:
                blast_cmd()
            if os.path.getsize(str(species_out)) == 0:
                blast_cmd()
            with open(species_out) as fl:
                for record in NCBIXML.parse(fl):
                    gene_name=record.query.split()[0]
                    if record.alignments:
                        max_flag=-1
                        #out_to_excel(species_count,record.query,1)
                        for alignment in record.alignments:
                            for hsp in alignment.hsps:
                                if max_flag == -1:
                                    identity_discriminant_for_length=hsp.align_length/record.query_length
                                    identity_discriminant_for_identity_perscent=hsp.identities/hsp.align_length
                                    max_flag=max_flag+2 
                                    if hsp.align_length < 100 and identity_discriminant_for_length < 0.5:
                                        out_to_excel(species_count,gene_name,0)
                                    elif identity_discriminant_for_length==1 and identity_discriminant_for_identity_perscent==1:
                                        out_to_excel(species_count,gene_name,4)
                                    elif identity_discriminant_for_length==1 and hsp.gaps == 0:
                                        out_to_excel(species_count,gene_name,3)
                                    elif hsp.align_length>record.query_length or record.query_length-hsp.align_length <=50:
                                        out_to_excel(species_count,gene_name,2)
                                    else:
                                        out_to_excel(species_count,gene_name,1)
                    else:
                        out_to_excel(species_count,gene_name,0)
    excel_book.save(pav_excel_name)
Ejemplo n.º 29
0
    return (min_, max_)


def _srange(begin, end):
    """ Return a set based on range """
    return set(range(begin, end))


def _hit_overlap(hsp1, hsp2):
    """ Determine whether the hits of two hsps overlap """
    hit1_begin, hit1_end = _minmax(hsp1.sbjct_start, hsp1.sbjct_end)
    hit2_begin, hit2_end = _minmax(hsp2.sbjct_start, hsp2.sbjct_end)

    hit1_range = _srange(hit1_begin, hit1_end)
    hit2_range = _srange(hit2_begin, hit2_end)

    return not hit1_range.isdisjoint(hit2_range)


if __name__ == '__main__':
    from Bio.Blast.Applications import NcbiblastnCommandline
    from Bio import SeqIO
    cmd = NcbiblastnCommandline(query='test/sul2_1_AF542061.fasta',
                                db='test/102637-001-018_k64-contigs.fa',
                                evalue=0.001)
    with pyBlastFlat(cmd, rm_tmp=False, min_cov=0.5, verbose=True) as pb:
        for record in pb:
            fasta = pyBlastFlat.fasta(record)
            print(fasta)
            print(SeqIO.write(fasta, sys.stdout, 'fasta'))
Ejemplo n.º 30
0
#!/usr/bin/env python3
# author : Andrew Smith
# date : 111820 @ 12:57
# file : blast.py
# description : provide initial statistics of a query against a blast searchable database
# NOTE: this script is specifically configured for one query and one subject.
import sys  # command line arguments
from time import time  # execution time
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIXML

if (len(sys.argv) != 3):
    print("Usage : ./main.py <DBname> <QueryFile>")
    sys.exit()

start_time = time()

dbname = sys.argv[1]
queryfile = sys.argv[2]

blastnCommandLine = NcbiblastnCommandline(query=queryfile,
                                          db=dbname,
                                          outfmt=5,
                                          out="results.xml")
stdout, stderr = blastnCommandLine()

for query in NCBIXML.parse(open("results.xml")):
    for alignment in query.alignments:
        for hsp in alignment.hsps:
            print(hsp)
Ejemplo n.º 31
0
    def getBlastHits(self):
        """ Function for blasting the handle sequence against the NCBI nt database to identify homologies
        """
        from Bio.Blast import NCBIWWW
        import sys
        import subprocess as sp
        sys.stdout = Unbuffered(sys.stdout)
 
        
        local=True
        if local:
            #localdb='/sw/data/uppnex/blast_databases/nt'
            localdb='/Users/erikborgstrom/localBioInfo/BLASTnt/nt'
            from Bio.Blast.Applications import NcbiblastnCommandline
            from Bio.Blast import NCBIXML
            from cStringIO import StringIO
            import time
            import os
            
            #setting up blast
            database=localdb
            blastsetting = 'strict'
            infile = open('tmp.fa','w')
            infile.write('>tmp\n'+self.sequence+'\n')
            infile.close()
            if blastsetting == 'strict':  cline = NcbiblastnCommandline(query=infile.name, db=database ,evalue=0.001, outfmt=5)#, out='tmp.blastout')
            elif blastsetting == 'sloppy':cline = NcbiblastnCommandline(query=infile.name, db=database ,evalue=0.001, outfmt=5, dust='no',perc_identity=80, task='blastn')#,out='tmp.blastout')
            cline =                               NcbiblastnCommandline(cmd='blastn', outfmt=5, query=infile.name, db=database, gapopen=5, gapextend=2, culling_limit=2)#,out='tmp.blastout')
            print str(cline)
            
            blast_handle = cline.__call__()
            #blastn = sp.Popen(cline.__str__().split(), stdout=sp.PIPE, stderr=sp.PIPE)
            #blastn.wait()
            #stdout, stderr = blastn.communicate()
            #print blastn.returncode
            #print cline.__str__().split()
            #blast_handle = stdout, stderr
            
            #print blast_handle
    
            blast_handle = StringIO(blast_handle[0])
            blast_handle.seek(0)
            #os.remove(infile.name)
        else:
            sys.stdout.write('getting blast hits for handle#'+str(self.id)+'\n')
            result_handle = NCBIWWW.qblast("blastn", "nr", '>tmp\n'+self.sequence,format_type='XML')
            sys.stdout.write('start parsing blast for handle#'+str(self.id)+'\n')
            from cStringIO import StringIO
            blast_handle = StringIO(result_handle.read())
            blast_handle.seek(0)
 
        from Bio.Blast import NCBIXML
        records = NCBIXML.parse(blast_handle)
        hits=0
        for blast_record in records:
            for alignment in blast_record.alignments:
                for hsp in alignment.hsps:
                    perc_identity = float(hsp.identities) 	/	float(hsp.align_length)	*100
                    perc_coverage = float(hsp.align_length)	/	float(blast_record.query_letters)*100
                    if perc_identity >= 90 and perc_coverage >= 90: hits +=1
        self.blastHits = hits
Ejemplo n.º 32
0
def search(sequence_filename, output_filename):
    """ Search for matches to known organisms from included 16S database.

    Parameters
    ----------
    sequence_filename : str
        Path to file with 16S rDNA sequences for unique OTUs
    output_filename: str
        Path to file where blast output file is saved

    Returns
    -------
    list of str
        List of PATRIC genome IDs for known organisms
    pandas.DataFrame
        Similarity information with OTU ID, genome ID, and percent similarity of match

    Raises
    ------
    Bio.Application.ApplicationError
        When there is an error running the blast command
    """

    # Run blast to search for matches to known organisms.
    # @todo Should it make me nervous to not use a fully-qualified path here?
    cmdline = NcbiblastnCommandline(
        cmd='blastn',
        query=sequence_filename,
        db=join(pkg_resources.resource_filename(__name__, 'data/db'), '16Sdb'),
        out=output_filename,
        outfmt=6,
        max_target_seqs=1,
        num_threads=4)
    cmdline()  # Raises ApplicationError when there is a problem

    # Parse the blast output file with the results. In output format 6, the first
    # field is the OTU ID from the query. The second field is the ID of match in
    # target database. In our case that is the PATRIC genome ID of the organism
    # with the matching 16S sequence. The third field is the percent similarity.
    genome_ids = set()
    query_ids = set()
    similarity = pd.DataFrame(columns=similarity_columns)
    with open(output_filename, 'r') as handle:
        for line in handle:
            fields = line.split()
            genome_ids.add(fields[1])
            if fields[0] not in query_ids:
                query_ids.add(fields[0])
                similarity = similarity.append(pd.Series(
                    [fields[0], fields[1],
                     float(fields[2])],
                    index=similarity_columns),
                                               ignore_index=True)
            else:
                current = similarity.loc[similarity['OTU_ID'] == fields[0]]
                if current.iloc[0]['GENOME_ID'] != fields[1]:
                    warn(
                        'OTU {0} matches already matched genome {1} and also matches genome {2}'
                        .format(fields[0], current.iloc[0]['GENOME_ID'],
                                fields[1]))

    return list(genome_ids), similarity
## For each genome in command line
for genome in args.genomes:

## 1. Save organism name by reading first line of genome file 
    genome_file=open(genome, 'r')                 ## open genome file
    first_line=genome_file.readline()             ## read first line
    first_line=first_line.rstrip('\n')            
    first_line=first_line.split(' ')              ## split first line 
    genus=first_line[1]                           ## save genus
    species=first_line[2]                         ## save species
    organism=("{}_{}").format(genus, species)     ## name
    genome_file.close()
    print (organism)

## 2. Blast on genome file
    blastn_cline=NcbiblastnCommandline(query=args.query_sequence, db=genome, outfmt=" '7 std sseq' ", out=organism + "_blast_results.tsv", soft_masking=True) 
    ## '7 std sseq' to obtain a tabular output file with comments line, with standard information and with sequence of aligned part of subject sequence
    ## max_target_seqs=10 save only first 10 best results
    ## soft_masking true as suggested by literature
    os.system(str(blastn_cline))                  ## save output 

## 3. Compare line to save in FASTA format only the result with highest bit-score or lowest e-value
    blast_results=open(organism + "_blast_results.tsv", 'r')      ## open output file
    fasta_sequence=open(organism + "_ortholog_sequence.fa", 'w')  ## open a new file
    for line in blast_results:
        line=line.rstrip('\n')
        if not line.startswith('#'):                              ## if not header
            first_line=line.split('\t')                          
            first_e_value=float(first_line[10])                   ## save lowest e-value
            first_bit_score=float(first_line[11])                 ## save highest bit-score
            subject_id=organism + "," + first_line[1]             ## save ID
Ejemplo n.º 34
0
def _blast_primer(primer_fasta_path,
                  db_path,
                  evalue_cutoff=1000,
                  min_total_mismatch_portion=0.2,
                  min_total_mismatch=6,
                  min_prime_3_mismatch=2,
                  prime_3_length=5,
                  alt_pos_cutoff=2000,
                  max_product_size=5000,
                  word_size=7):
    """
    Take a fasta file as input, query genome db and count qualified hits

    Parameters
    ----------
    primer_fasta_path
    db_path
    evalue_cutoff
    min_total_mismatch_portion
    min_total_mismatch
    min_prime_3_mismatch
    prime_3_length
    alt_pos_cutoff
    max_product_size
    word_size

    Returns
    -------

    """
    # run blastn for all primers
    temp_dir = pathlib.Path(primer_fasta_path).parent

    blast_cline = NcbiblastnCommandline(query=str(primer_fasta_path),
                                        db=db_path,
                                        evalue=evalue_cutoff,
                                        outfmt=5,
                                        word_size=word_size,
                                        out=str(temp_dir / (primer_fasta_path.stem +
                                                            "_blast_result.xml")),
                                        task='blastn')
    blast_cline()

    # parse blast result
    blast_results = SearchIO.parse(temp_dir / (primer_fasta_path.stem +
                                               "_blast_result.xml"), "blast-xml")
    primer_hit_dict = {}
    for blast_result in blast_results:
        primer_length = blast_result.seq_len
        primer_total_mismatch = max(min_total_mismatch, min_total_mismatch_portion * primer_length)
        alternate_hsps = []
        for hit in blast_result:
            for hsp in hit.hsps:
                prime_5_unmatch = [' ' for _ in range(hsp.query_range[0])]
                prime_3_unmatch = [' ' for _ in range(primer_length - hsp.query_range[1])]
                align_anno = prime_5_unmatch + list(hsp.aln_annotation['similarity']) + prime_3_unmatch
                align_anno = ''.join(align_anno)

                total_mismatch = primer_length - align_anno.count('|')
                if total_mismatch > primer_total_mismatch:
                    continue

                prime_3_mismatch = prime_3_length - align_anno[-prime_3_length:].count('|')
                if prime_3_mismatch > min_prime_3_mismatch:
                    continue
                alternate_hsps.append(hsp)
        *primer_name, direction = blast_result.id.split('_')
        primer_name = '_'.join(primer_name)
        append_pos = 0 if direction == 'l' else 1
        if primer_name not in primer_hit_dict:
            primer_hit_dict[primer_name] = [[], []]
        primer_hit_dict[primer_name][append_pos] += alternate_hsps
    primer_hit_records = {}
    for primer, (left_hits, right_hits) in primer_hit_dict.items():
        if (len(left_hits) > alt_pos_cutoff) or (len(right_hits) > alt_pos_cutoff):
            continue
        else:
            valid_product_lengths = []
            positive_strand_hit = [hit for hit in left_hits if hit.hit_strand == 1] + \
                                  [hit for hit in right_hits if hit.hit_strand == 1]
            negative_strand_hit = [hit for hit in left_hits if hit.hit_strand == -1] + \
                                  [hit for hit in right_hits if hit.hit_strand == -1]

            for positive_hit in positive_strand_hit:
                for negative_hit in negative_strand_hit:
                    # hit not in same chrom
                    if positive_hit.hit_id != negative_hit.hit_id:
                        continue
                    else:
                        product_size = abs(positive_hit.hit_range[0] - negative_hit.hit_range[1])
                        # left right too far away
                        if product_size > max_product_size:
                            continue

                    valid_product_lengths.append(str(product_size))
            primer_hit_records[primer] = {
                'LEFT_GENOME_HITS': len(left_hits),
                'RIGHT_GENOME_HITS': len(right_hits),
                'POTENTIAL_PRODUCTS': len(valid_product_lengths),
                'POTENTIAL_PRODUCT_LENGTHS': '|'.join(valid_product_lengths),
            }
    primer_hit_df = pd.DataFrame(primer_hit_records).T
    return primer_hit_df