Beispiel #1
0
def wrapper_bedtools_intersect2(bedfile1, bedfile2, outfile=None):
    """
    Using two bedfile to get the intsersection of pairs
    :param bigg_one:
    :param bigg_two:
    :return:
    """
    if outfile is None:
        prefix1 = get_file_prefix(bedfile1)
        prefix2 = get_file_prefix(bedfile2)
        location = get_file_location(bedfile1)

        outfile = location + "/" + "_".join([prefix1, prefix2]) + ".bed"

    sort_cmd1 = "bedtools sort -i {bed} > {bed}_s".format(bed=bedfile1)
    sort_cmd2 = "bedtools sort -i {bed} > {bed}_s".format(bed=bedfile2)

    _ = myexe(sort_cmd1)
    _ = myexe(sort_cmd2)

    # generate the bedfile

    cmd = "bedtools intersect -wa -wb -a {bedfile1}_s -b {bedfile2}_s>{out}".format(
        bedfile1=bedfile1, bedfile2=bedfile2, out=outfile)

    _ = myexe(cmd)

    ### cleanup
    bed1s = bedfile1 + "_s"
    bed2s = bedfile2 + "_s"
    del_files([bedfile1, bedfile2, bed1s, bed2s])

    return outfile
Beispiel #2
0
def bwa_index_wrapper(ref_file):
    """
    :param ref_file:
    :return:
    """
    cmd_index="bwa index {ref}".format(ref=ref_file)
    myexe(cmd_index)
    return ref_file
Beispiel #3
0
def adaptor_blast(query,dbpatch="adaptor.fasta"):
    # build the blast db, maybe adding an asserting to identify the exsentise of the db is better
    db=dbpatch.split(".")[0]
    print myexe("makeblastdb -in %s -dbtype nucl -input_type fasta -out %s" % (dbpatch,db))

    blastn_cline = NcbiblastnCommandline(db=db, outfmt=5)
    out, err = blastn_cline(stdin=query)
    blast_records = NCBIXML.read(StringIO(out))  # return is a generator, need a loop to parse the result
    return blast_records
Beispiel #4
0
def wrapper_samtools_merge(work_dir, ref_file, bam_list, out=None):
    if out is None:
        out_s=ref_file.split("/")[-1].split(".")[0]+".bam"
        out=os.path.join(work_dir, out_s)

    merge_cmd="samtools merge {out_bam} {in_bam}".format(
        out_bam=out, in_bam=" ".join(bam_list))
    print(merge_cmd)
    myexe(merge_cmd)

    return sort_index_wrapper(out)
Beispiel #5
0
def sort_index_wrapper(bamfile, core=1, bam_sorted=None):
    if bam_sorted==None:
        bam_sorted=bamfile.split(".")[0]+"_s.bam"
    else:
        pass

    sort_cmd="samtools sort {bamfile} -@ {core} -o {bam_sorted}".format(
        bamfile=bamfile, core=core, bam_sorted=bam_sorted)
    index_cmd="samtools index {bam_sorted}".format(bam_sorted=bam_sorted)

    myexe(sort_cmd)
    myexe(index_cmd)
    return bam_sorted
Beispiel #6
0
def fq_subseq(fqin, namefile, fqout=None):
    """
    used in fq_subset_main()
    :param fqin:454
    :param namefile:
    :param fqout:
    :return:
    """
    if fqout==None:
        fqout=fqin.split(".")[0]+"_s.fq"

    cmd_subseq = "seqtk subseq {fqin} {namefile} > {fq_out}".format(fqin=fqin, namefile=namefile, fq_out=fqout)
    print(cmd_subseq)
    myexe(cmd_subseq)
    return fqout
Beispiel #7
0
def exonerate_wrapper(query, target, outfile=False, geneticcode=5, score=100, bestn=None):
    """
    --geneticcode 5

    return is a outfile name in relative path
    todo: using stringIO to hinder the file IO
    """
    if bestn is None:
        bestn=len(fasta2dic(target)) # default, output one region for one query

    exonerate_cmd="exonerate {query} {target} \
                   --geneticcode {geneticcode} \
                   --score {score} \
                   --bestn {bestn} \
                   ".format(
                        query=query, target=target,
                        geneticcode=geneticcode,
                        score=score,
                        bestn=bestn,
                        )
    out=myexe(exonerate_cmd)

    ## trigger to write the outfile to disk
    if outfile:
        outname=query.split("/")[-1].split(".")[0]+".exonerate"
        with open(outname, "w") as fw:
            fw.write(outname)

    return out
Beispiel #8
0
def sra2fq_wrapper(sra_file, outdir):

    sra_cmd = "fastq-dump --split-files {} --outdir {}".format(sra_file, outdir)
    print(sra_cmd)
    print(myexe(sra_cmd))

    return outdir
Beispiel #9
0
def bwa_mem_wrapper(ref_file, fq_str, core=25, min_seed_length=20, band_width=2000, out="mapped.bam"):
    """
    a bwa mem mapper only collect the mapped reads
    :param ref_file:
    :param fq_str: a list contains the name of the sra -extracted
    :param core, min_seed_length and band_width is bwa mem parameter -t, -k and -w, respectively
    :return: the out bam file
    """

    cmd_bwa="bwa mem -k {band} -w {width} -t {core} {ref} \
        {fq_str} \
        | samtools view -F 4  -b -o {out}".format(
        band=min_seed_length, width=band_width, core=core,ref=ref_file,
        fq_str=fq_str,
        out=out)
    print(cmd_bwa)
    myexe(cmd_bwa)
    return out
Beispiel #10
0
def spades_wrapper(fq_name_dict,
                   outdir="spades_out",
                   core=12,
                   rna_model=False):
    """
    :param fq_name_dict: a dict with {"pe1-1": "ERRxxxx_1_s.fq","pe1-2": "ERRxxxx_1_s.fq", "s2":"ERRxxxx_1.fq"}
    :param outdir:
    :param core:
    :return: the file position of the scaf fasta and the fastg file
    """
    fq_str_list = []
    # to generate the readpool string used for spades
    lib_num = 1
    for k, v in fq_name_dict.iteritems():
        read_type = "pe" if len(v) >= 2 else "s"
        for fq_one_name in v:
            if read_type == "s":
                str_one = "--{read_type}{lib_num} {fq_one_name}".format(
                    read_type=read_type,
                    lib_num=lib_num,
                    fq_one_name=fq_one_name)
            elif read_type == "pe":
                if "_1" in fq_one_name or "_F" in fq_one_name:
                    fq_pos_ind = "-1"
                elif "_2" in fq_one_name or "_R" in fq_one_name:
                    fq_pos_ind = "-2"
                elif "_3" in fq_one_name:
                    fq_pos_ind = "-s"
                else:
                    fq_pos_ind = "-s"
                    print(
                        "Treat the PE reads as single reads!"
                        "Please re-check the origin fastq file, make sure _1, _2 or _F, _R in the paired file."
                    )

                str_one = "--{read_type}{lib_num}{fq_pos_ind} {fq_one_name}".format(
                    read_type=read_type,
                    lib_num=lib_num,
                    fq_pos_ind=fq_pos_ind,
                    fq_one_name=fq_one_name)
            fq_str_list.append(str_one)

        lib_num += 1
        fq_str = " ".join(fq_str_list)

    spades_bin = "rnaspades.py" if rna_model else "spades.py"
    spades_cmd = "{spades_bin} --only-assembler -t {core} {readpool} -o {outdir}".format(
        spades_bin=spades_bin, core=core, readpool=fq_str, outdir=outdir)
    print(spades_cmd)
    print(myexe(spades_cmd))

    scaf_fasta = os.path.join(outdir, "scaffolds.fasta")
    scaf_fastg = os.path.join(outdir, "assembly_graph.fastg")
    return scaf_fasta, scaf_fastg
Beispiel #11
0
def wrapper_bedtools_intersect2_v2(beddf1, beddf2, outfile=None):
    ## Input two sorted bed dataframe
    ## Use bedtools to intersect and return a dataframe

    ## Define tmp file 1,2,3
    bedfile1 = NamedTemporaryFile('w+t')
    bedfile2 = NamedTemporaryFile('w+t')
    beddf1.to_csv(bedfile1, sep="\t", header=False, index=False)
    beddf2.to_csv(bedfile2, sep="\t", header=False, index=False)

    # generate the bedfile
    ## command adapted to strand
    ## use sorted option to save memory
    with NamedTemporaryFile('w+t') as outfile:
        cmd = "bedtools intersect -wa -wb -s -sorted -a {bedfile1} -b {bedfile2} > {out}".format(
            bedfile1=bedfile1.name, bedfile2=bedfile2.name, out=outfile.name)
        _ = myexe(cmd)
        try:
            intersectDf = pandas.read_csv(outfile.name,
                                          sep="\t",
                                          header=None,
                                          dtype={
                                              0: str,
                                              1: int,
                                              2: int,
                                              3: object,
                                              4: str,
                                              5: str,
                                              6: str,
                                              7: int,
                                              8: int,
                                              9: str,
                                              10: str,
                                              11: str
                                          })
        except:
            intersectDf = pandas.DataFrame()  # Return empty dataframe

    ## Close tmp files, they will be automatically removed
    bedfile1.close()
    bedfile2.close()

    return intersectDf
Beispiel #12
0
def mitfi_wrapper_trna(fastafile, MITFIPATH=None, prefix=None):
    """
    mitfi.jar in in $MITFIPATH=./bins
    :return:teh filename of mitfi run
    """
    if MITFIPATH is None:
        path = os.path.dirname(__file__)
        MITFIPATH=os.path.join(path, "bins", "mitfi")
        #print MITFIPATH
    jarfile=os.path.join(MITFIPATH, "mitfi.jar")

    mitfi_cmd="java -jar {jarfile} {fastafile}".format(
        jarfile=jarfile, fastafile=fastafile)
    trna_out=myexe(mitfi_cmd)

    print(trna_out)
    if prefix is None:
        prefix=".".join(fastafile.split("/")[-1].split(".")[0:-1])
    with open(prefix+"_trna.txt", "w") as fw:
        fw.write(trna_out)

    return prefix+"_trna.txt"
Beispiel #13
0
def _cmsearch_wrapper_rrna(fastafile, MITFIPATH=None):

    """
    todo: too slow to be practical, maybe change to INFERNAL 1.1 and try
    mitfi.jar in in $MITFIPATH=./bins
    :return:teh filename of mitfi run
    """
    if MITFIPATH is None:
        path = os.path.dirname(__file__)
        MITFIPATH=os.path.join(path, "bins", "mitfi")
        #print MITFIPATH
    jarfile=os.path.join(MITFIPATH, "mitfi.jar")
    rrna_cm=os.path.join(os.path.dirname(__file__), "bins", "mitfi","r_rna.cm")

    mitfi_cmd = "java -jar {jarfile} -cm {rrna_cm} -top {fastafile}".format(
        jarfile=jarfile, fastafile=fastafile, rrna_cm=rrna_cm)
    rrna_out = myexe(mitfi_cmd)
    print rrna_out
    prefix=fastafile.split("/")[-1].split(".")[0]
    with open(prefix+"_rrna.txt", "w") as fw:
        fw.write(rrna_out)

    return prefix+"_rrna.txt"