Ejemplo n.º 1
0
def _sam_to_bam(bam_fn):
    if bam_fn.endswith("bam"):
        bam_out = "%s.bam" % os.path.splitext(bam_fn)[0]
        cmd = "samtools view -Sbh {bam_fn} -o {bam_out}"
        do.run(cmd)
        return bam_out
    return bam_fn
Ejemplo n.º 2
0
def _create_clusters(seqL, bam_file, args):
    """
    Cluster sequences and
    create metaclusters with multi-mappers.
    """
    clus_obj = []
    cluster_file = op.join(args.out, "cluster.bed")
    if not os.path.exists(op.join(args.out, 'list_obj.pk')):
        if not file_exists(cluster_file):
            logger.info("Parsing aligned file")
            logger.info("Merging sequences")
            bedtools = os.path.join(os.path.dirname(sys.executable),
                                    "bedtools")
            bedtools = bedtools if os.path.exists(bedtools) else "bedtools"
            parse_cmd = "awk '{i=i+1;print $1\"\\t\"$2\"\\t\"$3\"\\t\"$4\"\\t\"i\"\\t\"$6}'"
            cmd = "{bedtools} bamtobed -i {bam_file} | {parse_cmd} | {bedtools} cluster -s -d 20 -i - > {cluster_file}"
            do.run(cmd.format(**locals()))
        c = pybedtools.BedTool(cluster_file)
        logger.info("Creating clusters")
        clus_obj = detect_clusters(c, seqL, args.min_seqs, args.non_un_gl)
        with open(op.join(args.out, 'list_obj.pk'), 'wb') as output:
            pickle.dump(clus_obj, output, pickle.HIGHEST_PROTOCOL)
    else:
        logger.info("Loading previous clusters")
        with open(op.join(args.out, 'list_obj.pk'), 'rb') as input:
            clus_obj = pickle.load(input)
    # bedfile = pybedtools.BedTool(generate_position_bed(clus_obj), from_string=True)
    # seqs_2_loci = bedfile.intersect(pybedtools.BedTool(aligned_bed, from_string=True), wo=True, s=True)
    # seqs_2_position = add_seqs_position_to_loci(seqs_2_loci, seqL)
    logger.info("%s clusters found" % (len(clus_obj.clusid)))
    return clus_obj
Ejemplo n.º 3
0
def _create_clusters(seqL, bam_file, args):
    """
    Cluster sequences and
    create metaclusters with multi-mappers.
    """
    clus_obj = []
    cluster_file = op.join(args.out, "cluster.bed")
    if not os.path.exists(op.join(args.out, 'list_obj.pk')):
        if not file_exists(cluster_file):
            logger.info("Parsing aligned file")
            logger.info("Merging sequences")
            bedtools = os.path.join(os.path.dirname(sys.executable), "bedtools")
            bedtools = bedtools if os.path.exists(bedtools) else "bedtools"
            parse_cmd = "awk '{i=i+1;print $1\"\\t\"$2\"\\t\"$3\"\\t\"$4\"\\t\"i\"\\t\"$6}'"
            cmd = "{bedtools} bamtobed -i {bam_file} | {parse_cmd} | {bedtools} cluster -s -d 20 -i - > {cluster_file}"
            do.run(cmd.format(**locals()))
        c = pybedtools.BedTool(cluster_file)
        logger.info("Creating clusters")
        clus_obj = detect_clusters(c, seqL, args.min_seqs, args.non_un_gl)
        with open(op.join(args.out, 'list_obj.pk'), 'wb') as output:
            pickle.dump(clus_obj, output, pickle.HIGHEST_PROTOCOL)
    else:
        logger.info("Loading previous clusters")
        with open(op.join(args.out, 'list_obj.pk'), 'rb') as input:
            clus_obj = pickle.load(input)
    # bedfile = pybedtools.BedTool(generate_position_bed(clus_obj), from_string=True)
    # seqs_2_loci = bedfile.intersect(pybedtools.BedTool(aligned_bed, from_string=True), wo=True, s=True)
    # seqs_2_position = add_seqs_position_to_loci(seqs_2_loci, seqL)
    logger.info("%s clusters found" % (len(clus_obj.clusid)))
    return clus_obj
Ejemplo n.º 4
0
def clean_bam_file(bam_in, mask=None):
    """
    Remove from alignment reads with low counts and highly # of hits
    """
    seq_obj = defaultdict(int)
    if mask:
        mask_file = op.splitext(bam_in)[0] + "_mask.bam"
        if not file_exists(mask_file):
            pybedtools.BedTool(bam_file).intersect(b=mask, v=True).saveas(mask_file)
        bam_in = mask_file
    out_file = op.splitext(bam_in)[0] + "_rmlw.bam"
    # bam.index(bam_in, {'algorithm':{}})
    run("samtools index %s" % bam_in)
    if not file_exists(bam_in + ".bai"):
        raise IOError("Failed to created bam index of %s. Try to do it manually" % bam_in)
    bam_handle = pysam.AlignmentFile(bam_in, "rb")
    with pysam.AlignmentFile(out_file, "wb", template=bam_handle) as out_handle:
        for read in bam_handle.fetch():
            seq_name = int(read.query_name.replace('seq_', ''))
            match_size = [nts for oper, nts in read.cigartuples if oper == 0]
            subs_size = [nts for oper, nts in read.cigartuples if oper == 4]
            if match_size[0] < 17:
                continue
            if subs_size:
                if subs_size[0] > 3:
                    continue
            try:
                nh = read.get_tag('NH')
            except KeyError:
                nh = 1
            seq_obj[seq_name] = sequence(seq_name)
            seq_obj[seq_name].align = nh
            out_handle.write(read)
    return out_file, seq_obj
Ejemplo n.º 5
0
def _sam_to_bam(bam_fn):
    if bam_fn.endswith("bam"):
        bam_out = "%s.bam" % os.path.splitext(bam_fn)[0]
        cmd = "samtools view -Sbh {bam_fn} -o {bam_out}"
        do.run(cmd)
        return bam_out
    return bam_fn
Ejemplo n.º 6
0
def _run_tRNA_scan(fasta_file):
    """
    Run tRNA-scan-SE to predict tRNA
    """
    out_file = fasta_file + "_trnascan"
    se_file = fasta_file + "_second_str"
    cmd = "tRNAscan-SE -q -o {out_file} -f {se_file} {fasta_file}"
    run(cmd.format(**locals()))
    return out_file, se_file
Ejemplo n.º 7
0
def _run_tRNA_scan(fasta_file):
    """
    Run tRNA-scan-SE to predict tRNA
    """
    out_file = fasta_file + "_trnascan"
    se_file = fasta_file + "_second_str"
    cmd = "tRNAscan-SE -q -o {out_file} -f {se_file} {fasta_file}"
    run(cmd.format(**locals()))
    return out_file, se_file
Ejemplo n.º 8
0
def _cmd_miraligner(fn, out_file, species, hairpin, out):
    """
    Run miraligner for miRNA annotation
    """
    tool = _get_miraligner()
    path_db = op.dirname(op.abspath(hairpin))
    cmd = "{tool} -freq -i {fn} -o {out_file} -s {species} -db {path_db} -sub 1 -trim 3 -add 3"
    if not file_exists(out_file):
        logger.info("Running miraligner with %s" % fn)
        do.run(cmd.format(**locals()), "miraligner with %s" % fn)
        shutil.move(out_file + ".mirna", out_file)
    return out_file
Ejemplo n.º 9
0
def _cmd_miraligner(fn, out_file, species, hairpin, out):
    """
    Run miraligner for miRNA annotation
    """
    tool = _get_miraligner()
    path_db = op.dirname(op.abspath(hairpin))
    cmd = "{tool} -freq -i {fn} -o {out_file} -s {species} -db {path_db} -sub 1 -trim 3 -add 3"
    if not file_exists(out_file):
        logger.info("Running miraligner with %s" % fn)
        do.run(cmd.format(**locals()), "miraligner with %s" % fn)
        shutil.move(out_file + ".mirna", out_file)
    return out_file
Ejemplo n.º 10
0
def detect_regions(bam_in, bed_file, out_dir, prefix):
    """
    Detect regions using first CoRaL module
    """
    bed_file = _reorder_columns(bed_file)
    counts_reads_cmd = ("coverageBed -s -counts -b {bam_in} "
                        "-a {bed_file} | sort -k4,4 "
                        "> {out_dir}/loci.cov")
    # with tx_tmpdir() as temp_dir:
    with utils.chdir(out_dir):
        run(counts_reads_cmd.format(min_trimmed_read_len=min_trimmed_read_len, max_trimmed_read_len=max_trimmed_read_len, **locals()), "Run counts_reads")
        loci_file = _fix_score_column(op.join(out_dir, "loci.cov"))
        return loci_file
Ejemplo n.º 11
0
def _download_mirbase(args, version="CURRENT"):
    """
    Download files from mirbase
    """
    if not args.hairpin or not args.mirna:
        logger.info("Working with version %s" % version)
        hairpin_fn = op.join(op.abspath(args.out), "hairpin.fa.gz")
        mirna_fn = op.join(op.abspath(args.out), "miRNA.str.gz")
        if not file_exists(hairpin_fn):
            cmd_h = "wget ftp://mirbase.org/pub/mirbase/%s/hairpin.fa.gz -O %s &&  gunzip -f !$" % (version, hairpin_fn)
            do.run(cmd_h, "download hairpin")
        if not file_exists(mirna_fn):
            cmd_m = "wget ftp://mirbase.org/pub/mirbase/%s/miRNA.str.gz -O %s && gunzip -f !$" % (version, mirna_fn)
            do.run(cmd_m, "download mirna")
    else:
        return args.hairpin, args.mirna
Ejemplo n.º 12
0
def deprecated_map_to_precursors(seqs, names, loci, out_file, args):
    """map sequences to precursors with bowtie"""
    with make_temp_directory() as temp:
        pre_fasta = os.path.join(temp, "pre.fa")
        seqs_fasta = os.path.join(temp, "seqs.fa")
        out_sam = os.path.join(temp, "out.sam")
        pre_fasta = get_loci_fasta(loci, pre_fasta, args.ref)
        out_precursor_file = out_file.replace("tsv", "fa")
        seqs_fasta = get_seqs_fasta(seqs, names, seqs_fasta)
        if find_cmd("bowtie2-build"):
            cmd = "bowtie2-build -f {pre_fasta} {temp}/pre"
            run(cmd.format(**locals()))
            cmd = "bowtie2 -a --rdg 7,3 --mp 4 --end-to-end -D 20 -R 3 -N 0 -i S,1,0.8 -L 3 -f -x  {temp}/pre -U {seqs_fasta} -S {out_sam}"
            run(cmd.format(**locals()))
            out_file = read_alignment(out_sam, loci, seqs, out_file)
            shutil.copy(pre_fasta, out_precursor_file)
    return out_file
Ejemplo n.º 13
0
def map_to_precursors(seqs, names, loci, out_file, args):
    """map sequences to precursors with razers3"""
    with make_temp_directory() as temp:
        pre_fasta = os.path.join(temp, "pre.fa")
        seqs_fasta = os.path.join(temp, "seqs.fa")
        out_sam = os.path.join(temp, "out.sam")
        pre_fasta = get_loci_fasta(loci, pre_fasta, args.ref)
        out_precursor_file = out_file.replace("tsv", "fa")
        seqs_fasta = get_seqs_fasta(seqs, names, seqs_fasta)

        # print(open(pre_fasta).read().split("\n")[1])
        if find_cmd("razers3"):
            cmd = "razers3 -dr 2 -i 80 -rr 90 -f -o {out_sam} {temp}/pre.fa  {seqs_fasta}"
            run(cmd.format(**locals()))
            out_file = read_alignment(out_sam, loci, seqs, out_file)
            shutil.copy(pre_fasta, out_precursor_file)
    return out_file
Ejemplo n.º 14
0
def deprecated_map_to_precursors(seqs, names, loci, out_file, args):
    """map sequences to precursors with bowtie"""
    with make_temp_directory() as temp:
        pre_fasta = os.path.join(temp, "pre.fa")
        seqs_fasta = os.path.join(temp, "seqs.fa")
        out_sam = os.path.join(temp, "out.sam")
        pre_fasta = get_loci_fasta(loci, pre_fasta, args.ref)
        out_precursor_file = out_file.replace("tsv", "fa")
        seqs_fasta = get_seqs_fasta(seqs, names, seqs_fasta)
        if find_cmd("bowtie2-build"):
            cmd = "bowtie2-build -f {pre_fasta} {temp}/pre"
            run(cmd.format(**locals()))
            cmd = "bowtie2 -a --rdg 7,3 --mp 4 --end-to-end -D 20 -R 3 -N 0 -i S,1,0.8 -L 3 -f -x  {temp}/pre -U {seqs_fasta} -S {out_sam}"
            run(cmd.format(**locals()))
            out_file = read_alignment(out_sam, loci, seqs, out_file)
            shutil.copy(pre_fasta, out_precursor_file)
    return out_file
Ejemplo n.º 15
0
def map_to_precursors(seqs, names, loci, out_file, args):
    """map sequences to precursors with razers3"""
    with make_temp_directory() as temp:
        pre_fasta = os.path.join(temp, "pre.fa")
        seqs_fasta = os.path.join(temp, "seqs.fa")
        out_sam = os.path.join(temp, "out.sam")
        pre_fasta = get_loci_fasta(loci, pre_fasta, args.ref)
        out_precursor_file = out_file.replace("tsv", "fa")
        seqs_fasta = get_seqs_fasta(seqs, names, seqs_fasta)

        # print(open(pre_fasta).read().split("\n")[1])
        if find_cmd("razers3"):
            cmd = "razers3 -dr 2 -i 80 -rr 90 -f -o {out_sam} {temp}/pre.fa  {seqs_fasta}"
            run(cmd.format(**locals()))
            out_file = read_alignment(out_sam, loci, seqs, out_file)
            shutil.copy(pre_fasta, out_precursor_file)
    return out_file
Ejemplo n.º 16
0
def _download_mirbase(args, version="CURRENT"):
    """
    Download files from mirbase
    """
    if not args.hairpin or not args.mirna:
        logger.info("Working with version %s" % version)
        hairpin_fn = op.join(op.abspath(args.out), "hairpin.fa.gz")
        mirna_fn = op.join(op.abspath(args.out), "miRNA.str.gz")
        if not file_exists(hairpin_fn):
            cmd_h = "wget ftp://mirbase.org/pub/mirbase/%s/hairpin.fa.gz -O %s &&  gunzip -f !$" % (
                version, hairpin_fn)
            do.run(cmd_h, "download hairpin")
        if not file_exists(mirna_fn):
            cmd_m = "wget ftp://mirbase.org/pub/mirbase/%s/miRNA.str.gz -O %s && gunzip -f !$" % (
                version, mirna_fn)
            do.run(cmd_m, "download mirna")
    else:
        return args.hairpin, args.mirna
Ejemplo n.º 17
0
def _get_miraligner():
    opts = "-Xms750m -Xmx4g"
    try:
        tool = "miraligner"
        ret = os.system(tool)
        if ret != 0:
            raise SystemExit("%s not installed." % tool)
    except SystemExit:
        tool = None
        pass
    if not tool:
        if not utils.file_exists(op.abspath("miraligner.jar")):
            url = "https://raw.githubusercontent.com/lpantano/seqbuster/miraligner/modules/miraligner/miraligner.jar"
            cmd = ["wget", "-O miraligner.jar", "--no-check-certificate", url]
            do.run(" ".join(cmd), "Download miraligner.")
        tool = "java -jar {opts} %s" % op.abspath("miraligner.jar")
    else:
        tool = "%s {opts}" % tool
    return tool.format(**locals())
Ejemplo n.º 18
0
def get_fasta(bed_file, ref, out_fa):
    """Run bedtools to get fasta from bed file"""
    cmd = "bedtools getfasta -s -fi {ref} -bed {bed_file} -fo {out_fa}"
    run(cmd.format(**locals()))
Ejemplo n.º 19
0
def get_fasta(bed_file, ref, out_fa):
    """Run bedtools to get fasta from bed file"""
    cmd = "bedtools getfasta -s -fi {ref} -bed {bed_file} -fo {out_fa}"
    run(cmd.format(**locals()))
Ejemplo n.º 20
0
def create_features(bam_in, loci_file, reference, out_dir):
    """
    Use feature extraction module from CoRaL
    """
    lenvec_plus = op.join(out_dir, 'genomic_lenvec.plus')
    lenvec_minus = op.join(out_dir, 'genomic_lenvec.minus')
    compute_genomic_cmd = ("compute_genomic_lenvectors "
                           "{bam_in} {lenvec_plus} "
                           "{lenvec_minus} "
                           "{min_len} "
                           "{max_len} ")
    index_genomic_cmd = ("index_genomic_lenvectors "
                         "{lenvec} ")
    genomic_lenvec = op.join(out_dir, 'genomic_lenvec')
    feat_len_file = op.join(out_dir, 'feat_lengths.txt')
    compute_locus_cmd = ("compute_locus_lenvectors "
                         "{loci_file} "
                         "{genomic_lenvec} "
                         "{min_len} "
                         "{max_len} "
                         "> {feat_len_file}")
    cov_S_file = op.join(out_dir, 'loci.cov_anti')
    coverage_anti_cmd = ("coverageBed -S -counts -b "
                         "{bam_in} -a {loci_file} "
                         "> {cov_S_file}")
    feat_posentropy = op.join(out_dir, 'feat_posentropy.txt')
    entropy_cmd = ("compute_locus_entropy.rb "
                   "{counts_reads} "
                   "> {feat_posentropy}")
    with utils.chdir(out_dir):
        run(compute_genomic_cmd.format(min_len=min_trimmed_read_len, max_len=max_trimmed_read_len, **locals()), "Run compute_genomic")
        run(index_genomic_cmd.format(lenvec=lenvec_plus), "Run index in plus")
        run(index_genomic_cmd.format(lenvec=lenvec_minus), "Run index in minus")
        run(compute_locus_cmd.format(min_len=min_trimmed_read_len, max_len=max_trimmed_read_len, **locals()), "Run compute locus")
        run(coverage_anti_cmd.format(**locals()), "Run coverage antisense")
        feat_antisense = _order_antisense_column(cov_S_file, min_trimmed_read_len)

        counts_reads = _reads_per_position(bam_in, loci_file, out_dir)
        run(entropy_cmd.format(**locals()), "Run entropy")

        rnafold = calculate_structure(loci_file, reference)