Example #1
0
def clean_bam_file(bam_in, mask=None):
    """
    Remove from alignment reads with low counts and highly # of hits
    """
    seq_obj = defaultdict(int)
    if mask:
        mask_file = op.splitext(bam_in)[0] + "_mask.bam"
        if not file_exists(mask_file):
            pybedtools.BedTool(bam_file).intersect(b=mask, v=True).saveas(mask_file)
        bam_in = mask_file
    out_file = op.splitext(bam_in)[0] + "_rmlw.bam"
    # bam.index(bam_in, {'algorithm':{}})
    run("samtools index %s" % bam_in)
    if not file_exists(bam_in + ".bai"):
        raise IOError("Failed to created bam index of %s. Try to do it manually" % bam_in)
    bam_handle = pysam.AlignmentFile(bam_in, "rb")
    with pysam.AlignmentFile(out_file, "wb", template=bam_handle) as out_handle:
        for read in bam_handle.fetch():
            seq_name = int(read.query_name.replace('seq_', ''))
            match_size = [nts for oper, nts in read.cigartuples if oper == 0]
            subs_size = [nts for oper, nts in read.cigartuples if oper == 4]
            if match_size[0] < 17:
                continue
            if subs_size:
                if subs_size[0] > 3:
                    continue
            try:
                nh = read.get_tag('NH')
            except KeyError:
                nh = 1
            seq_obj[seq_name] = sequence(seq_name)
            seq_obj[seq_name].align = nh
            out_handle.write(read)
    return out_file, seq_obj
Example #2
0
def miraligner(args):
    """
    Realign BAM hits to miRBAse to get better accuracy and annotation
    """
    hairpin, mirna = _download_mirbase(args)
    precursors = _read_precursor(args.hairpin, args.sps)
    matures = _read_mature(args.mirna, args.sps)
    gtf = _read_gtf(args.gtf)
    out_dts = []
    for bam_fn in args.files:
        sample = op.splitext(op.basename(bam_fn))[0]
        if bam_fn.endswith("bam") or bam_fn.endswith("sam"):
            logger.info("Reading %s" % bam_fn)
            bam_fn = _sam_to_bam(bam_fn)
            bam_sort_by_n = op.splitext(bam_fn)[0] + "_sort"
            pysam.sort("-n", bam_fn, bam_sort_by_n)
            reads = _read_bam(bam_sort_by_n + ".bam", precursors)
        elif bam_fn.endswith("fasta") or bam_fn.endswith("fa") or bam_fn.endswith("fastq"):
            out_file = op.join(args.out, sample + ".premirna")
            bam_fn = _filter_seqs(bam_fn)
            if args.miraligner:
                _cmd_miraligner(bam_fn, out_file, args.sps, args.hairpin)
                reads = _read_miraligner(out_file)
            else:
                if bam_fn.endswith("fastq"):
                    bam_fn = _convert_to_fasta(bam_fn)
                logger.info("Aligning %s" % bam_fn)
                if not file_exists(out_file):
                    pyMatch.Miraligner(hairpin, bam_fn, out_file, 1, 4)
                reads = _read_pyMatch(out_file, precursors)
        else:
            raise ValueError("Format not recognized.")

        if not args.miraligner:
            reads = _annotate(reads, matures, precursors)
        out_file = op.join(args.out, sample + ".mirna")
        out_file, dt, dt_pre= _tab_output(reads, out_file, sample)
        try:
            vcf_file = op.join(args.out, sample + ".vcf")
            if not file_exists(vcf_file):
            # if True:
                create_vcf(dt_pre, matures, gtf, vcf_file)
            try:
                import vcf
                vcf.Reader(filename=vcf_file)
            except Exception as e:
                logger.warning(e.__doc__)
                logger.warning(e.message)
        except Exception as e:
            # traceback.print_exc()
            logger.warning(e.__doc__)
            logger.warning(e.message)
        if isinstance(dt, pd.DataFrame):
            out_dts.append(dt)

    if out_dts:
        _create_counts(out_dts, args.out)
        # _summarize(out_dts)
    else:
        print "No files analyzed!"
Example #3
0
def _single_cluster(c, data, out_file, args):
    """
    Map sequences on precursors and create
    expression profile
    """
    valid, ann = 0, 0
    raw_file = None
    freq = defaultdict()
    [freq.update({s.keys()[0]: s.values()[0]}) for s in data[0][c]['freq']]
    names = [s.keys()[0] for s in data[0][c]['seqs']]
    seqs = [s.values()[0] for s in data[0][c]['seqs']]
    loci = data[0][c]['loci']

    if loci[0][3] - loci[0][2] > 500:
        logger.info("locus bigger > 500 nt, skipping: %s" % loci)
        return valid, ann, {}
    if not file_exists(out_file):
        if args.razer:
            logger.debug("map with razer all sequences to all loci %s " % loci)
            map_to_precursors(seqs, names, {loci[0][0]: [loci[0][0:5]]},
                              out_file, args)
        else:
            logger.debug("map with C fn all sequences to all loci %s " % loci)
            if args.debug:
                raw_file = out_file
            out_file = map_to_precursors_on_fly(seqs, names, loci[0][0:5],
                                                args)

    logger.debug("plot sequences on loci")
    df = _convert_to_df(out_file, freq, raw_file)
    if df:
        valid, ann = _make(data[0][c])

    return valid, ann, df
Example #4
0
def _download_mirbase(args, version="CURRENT"):
    """
    Download files from mirbase
    """
    if not args.hairpin or not args.mirna:
        logger.info("Working with version %s" % version)
        hairpin_fn = op.join(op.abspath(args.out), "hairpin.fa.gz")
        mirna_fn = op.join(op.abspath(args.out), "miRNA.str.gz")
        if not file_exists(hairpin_fn):
            cmd_h = "wget ftp://mirbase.org/pub/mirbase/%s/hairpin.fa.gz -O %s &&  gunzip -f !$" % (version, hairpin_fn)
            do.run(cmd_h, "download hairpin")
        if not file_exists(mirna_fn):
            cmd_m = "wget ftp://mirbase.org/pub/mirbase/%s/miRNA.str.gz -O %s && gunzip -f !$" % (version, mirna_fn)
            do.run(cmd_m, "download mirna")
    else:
        return args.hairpin, args.mirna
Example #5
0
def _single_cluster(c, data, out_file, args):
    """
    Map sequences on precursors and create
    expression profile
    """
    valid, ann = 0, 0
    raw_file = None
    freq = defaultdict()
    [freq.update({list(s.keys())[0]: list(s.values())[0]}) for s in data[0][c]['freq']]
    names = [list(s.keys())[0] for s in data[0][c]['seqs']]
    seqs = [list(s.values())[0] for s in data[0][c]['seqs']]
    loci = data[0][c]['loci']

    if loci[0][3] - loci[0][2] > 500:
        logger.info("locus bigger > 500 nt, skipping: %s" % loci)
        return valid, ann, {}
    if not file_exists(out_file):
        if args.razer:
            logger.debug("map with razer all sequences to all loci %s " % loci)
            map_to_precursors(seqs, names, {loci[0][0]: [loci[0][0:5]]}, out_file, args)
        else:
            logger.debug("map with biopython fn all sequences to all loci %s " % loci)
            if args.debug:
                raw_file = out_file
            out_file = map_to_precursor_biopython(seqs, names, loci[0][0:5], args)

    logger.debug("plot sequences on loci")
    df = _convert_to_df(out_file, freq, raw_file)
    
    if df:
        logger.debug("create html")
        valid, ann = _make(data[0][c])
    logger.debug("done single cluster")
    return valid, ann, df
Example #6
0
def _create_clusters(seqL, bam_file, args):
    """
    Cluster sequences and
    create metaclusters with multi-mappers.
    """
    clus_obj = []
    cluster_file = op.join(args.out, "cluster.bed")
    if not os.path.exists(op.join(args.out, 'list_obj.pk')):
        if not file_exists(cluster_file):
            logger.info("Parsing aligned file")
            logger.info("Merging sequences")
            bedtools = os.path.join(os.path.dirname(sys.executable), "bedtools")
            bedtools = bedtools if os.path.exists(bedtools) else "bedtools"
            parse_cmd = "awk '{i=i+1;print $1\"\\t\"$2\"\\t\"$3\"\\t\"$4\"\\t\"i\"\\t\"$6}'"
            cmd = "{bedtools} bamtobed -i {bam_file} | {parse_cmd} | {bedtools} cluster -s -d 20 -i - > {cluster_file}"
            do.run(cmd.format(**locals()))
        c = pybedtools.BedTool(cluster_file)
        logger.info("Creating clusters")
        clus_obj = detect_clusters(c, seqL, args.min_seqs, args.non_un_gl)
        with open(op.join(args.out, 'list_obj.pk'), 'wb') as output:
            pickle.dump(clus_obj, output, pickle.HIGHEST_PROTOCOL)
    else:
        logger.info("Loading previous clusters")
        with open(op.join(args.out, 'list_obj.pk'), 'rb') as input:
            clus_obj = pickle.load(input)
    # bedfile = pybedtools.BedTool(generate_position_bed(clus_obj), from_string=True)
    # seqs_2_loci = bedfile.intersect(pybedtools.BedTool(aligned_bed, from_string=True), wo=True, s=True)
    # seqs_2_position = add_seqs_position_to_loci(seqs_2_loci, seqL)
    logger.info("%s clusters found" % (len(clus_obj.clusid)))
    return clus_obj
Example #7
0
def _create_clusters(seqL, bam_file, args):
    """
    Cluster sequences and
    create metaclusters with multi-mappers.
    """
    clus_obj = []
    logger.info("Parsing aligned file")
    cluster_file = op.join(args.out, "cluster.bed")
    if not os.path.exists(args.out + '/list_obj.pk'):
        logger.info("Merging position")
        if not file_exists(cluster_file):
            aligned_bed = parse_align_file(bam_file)
            a = pybedtools.BedTool(aligned_bed, from_string=True)
            c = a.cluster(s=True, d=20)
            c.saveas(cluster_file)
        else:
            c = pybedtools.BedTool(cluster_file)
        logger.info("Creating clusters")
        clus_obj = detect_clusters(c, seqL, args.min_seqs, args.non_un_gl)
        with open(args.out + '/list_obj.pk', 'wb') as output:
            pickle.dump(clus_obj, output, pickle.HIGHEST_PROTOCOL)
    else:
        logger.info("Loading previous clusters")
        with open(args.out + '/list_obj.pk', 'rb') as input:
            clus_obj = pickle.load(input)
    # bedfile = pybedtools.BedTool(generate_position_bed(clus_obj), from_string=True)
    # seqs_2_loci = bedfile.intersect(pybedtools.BedTool(aligned_bed, from_string=True), wo=True, s=True)
    # seqs_2_position = add_seqs_position_to_loci(seqs_2_loci, seqL)
    logger.info("%s clusters found" % (len(clus_obj.clusid)))
    return clus_obj
Example #8
0
def _create_clusters(seqL, bam_file, args):
    """
    Cluster sequences and
    create metaclusters with multi-mappers.
    """
    clus_obj = []
    cluster_file = op.join(args.out, "cluster.bed")
    if not os.path.exists(op.join(args.out, 'list_obj.pk')):
        if not file_exists(cluster_file):
            logger.info("Parsing aligned file")
            logger.info("Merging sequences")
            bedtools = os.path.join(os.path.dirname(sys.executable), "bedtools")
            bedtools = bedtools if os.path.exists(bedtools) else "bedtools"
            parse_cmd = "awk '{i=i+1;print $1\"\\t\"$2\"\\t\"$3\"\\t\"$4\"\\t\"i\"\\t\"$6}'"
            cmd = "{bedtools} bamtobed -i {bam_file} | {parse_cmd} | {bedtools} cluster -s -d 20 -i - > {cluster_file}"
            do.run(cmd.format(**locals()))
        c = pybedtools.BedTool(cluster_file)
        logger.info("Creating clusters")
        clus_obj = detect_clusters(c, seqL, args.min_seqs, args.non_un_gl)
        with open(op.join(args.out, 'list_obj.pk'), 'wb') as output:
            pickle.dump(clus_obj, output, pickle.HIGHEST_PROTOCOL)
    else:
        logger.info("Loading previous clusters")
        with open(op.join(args.out, 'list_obj.pk'), 'rb') as input:
            clus_obj = pickle.load(input)
    # bedfile = pybedtools.BedTool(generate_position_bed(clus_obj), from_string=True)
    # seqs_2_loci = bedfile.intersect(pybedtools.BedTool(aligned_bed, from_string=True), wo=True, s=True)
    # seqs_2_position = add_seqs_position_to_loci(seqs_2_loci, seqL)
    logger.info("%s clusters found" % (len(clus_obj.clusid)))
    return clus_obj
Example #9
0
def _filter_seqs(fn):
    """Convert names of sequences to unique ids"""
    out_file = op.splitext(fn)[0] + "_unique.fa"
    idx = 0
    if not file_exists(out_file):
        with open(out_file, 'w') as out_handle:
            with open(fn) as in_handle:
                for line in in_handle:
                    if line.startswith("@") or line.startswith(">"):
                        fixed_name = _make_unique(line.strip(), idx)
                        seq = in_handle.next().strip()
                        counts = _get_freq(fixed_name)
                        if len(seq) < 26 and (counts > 1 or counts == 0):
                            idx += 1
                            #print >>out_handle, fixed_name
                            print(fixed_name, file=out_handle)
                            #print >>out_handle, seq
                            print(seq, file=out_handle)
                        try:
                            if line.startswith("@"):
                                in_handle.next()
                                in_handle.next()
                        except:
                            pass

    return out_file
Example #10
0
def _download_mirbase(args, version="CURRENT"):
    """
    Download files from mirbase
    """
    if not args.hairpin or not args.mirna:
        logger.info("Working with version %s" % version)
        hairpin_fn = op.join(op.abspath(args.out), "hairpin.fa.gz")
        mirna_fn = op.join(op.abspath(args.out), "miRNA.str.gz")
        if not file_exists(hairpin_fn):
            cmd_h = "wget ftp://mirbase.org/pub/mirbase/%s/hairpin.fa.gz -O %s &&  gunzip -f !$" % (
                version, hairpin_fn)
            do.run(cmd_h, "download hairpin")
        if not file_exists(mirna_fn):
            cmd_m = "wget ftp://mirbase.org/pub/mirbase/%s/miRNA.str.gz -O %s && gunzip -f !$" % (
                version, mirna_fn)
            do.run(cmd_m, "download mirna")
    else:
        return args.hairpin, args.mirna
Example #11
0
def _single_cluster(c, data, out_file, args):
    """
    Map sequences on precursors and create
    expression profile
    """
    valid, ann = 0, 0
    raw_file = None
    figure_file = out_file.replace(".tsv", ".png")
    html_file = out_file.replace(".tsv", ".html")
    prefix = os.path.dirname(out_file)
    freq = defaultdict()
    [freq.update({s.keys()[0]: s.values()[0]}) for s in data[0][c]['freq']]
    names = [s.keys()[0] for s in data[0][c]['seqs']]
    seqs = [s.values()[0] for s in data[0][c]['seqs']]
    loci = data[0][c]['loci']

    if loci[0][3] - loci[0][2] > 500:
        logger.info("locus bigger > 500 nt, skipping: %s" % loci)
        return valid, ann, {}
    if not file_exists(out_file):
        if args.razer:
            logger.debug("map with razer all sequences to all loci %s " % loci)
            map_to_precursors(seqs, names, {loci[0][0]: [loci[0][0:5]]}, out_file, args)
        else:
            logger.debug("map with C fn all sequences to all loci %s " % loci)
            if args.debug:
                raw_file = out_file
            out_file = map_to_precursors_on_fly(seqs, names, loci[0][0:5], args)

    logger.debug("plot sequences on loci")
    df = _convert_to_df(out_file, freq, raw_file)
    if df:
        if not file_exists(figure_file):
            fig = plt.figure()
            for s in df:
                plt.plot(df[s].keys(), df[s].values())
            plt.ylabel('Normalized expression', fontsize=15)
            plt.xlabel('Position', fontsize=15)
            plt.savefig(figure_file)
            plt.close(fig)
        valid, ann = _make_html(data[0][c], html_file, figure_file, prefix)

    return valid, ann, df
Example #12
0
def _cmd_miraligner(fn, out_file, species, hairpin, out):
    """
    Run miraligner for miRNA annotation
    """
    tool = _get_miraligner()
    path_db = op.dirname(op.abspath(hairpin))
    cmd = "{tool} -freq -i {fn} -o {out_file} -s {species} -db {path_db} -sub 1 -trim 3 -add 3"
    if not file_exists(out_file):
        logger.info("Running miraligner with %s" % fn)
        do.run(cmd.format(**locals()), "miraligner with %s" % fn)
        shutil.move(out_file + ".mirna", out_file)
    return out_file
Example #13
0
def _cmd_miraligner(fn, out_file, species, hairpin, out):
    """
    Run miraligner for miRNA annotation
    """
    tool = _get_miraligner()
    path_db = op.dirname(op.abspath(hairpin))
    cmd = "{tool} -freq -i {fn} -o {out_file} -s {species} -db {path_db} -sub 1 -trim 3 -add 3"
    if not file_exists(out_file):
        logger.info("Running miraligner with %s" % fn)
        do.run(cmd.format(**locals()), "miraligner with %s" % fn)
        shutil.move(out_file + ".mirna", out_file)
    return out_file
Example #14
0
def _check_args(args):
    """
    check arguments before starting analysis.
    """
    logger.info("Checking parameters and files")
    args.dir_out = args.out
    args.samplename = "pro"
    global decision_cluster
    global similar
    if not os.path.isdir(args.out):
        logger.warning("the output folder doens't exists")
        os.mkdirs(args.out)
    if args.bed and args.gtf:
        logger.error("cannot provide -b and -g at the same time")
        raise SyntaxError
    if args.debug:
        logger.info("DEBUG messages will be showed in file.")
    if args.bed:
        args.list_files = args.bed
        args.type_ann = "bed"
    if args.gtf:
        args.list_files = args.gtf
        args.type_ann = "gtf"
    logger.info("Output dir will be: %s" % args.dir_out)
    if not all([file_exists(args.ffile), file_exists(args.afile)]):
        logger.error("I/O error: Seqs.ma or Seqs.bam. ")
        raise IOError("Seqs.ma or/and Seqs.bam doesn't exists.")
    if hasattr(args, 'list_files'):
        beds = args.list_files.split(",")
        for filebed in beds:
            if not file_exists(filebed):
                logger.error("I/O error: {0}".format(filebed))
                raise IOError("%s  annotation files doesn't exist" % filebed)
    param.decision_cluster = args.method
    if args.similar:
        param.similar = float(args.similar)
    if args.min_seqs:
        param.min_seqs = int(args.min_seqs)
    return args
Example #15
0
def _check_args(args):
    """
    check arguments before starting analysis.
    """
    logger.info("Checking parameters and files")
    args.dir_out = args.out
    args.samplename = "pro"
    global decision_cluster
    global similar
    if not os.path.isdir(args.out):
        logger.warning("the output folder doens't exists")
        os.mkdirs(args.out)
    if args.bed and args.gtf:
        logger.error("cannot provide -b and -g at the same time")
        raise SyntaxError
    if args.debug:
        logger.info("DEBUG messages will be showed in file.")
    if args.bed:
        args.list_files = args.bed
        args.type_ann = "bed"
    if args.gtf:
        args.list_files = args.gtf
        args.type_ann = "gtf"
    logger.info("Output dir will be: %s" % args.dir_out)
    if not all([file_exists(args.ffile), file_exists(args.afile)]):
        logger.error("I/O error: Seqs.ma or Seqs.bam. ")
        raise IOError("Seqs.ma or/and Seqs.bam doesn't exists.")
    if hasattr(args, 'list_files'):
        beds = args.list_files.split(",")
        for filebed in beds:
            if not file_exists(filebed):
                logger.error("I/O error: {0}".format(filebed))
                raise IOError("%s  annotation files doesn't exist" % filebed)
    param.decision_cluster = args.method
    if args.similar:
        param.similar = float(args.similar)
    if args.min_seqs:
        param.min_seqs = int(args.min_seqs)
    return args
Example #16
0
def detect_complexity(bam_in, genome):
    """
    genome coverage of small RNA
    """
    if not genome:
        logger.info("No genome given. skipping.")
        return None
    out_file = bam_in + "_cov.tsv"
    if file_exists(out_file):
        return None
    fai = genome + ".fai"
    cov = pybedtools.BedTool(bam_in).genome_coverage(g=fai, max=1)
    cov.saveas(out_file)
    total = 0
    for region in cov:
        if region[0] == "genome" and int(region[1]) != 0:
            total += float(region[4])
    logger.info("Total genome with sequences: %s " % total)
Example #17
0
def _get_miraligner():
    opts = "-Xms750m -Xmx4g"
    try:
        tool = "miraligner"
        ret = os.system(tool)
        if ret != 0:
            raise SystemExit("%s not installed." % tool)
    except SystemExit:
        tool = None
        pass
    if not tool:
        if not utils.file_exists(op.abspath("miraligner.jar")):
            url = "https://raw.githubusercontent.com/lpantano/seqbuster/miraligner/modules/miraligner/miraligner.jar"
            cmd = ["wget", "-O miraligner.jar", "--no-check-certificate", url]
            do.run(" ".join(cmd), "Download miraligner.")
        tool = "java -jar {opts} %s" % op.abspath("miraligner.jar")
    else:
        tool = "%s {opts}" % tool
    return tool.format(**locals())
Example #18
0
def _make_html(c, html_file, figure_file, prefix):
    """
    create html from template, adding figure,
    annotation and sequences counts
    """
    ann = defaultdict(list)
    seqs_table = []
    src_img = "<img src=\"%s\" width=\"800\" height=\"350\" />" % os.path.basename(figure_file)
    coor_list = [" ".join(map(str, l)) for l in c['loci']]

    for pos in c['ann']:
        for db in pos:
            ann[db] += list(pos[db])
    logger.debug(ann)

    valid = [l for l in c['valid']]
    ann_list = [", ".join(list(set(ann[feature]))) for feature in ann if feature in valid]

    seqs = [s.values()[0] for s in c['seqs']]
    freq = [map(float, s.values()[0].values()) for s in c['freq']]
    header = ['seq'] + c['freq'][0].values()[0].keys()
    for s, f in zip(seqs, freq):
        f = map(round, f)
        seqs_table.append([s] + map(str, f))
    # seqs_html = seqs_html.replace("TABLE", "TABLE id=\"keywords\"")
    if not file_exists(html_file):
        coor_html = HTML.list(coor_list)
        ann_html = HTML.list(ann_list)
        seqs_html = HTML.table(seqs_table,
                               header_row=header, attribs={'id': 'keywords'})
        html_template = os.path.normpath(os.path.join(os.path.dirname(os.path.realpath(templates.__file__)), "cluster"))
        content = open(html_template).read()
        data = {'profile': src_img,
                'loci': coor_html,
                'annotation': ann_html,
                'table': seqs_html}
        out_content = string.Template(content).safe_substitute(data)
        with open(html_file, 'w') as out_handle:
            print >>out_handle, out_content

    return valid, ann_list
Example #19
0
def _filter_seqs(fn):
    """Convert names of sequences to unique ids"""
    out_file = op.splitext(fn)[0] + "_unique.fa"
    idx = 0
    if not file_exists(out_file):
        with open(out_file, 'w') as out_handle:
            with open(fn) as in_handle:
                line = in_handle.readline()
                while line:
                    if line.startswith("@") or line.startswith(">"):
                        fixed_name = _make_unique(line.strip(), idx)
                        seq = in_handle.readline().strip()
                        counts = _get_freq(fixed_name)
                        if len(seq) < 26 and (counts > 1 or counts == 0):
                            idx += 1
                            print(fixed_name, file=out_handle, end="\n")
                            print(seq, file=out_handle, end="\n")
                        if line.startswith("@"):
                            in_handle.readline()
                            in_handle.readline()
                    line = in_handle.readline()
    return out_file
Example #20
0
def _filter_seqs(fn):
    """Convert names of sequences to unique ids"""
    out_file = op.splitext(fn)[0] + "_unique.fa"
    idx = 0
    if not file_exists(out_file):
        with open(out_file, 'w') as out_handle:
            with open(fn) as in_handle:
                line = in_handle.readline()
                while line:
                    if line.startswith("@") or line.startswith(">"):
                        fixed_name = _make_unique(line.strip(), idx)
                        seq = in_handle.readline().strip()
                        counts = _get_freq(fixed_name)
                        if len(seq) < 26 and (counts > 1 or counts == 0):
                            idx += 1
                            print(fixed_name, file=out_handle, end="\n")
                            print(seq, file=out_handle, end="\n")
                        if line.startswith("@"):
                            in_handle.readline()
                            in_handle.readline()
                    line = in_handle.readline()
    return out_file
Example #21
0
def _filter_seqs(fn):
    """Convert names of sequences to unique ids"""
    out_file = op.splitext(fn)[0] + "_unique.fa"
    idx = 0
    if not file_exists(out_file):
        with open(out_file, 'w') as out_handle:
            with open(fn) as in_handle:
                for line in in_handle:
                    if line.startswith("@") or line.startswith(">"):
                        fixed_name = _make_unique(line.strip(), idx)
                        seq = in_handle.next().strip()
                        counts = _get_freq(fixed_name)
                        if len(seq) < 26 and (counts > 1 or counts == 0):
                            idx += 1
                            print >>out_handle, fixed_name
                            print >>out_handle, seq
                        try:
                            if line.startswith("@"):
                                in_handle.next()
                                in_handle.next()
                        except:
                            pass

    return out_file
Example #22
0
def cluster(args):
    args = _check_args(args)
    read_stats_file = op.join(args.dir_out, "read_stats.tsv")
    if file_exists(read_stats_file):
        os.remove(read_stats_file)

    bam_file, seq_obj = _clean_alignment(args)

    logger.info("Parsing matrix file")
    seqL, y, l = parse_ma_file(seq_obj, args.ffile)
    # y, l = _total_counts(seqL.keys(), seqL)
    logger.info("counts after: %s" % sum(y.values()))
    logger.info("# sequences after: %s" % l)
    dt = pd.DataFrame({'sample': y.keys(), 'counts': y.values()})
    dt['step'] = 'aligned'
    dt.to_csv(read_stats_file, sep="\t", index=False, header=False, mode='a')

    if len(seqL.keys()) < 10:
        logger.error("It seems you have low coverage. Please check your fastq files have enough sequences.")
        raise ValueError("So few sequences.")

    logger.info("Cleaning bam file")
    y, l = _total_counts(seqL.keys(), seqL)
    logger.info("counts after: %s" % sum(y.values()))
    logger.info("# sequences after: %s" % l)
    dt = pd.DataFrame({'sample': y.keys(), 'counts': y.values()})
    dt['step'] = 'cleaned'
    dt.to_csv(read_stats_file, sep="\t", index=False, header=False, mode='a')

    clusL = _create_clusters(seqL, bam_file, args)
    y, l = _total_counts(clusL.seq.keys(), clusL.seq, aligned=True)
    logger.info("counts after: %s" % sum(y.values()))
    logger.info("# sequences after: %s" % l)
    dt = pd.DataFrame({'sample': y.keys(), 'counts': y.values()})
    dt['step'] = 'clusters'
    dt.to_csv(read_stats_file, sep="\t", index=False, header=False, mode='a')

    logger.info("Solving multi-mapping events in the network of clusters")
    clusLred = _cleaning(clusL, args.dir_out)
    y, l = _total_counts(clusLred.clus, seqL)
    logger.info("counts after: %s" % sum(y.values()))
    logger.info("# sequences after: %s" % l)
    dt = pd.DataFrame({'sample': y.keys(), 'counts': y.values()})
    dt['step'] = 'meta-cluster'
    dt.to_csv(read_stats_file, sep="\t", index=False, header=False, mode='a')
    logger.info("Clusters up to %s" % (len(clusLred.clus.keys())))

    if args.show:
        logger.info("Creating sequences alignment to precursor")
        clusLred = show_seq(clusLred, args.index)

    clusLred = peak_calling(clusLred)

    clusLred = _annotate(args, clusLred)
    logger.info("Creating json and count matrix")

    json_file = _create_json(clusLred, args)
    logger.info("Output file in: %s" % args.dir_out)

    if args.db:
        name = args.db + ".db"
        logger.info("Create database: database/" + name)
        data = load_data(json_file)
        out_dir = op.join(args.dir_out, "database")
        make_database(data, name, out_dir)
    logger.info("Finished")
Example #23
0
def cluster(args):
    args = _check_args(args)
    read_stats_file = op.join(args.dir_out, "read_stats.tsv")
    if file_exists(read_stats_file):
        os.remove(read_stats_file)

    bam_file, seq_obj = _clean_alignment(args)

    logger.info("Parsing matrix file")
    seqL, y, l = parse_ma_file(seq_obj, args.ffile)
    # y, l = _total_counts(seqL.keys(), seqL)
    logger.info("counts after: %s" % sum(y.values()))
    logger.info("# sequences after: %s" % l)
    dt = pd.DataFrame({'sample': y.keys(), 'counts': y.values()})
    dt['step'] = 'aligned'
    dt.to_csv(read_stats_file, sep="\t", index=False, header=False, mode='a')

    if len(seqL.keys()) < 10:
        logger.error("It seems you have so low coverage. Please check your fastq files have enough sequences.")
        raise ValueError("So few sequences.")

    logger.info("Cleaning bam file")
    y, l = _total_counts(seqL.keys(), seqL)
    logger.info("counts after: %s" % sum(y.values()))
    logger.info("# sequences after: %s" % l)
    dt = pd.DataFrame({'sample': y.keys(), 'counts': y.values()})
    dt['step'] = 'cleaned'
    dt.to_csv(read_stats_file, sep="\t", index=False, header=False, mode='a')

    clusL = _create_clusters(seqL, bam_file, args)
    y, l = _total_counts(clusL.seq.keys(), clusL.seq, aligned=True)
    logger.info("counts after: %s" % sum(y.values()))
    logger.info("# sequences after: %s" % l)
    dt = pd.DataFrame({'sample': y.keys(), 'counts': y.values()})
    dt['step'] = 'clusters'
    dt.to_csv(read_stats_file, sep="\t", index=False, header=False, mode='a')

    logger.info("Solving multi-mapping events in the network of clusters")
    clusLred = _cleaning(clusL, args.dir_out)
    y, l = _total_counts(clusLred.clus, seqL)
    logger.info("counts after: %s" % sum(y.values()))
    logger.info("# sequences after: %s" % l)
    dt = pd.DataFrame({'sample': y.keys(), 'counts': y.values()})
    dt['step'] = 'meta-cluster'
    dt.to_csv(read_stats_file, sep="\t", index=False, header=False, mode='a')
    logger.info("Clusters up to %s" % (len(clusLred.clus.keys())))

    if args.show:
        logger.info("Creating sequences alignment to precursor")
        clusLred = show_seq(clusLred, args.index)

    clusLred = peak_calling(clusLred)

    clusLred = _annotate(args, clusLred)
    logger.info("Creating json and count matrix")

    json_file = _create_json(clusLred, args)
    logger.info("Output file in: %s" % args.dir_out)

    if args.db:
        name = args.db + ".db"
        logger.info("Create database: database/" + name)
        data = load_data(json_file)
        out_dir = op.join(args.dir_out, "database")
        make_database(data, name, out_dir)
    logger.info("Finished")
Example #24
0
def miraligner(args):
    """
    Realign BAM hits to miRBAse to get better accuracy and annotation
    """
    hairpin, mirna = _download_mirbase(args)
    precursors = _read_precursor(args.hairpin, args.sps)
    matures = _read_mature(args.mirna, args.sps)
    gtf = _read_gtf(args.gtf)
    out_dts = []
    out_files = []
    for bam_fn in args.files:
        sample = op.splitext(op.basename(bam_fn))[0]
        logger.info("Reading %s" % bam_fn)
        if bam_fn.endswith("bam") or bam_fn.endswith("sam"):
            bam_fn = _sam_to_bam(bam_fn)
            bam_sort_by_n = op.splitext(bam_fn)[0] + "_sort"
            pysam.sort("-n", bam_fn, bam_sort_by_n)
            reads = _read_bam(bam_sort_by_n + ".bam", precursors)
        elif bam_fn.endswith("fasta") or bam_fn.endswith("fa") or \
                bam_fn.endswith("fastq"):
            if args.collapse:
                bam_fn = _collapse_fastq(bam_fn)
            out_file = op.join(args.out, sample + ".premirna")
            bam_fn = _filter_seqs(bam_fn)
            if args.miraligner:
                _cmd_miraligner(bam_fn, out_file, args.sps, args.hairpin,
                                args.out)
                reads = _read_miraligner(out_file)
                out_files.append(out_file)
        else:
            raise ValueError("Format not recognized.")

        if args.miraligner:
            _mirtop(out_files, args.hairpin, args.gtf, args.sps, args.out)

        if not args.miraligner:
            reads = _annotate(reads, matures, precursors)

        out_file = op.join(args.out, sample + ".mirna")
        out_file, dt, dt_pre = _tab_output(reads, out_file, sample)
        try:
            vcf_file = op.join(args.out, sample + ".vcf")
            if not file_exists(vcf_file):
                # if True:
                create_vcf(dt_pre, matures, gtf, vcf_file)
            try:
                import vcf
                vcf.Reader(filename=vcf_file)
            except Exception as e:
                logger.warning(e.__doc__)
                logger.warning(e.message)
        except Exception as e:
            # traceback.print_exc()
            logger.warning(e.__doc__)
            logger.warning(e.message)
        if isinstance(dt, pd.DataFrame):
            out_dts.append(dt)

    if out_dts:
        _create_counts(out_dts, args.out)
    else:
        print("No files analyzed!")