コード例 #1
0
 def test_cigar(self):
     """testing cigar correction function"""
     cigar = [[0, 14], [1, 1], [0, 5]]
     from mirtop.mirna.realign import cigar_correction, make_cigar
     fixed = cigar_correction(cigar, "AAAAGCTGGGTTGAGGAGGA",
                              "AAAAGCTGGGTTGAGAGGA")
     if not fixed[0] == "AAAAGCTGGGTTGAGGAGGA":
         raise ValueError("Sequence 1 is not right.")
     if not fixed[1] == "AAAAGCTGGGTTGA-GAGGA":
         raise ValueError("Sequence 2 is not right.")
     if not make_cigar("AAA-AAATAAA", "AGACAAA-AAA") == "MGMD3MI3M":
         raise ValueError("Cigar not eq to MAMDMMMIMMM: %s" %
                          make_cigar("AAA-AAATAAA", "AGACAAA-AAA"))
コード例 #2
0
def tune(seq, precursor, start, cigar):
    """
    The actual fn that will realign the sequence
    """
    if cigar:
        seq, mature = cigar_correction(cigar, seq, precursor[start:])
    else:
        seq, mature, score, p, size = align(seq,
                                            precursor[start:start + len(seq)])
        cigar = make_cigar(seq, mature)
    if seq.startswith("-"):
        seq = seq[1:]
    if seq.endswith("-"):
        seq = seq[:-1]
    logger.debug("TUNE:: %s %s %s" % (cigar, seq, mature))
    error = set()
    pattern_addition = [[1, 1, 0], [1, 0, 1], [0, 1, 0], [0, 1, 1], [0, 0, 1],
                        [1, 1, 1]]
    for pos in range(0, len(seq)):
        if seq[pos] != mature[pos]:
            error.add(pos)

    subs, add = [], []
    for e in error:
        if e < len(seq) - 3:
            subs.append([e, seq[e], mature[e]])

    pattern, error_add = [], []
    for e in range(len(seq) - 3, len(seq)):
        if e in error:
            pattern.append(1)
            error_add.append(e)
        else:
            pattern.append(0)
    for p in pattern_addition:
        if pattern == p:
            add = seq[error_add[0]:].replace("-", "")
            break
    if not add and error_add:
        for e in error_add:
            subs.append([e, seq[e], mature[e]])

    return subs, add, make_cigar(seq, mature)
コード例 #3
0
ファイル: test_functions.py プロジェクト: miRTop/mirtop
 def test_cigar(self):
     """testing cigar correction function"""
     cigar = [[0, 14], [1, 1], [0, 5]]
     from mirtop.mirna.realign import cigar_correction, make_cigar, \
         cigar2snp, expand_cigar
     fixed = cigar_correction(cigar, "AAAAGCTGGGTTGAGGAGGA",
                              "AAAAGCTGGGTTGAGAGGA")
     if not fixed[0] == "AAAAGCTGGGTTGAGGAGGA":
         raise ValueError("Sequence 1 is not right.")
     if not fixed[1] == "AAAAGCTGGGTTGA-GAGGA":
         raise ValueError("Sequence 2 is not right.")
     if not make_cigar("AAA-AAATAAA", "AGACAAA-AAA") == "MAMD3MI3M":
         raise ValueError("Cigar not eq to MAMD3MI3M: %s" %
                          make_cigar("AAA-AAATAAA", "AGACAAA-AAA"))
     # test expand cigar
     if not expand_cigar("3MA3M") == "MMMAMMM":
         raise ValueError("Cigar 3MA3M not eqaul to MMMAMMM but to %s" %
                          expand_cigar("3MA3M"))
     # test cigar to snp
     if not cigar2snp("3MA3M", "AAATCCC")[0] == [3, "A", "T"]:
         raise ValueError("3MA3M not equal AAATCCC but %s" %
                          cigar2snp("3MA3M", "AAATCCC"))
コード例 #4
0
ファイル: test_functions.py プロジェクト: chapmanb/mirtop
 def test_cigar(self):
     """testing cigar correction function"""
     cigar = [[0, 14], [1, 1], [0, 5]]
     from mirtop.mirna.realign import cigar_correction, make_cigar, \
         cigar2snp, expand_cigar
     fixed = cigar_correction(cigar, "AAAAGCTGGGTTGAGGAGGA",
                              "AAAAGCTGGGTTGAGAGGA")
     if not fixed[0] == "AAAAGCTGGGTTGAGGAGGA":
         raise ValueError("Sequence 1 is not right.")
     if not fixed[1] == "AAAAGCTGGGTTGA-GAGGA":
         raise ValueError("Sequence 2 is not right.")
     if not make_cigar("AAA-AAATAAA", "AGACAAA-AAA") == "MAMD3MI3M":
         raise ValueError("Cigar not eq to MAMD3MI3M: %s" %
                          make_cigar("AAA-AAATAAA", "AGACAAA-AAA"))
     # test expand cigar
     if not expand_cigar("3MA3M") == "MMMAMMM":
         raise ValueError("Cigar 3MA3M not eqaul to MMMAMMM but to %s" %
                          expand_cigar("3MA3M"))
     # test cigar to snp
     if not cigar2snp("3MA3M", "AAATCCC")[0] == [3, "A", "T"]:
         raise ValueError("3MA3M not equal AAATCCC but %s" %
                          cigar2snp("3MA3M", "AAATCCC"))
コード例 #5
0
def create_iso(name, mir, seq, numsim, exp):
    data = dict()
    reads = dict()
    full_read = list()
    clean_read = list()
    seen = set()
    for mirna in mir[name]:
        info = mir[name][mirna]
        mirSeq = seq[info[0]:info[1] + 1]
        for rand in range(int(numsim)):
            # expression
            e = 1
            if exp:
                trial = random.randint(1, 100)
                p = random.randint(1, 50) / 50.0
                e = numpy.random.negative_binomial(trial, p, 1)[0]
            iso = realign.isomir()
            randSeq, iso.start, iso.t5, iso.t3, iso.subs, iso.add = variation(
                info, seq)
            if randSeq in seen:
                continue
            seen.add(randSeq)
            iso.end = iso.start + len(randSeq)
            aln = realign.align(randSeq, seq[iso.start:iso.end])
            iso.cigar = realign.make_cigar(aln[0], aln[1])
            iso.mirna = mirna
            query_name = "%s.%s.%s" % (mirna, iso.format_id("."), randSeq)
            reads[query_name] = realign.hits()
            reads[query_name].set_sequence(randSeq)
            reads[query_name].counts = e
            reads[query_name].set_precursor(name, iso)
            full_read.extend(create_read(randSeq, e))
            clean_read.append([
                randSeq,
                e,
            ])
            # print [randSeq, mutLab, addTag, t5Lab, t3Lab, mirSeq]
            # data[randSeq] = [exp, iso] # create real object used in code to generate GFF
    write_fastq(full_read, full_fq)
    write_collapse_fastq(clean_read, clean_fq)
    gff = body.create(reads, "miRBase21", "sim1")
    return gff
コード例 #6
0
ファイル: miRNA.simulator.py プロジェクト: miRTop/mirtop
def create_iso(name, mir, seq, numsim, exp):
    data = dict()
    reads = dict()
    full_read = list()
    clean_read = list()
    seen = set()
    for mirna in mir[name]:
        info = mir[name][mirna]
        mirSeq = seq[info[0]:info[1] + 1]
        for rand in range(int(numsim)):
             # expression
            e = 1
            if exp:
                trial = random.randint(1, 100)
                p = random.randint(1, 50) / 50.0
                e = numpy.random.negative_binomial(trial, p, 1)[0]
            iso = realign.isomir()
            randSeq, iso.start, iso.t5, iso.t3, iso.subs, iso.add = variation(info, seq)
            if randSeq in seen:
                continue
            seen.add(randSeq)
            iso.end = iso.start + len(randSeq)
            aln = realign.align(randSeq, seq[iso.start:iso.end])
            iso.cigar = realign.make_cigar(aln[0], aln[1])
            iso.mirna = mirna
            query_name = "%s.%s.%s" % (mirna, iso.format_id("."), randSeq)
            reads[query_name] = realign.hits()
            reads[query_name].set_sequence(randSeq)
            reads[query_name].counts = e
            reads[query_name].set_precursor(name, iso)
            full_read.extend(create_read(randSeq, e))
            clean_read.append([randSeq, e,])
            # print([randSeq, mutLab, addTag, t5Lab, t3Lab, mirSeq])
            # data[randSeq] = [exp, iso] # create real object used in code to generate GFF
    write_fastq(full_read, full_fq)
    write_collapse_fastq(clean_read, clean_fq)
    gff = body.create(reads, "miRBase21", "sim1")
    return gff
コード例 #7
0
ファイル: srnabench.py プロジェクト: srinivas32/mirtop
def read_file(folder, args):
    """
    Read sRNAbench file and convert to mirtop GFF format.

    Args:
        *fn(str)*: file name with sRNAbench output information.

        *database(str)*: database name.

        *args(namedtuple)*: arguments from command line.
            See *mirtop.libs.parse.add_subparser_gff()*.

    Returns:
        *reads (nested dicts)*:gff_list has the format as
            defined in *mirtop.gff.body.read()*.

    """
    reads_anno = os.path.join(folder, "reads.annotation")
    reads_iso = os.path.join(folder, "microRNAannotation.txt")
    sep = " " if args.out_format == "gtf" else "="
    sample = os.path.basename(folder)
    database = args.database
    precursors = args.precursors
    matures = args.matures

    n_out = 0
    n_in = 0
    n_ns = 0
    n_notassign = 0
    n_notindb = 0
    reads = defaultdict(dict)
    seen = set()

    source_iso = _read_iso(reads_iso)
    logger.info("Reads with isomiR information %s" % len(source_iso))
    with open(reads_anno) as handle:
        for sequence in handle:
            cols = sequence.strip().split("\t")
            query_name = cols[0]
            query_sequence = cols[0]
            if query_name not in reads and not query_sequence:
                continue
            if query_sequence and query_sequence.find("N") > -1:
                n_ns += 1
                continue
            if cols[3].find("mature") == -1:
                n_in += 1
                continue

            counts = int(cols[1])

            hits = len(
                set([mirna.split("#")[1] for mirna in cols[4].split("$")]))

            for nhit in cols[4].split("$"):
                logger.debug("SRNABENCH::line hit: %s" % nhit)
                hit_info = nhit.split("#")
                pos_info = hit_info[3].split(",")
                start = int(pos_info[1]) - 1
                end = start + len(query_sequence)  # int(pos_info[2]) - 1
                chrom = pos_info[0]
                mirName = hit_info[1]
                if chrom not in precursors or chrom not in matures:
                    n_notindb += 1
                if mirName not in matures[chrom]:
                    n_notindb += 1
                if (query_sequence, mirName) in seen:
                    continue

                seen.add((query_sequence, mirName))

                if (query_sequence, mirName) not in source_iso:
                    continue

                isoformat = source_iso[(query_sequence, mirName)]

                if isoformat == "mv":
                    n_notassign += 1
                    continue

                source = "isomiR" if isoformat != "NA" else "ref_miRNA"

                logger.debug("SRNABENCH::query: {query_sequence}\n"
                             "  precursor {chrom}\n"
                             "  name:  {query_name}\n"
                             "  start: {start}\n"
                             "  external: {isoformat}\n"
                             "  hit: {hits}".format(**locals()))
                logger.debug("SRNABENCH:: start %s end %s" % (start, end))
                if len(precursors[chrom]) < start + len(query_sequence):
                    n_out += 1
                    continue

                Filter = "Pass"
                cigar = make_cigar(query_sequence,
                                   precursors[chrom][start:end])
                preName = chrom
                score = "."
                strand = "+"
                idu = make_id(query_sequence)
                # attrb = ("Read {query_sequence}; UID {idu}; Name {mirName};"
                #          " Parent {preName}; Variant {isoformat};"
                #          " Cigar {cigar}; Expression {counts};"
                #          " Filter {Filter}; Hits {hits};").format(**locals())
                # line = ("{chrom}\t{database}\t{source}\t{start}\t{end}\t"
                #         "{score}\t{strand}\t.\t{attrb}").format(**locals())
                fields = {
                    'seq_name': query_sequence,
                    'idseq': idu,
                    'name': mirName,
                    'parent': preName,
                    'variant': isoformat,
                    'cigar': cigar,
                    'counts': counts,
                    'filter': Filter,
                    'hits': hits,
                    'chrom': chrom,
                    'start': start,
                    'end': end,
                    'database': database,
                    'source': source,
                    'score': score,
                    'strand': strand
                }
                # TODO: convert to genomic if args.out_genomic
                line = feature(fields).line
                if args.add_extra:
                    extra = variant_with_nt(line, args.precursors,
                                            args.matures)
                    line = "%s Changes %s;" % (line, extra)

                line = paste_columns(feature(line), sep=sep)
                if start not in reads[chrom]:
                    reads[chrom][start] = []
                if Filter == "Pass":
                    n_in += 1
                    reads[chrom][start].append(
                        [idu, chrom, counts, sample, line])

    logger.info("Loaded %s reads with %s hits" % (len(reads), n_in))
    logger.info("Reads without precursor information: %s" % n_notindb)
    logger.info("Reads with MV as variant definition,"
                " not supported by GFF: %s" % n_notassign)
    logger.info("Hit Filtered by having > 3 changes: %s" % n_out)

    return reads
コード例 #8
0
ファイル: filter.py プロジェクト: srinivas32/mirtop
def tune(seq, precursor, start, cigar):
    """
    The actual fn that will realign the sequence to find the nt changes
    at 5', 3' sequence and nt variations.

    Args:
        *seq (str)*: sequence of the read.

        *precursor (str)*: sequence of the precursor.

        *start (int)*: start position of sequence on the precursor, +1.

        *cigar (str)*: similar to SAM CIGAR attribute.

    Returns:

        *list* with:

            subs (list): substitutions

            add (list): nt added to the end

            cigar (str): updated cigar
    """
    end = len(seq)
    if start < 0:
        end = end + start
        start = 0
    if cigar:
        seq, mature = cigar_correction(cigar, seq, precursor[start:])
    else:
        seq, mature, score, p, size = align(seq, precursor[start:start + end])
        cigar = make_cigar(seq, mature)
    if seq.startswith("-"):
        seq = seq[1:]
    if seq.endswith("-"):
        seq = seq[:-1]
    logger.debug("TUNE:: %s %s %s" % (cigar, seq, mature))

    error = set()
    for pos in range(0, len(seq)):
        if seq[pos] != mature[pos]:
            error.add(pos)

    subs, add = [], []

    prob = 0
    add_position = []
    for e in range(len(seq) - 1, len(seq) - 6, -1):
        if e in error:
            prob = 1
        if prob == 1:
            add.append(seq[e])
            add_position.append(e)
        if e not in error and prob == 0 and seq[e] in ["A", "T"]:
            add.append(seq[e])
            add_position.append(e)
            continue
        if e not in error:
            if add:
                add.pop()
                add_position.pop()
            if prob == 0:
                add = []
                add_position = []
            break

    for e in error:
        if e not in add_position:
            subs.append([e, seq[e], mature[e]])

    logger.debug("TUNE:: %s %s" % (subs, add))

    return subs, "".join(add), make_cigar(seq, mature)
コード例 #9
0
ファイル: srnabench.py プロジェクト: miRTop/mirtop
def read_file(folder, args):
    """
    Read sRNAbench file and convert to mirtop GFF format.

    Args:
        *fn(str)*: file name with sRNAbench output information.

        *database(str)*: database name.

        *args(namedtuple)*: arguments from command line.
            See *mirtop.libs.parse.add_subparser_gff()*.

    Returns:
        *reads (nested dicts)*:gff_list has the format as
            defined in *mirtop.gff.body.read()*.

    """
    reads_anno = os.path.join(folder, "reads.annotation")
    reads_iso = os.path.join(folder, "microRNAannotation.txt")
    sep = " " if args.out_format == "gtf" else "="
    sample = os.path.basename(folder)
    database = args.database
    precursors = args.precursors
    matures = args.matures

    n_out = 0
    n_in = 0
    n_ns = 0
    n_notassign = 0
    n_notindb = 0
    reads = defaultdict(dict)
    seen = set()

    source_iso = _read_iso(reads_iso)
    logger.info("Reads with isomiR information %s" % len(source_iso))
    with open(reads_anno) as handle:
        for sequence in handle:
            cols = sequence.strip().split("\t")
            query_name = cols[0]
            query_sequence = cols[0]
            if query_name not in reads and not query_sequence:
                continue
            if query_sequence and query_sequence.find("N") > -1:
                n_ns += 1
                continue
            if cols[3].find("mature") == -1:
                n_in += 1
                continue

            counts = int(cols[1])

            hit = len(set([mirna.split("#")[1] for mirna in cols[4].split("$")]))

            for nhit in cols[4].split("$"):
                logger.debug("SRNABENCH::line hit: %s" % nhit)
                hit_info = nhit.split("#")
                pos_info = hit_info[3].split(",")
                start = int(pos_info[1]) - 1
                end = start + len(query_sequence)  # int(pos_info[2]) - 1
                chrom = pos_info[0]
                mirName = hit_info[1]
                if chrom not in precursors or chrom not in matures:
                    n_notindb += 1
                if mirName not in matures[chrom]:
                    n_notindb += 1
                if (query_sequence, mirName) in seen:
                    continue

                seen.add((query_sequence, mirName))

                if (query_sequence, mirName) not in source_iso:
                    continue

                isoformat = source_iso[(query_sequence, mirName)]

                if isoformat == "mv":
                    n_notassign += 1
                    continue

                source = "isomiR" if isoformat != "NA" else "ref_miRNA"

                logger.debug("SRNABENCH::query: {query_sequence}\n"
                             "  precursor {chrom}\n"
                             "  name:  {query_name}\n"
                             "  start: {start}\n"
                             "  external: {isoformat}\n"
                             "  hit: {hit}".format(**locals()))
                logger.debug("SRNABENCH:: start %s end %s" % (start, end))
                if len(precursors[chrom]) < start + len(query_sequence):
                    n_out += 1
                    continue

                Filter = "Pass"
                cigar = make_cigar(query_sequence,
                                   precursors[chrom][start:end])
                preName = chrom
                score = "."
                strand = "+"
                idu = make_id(query_sequence)
                attrb = ("Read {query_sequence}; UID {idu}; Name {mirName};"
                         " Parent {preName}; Variant {isoformat};"
                         " Cigar {cigar}; Expression {counts};"
                         " Filter {Filter}; Hits {hit};").format(**locals())
                line = ("{chrom}\t{database}\t{source}\t{start}\t{end}\t"
                        "{score}\t{strand}\t.\t{attrb}").format(**locals())
                if args.add_extra:
                    extra = variant_with_nt(line, args.precursors,
                                            args.matures)
                    line = "%s Changes %s;" % (line, extra)

                line = paste_columns(read_gff_line(line), sep=sep)
                if start not in reads[chrom]:
                    reads[chrom][start] = []
                if Filter == "Pass":
                    n_in += 1
                    reads[chrom][start].append([idu, chrom, counts,
                                                sample, line])

    logger.info("Loaded %s reads with %s hits" % (len(reads), n_in))
    logger.info("Reads without precursor information: %s" % n_notindb)
    logger.info("Reads with MV as variant definition,"
                " not supported by GFF: %s" % n_notassign)
    logger.info("Hit Filtered by having > 3 changes: %s" % n_out)

    return reads