Esempio n. 1
0
def split(args):
    parser = optparse.OptionParser("""\
   split a fasta file into separated files.
        pyfasta split -n 6 [-k 5000 ] some.fasta
    the output will be some.0.fasta, some.1.fasta ... some.6.fasta
    the sizes will be as even as reasonable.
   """)
    parser.add_option("--header", dest="header", metavar="FILENAME_FMT",
       help="""this overrides all other options. if specified, it will
               split the file into a separate file for each header. it
               will be a template specifying the file name for each new file.
               e.g.:    "%(fasta)s.%(seqid)s.fasta"
               where 'fasta' is the basename of the input fasta file and seqid
               is the header of each entry in the fasta file.""" ,default=None)

    parser.add_option("-n", "--n", type="int", dest="nsplits", 
                            help="number of new files to create")
    parser.add_option("-o", "--overlap", type="int", dest="overlap", 
                            help="overlap in basepairs", default=0)
    parser.add_option("-k", "--kmers", type="int", dest="kmers", default=-1,
                     help="""\
    split big files into pieces of this size in basepairs. default
    default of -1 means do not split the sequence up into k-mers, just
    split based on the headers. a reasonable value would be 10Kbp""")
    options, fasta = parser.parse_args(args)
    if not (fasta and (options.nsplits or options.header)):
        sys.exit(parser.print_help())

    if isinstance(fasta, (tuple, list)):
        assert len(fasta) == 1, fasta
        fasta = fasta[0]

    kmer = options.kmers if options.kmers != -1 else None
    overlap = options.overlap if options.overlap != 0 else None
    f = Fasta(fasta)
    if options.header:
        names = dict([(seqid, options.header % \
                      dict(fasta=f.fasta_name, seqid=seqid)) \
                                       for seqid in f.iterkeys()])
        """
        if len(names) > 0:
            assert names[0][1] != names[1][1], ("problem with header format", options.header)
        fhs = dict([(seqid, open(fn, 'wb')) for seqid, fn in names[:200]])
        fhs.extend([(seqid, StringIO(), fn) for seqid, fn in names[200:]])
        """
        return with_header_names(f, names)
    else:
        names = newnames(fasta, options.nsplits, kmers=kmer, overlap=overlap, 
                     header=options.header)

        #fhs = [open(n, 'wb') for n in names]
    if options.kmers == -1:
        return without_kmers(f, names)
    else: 
        return with_kmers(f, names, options.kmers, options.overlap)
def write_c2t(fasta_name, unconverted, colorspace=False):
    """
    given a fasta file, write a new file:
        `some.fr.c2t.fasta` which contains:
          + the same headers prefixed with 'f' with all C's converted to T
          + headers prefixed with 'r' reverse complemented with
                                 all C's converted to T.

    if unconverted is false, then also save a file with the forward and reverse
    without conversion.
    """
    d = op.join(op.dirname(fasta_name), "bowtie_index")
    if colorspace: d += "_colorspace"
    if not op.exists(d): os.mkdir(d)

    p, ext = op.splitext(op.basename(fasta_name)) # some.fasta -> some, fasta
    fname = "%s/%s.fr.c2t%s" % (d, p, ext)
        # no conversion, just copy the file into the index dir.
    unconverted_fname = "%s/%s.fr%s" % (d, p, ext)
    if op.exists(fname):
        if not unconverted: return fname, unconverted_fname
        elif op.exists(unconverted_fname): return fname, unconverted_fname

    fasta = Fasta(fasta_name)

    c2t_fh = open(fname, 'w')
    unc_fh = open(unconverted_fname, 'w') if unconverted else None

    print >>sys.stderr, "writing forward and reverse c2t to: %s" % (fname,)

    try:
        for header in fasta.iterkeys():
            seq = str(fasta[header]).upper()
            assert not ">" in seq
            # c2t, prefix header with f and write
            print >>c2t_fh, ">f%s" % header
            print >>c2t_fh, seq.replace('C', 'T')
            # then r-c, c2t, prefix header with r and write
            print >>c2t_fh, ">r%s" % header
            rseq = revcomp(seq)
            print >>c2t_fh, rseq.replace('C', 'T')
            if unc_fh is not None:
                print >>unc_fh, ">f%s\n%s" % (header, seq)
                print >>unc_fh, ">r%s\n%s" % (header, rseq)

        c2t_fh.close()
    except:
        os.unlink(fname)
        os.unlink(unconverted_fname)
        raise

    return fname, unconverted_fname
Esempio n. 3
0
def write_c2t(fasta_name, unconverted, colorspace=False):
    """
    given a fasta file, write a new file:
        `some.fr.c2t.fasta` which contains:
          + the same headers prefixed with 'f' with all C's converted to T
          + headers prefixed with 'r' reverse complemented with
                                 all C's converted to T.

    if unconverted is false, then also save a file with the forward and reverse
    without conversion.
    """
    d = op.join(op.dirname(fasta_name), "bowtie_index")
    if colorspace: d += "_colorspace"
    if not op.exists(d): os.mkdir(d)

    p, ext = op.splitext(op.basename(fasta_name))  # some.fasta -> some, fasta
    fname = "%s/%s.fr.c2t%s" % (d, p, ext)
    # no conversion, just copy the file into the index dir.
    unconverted_fname = "%s/%s.fr%s" % (d, p, ext)
    if op.exists(fname):
        if not unconverted: return fname, unconverted_fname
        elif op.exists(unconverted_fname): return fname, unconverted_fname

    fasta = Fasta(fasta_name)

    c2t_fh = open(fname, 'w')
    unc_fh = open(unconverted_fname, 'w') if unconverted else None

    print >> sys.stderr, "writing forward and reverse c2t to: %s" % (fname, )

    try:
        for header in fasta.iterkeys():
            seq = str(fasta[header]).upper()
            assert not ">" in seq
            # c2t, prefix header with f and write
            print >> c2t_fh, ">f%s" % header
            print >> c2t_fh, seq.replace('C', 'T')
            # then r-c, c2t, prefix header with r and write
            print >> c2t_fh, ">r%s" % header
            rseq = revcomp(seq)
            print >> c2t_fh, rseq.replace('C', 'T')
            if unc_fh is not None:
                print >> unc_fh, ">f%s\n%s" % (header, seq)
                print >> unc_fh, ">r%s\n%s" % (header, rseq)

        c2t_fh.close()
    except:
        os.unlink(fname)
        os.unlink(unconverted_fname)
        raise

    return fname, unconverted_fname
Esempio n. 4
0
def mask(fasta_file, org, cutoff, mask_value='X'):
    h5, node = get_node(org, 'r')

    outfile = fasta_file[:fasta_file.rfind(".")] + (".masked.%i" % cutoff) \
                         + fasta_file[fasta_file.rfind("."):]

    print "> masking sequence to file:", outfile
    out = open(outfile ,'w')

    fasta = Fasta(fasta_file)

    soft_mask = mask_value.lower() == 'soft'
    for seqid in sorted(fasta.iterkeys()): 
        masked = 0
        if soft_mask:
            seq = str(fasta[seqid])
            # mask is the lowercase sequence.
            mask_value = np.array(seq.lower(), dtype='c')
            seq = np.array(seq.upper(), dtype='c')
        else:
            fasta[seqid].tostring = False
            seq = fasta[seqid][:] # a


        if not 'c' + seqid in node:
            print >>sys.stderr, seqid,\
                '! not found in masked, writing unchanged\n' \
                '  this means that no section of this sequence appeared\n' \
                '  more than %i times' % cutoff
            out.write('>' + seqid + '\n')
            out.write(seq.tostring() + '\n')
            continue
        
        hit_counts = getattr(node, 'c' + seqid)[:]
        masked_seq = np.where(numexpr.evaluate("hit_counts > %i" % cutoff)
                              , mask_value, seq).tostring() 

        l = len(masked_seq)
        print >>sys.stderr, "! seq:%s len:%i %%masked:%.3f" % (seqid, l, 
                                   100.0 * masked_seq.count(mask_value) / l)
        assert len(seq) == l
        out.write('>' + seqid + '\n')
        out.write(masked_seq + '\n')

    out.close()
    # write out a file .fasta.version containing
    # the svnversion (if available of this script
    # that was used to create the file.
    path = os.path.dirname(__file__)
    os.system('svnversion %s > %s.version' % (path, outfile))
    h5.close()
Esempio n. 5
0
def mask(fasta_file, org, cutoff, mask_value='X'):
    h5, node = get_node(org, 'r')

    outfile = fasta_file[:fasta_file.rfind(".")] + (".masked.%i" % cutoff) \
                         + fasta_file[fasta_file.rfind("."):]

    print "> masking sequence to file:", outfile
    out = open(outfile, 'w')

    fasta = Fasta(fasta_file)

    soft_mask = mask_value.lower() == 'soft'
    for seqid in sorted(fasta.iterkeys()):
        masked = 0
        if soft_mask:
            seq = str(fasta[seqid])
            # mask is the lowercase sequence.
            mask_value = np.array(seq.lower(), dtype='c')
            seq = np.array(seq.upper(), dtype='c')
        else:
            fasta[seqid].tostring = False
            seq = fasta[seqid][:]  # a

        if not 'c' + seqid in node:
            print >>sys.stderr, seqid,\
                '! not found in masked, writing unchanged\n' \
                '  this means that no section of this sequence appeared\n' \
                '  more than %i times' % cutoff
            out.write('>' + seqid + '\n')
            out.write(seq.tostring() + '\n')
            continue

        hit_counts = getattr(node, 'c' + seqid)[:]
        masked_seq = np.where(numexpr.evaluate("hit_counts > %i" % cutoff),
                              mask_value, seq).tostring()

        l = len(masked_seq)
        print >> sys.stderr, "! seq:%s len:%i %%masked:%.3f" % (
            seqid, l, 100.0 * masked_seq.count(mask_value) / l)
        assert len(seq) == l
        out.write('>' + seqid + '\n')
        out.write(masked_seq + '\n')

    out.close()
    # write out a file .fasta.version containing
    # the svnversion (if available of this script
    # that was used to create the file.
    path = os.path.dirname(__file__)
    os.system('svnversion %s > %s.version' % (path, outfile))
    h5.close()
    chh_idxs, = np.where(~(first_g | second_g))
    chh = c_idxs[chh_idxs]
    methyl[chh] = CHH + adder

    return methyl

def write_file(seq, file_path):
    methyl = calc_methylation(str(seq))
    methyl.tofile(file_path)


if __name__ == "__main__":
    import optparse
    import sys
    import doctest
    import os
    doctest.testmod()

    p = optparse.OptionParser(__doc__)
    opts, args = p.parse_args()
    if not (len(args) and args[0]): sys.exit(p.print_help())

    from pyfasta import Fasta
    path = args[0]
    f = Fasta(path)
    for k in f.iterkeys():
        print "calculating methylation data for seqid: %s" % k
        binpath = "%s.%s.methyl.type.bin" % (path, k)
        write_file(f[k], binpath)
        print "saved to %s" % binpath
    methyl[chh] = CHH + adder

    return methyl

def write_file(seq, file_path):
    methyl = calc_methylation(str(seq))
    methyl.tofile(file_path)


if __name__ == "__main__":
    import optparse
    import sys
    import doctest
    import os
    doctest.testmod()
    import sys
    sys.exit()

    p = optparse.OptionParser(__doc__)
    opts, args = p.parse_args()
    if not (len(args) and args[0]): sys.exit(p.print_help())

    from pyfasta import Fasta
    path = args[0]
    f = Fasta(path)
    for k in f.iterkeys():
        print "calculating methylation data for seqid: %s" % k
        binpath = "%s.%s.methyl.type.bin" % (path, k)
        write_file(f[k], binpath)
        print "saved to %s" % binpath
subject_fasta_file = sys.argv[2]

out_fasta = "%s.features%s" % op.splitext(subject_fasta_file)

by_subject = collections.defaultdict(list)
fa = Fasta(subject_fasta_file)

def pairwise(iterable):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    a, b = tee(iterable)
    next(b, None)
    return izip(a, b)

by_subject = {}
seqids = []
for seqid in fa.iterkeys():
    by_subject[seqid] = np.zeros((len(fa[seqid]) + 1,), dtype=np.uint8)
    seqids.append((len(fa[seqid]),seqid))


by_query_subject = collections.defaultdict(dict)
for line in open(blast_file):
    args = line.split("\t")
    query, subject = args[0], args[1]
    sstart, sstop = sorted(map(int, args[8:10]))
    by_subject[subject][sstart: sstop + 1] |= 1
    if not subject in by_query_subject[query]:
        by_query_subject[query][subject] = [(sstart, sstop)]
    else:
        by_query_subject[query][subject].append((sstart, sstop))
Esempio n. 9
0
def parse_gsnap_sam(gsnap_f, ref_path, out_dir, paired_end):
    fa = Fasta(ref_path)

    fc, ft, fmethyltype = \
            bin_paths_from_fasta(fa.fasta_name, out_dir)
    counts = get_counts(fc, ft, fa)
    chr_lengths = dict((k, len(fa[k])) for k in fa.iterkeys())


    print >>sys.stderr, "tabulating methylation"
    gsnap_subset = open(gsnap_f.replace(".gsnap.sam", ".sam"), "w")

    for sline in open(gsnap_f):
        if sline.startswith("@SQ"):
            print >>gsnap_subset, sline.strip()
            continue

        # the ends didn't map to same spot.
        line = sline.split("\t")
        sam_flag = int(line[1])
        if paired_end:
            if line[6] != "=": continue
            #print >>gsnap_subset, sline.strip()
        else:
            # no reported alignments.
            if sam_flag == 4: continue

        print >>gsnap_subset, sline.rstrip("\n")

        seqid = line[2]
        aln_seq = line[9]
        read_length = len(aln_seq)
        bp0 = int(line[3]) - 1
        ga = ((sam_flag & 16) != 0) ^ (sam_flag & 128 != 0)
        insert_length = int(line[8])
            #line[9] = aln_seq
            #line[10] = line[10][:len(aln_seq)]

        # both ends start at exactly the same place.
        if insert_length == 0: continue
        # handle overlapping reads. one side has + insert, the other is -
        if -read_length < insert_length < 0:
            insert_length = abs(insert_length)
            aln_seq = aln_seq[:-(read_length - insert_length)]
            read_length = len(aln_seq)
        if line[7] == '0': continue

        bp1 = bp0 + read_length
        ref_seq = (fa[seqid][bp0:bp1]).upper()


        letters = 'GA' if ga else 'CT'
        read_length = len(ref_seq)
        assert read_length > 0, (bp0, bp1)
        _update_conversions(ref_seq, aln_seq, bp0, letters,
                            counts[seqid]['c'], counts[seqid]['t'],
                            50, read_length, line[5])

    write_files(fa.fasta_name, out_dir, counts)

    cmd = open(out_dir +"/cmd.ran", "w")
    import datetime
    print >>cmd, "#date:", str(datetime.date.today())
    print >>cmd, "#path:", op.abspath(".")
    print >>cmd, " ".join(sys.argv)
    write_sam_commands(out_dir, fa, "methylcoder.gsnap")
Esempio n. 10
0
def count_conversions(original_fasta, sam_iter, raw_reads_list, c2t_reads_list, index_class, out_dir,
                      allowed_mismatches, mode='w', counts=None, is_colorspace=False):
    print >>sys.stderr, "tabulating methylation for %s" % (original_fasta,)

    fh_raw_reads_list = [open(raw_reads) for raw_reads in raw_reads_list]
    fh_c2t_reads_list = [open(c2t_reads) for c2t_reads in c2t_reads_list]
    fqidx_list = [index_class(c2t_reads) for c2t_reads in c2t_reads_list]
    fa = Fasta(original_fasta)
    #fh_raw_reads = open(raw_reads, 'r')
    #fh_c2t_reads = open(c2t_reads, 'r')

    def get_records(header, idx):
        return get_raw_and_c2t(header, fqidx_list[idx], fh_raw_reads_list[idx], fh_c2t_reads_list[idx])

    chr_lengths = dict((k, len(fa[k])) for k in fa.iterkeys())

    out_sam = open(out_dir + "/methylcoded.sam", mode)


    # get the 3 possible binary files for each chr
    fc, ft, fmethyltype = \
            bin_paths_from_fasta(original_fasta, out_dir)

    # if we're re-running without conversion, it was passed via kwargs.
    # otherwise, we get a new set.
    retry = counts is not None
    unmapped_name = op.join(out_dir, "unmapped.reads")
    if retry:
        unmapped_name += ".try2"
    else:
        counts = get_counts(fc, ft, fa)

    skipped = 0
    align_count = 0
    for p, sam_line, read_len, direction in parse_sam(sam_iter, chr_lengths,
                                                      get_records,
                                                      unmapped_name,
                                                      is_colorspace, out_sam):

        pairs = "CT" if direction == "f" else "GA" #
        read_id = p['read_id']
        pos0 = p['pos0']
        align_count += 1
        raw = p['raw_read']
        sam_flag = int(sam_line[1])


        # paired and first of read and ends have overlap.
        # this section makes sure that the overlap is only counted 1x.
        if sam_line[7] != '0' and 0 <= int(sam_line[8]) < read_len:
            offset = int(sam_line[8])
            pos0 = pos0 + read_len - offset
            read_len = offset
            raw = raw[-offset:]
            # chop the read info to the non-overlapping bases.
            p['read_sequence'] = p['read_sequence'][-offset:]

        # the position is the line num * the read_id + read_id (where the +
        # is to account for the newlines.
        genomic_ref = str(fa[p['seqid']][pos0:pos0 + read_len]).upper()
        DEBUG = False
        if DEBUG:
            print >>sys.stderr, "=" * 80
            print >>sys.stderr, sam_line
            print >>sys.stderr, "=" * 80
            print >>sys.stderr, ">>", sam_line[1], sam_flag
            idx = 0 if (sam_flag & 128) == 0 else 1
            araw, ac2t = get_records(read_id, idx)
            #araw.seq = araw.seq[-offset:]
            #c2t.seq = ac2t.seq[-offset:]
            print >>sys.stderr, "f['%s'][%i:%i + %i]" % (p['seqid'], pos0,
                                                         pos0, read_len)
            print >>sys.stderr, "mismatches:", p['nmiss']
            print >>sys.stderr, "ref        :", genomic_ref
            if direction == 'r':
                print >>sys.stderr, "raw_read(r):", raw
                c2t = ac2t.seq
                if not is_colorspace:
                    if (sam_flag & 128) == 0:
                        c2t = revcomp(c2t)[-offset:]
                    assert c2t == p['read_sequence'], (c2t, p['read_sequence'])

            else:
                print >>sys.stderr, "raw_read(f):", raw
                c2t = ac2t.seq
                if not is_colorspace:
                    if (sam_flag & 128) != 0:
                        c2t = revcomp(c2t)
                        p['read_sequence'] = revcomp(p['read_sequence'])
                    assert c2t == p['read_sequence'], (c2t, p['read_sequence'], sam_flag)
            print >>sys.stderr, "c2t        :",  c2t, "\n"

        # have to keep the ref in forward here to get the correct bp
        # positions. look for CT when forward and GA when back.
        current_mismatches = p['nmiss']
        # we send in the current mismatches and allowed mismatches so that in
        # the case where a C in the ref seq has becomes a T in the align seq
        # (but wasnt calc'ed as a mismatch because we had converted C=>T. if
        # these errors cause the number of mismatches to exceed the number of
        # allowed mismatches, we dont include the stats for this read.
        # if we're retrying, the reads are treated, but the genome is not ct-converted, so
        # dont check the mismtaches since it was done correctly by bowtie.
        remaining_mismatches = -1 if retry else (allowed_mismatches - current_mismatches)

        this_skipped = _update_conversions(genomic_ref, raw, pos0, pairs,
                                       counts[p['seqid']]['c'],
                                       counts[p['seqid']]['t'],
                                      remaining_mismatches, read_len,
                                           sam_line[5])
        if this_skipped == 0:
            # only print the line to the sam file if we use it in our calcs.
            print >>out_sam, "\t".join(sam_line)
        skipped += this_skipped
        if DEBUG:
            raw_input("press any key...\n")
    print >>sys.stderr, "total alignments: %i" % align_count
    print >>sys.stderr, \
            "skipped %i alignments (%.3f%%) where read T mapped to genomic C" % \
                  (skipped, 100.0 * skipped / (align_count or 1))
    return counts, unmapped_name
Esempio n. 11
0
def count_conversions(original_fasta,
                      sam_iter,
                      raw_reads_list,
                      c2t_reads_list,
                      index_class,
                      out_dir,
                      allowed_mismatches,
                      mode='w',
                      counts=None,
                      is_colorspace=False):
    print >> sys.stderr, "tabulating methylation for %s" % (original_fasta, )

    fh_raw_reads_list = [open(raw_reads) for raw_reads in raw_reads_list]
    fh_c2t_reads_list = [open(c2t_reads) for c2t_reads in c2t_reads_list]
    fqidx_list = [index_class(c2t_reads) for c2t_reads in c2t_reads_list]
    fa = Fasta(original_fasta)

    #fh_raw_reads = open(raw_reads, 'r')
    #fh_c2t_reads = open(c2t_reads, 'r')

    def get_records(header, idx):
        return get_raw_and_c2t(header, fqidx_list[idx], fh_raw_reads_list[idx],
                               fh_c2t_reads_list[idx])

    chr_lengths = dict((k, len(fa[k])) for k in fa.iterkeys())

    out_sam = open(out_dir + "/methylcoded.sam", mode)

    # get the 3 possible binary files for each chr
    fc, ft, fmethyltype = \
            bin_paths_from_fasta(original_fasta, out_dir)

    # if we're re-running without conversion, it was passed via kwargs.
    # otherwise, we get a new set.
    retry = counts is not None
    unmapped_name = op.join(out_dir, "unmapped.reads")
    if retry:
        unmapped_name += ".try2"
    else:
        counts = get_counts(fc, ft, fa)

    skipped = 0
    align_count = 0
    for p, sam_line, read_len, direction in parse_sam(sam_iter, chr_lengths,
                                                      get_records,
                                                      unmapped_name,
                                                      is_colorspace, out_sam):

        pairs = "CT" if direction == "f" else "GA"  #
        read_id = p['read_id']
        pos0 = p['pos0']
        align_count += 1
        raw = p['raw_read']
        sam_flag = int(sam_line[1])

        # paired and first of read and ends have overlap.
        # this section makes sure that the overlap is only counted 1x.
        if sam_line[7] != '0' and 0 <= int(sam_line[8]) < read_len:
            offset = int(sam_line[8])
            pos0 = pos0 + read_len - offset
            read_len = offset
            raw = raw[-offset:]
            # chop the read info to the non-overlapping bases.
            p['read_sequence'] = p['read_sequence'][-offset:]

        # the position is the line num * the read_id + read_id (where the +
        # is to account for the newlines.
        genomic_ref = str(fa[p['seqid']][pos0:pos0 + read_len]).upper()
        DEBUG = False
        if DEBUG:
            print >> sys.stderr, "=" * 80
            print >> sys.stderr, sam_line
            print >> sys.stderr, "=" * 80
            print >> sys.stderr, ">>", sam_line[1], sam_flag
            idx = 0 if (sam_flag & 128) == 0 else 1
            araw, ac2t = get_records(read_id, idx)
            #araw.seq = araw.seq[-offset:]
            #c2t.seq = ac2t.seq[-offset:]
            print >> sys.stderr, "f['%s'][%i:%i + %i]" % (p['seqid'], pos0,
                                                          pos0, read_len)
            print >> sys.stderr, "mismatches:", p['nmiss']
            print >> sys.stderr, "ref        :", genomic_ref
            if direction == 'r':
                print >> sys.stderr, "raw_read(r):", raw
                c2t = ac2t.seq
                if not is_colorspace:
                    if (sam_flag & 128) == 0:
                        c2t = revcomp(c2t)[-offset:]
                    assert c2t == p['read_sequence'], (c2t, p['read_sequence'])

            else:
                print >> sys.stderr, "raw_read(f):", raw
                c2t = ac2t.seq
                if not is_colorspace:
                    if (sam_flag & 128) != 0:
                        c2t = revcomp(c2t)
                        p['read_sequence'] = revcomp(p['read_sequence'])
                    assert c2t == p['read_sequence'], (c2t, p['read_sequence'],
                                                       sam_flag)
            print >> sys.stderr, "c2t        :", c2t, "\n"

        # have to keep the ref in forward here to get the correct bp
        # positions. look for CT when forward and GA when back.
        current_mismatches = p['nmiss']
        # we send in the current mismatches and allowed mismatches so that in
        # the case where a C in the ref seq has becomes a T in the align seq
        # (but wasnt calc'ed as a mismatch because we had converted C=>T. if
        # these errors cause the number of mismatches to exceed the number of
        # allowed mismatches, we dont include the stats for this read.
        # if we're retrying, the reads are treated, but the genome is not ct-converted, so
        # dont check the mismtaches since it was done correctly by bowtie.
        remaining_mismatches = -1 if retry else (allowed_mismatches -
                                                 current_mismatches)

        this_skipped = _update_conversions(genomic_ref, raw, pos0, pairs,
                                           counts[p['seqid']]['c'],
                                           counts[p['seqid']]['t'],
                                           remaining_mismatches, read_len,
                                           sam_line[5])
        if this_skipped == 0:
            # only print the line to the sam file if we use it in our calcs.
            print >> out_sam, "\t".join(sam_line)
        skipped += this_skipped
        if DEBUG:
            raw_input("press any key...\n")
    print >> sys.stderr, "total alignments: %i" % align_count
    print >>sys.stderr, \
            "skipped %i alignments (%.3f%%) where read T mapped to genomic C" % \
                  (skipped, 100.0 * skipped / (align_count or 1))
    return counts, unmapped_name
Esempio n. 12
0
def split(args):
    parser = optparse.OptionParser("""\
   split a fasta file into separated files.
        pyfasta split -n 6 [-k 5000 ] some.fasta
    the output will be some.0.fasta, some.1.fasta ... some.6.fasta
    the sizes will be as even as reasonable.
   """)
    parser.add_option(
        "--header",
        dest="header",
        metavar="FILENAME_FMT",
        help="""this overrides all other options. if specified, it will
               split the file into a separate file for each header. it
               will be a template specifying the file name for each new file.
               e.g.:    "%(fasta)s.%(seqid)s.fasta"
               where 'fasta' is the basename of the input fasta file and seqid
               is the header of each entry in the fasta file.""",
        default=None)

    parser.add_option("-n",
                      "--n",
                      type="int",
                      dest="nsplits",
                      help="number of new files to create")
    parser.add_option("-o",
                      "--overlap",
                      type="int",
                      dest="overlap",
                      help="overlap in basepairs",
                      default=0)
    parser.add_option("-k",
                      "--kmers",
                      type="int",
                      dest="kmers",
                      default=-1,
                      help="""\
    split big files into pieces of this size in basepairs. default
    default of -1 means do not split the sequence up into k-mers, just
    split based on the headers. a reasonable value would be 10Kbp""")
    options, fasta = parser.parse_args(args)
    if not (fasta and (options.nsplits or options.header)):
        sys.exit(parser.print_help())

    if isinstance(fasta, (tuple, list)):
        assert len(fasta) == 1, fasta
        fasta = fasta[0]

    kmer = options.kmers if options.kmers != -1 else None
    overlap = options.overlap if options.overlap != 0 else None
    f = Fasta(fasta)
    if options.header:
        names = dict([(seqid, options.header % \
                      dict(fasta=f.fasta_name, seqid=seqid)) \
                                       for seqid in f.iterkeys()])
        """
        if len(names) > 0:
            assert names[0][1] != names[1][1], ("problem with header format", options.header)
        fhs = dict([(seqid, open(fn, 'wb')) for seqid, fn in names[:200]])
        fhs.extend([(seqid, StringIO(), fn) for seqid, fn in names[200:]])
        """
        return with_header_names(f, names)
    else:
        names = newnames(fasta,
                         options.nsplits,
                         kmers=kmer,
                         overlap=overlap,
                         header=options.header)

        #fhs = [open(n, 'wb') for n in names]
    if options.kmers == -1:
        return without_kmers(f, names)
    else:
        return with_kmers(f, names, options.kmers, options.overlap)