Esempio n. 1
0
    def test_pattern(self):
        paths = bin_paths_from_fasta(self.fasta, pattern_only=True)
        self.assertEquals(paths, 'a.%s.*.bin')

        paths = bin_paths_from_fasta(self.fasta,
                                     pattern_only=True,
                                     out_dir="/a")
        self.assertEquals(paths, '/a/a.%s.*.bin')
Esempio n. 2
0
 def test_paths(self):
     paths = bin_paths_from_fasta(self.fasta)
     self.assertEquals(paths, ['a.%s.c.bin', 'a.%s.t.bin', 'a.%s.methyltype.bin'])
Esempio n. 3
0
    def test_pattern(self):
        paths = bin_paths_from_fasta(self.fasta, pattern_only=True)
        self.assertEquals(paths, 'a.%s.*.bin')

        paths = bin_paths_from_fasta(self.fasta, pattern_only=True, out_dir="/a")
        self.assertEquals(paths, '/a/a.%s.*.bin')
Esempio n. 4
0
 def test_paths(self):
     paths = bin_paths_from_fasta(self.fasta)
     self.assertEquals(paths,
                       ['a.%s.c.bin', 'a.%s.t.bin', 'a.%s.methyltype.bin'])
Esempio n. 5
0
def parse_gsnap_sam(gsnap_f, ref_path, out_dir, paired_end):
    fa = Fasta(ref_path)

    fc, ft, fmethyltype = \
            bin_paths_from_fasta(fa.fasta_name, out_dir)
    counts = get_counts(fc, ft, fa)
    chr_lengths = dict((k, len(fa[k])) for k in fa.iterkeys())


    print >>sys.stderr, "tabulating methylation"
    gsnap_subset = open(gsnap_f.replace(".gsnap.sam", ".sam"), "w")

    for sline in open(gsnap_f):
        if sline.startswith("@SQ"):
            print >>gsnap_subset, sline.strip()
            continue

        # the ends didn't map to same spot.
        line = sline.split("\t")
        sam_flag = int(line[1])
        if paired_end:
            if line[6] != "=": continue
            #print >>gsnap_subset, sline.strip()
        else:
            # no reported alignments.
            if sam_flag == 4: continue

        print >>gsnap_subset, sline.rstrip("\n")

        seqid = line[2]
        aln_seq = line[9]
        read_length = len(aln_seq)
        bp0 = int(line[3]) - 1
        ga = ((sam_flag & 16) != 0) ^ (sam_flag & 128 != 0)
        insert_length = int(line[8])
            #line[9] = aln_seq
            #line[10] = line[10][:len(aln_seq)]

        # both ends start at exactly the same place.
        if insert_length == 0: continue
        # handle overlapping reads. one side has + insert, the other is -
        if -read_length < insert_length < 0:
            insert_length = abs(insert_length)
            aln_seq = aln_seq[:-(read_length - insert_length)]
            read_length = len(aln_seq)
        if line[7] == '0': continue

        bp1 = bp0 + read_length
        ref_seq = (fa[seqid][bp0:bp1]).upper()


        letters = 'GA' if ga else 'CT'
        read_length = len(ref_seq)
        assert read_length > 0, (bp0, bp1)
        _update_conversions(ref_seq, aln_seq, bp0, letters,
                            counts[seqid]['c'], counts[seqid]['t'],
                            50, read_length, line[5])

    write_files(fa.fasta_name, out_dir, counts)

    cmd = open(out_dir +"/cmd.ran", "w")
    import datetime
    print >>cmd, "#date:", str(datetime.date.today())
    print >>cmd, "#path:", op.abspath(".")
    print >>cmd, " ".join(sys.argv)
    write_sam_commands(out_dir, fa, "methylcoder.gsnap")