def parse_gsnap_sam(gsnap_f, ref_path, out_dir, paired_end, write_bin): fa = Fasta(ref_path) fc, ft, fmethyltype = \ bin_paths_from_fasta(fa.fasta_name, out_dir) counts = get_counts(fc, ft, fa) #chr_lengths = dict((k, len(fa[k])) for k in fa.iterkeys()) print >> sys.stderr, "tabulating methylation for %s" % gsnap_f for sline in open(gsnap_f): if sline.startswith("@"): continue # the ends didn't map to same spot. line = sline.split("\t") sam_flag = int(line[1]) if paired_end: if line[6] != "=": continue else: # no reported alignments. if sam_flag == 4: continue seqid = line[2] aln_seq = line[9] read_length = len(aln_seq) bp0 = int(line[3]) - 1 ga = ((sam_flag & 16) != 0) ^ (sam_flag & 128 != 0) insert_length = int(line[8]) #line[9] = aln_seq #line[10] = line[10][:len(aln_seq)] # both ends start at exactly the same place. if paired_end and insert_length == 0: continue # handle overlapping reads. one side has + insert, the other is - if -read_length < insert_length < 0: insert_length = abs(insert_length) aln_seq = aln_seq[:-(read_length - insert_length)] read_length = len(aln_seq) if paired_end and line[7] == '0': continue bp1 = bp0 + read_length ref_seq = (fa[seqid][bp0:bp1]).upper() letters = 'GA' if ga else 'CT' read_length = len(ref_seq) assert read_length > 0, (bp0, bp1) _update_conversions(ref_seq, aln_seq, bp0, letters, counts[seqid]['c'], counts[seqid]['t'], 50, read_length, line[5]) write_files(fa.fasta_name, out_dir, counts, write_bin) cmd = open(out_dir + "/cmd.ran", "w") import datetime print >> cmd, "#date:", str(datetime.date.today()) print >> cmd, "#path:", op.abspath(".") print >> cmd, " ".join(sys.argv) write_sam_commands(out_dir, fa, "methylcoded.gsnap")
def main(argv): save = "" force = False generate = "log-uniform" search_mode = "fix-grid-search" try: opts, args = getopt.getopt(argv,"vhfo:s:g:", ["verbose","help","force","out=","search=","generate="]) except getopt.GetoptError as getopt_error: print getopt_error.msg, getopt_error.opt error() else: for opt, arg in opts: if opt in ("-h", "--help"): show_help() sys.exit() elif opt in ("-v","--verbose"): global _verbose _verbose = True elif opt in ("-f","--force"): force = True elif opt in ("-o","--out"): save = re.sub('.yaml$','',arg) elif opt in ("-g","--generate"): if arg not in generation_modes.keys(): print "generate MODE is invalid: " +arg error() generate = arg elif opt in ("-s","--search"): if arg not in search_modes.keys(): print "search MODE is invalid: " +arg error() search_mode = arg template, hparams = read_args(args) if not save: save = re.sub('.yaml$','',args[0]) hpnames, hpvalues = generate_params(hparams,generate,search_mode) # fill template template = ''.join(template) write_files(''.join(open(template,'r')),hpnames,hpvalues,save,force=force) if _verbose: print '\n'.join(files)+'\n'
def parse_gsnap_sam(gsnap_f, ref_path, out_dir, paired_end, write_bin): fa = Fasta(ref_path) fc, ft, fmethyltype = \ bin_paths_from_fasta(fa.fasta_name, out_dir) counts = get_counts(fc, ft, fa) #chr_lengths = dict((k, len(fa[k])) for k in fa.iterkeys()) print >>sys.stderr, "tabulating methylation for %s" % gsnap_f for sline in open(gsnap_f): if sline.startswith("@SQ"): continue # the ends didn't map to same spot. line = sline.split("\t") sam_flag = int(line[1]) if paired_end: if line[6] != "=": continue else: # no reported alignments. if sam_flag == 4: continue seqid = line[2] aln_seq = line[9] read_length = len(aln_seq) bp0 = int(line[3]) - 1 ga = ((sam_flag & 16) != 0) ^ (sam_flag & 128 != 0) insert_length = int(line[8]) #line[9] = aln_seq #line[10] = line[10][:len(aln_seq)] # both ends start at exactly the same place. if paired_end and insert_length == 0: continue # handle overlapping reads. one side has + insert, the other is - if -read_length < insert_length < 0: insert_length = abs(insert_length) aln_seq = aln_seq[:-(read_length - insert_length)] read_length = len(aln_seq) if paired_end and line[7] == '0': continue bp1 = bp0 + read_length ref_seq = (fa[seqid][bp0:bp1]).upper() letters = 'GA' if ga else 'CT' read_length = len(ref_seq) assert read_length > 0, (bp0, bp1) _update_conversions(ref_seq, aln_seq, bp0, letters, counts[seqid]['c'], counts[seqid]['t'], 50, read_length, line[5]) write_files(fa.fasta_name, out_dir, counts, write_bin) cmd = open(out_dir +"/cmd.ran", "w") import datetime print >>cmd, "#date:", str(datetime.date.today()) print >>cmd, "#path:", op.abspath(".") print >>cmd, " ".join(sys.argv) write_sam_commands(out_dir, fa, "methylcoded.gsnap")