def main(chrs, scale, step, meth_dir, cpg_dir, out_dir, sname):
    LOGFH = open('create_microarray_tracks.LOG', 'w')
    LOGFH.write('Start the program ...  %s\n' % datetime.datetime.now())
    # Create methylation tracks for each chromosome
    for chr in chrs:
        # 1. Build CpGs
        LOGFH.write('... Build CpG sites for [ %s ] ...  %s\n' % (chr, datetime.datetime.now()))
        try:
            cpgfile = check_file(chr+'_cpgs', PATH=cpg_dir)
        except MethError, e:
            print >> sys.stderr, e.value
            continue
        siteparser = SiteParser(chr)
        siteparser.parse_sites(cpgfile, 'CpGSimple')
        cpg_sites = siteparser.get_sites()
        for cpg in cpg_sites:  # define empty meth score
            cpg.meth_score = '-10000'
        # 2. Get methylation scores
        # note files are saved in 'methstatus_chrN'
        LOGFH.write('... Obtain methylation scores ...\n')
        try:
            mfile = check_file('methstatus_' + chr, PATH=meth_dir)
        except MethError, e:
            print >> sys.stderr, e.value
            sys.exit(2)
Exemple #2
0
def main(chrs, scale, step, meth_dir, cpg_dir, out_dir, sname):
    LOGFH = open('create_microarray_tracks.LOG', 'w')
    LOGFH.write('Start the program ...  %s\n' % datetime.datetime.now())
    # Create methylation tracks for each chromosome
    for chr in chrs:
        # 1. Build CpGs
        LOGFH.write('... Build CpG sites for [ %s ] ...  %s\n' %
                    (chr, datetime.datetime.now()))
        try:
            cpgfile = check_file(chr + '_cpgs', PATH=cpg_dir)
        except MethError, e:
            print >> sys.stderr, e.value
            continue
        siteparser = SiteParser(chr)
        siteparser.parse_sites(cpgfile, 'CpGSimple')
        cpg_sites = siteparser.get_sites()
        for cpg in cpg_sites:  # define empty meth score
            cpg.meth_score = '-10000'
        # 2. Get methylation scores
        # note files are saved in 'methstatus_chrN'
        LOGFH.write('... Obtain methylation scores ...\n')
        try:
            mfile = check_file('methstatus_' + chr, PATH=meth_dir)
        except MethError, e:
            print >> sys.stderr, e.value
            sys.exit(2)
Exemple #3
0
def main(parafile, chr, anno_dir, re_fragfile, mcrbc_fragfile, out_dir):
    # Get annotation files: e.g., chr1_cpgs, chr1_re, chr1_mcrbc
    try:
        cpg_file = check_file(chr+'_cpgs', PATH=anno_dir)
        re_sitefile = check_file(chr+'_re', PATH=anno_dir)
        mcrbc_sitefile = check_file(chr+'_mcrbc', PATH=anno_dir)
    except MethError, e:
        print >> sys.stderr, e.value
        sys.exit(2)
def main(chrs, meth_dir, out_dir, sname):
    LOGFH = open('create_wiggle_tracks.LOG', 'w')
    LOGFH.write('Start the program ...  %s\n' % datetime.datetime.now())
    # Create methylation tracks for each chromosome
    for chr in chrs:
        LOGFH.write('... Obtain methylation scores for [ %s ] ...\n' % chr)
        # 1. Get methylation scores
        # note files are saved in 'methstatus_chrN'
        try:
            mfile = check_file('methstatus_' + chr, PATH=meth_dir)
        except MethError, e:
            print >> sys.stderr, e.value
            sys.exit(2)
        cpg_sites = {}  # coordinate: score
        mfh = open(mfile)
        for line in mfh:
            line_list = line.rstrip().split('\t')
            coordinate = int(line_list[1])
            cpg_sites[coordinate] = line_list[8]
        mfh.close()
        # 2. Write to the output file
        # Some fixed info in the output file
        header1 = 'track type=wiggle_0 name=' + chr + ' description=Wiggle custom track for ' \
                 + sname + '_' + chr + ' color=128,0,0 visibility=full'
        header2 = 'variableStep chrom=' + chr
        outfile = 'meth_' + chr + '.wig'
        fout = open(outfile, 'w')
        fout.write(header1 + '\n' + header2 + '\n')
        sorted_coords = cpg_sites.keys()
        sorted_coords.sort()
        for coord in sorted_coords:
            record = '\t'.join([str(coord), cpg_sites[coord]])
            fout.write(record + '\n')
        fout.close()        
        del cpg_sites
Exemple #5
0
def main(chrs, meth_dir, out_dir, sname):
    LOGFH = open('create_wiggle_tracks.LOG', 'w')
    LOGFH.write('Start the program ...  %s\n' % datetime.datetime.now())
    # Create methylation tracks for each chromosome
    for chr in chrs:
        LOGFH.write('... Obtain methylation scores for [ %s ] ...\n' % chr)
        # 1. Get methylation scores
        # note files are saved in 'methstatus_chrN'
        try:
            mfile = check_file('methstatus_' + chr, PATH=meth_dir)
        except MethError, e:
            print >> sys.stderr, e.value
            sys.exit(2)
        cpg_sites = {}  # coordinate: score
        mfh = open(mfile)
        for line in mfh:
            line_list = line.rstrip().split('\t')
            coordinate = int(line_list[1])
            cpg_sites[coordinate] = line_list[8]
        mfh.close()
        # 2. Write to the output file
        # Some fixed info in the output file
        header1 = 'track type=wiggle_0 name=' + chr + ' description=Wiggle custom track for ' \
                 + sname + '_' + chr + ' color=128,0,0 visibility=full'
        header2 = 'variableStep chrom=' + chr
        outfile = 'meth_' + chr + '.wig'
        fout = open(outfile, 'w')
        fout.write(header1 + '\n' + header2 + '\n')
        sorted_coords = cpg_sites.keys()
        sorted_coords.sort()
        for coord in sorted_coords:
            record = '\t'.join([str(coord), cpg_sites[coord]])
            fout.write(record + '\n')
        fout.close()
        del cpg_sites
def main(chrs, cpg_dir, out_dir):
    LOGFH = open('create_cpg_tracks.LOG', 'w')
    LOGFH.write('Start the program ...  %s\n' % datetime.datetime.now())
    # Get track values for each chromosome
    for chr in chrs:
        LOGFH.write('... Read CpGs on [ %s ] ...  %s\n' % (chr, datetime.datetime.now()))
        # Some fixed info in the output file
        header1 = 'browser position ' + chr + ':1-10000'
        header_cpg = 'track name="'+chr+' CpG" description="CpGs on '+chr+'" color=0,0,0'
        header_re = 'track name="'+chr+' RE" description="RE sites on '+chr+'" color=255,0,0'
        header_mcrbc = 'track name="'+chr+' McrBC" description="McrBC sites on '+chr+'" color=0,0,255'
        # Read CpG file
        try:
            cpgfile = check_file(chr+'_cpgs', cpg_dir)
        except MethError, e:
            print >> stderr, e.value
            sys.exit(2)
        infh = open(cpgfile)
        cpgout = open(os.path.join(out_dir, chr+'_cpgs.bed'), 'w')
        reout = open(os.path.join(out_dir, chr+'_re.bed'), 'w')
        mcrbcout = open(os.path.join(out_dir, chr+'_mcrbc.bed'), 'w')
        cpgout.write(header1 + '\n')
        cpgout.write(header_cpg + '\n')
        reout.write(header1 + '\n')
        reout.write(header_re + '\n')
        mcrbcout.write(header1 + '\n')
        mcrbcout.write(header_mcrbc + '\n')
        for line in infh:
            line_list = line.rstrip().split('\t')
            position = int(line_list[1])
            isre = int(line_list[2])
            ismcrbc = int(line_list[3])
            cpgout.write('%s\t%d\t%d\n' % (chr, position, position+2))
            if isre:
                reout.write('%s\t%d\t%d\n' % (chr, position, position+2))
            if ismcrbc:
                mcrbcout.write('%s\t%d\t%d\n' % (chr, position, position+2))
        infh.close()
        cpgout.close()
        reout.close()
        mcrbcout.close()
    return lib_dict[arg]


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Parse mate-pair reads to get methylation compartments: \
    methylated fragments and unmethylated fragments.',
                                     epilog='Save coordinates of parsed methylated/unmethylated fragments \
                                     for each chromosome in BED format, generating files like chr*_re.bed.')
    parser.add_argument('lib', choices=['re', 'mcrbc'], help='sequences generated by the RE or McrBC library')
    parser.add_argument('cmap', help='chromosome map provided by SOLiD')
    parser.add_argument('mates', help='mate-pair reads from either the RE or McrBC library')
    parser.add_argument('--out_dir', default=os.getcwd(), help='directory for parsed fragments, default=current dir')
    # Parse arguments
    args = parser.parse_args()
    readlib = get_readlib(args.lib)    
    try:
        cmapfile = check_file(args.cmap)
        matesfile = check_file(args.mates)
    except MethError, e:
        parser.print_usage()
        print >> sys.stderr, 'MethError: ', e.value
        sys.exit(2)
    if os.path.isdir(args.out_dir):
        out_dir = os.path.abspath(args.out_dir)
    else:
        parser.print_usage()
        print >> sys.stderr, 'Invalid output directory'
        sys.exit(2)
    main(readlib, cmapfile, matesfile, out_dir)
    
    parser = argparse.ArgumentParser(description='Parse mate-pair reads for a specific chromosome. Reads are saved in SAM/BAM format.',
                                     epilog='Save coordinates of fragments that are formed by properly paired reads \
                                     in BED formate, generating files like chr1_re.bed.')
    parser.add_argument('--flag', type=bool, nargs='?', const=True, default=False, \
                        help='use flag value to extract paired reads, default=False')
    parser.add_argument('--min_ins', type=int, default=0, help='the minimum insert size of mate-pair reads, default=0')
    parser.add_argument('--max_ins', type=int, default=15000, help='the maximum insert size of mate-pair reads, default=15000')
    parser.add_argument('--out_dir', default=os.getcwd(), help='directory for parsed fragments, default=current dir')
    parser.add_argument('format', choices=['sam', 'bam'], help='input file format')
    parser.add_argument('lib', choices=['re', 'mcrbc'], help='sequences generated by the RE or McrBC library')
    parser.add_argument('chr', help='chromosome name, e.g., chr1')
    parser.add_argument('input', help='input file name')
    # Parse arguments
    args = parser.parse_args()
    readlib = get_readlib(args.lib)
    fformat = get_format(args.format)
    try:
        infile = check_file(args.input)
    except MethError, e:
        parser.print_usage()
        print >> sys.stderr, 'MethError: ', e.value
        sys.exit(2)
    if os.path.isdir(args.out_dir):
        out_dir = os.path.abspath(args.out_dir)
    else:
        parser.print_usage()
        print >> sys.stderr, 'Invalid output directory'
        sys.exit(2)
    main(args.flag, args.min_ins, args.max_ins, fformat, readlib, args.chr, infile, out_dir)
        
    for line in fragfh:  # BED format
        line_list = line.rstrip().split("\t")
        length = str(int(line_list[2]) - int(line_list[1]))
        id = line_list[3] + '-' + length
        id_insert_point = bisect.bisect_right(idlist, id)
        if id_insert_point != 0 and id_insert_point <= len(idlist) and idlist[id_insert_point-1] == id:
            newfh.write(line)
    fragfh.close()
    newfh.close()

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Create custom track files for RE/McrBC fragments (BED format) to \
    display in the UCSC Genome Browser.')
    parser.add_argument('--header', help='number of header lines, default=2', type=int, default=2)
    parser.add_argument('id_list', help='list of [fragment IDs, start coordinate, and end coordinate], \
    generated by filter.py')
    parser.add_argument('frags', help='the origianl fragments (BED format) generated by parse_mates.py')
    parser.add_argument('newfrags', help='the new fragments based on id_list')
    # Parse arguments
    args = parser.parse_args()
    header = args.header
    outfile = args.newfrags
    try:
        listfile = check_file(args.id_list)
        fragfile = check_file(args.frags)
    except MethError, e:
        parser.print_usage()
        print >> stderr, e.value
        sys.exit(2)
    main(header, listfile, fragfile, outfile)
if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description=
        'The master script to generate all sub-scripts for the data analysis pipeline'
    )
    parser.add_argument('--run', type=bool, nargs='?', const=True, default=False, \
                        help='run the analysis pipeline or just save scripts, default=False')
    parser.add_argument('--format',
                        choices=['mates', 'sam', 'bam'],
                        default='mates',
                        help='paired file format, default=mates')
    parser.add_argument('para', help='parameters required in the pipeline')
    parser.add_argument('re_reads', help='paired read file for RE fragments')
    parser.add_argument('mcrbc_reads',
                        help='paired read file for McrBC fragments')
    parser.add_argument('out_dir',
                        help='directory for all output files of the pipeline')
    # Parse arguments
    args = parser.parse_args()
    try:
        parafile = check_file(args.para)
        refile = check_file(args.re_reads)
        mcrbcfile = check_file(args.mcrbc_reads)
    except MethError, e:
        parser.print_usage()
        print >> sys.stderr, e.value
        sys.exit(2)
    if os.path.isdir(args.out_dir):
        out_dir = os.path.abspath(args.out_dir)
    main(args.run, args.format, parafile, refile, mcrbcfile, out_dir)
Exemple #11
0
                                      1. failed fragments in four classes: failed_refrags_ends,
                                         failed_refrags_mid, failed_mcrbcfrags_ends, failed_mcrbcfrags_mid
                                      2. passed fragments in four classes: passed_refrags_1end,
                                         passed_refrags_2ends, passed_mcrbcfrags_2ends,
                                         passed_mcrbcfrags_1end
                                      3. save filtering results to files'''))
 parser.add_argument('--para', help='filtering parameters', default=os.path.join(DIR, 'data/filter_para'))
 parser.add_argument('--out_dir', help='directory for output files, default=currect dir', default=os.getcwd())
 parser.add_argument('chr', help='chromosome')
 parser.add_argument('anno_dir', help='directory storing annotation files for CpG, RE, and McrBC sites')
 parser.add_argument('re_frags', help='RE fragments')
 parser.add_argument('mcrbc_frags', help='McrBC fragments')
 # Parse arguments
 args = parser.parse_args()
 try:
     parafile = check_file(args.para)  # parafile
 except MethError, e:
     parser.print_usage()
     print >> sys.stderr, e.value
     sys.exit(2)
 if os.path.isdir(args.anno_dir) is True and os.path.isdir(args.out_dir) is True:
     anno_dir = os.path.abspath(args.anno_dir)  # anno_dir
     out_dir = os.path.abspath(args.out_dir)  # out_dir
 else:
     parser.print_usage()
     print >> sys.stderr, 'Invalid directories'
     sys.exit(2)
 try:
     chr = check_chr(args.chr)  # chr
     re_fragfile = check_file(args.re_frags)  # re_fragfile
     mcrbc_fragfile = check_file(args.mcrbc_frags)  # mcrbc_fragfile
Exemple #12
0
    outfh.close()
    LOGFH.write('Finish the program ...  %s\n\n' % str(datetime.datetime.now()))
    LOGFH.close()
    

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Estimate DNA methylation states for CpGs in a sample')
    parser.add_argument('--out_dir', help='directory for output files, default=currect dir', default=os.getcwd())
    parser.add_argument('meth_ave', type=float, help='global methylation level estimated by LUMA')
    parser.add_argument('chr', help='chromosome')
    parser.add_argument('chr_len', type=int, help='chromosome length')
    parser.add_argument('methdata', help='methylation data generated by filter.py')
    # Parse arguments
    args = parser.parse_args()
    p_bar = args.meth_ave
    chr_len = args.chr_len
    try:
        chr = check_chr(args.chr)
        meth_data = check_file(args.methdata)
    except MethError, e:
        parser.print_usage()
        print >> sys.stderr, e.value
        sys.exit(2)
    if os.path.isdir(args.out_dir) is True:
        out_dir = os.path.abspath(args.out_dir)
    else:
        parser.print_usage()
        print >> sys.stderr, 'Invalid directories'
        sys.exit(2)
    main(p_bar, chr, chr_len, meth_data, out_dir)
Exemple #13
0
    LOGFH.close()

def write2output(filename, sites, chr):
    fh = open(filename, 'w')
    for pos in sites.sorted_iter():
        record = pos.get_record(chr)
        fh.write(record + '\n')
    fh.close()


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Parse genomic sequences for CpG, RE, and McrBC sites')
    parser.add_argument('chr', help='chromosome')
    parser.add_argument('fasta', help='Fasta sequence for the chromosome')
    parser.add_argument('--outname', help='name for the output files, default=chrN')
    # Parse arguments
    args = parser.parse_args()
    try:
        chr = check_chr(args.chr)
        seqfile = check_file(args.fasta)
    except MethError, e:
        parser.print_usage()
        print >> sys.stderr, e.value
        sys.exit(2)
    if args.outname is not None:
        outname = args.outname
    else:
        outname = chr
    main(chr, seqfile, outname)
                                     
Exemple #14
0
    parser.add_argument(
        '--out_dir',
        default=os.getcwd(),
        help='directory for parsed fragments, default=current dir')
    parser.add_argument('format',
                        choices=['sam', 'bam'],
                        help='input file format')
    parser.add_argument('lib',
                        choices=['re', 'mcrbc'],
                        help='sequences generated by the RE or McrBC library')
    parser.add_argument('chr', help='chromosome name, e.g., chr1')
    parser.add_argument('input', help='input file name')
    # Parse arguments
    args = parser.parse_args()
    readlib = get_readlib(args.lib)
    fformat = get_format(args.format)
    try:
        infile = check_file(args.input)
    except MethError, e:
        parser.print_usage()
        print >> sys.stderr, 'MethError: ', e.value
        sys.exit(2)
    if os.path.isdir(args.out_dir):
        out_dir = os.path.abspath(args.out_dir)
    else:
        parser.print_usage()
        print >> sys.stderr, 'Invalid output directory'
        sys.exit(2)
    main(args.flag, args.min_ins, args.max_ins, fformat, readlib, args.chr,
         infile, out_dir)
Exemple #15
0
if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description=
        'Create custom track files for RE/McrBC fragments (BED format) to \
    display in the UCSC Genome Browser.')
    parser.add_argument('--header',
                        help='number of header lines, default=2',
                        type=int,
                        default=2)
    parser.add_argument(
        'id_list',
        help='list of [fragment IDs, start coordinate, and end coordinate], \
    generated by filter.py')
    parser.add_argument(
        'frags',
        help='the origianl fragments (BED format) generated by parse_mates.py')
    parser.add_argument('newfrags', help='the new fragments based on id_list')
    # Parse arguments
    args = parser.parse_args()
    header = args.header
    outfile = args.newfrags
    try:
        listfile = check_file(args.id_list)
        fragfile = check_file(args.frags)
    except MethError, e:
        parser.print_usage()
        print >> stderr, e.value
        sys.exit(2)
    main(header, listfile, fragfile, outfile)
Exemple #16
0
        epilog='Save coordinates of parsed methylated/unmethylated fragments \
                                     for each chromosome in BED format, generating files like chr*_re.bed.'
    )
    parser.add_argument('lib',
                        choices=['re', 'mcrbc'],
                        help='sequences generated by the RE or McrBC library')
    parser.add_argument('cmap', help='chromosome map provided by SOLiD')
    parser.add_argument(
        'mates', help='mate-pair reads from either the RE or McrBC library')
    parser.add_argument(
        '--out_dir',
        default=os.getcwd(),
        help='directory for parsed fragments, default=current dir')
    # Parse arguments
    args = parser.parse_args()
    readlib = get_readlib(args.lib)
    try:
        cmapfile = check_file(args.cmap)
        matesfile = check_file(args.mates)
    except MethError, e:
        parser.print_usage()
        print >> sys.stderr, 'MethError: ', e.value
        sys.exit(2)
    if os.path.isdir(args.out_dir):
        out_dir = os.path.abspath(args.out_dir)
    else:
        parser.print_usage()
        print >> sys.stderr, 'Invalid output directory'
        sys.exit(2)
    main(readlib, cmapfile, matesfile, out_dir)
    for line in lines:
        if not re.search('^#', line):  # skip comments
            k, v = line.rstrip().split('\t')
            para_dict[k] = v
    return para_dict


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='The master script to generate all sub-scripts for the data analysis pipeline')
    parser.add_argument('--run', type=bool, nargs='?', const=True, default=False, \
                        help='run the analysis pipeline or just save scripts, default=False')
    parser.add_argument('--format', choices=['mates', 'sam', 'bam'], default='mates', help='paired file format, default=mates')
    parser.add_argument('para', help='parameters required in the pipeline')
    parser.add_argument('re_reads', help='paired read file for RE fragments')
    parser.add_argument('mcrbc_reads', help='paired read file for McrBC fragments')
    parser.add_argument('out_dir', help='directory for all output files of the pipeline')
    # Parse arguments
    args = parser.parse_args()
    try:
        parafile = check_file(args.para)
        refile = check_file(args.re_reads)
        mcrbcfile = check_file(args.mcrbc_reads)
    except MethError, e:
        parser.print_usage()
        print >> sys.stderr, e.value
        sys.exit(2)
    if os.path.isdir(args.out_dir):
        out_dir = os.path.abspath(args.out_dir)
    main(args.run, args.format, parafile, refile, mcrbcfile, out_dir)