def calculate_mods_frequency(mods_files, prob_cf):
    sitekeys = set()
    sitekey2stats = dict()

    count, used = 0, 0
    for mods_file in mods_files:
        if mods_file.endswith(".gz"):
            infile = gzip.open(mods_file, 'rt')
        else:
            infile = open(mods_file, 'r')
        for line in infile:
            words = line.strip().split("\t")
            mod_record = ModRecord(words)
            if mod_record.is_record_callable(prob_cf):
                if mod_record._site_key not in sitekeys:
                    sitekeys.add(mod_record._site_key)
                    sitekey2stats[mod_record._site_key] = SiteStats(mod_record._strand,
                                                                    mod_record._pos_in_strand,
                                                                    mod_record._kmer)
                sitekey2stats[mod_record._site_key]._prob_0 += mod_record._prob_0
                sitekey2stats[mod_record._site_key]._prob_1 += mod_record._prob_1
                sitekey2stats[mod_record._site_key]._coverage += 1
                if mod_record._called_label == 1:
                    sitekey2stats[mod_record._site_key]._met += 1
                else:
                    sitekey2stats[mod_record._site_key]._unmet += 1
                used += 1
            count += 1
        infile.close()
    print("{:.2f}% ({} of {}) calls used..".format(used/float(count) * 100, used, count))
    return sitekey2stats
Example #2
0
def calculate_mods_frequency(mods_files, prob_cf):
    sitekeys = set()
    sitekey2stats = dict()

    count, used = 0, 0
    for mods_file in mods_files:
        with open(mods_file, 'r') as rf:
            for line in rf:
                words = line.strip().split("\t")
                mod_record = ModRecord(words)
                if mod_record.is_record_callable(prob_cf):
                    if mod_record._site_key not in sitekeys:
                        sitekeys.add(mod_record._site_key)
                        sitekey2stats[mod_record._site_key] = SiteStats(mod_record._strand,
                                                                        mod_record._pos_in_strand,
                                                                        mod_record._kmer)
                    sitekey2stats[mod_record._site_key]._prob_0 += mod_record._prob_0
                    sitekey2stats[mod_record._site_key]._prob_1 += mod_record._prob_1
                    sitekey2stats[mod_record._site_key]._coverage += 1
                    if mod_record._called_label == 1:
                        sitekey2stats[mod_record._site_key]._met += 1
                    else:
                        sitekey2stats[mod_record._site_key]._unmet += 1
                    used += 1
                count += 1
    print("{} of {} samples used..".format(used, count))
    return sitekey2stats
Example #3
0
def sample_sites(filename, is_methylated):
    all_crs = list()
    rf = open(filename)
    for line in rf:
        mt_record = ModRecord(line.rstrip().split())
        all_crs.append(
            CallRecord(mt_record._site_key, mt_record._called_label,
                       is_methylated, mt_record._prob_0, mt_record._prob_1))
    rf.close()
    print('there are {} basemod candidates totally'.format(len(all_crs)))

    random.shuffle(all_crs)
    return all_crs