Esempio n. 1
0
def generate_plots_from_pickle(limit_to, report_dir, target_profiles, profile2def, gid2arcog_cdd, neighborhood_files_path):

    data_path = os.path.join(gv.project_data_path, 'Archea/pickle/')

    fname = os.path.join(data_path, str(limit_to), 'pentaplets_merged_within.p.bz2')
    pentaplets = t.load_compressed_pickle(fname)
    # fname = os.path.join(data_path, str(limit_to), 'quadruplets_merged_within.p.bz2')
    # quadruplets = t.load_compressed_pickle(fname)
    # fname = os.path.join(data_path, str(limit_to), 'triplets_merged_within.p.bz2')
    # triplets = t.load_compressed_pickle(fname)
    # fname = os.path.join(data_path, str(limit_to), 'duplets_merged_within.p.bz2')
    # duplets = t.load_compressed_pickle(fname)

    print 'Generationg reports for within orders merged lists'

    report_files_dir = os.path.join(gv.project_data_path, 'Archea/reports/merged_within_orders/', report_dir)

    if not os.path.exists(report_files_dir):
        os.mkdir(report_files_dir)

    for i, kplet_pool in zip([5, 4, 3, 2], [pentaplets, pentaplets, pentaplets, pentaplets]):
        print 'Starting for', i
        j = 0
        profile2counts_pool = dist.get_flank_distributions(kplet_pool, neighborhood_files_path, target_profiles)

        for k in range(len(kplet_pool)):

            kplet_sublist = kplet_pool[k]
            cur_reports_folder = os.path.join(report_files_dir, str(i))
            if not os.path.exists(cur_reports_folder):
                os.mkdir(cur_reports_folder)

            src2org, file_summaries, community, community_count, community_count_with_flanks = \
                merging.merge_into_file_summaries(kplet_sublist,
                                                  neighborhood_files_path,
                                                  file2src_src2org_map,
                                                  'archaea')
            if not src2org:
                continue

            xls_file_name = os.path.join(cur_reports_folder,  "%d_%d.xls" % (j+1, i))
            class2counts, class2profiles = merging.arcog_profile_count_into_class_count(community_count)
            class2counts_flank, class2profiles_flank = merging.arcog_profile_count_into_class_count(community_count_with_flanks)

            profile2counts = profile2counts_pool[k]

            j += 1
            params = dict()
            params['xls_file_name']      = xls_file_name
            params['src2org']            = src2org
            params['file_summaries']     = file_summaries
            params['community']          = community
            params['target_profiles']    = target_profiles
            params['profile2def']        = profile2def
            params['gid2arcog_cdd']      = gid2arcog_cdd
            params['class2counts']       = class2counts
            params['class2profiles']     = class2profiles
            params['class2counts_flank'] = class2counts_flank
            params['profile2counts']     = profile2counts
            r.write_to_xls(params)
Esempio n. 2
0
def generate_plots_from_pickle(limit_to, report_dir, target_profiles,
                               profile2def, gid2arcog_cdd,
                               neighborhood_files_path):

    data_path = os.path.join(gv.project_data_path, 'Bacteria/pickle/')

    fname = os.path.join(data_path, str(limit_to),
                         'pentaplets_merged_within.p.bz2')
    pentaplets = t.load_compressed_pickle(fname)
    fname = os.path.join(data_path, str(limit_to),
                         'quadruplets_merged_within.p.bz2')
    quadruplets = t.load_compressed_pickle(fname)
    fname = os.path.join(data_path, str(limit_to),
                         'triplets_merged_within.p.bz2')
    triplets = t.load_compressed_pickle(fname)
    fname = os.path.join(data_path, str(limit_to),
                         'duplets_merged_within.p.bz2')
    duplets = t.load_compressed_pickle(fname)

    print 'Generationg reports for across orders merged lists'

    flank_report_fnames = ['pentaplets_flank_counts.xls', 'quadruplets_flank_counts.xls', \
                           'triplets_flank_counts.xls', 'duplets_flank_counts.xls']

    kplets = [pentaplets, quadruplets, triplets, duplets]
    titles = ['Pentaplets', 'Quadruplets', 'Triplets', 'Duplets']

    report_files_dir = os.path.join(gv.project_data_path,
                                    'Bacteria/reports/merged_within_orders/',
                                    report_dir)

    for i in range(len(flank_report_fnames)):

        _fname = flank_report_fnames[i]
        _kplet_list = kplets[i]
        _title = titles[i]
        flank_counts, cog2gids, gid2weight = dist.get_flank_distributions(
            _kplet_list, neighborhood_files_path, target_profiles)
        universal_flank_counts = t.merge_dict_set_list(cog2gids, gid2weight)

        params = dict()
        params['xls_file_name'] = os.path.join(report_files_dir, _fname)
        params['profile2def'] = profile2def
        params['flank_counts'] = universal_flank_counts
        params['title'] = _title
        params['target_profiles'] = target_profiles
        r.write_flanking_count_xls(params)

    sys.exit()

    if not os.path.exists(report_files_dir):
        os.mkdir(report_files_dir)

    for i, kplet_pool in zip([5, 4, 3, 2],
                             [pentaplets, quadruplets, triplets, duplets]):
        print i
        j = 0
        # flank_counts_pool = dist.get_flank_distributions(kplet_pool, neighborhood_files_path, target_profiles)

        for k in range(len(kplet_pool)):

            kplet_sublist = kplet_pool[k]
            cur_reports_folder = os.path.join(report_files_dir, str(i))
            if not os.path.exists(cur_reports_folder):
                os.mkdir(cur_reports_folder)

            src2org, file_summaries, community, community_count, community_count_with_flanks\
                = merging.merge_into_file_summaries(kplet_sublist,
                                                    neighborhood_files_path,
                                                    file2src_src2org_map)
            if not src2org:
                continue

            xls_file_name = os.path.join(cur_reports_folder,
                                         "%d_%d.xls" % (j + 1, i))
            class2counts, class2profiles = merging.cdd_profile_count_into_class_count(
                community_count)
            class2counts_flank, class2profiles_flank = merging.cdd_profile_count_into_class_count(
                community_count_with_flanks)

            # profile2counts = flank_counts_pool[k]

            j += 1
            params = dict()
            params['xls_file_name'] = xls_file_name
            params['src2org'] = src2org
            params['file_summaries'] = file_summaries
            params['community'] = community
            params['target_profiles'] = target_profiles
            params['profile2def'] = profile2def
            params['gid2arcog_cdd'] = gid2arcog_cdd
            params['class2counts'] = class2counts
            params['class2profiles'] = class2profiles
            params['class2counts_flank'] = class2counts_flank
            # params['profile2counts'] = profile2counts
            r.write_to_xls(params)
Esempio n. 3
0
    fname = os.path.join(data_path, str(limit_to),
                         'pentaplets_merged_across.p.bz2')
    pentaplets = t.load_compressed_pickle(fname)

    report_dir = 'all'
    report_files_dir = os.path.join(gv.project_data_path,
                                    'Archea/reports/merged_across_orders/',
                                    report_dir)

    j = 0
    for kplet_sublist in pentaplets:
        cur_reports_folder = os.path.join(report_files_dir, str(5))

        src2org, file_summaries, community, community_count, community_count_with_flanks = \
            merging.merge_into_file_summaries(kplet_sublist,
                                              neighborhood_files_path,
                                              file2src_src2org_map,
                                              'archaea')
        if not src2org:
            continue
        community_classes = merging.arcog_profile_count_into_class_count(
            community_count)
        community_flank_classes = merging.arcog_profile_count_into_class_count(
            community_count_with_flanks)

        xls_file_name = os.path.join(cur_reports_folder,
                                     "%d_%d.xls" % (j + 1, 5))
        print xls_file_name
        j += 1
        params = dict()
        params['xls_file_name'] = xls_file_name
        params['src2org'] = src2org
        if len(pivot.codes.intersection(_kplet.codes)) == 4:
            _orgs = set([file2organism[f] for f in _kplet.files])
            print i+1, len(set(pivot.files).intersection(set(_kplet.files))), len(_kplet.files), len(pivot_orgs), len(_orgs), len(pivot_orgs.intersection(_orgs)), _kplet.codes

    profile2counts_pool = dist.get_flank_distributions(pentaplets, neighborhood_files_path, target_profiles)

    j = 0
    for k in range(len(pentaplets)):

        kplet_sublist = pentaplets[k]

        cur_reports_folder = os.path.join(report_files_dir, str(5))

        src2org, file_summaries, community, community_count, community_count_with_flanks = \
            merging.merge_into_file_summaries(kplet_sublist,
                                              neighborhood_files_path,
                                              file2src_src2org_map,
                                              'archaea')
        if not src2org:
            continue

        class2counts, class2profiles = merging.arcog_profile_count_into_class_count(community_count)
        class2counts_flank, class2profiles_flank = merging.arcog_profile_count_into_class_count(community_count_with_flanks)

        profile2counts = profile2counts_pool[k]

        xls_file_name = os.path.join(cur_reports_folder,  "%d_%d.xls" % (j+1, 5))
        print xls_file_name
        j += 1
        params = dict()
        params['xls_file_name'] = xls_file_name
        params['src2org'] = src2org
Esempio n. 5
0
    profiles = {}
    for kplet in merged_kplets[0]:
        for code in kplet.codes:
            if code in target_profiles:
                if code in profiles:
                    profiles[code] += 1
                else:
                    profiles[code] = 1
    fname = '/Users/hudaiber/Projects/NewSystems/data/Bacteria/pickle/100000/merged_kplets.p.bz2'

    dest_dir = '/Users/hudaiber/Projects/NewSystems/data/Bacteria/reports/tmp/'

    for cnt, kplet_sublist in enumerate(merged_kplets):
        print cnt + 1
        xls_file_name = os.path.join(dest_dir, "%d.xls" % (cnt + 1))
        src2org, file_summaries, community, community_count = merging.merge_into_file_summaries(
            kplet_sublist, neighborhood_files_path, file2src_src2org_map)

        params = {}

        params['xls_file_name'] = xls_file_name
        params['src2org'] = src2org
        params['file_summaries'] = file_summaries
        params['community'] = community
        params['target_profiles'] = target_profiles
        params['profile2def'] = profile2def
        params['gid2arcog_cdd'] = gid2arcog_cdd

        r.write_to_xls(params)
        if cnt == 200:
            break
    profiles = {}
    for kplet in merged_kplets[0]:
        for code in kplet.codes:
            if code in target_profiles:
                if code in profiles:
                    profiles[code] += 1
                else:
                    profiles[code] = 1
    fname = '/Users/hudaiber/Projects/NewSystems/data/Bacteria/pickle/100000/merged_kplets.p.bz2'

    dest_dir = '/Users/hudaiber/Projects/NewSystems/data/Bacteria/reports/tmp/'

    for cnt, kplet_sublist in enumerate(merged_kplets):
        print cnt+1
        xls_file_name = os.path.join(dest_dir,  "%d.xls" % (cnt+1))
        src2org, file_summaries, community, community_count = merging.merge_into_file_summaries(kplet_sublist, neighborhood_files_path, file2src_src2org_map)

        params = {}

        params['xls_file_name'] = xls_file_name
        params['src2org'] = src2org
        params['file_summaries'] = file_summaries
        params['community'] = community
        params['target_profiles'] = target_profiles
        params['profile2def'] = profile2def
        params['gid2arcog_cdd'] = gid2arcog_cdd

        r.write_to_xls(params)
        if cnt == 200:
            break
Esempio n. 7
0
def generate_plots_from_pickle(limit_to, report_dir, target_profiles, profile2def, gid2arcog_cdd, neighborhood_files_path):

    data_path = os.path.join(gv.project_data_path, 'Bacteria/pickle/')

    fname = os.path.join(data_path, str(limit_to), 'pentaplets_merged_within.p.bz2')
    pentaplets = t.load_compressed_pickle(fname)
    fname = os.path.join(data_path, str(limit_to), 'quadruplets_merged_within.p.bz2')
    quadruplets = t.load_compressed_pickle(fname)
    fname = os.path.join(data_path, str(limit_to), 'triplets_merged_within.p.bz2')
    triplets = t.load_compressed_pickle(fname)
    fname = os.path.join(data_path, str(limit_to), 'duplets_merged_within.p.bz2')
    duplets = t.load_compressed_pickle(fname)

    print 'Generationg reports for across orders merged lists'

    flank_report_fnames = ['pentaplets_flank_counts.xls', 'quadruplets_flank_counts.xls', \
                           'triplets_flank_counts.xls', 'duplets_flank_counts.xls']

    kplets = [pentaplets, quadruplets, triplets, duplets]
    titles = ['Pentaplets', 'Quadruplets', 'Triplets', 'Duplets']

    report_files_dir = os.path.join(gv.project_data_path, 'Bacteria/reports/merged_within_orders/', report_dir)

    for i in range(len(flank_report_fnames)):

        _fname = flank_report_fnames[i]
        _kplet_list = kplets[i]
        _title = titles[i]
        flank_counts, cog2gids, gid2weight = dist.get_flank_distributions(_kplet_list, neighborhood_files_path, target_profiles)
        universal_flank_counts = t.merge_dict_set_list(cog2gids, gid2weight)

        params = dict()
        params['xls_file_name'] = os.path.join(report_files_dir, _fname)
        params['profile2def'] = profile2def
        params['flank_counts'] = universal_flank_counts
        params['title'] = _title
        params['target_profiles'] = target_profiles
        r.write_flanking_count_xls(params)

    sys.exit()

    if not os.path.exists(report_files_dir):
        os.mkdir(report_files_dir)

    for i, kplet_pool in zip([5, 4, 3, 2], [pentaplets, quadruplets, triplets, duplets]):
        print i
        j = 0
        # flank_counts_pool = dist.get_flank_distributions(kplet_pool, neighborhood_files_path, target_profiles)

        for k in range(len(kplet_pool)):

            kplet_sublist = kplet_pool[k]
            cur_reports_folder = os.path.join(report_files_dir, str(i))
            if not os.path.exists(cur_reports_folder):
                os.mkdir(cur_reports_folder)

            src2org, file_summaries, community, community_count, community_count_with_flanks\
                = merging.merge_into_file_summaries(kplet_sublist,
                                                    neighborhood_files_path,
                                                    file2src_src2org_map)
            if not src2org:
                continue

            xls_file_name = os.path.join(cur_reports_folder,  "%d_%d.xls" % (j+1, i))
            class2counts, class2profiles = merging.cdd_profile_count_into_class_count(community_count)
            class2counts_flank, class2profiles_flank = merging.cdd_profile_count_into_class_count(community_count_with_flanks)

            # profile2counts = flank_counts_pool[k]

            j += 1
            params = dict()
            params['xls_file_name'] = xls_file_name
            params['src2org'] = src2org
            params['file_summaries'] = file_summaries
            params['community'] = community
            params['target_profiles'] = target_profiles
            params['profile2def'] = profile2def
            params['gid2arcog_cdd'] = gid2arcog_cdd
            params['class2counts'] = class2counts
            params['class2profiles'] = class2profiles
            params['class2counts_flank'] = class2counts_flank
            # params['profile2counts'] = profile2counts
            r.write_to_xls(params)
        if not os.path.exists(reports_dir):
            os.mkdir(reports_dir)

        ind = 0
        for kplet in kplet_list[:1000]:

            codes = list(kplet.codes)
            suffix = ['s' if code in target_profiles else 'n' for code in codes]
            if 's' not in suffix:
                continue

            xls_file_name = os.path.join(reports_dir, '%d_%s.xls' % (ind+1, "".join(suffix)))
            ind += 1

            src2org, file_summaries, community, community_count, community_count_with_flanks, weight = \
                    merging.merge_into_file_summaries([kplet],
                                                      neighborhood_files_path,
                                                      file2src_src2org_map)

            # xls_file_name = os.path.join(reports_dir, '%d.xls' % (ind+1))
            params = dict()
            params[     'xls_file_name'] = xls_file_name
            params[           'src2org'] = src2org
            params[    'file_summaries'] = file_summaries
            params[   'target_profiles'] = target_profiles
            params[       'profile2def'] = profile2def
            params[     'gid2arcog_cdd'] = gid2arcog_cdd

            # print "Writing", ind, xls_file_name
            r.write_to_xls_raw_kplet(params)
Esempio n. 9
0
    # profile_id2code = map_id2cdd()
    # pickle.dump(profile_id2code, open('profile_id2code.p','w'))
    profile_id2code = cPickle.load(open('/Users/hudaiber/Projects/NewSystems/code/Bacteria/profile_id2code.p'))

    # fname = '/Users/hudaiber/Projects/NewSystems/data/Bacteria/pickle/100000/pentaplets_merged_across.p.bz2'
    fname = '/Users/hudaiber/Projects/NewSystems/data/Bacteria/pickle/100000/merged_kplets.p.bz2'
    f = bz2.BZ2File(fname, 'rb')
    merged_kplets = cPickle.load(f)

    dest_dir = '/Users/hudaiber/Projects/NewSystems/data/Bacteria/reports/tmp/'

    print 'starting reports'
    for cnt, kplet_sublist in enumerate(merged_kplets):
        print cnt+1
        xls_file_name = os.path.join(dest_dir,  "%d.xls" % (cnt+1))

        src2org, file_summaries, community = merging.merge_into_file_summaries(kplet_sublist, neighborhood_files_path, file2src_src2org_map, 'bacteria')
        params = {}
        params['xls_file_name'] = xls_file_name
        params['src2org'] = src2org
        params['file_summaries'] = file_summaries
        params['community'] = community
        params['target_profiles'] = target_profiles
        params['profile2def'] = profile2def
        params['gid2arcog_cdd'] = gid2arcog_cdd

        r.write_to_xls(params)
        break
        if cnt == 200:
            break