Example #1
0
def generate_plots_from_pickle(limit_to, report_dir, target_profiles, profile2def, gid2arcog_cdd, neighborhood_files_path):

    data_path = os.path.join(gv.project_data_path, 'Archea/pickle/')

    fname = os.path.join(data_path, str(limit_to), 'pentaplets_merged_within.p.bz2')
    pentaplets = t.load_compressed_pickle(fname)
    # fname = os.path.join(data_path, str(limit_to), 'quadruplets_merged_within.p.bz2')
    # quadruplets = t.load_compressed_pickle(fname)
    # fname = os.path.join(data_path, str(limit_to), 'triplets_merged_within.p.bz2')
    # triplets = t.load_compressed_pickle(fname)
    # fname = os.path.join(data_path, str(limit_to), 'duplets_merged_within.p.bz2')
    # duplets = t.load_compressed_pickle(fname)

    print 'Generationg reports for within orders merged lists'

    report_files_dir = os.path.join(gv.project_data_path, 'Archea/reports/merged_within_orders/', report_dir)

    if not os.path.exists(report_files_dir):
        os.mkdir(report_files_dir)

    for i, kplet_pool in zip([5, 4, 3, 2], [pentaplets, pentaplets, pentaplets, pentaplets]):
        print 'Starting for', i
        j = 0
        profile2counts_pool = dist.get_flank_distributions(kplet_pool, neighborhood_files_path, target_profiles)

        for k in range(len(kplet_pool)):

            kplet_sublist = kplet_pool[k]
            cur_reports_folder = os.path.join(report_files_dir, str(i))
            if not os.path.exists(cur_reports_folder):
                os.mkdir(cur_reports_folder)

            src2org, file_summaries, community, community_count, community_count_with_flanks = \
                merging.merge_into_file_summaries(kplet_sublist,
                                                  neighborhood_files_path,
                                                  file2src_src2org_map,
                                                  'archaea')
            if not src2org:
                continue

            xls_file_name = os.path.join(cur_reports_folder,  "%d_%d.xls" % (j+1, i))
            class2counts, class2profiles = merging.arcog_profile_count_into_class_count(community_count)
            class2counts_flank, class2profiles_flank = merging.arcog_profile_count_into_class_count(community_count_with_flanks)

            profile2counts = profile2counts_pool[k]

            j += 1
            params = dict()
            params['xls_file_name']      = xls_file_name
            params['src2org']            = src2org
            params['file_summaries']     = file_summaries
            params['community']          = community
            params['target_profiles']    = target_profiles
            params['profile2def']        = profile2def
            params['gid2arcog_cdd']      = gid2arcog_cdd
            params['class2counts']       = class2counts
            params['class2profiles']     = class2profiles
            params['class2counts_flank'] = class2counts_flank
            params['profile2counts']     = profile2counts
            r.write_to_xls(params)
Example #2
0
def generate_plots(limit_to, report_dir, target_profiles, profile2def, gid2arcog_cdd, neighborhood_files_path):

    print 'Reading kplets from database'
    pentaplets = p.get_report_kplets(limit_to=limit_to, load_locations=True)
    quadruplets = q.get_report_kplets(limit_to=limit_to, load_locations=True)
    triplets = tr.get_report_kplets(limit_to=limit_to, load_locations=True)
    duplets = d.get_report_kplets(limit_to=limit_to, load_locations=True)

    print 'Merging within order'
    pentaplets = merging.merge_kplets_within_orders(pentaplets, target_profiles)
    quadruplets= merging.merge_kplets_within_orders(quadruplets, target_profiles)
    triplets = merging.merge_kplets_within_orders(triplets, target_profiles)
    duplets = merging.merge_kplets_within_orders(duplets, target_profiles)

    print 'Generationg reports for within orders merged lists'

    report_files_dir = os.path.join(gv.project_data_path, 'Archea/reports/merged_within_orders/', report_dir)
    if not os.path.exists(report_files_dir):
        os.mkdir(report_files_dir)

    for i, kplet_pool in zip([5, 4, 3, 2], [pentaplets, quadruplets, triplets, duplets]):
        for j, kplet_sublist in enumerate(kplet_pool):
            cur_reports_folder = os.path.join(report_files_dir, str(i))
            if not os.path.exists(cur_reports_folder):
                os.mkdir(cur_reports_folder)
            xls_file_name = os.path.join(cur_reports_folder,  "%d_%d.xls" % (j+1, i))
            r.write_to_xls(xls_file_name,kplet_sublist,target_profiles,profile2def,gid2arcog_cdd,neighborhood_files_path,file2src_src2org_map)

    print 'Merging across order'
    triplets, duplets = merging.merge_kplets_across_order(triplets, duplets)
    quadruplets, triplets = merging.merge_kplets_across_order(quadruplets, triplets)
    pentaplets, quadruplets = merging.merge_kplets_across_order(pentaplets, quadruplets)

    print 'Generationg reports for across orders merged lists'

    report_files_dir = os.path.join(gv.project_data_path, 'Archea/reports/merged_across_orders/', report_dir)
    if not os.path.exists(report_files_dir):
        os.mkdir(report_files_dir)

    for i, kplet_pool in zip([5, 4, 3, 2], [pentaplets, quadruplets, triplets, duplets]):
        for j, kplet_sublist in enumerate(kplet_pool):
            cur_reports_folder = os.path.join(report_files_dir, str(i))
            if not os.path.exists(cur_reports_folder):
                os.mkdir(cur_reports_folder)
            xls_file_name = os.path.join(cur_reports_folder,  "%d_%d.xls" % (j+1, i))
            r.write_to_xls(xls_file_name,kplet_sublist,target_profiles,profile2def,gid2arcog_cdd,neighborhood_files_path,file2src_src2org_map)
Example #3
0
def generate_plots_from_pickle(limit_to, report_dir, target_profiles,
                               profile2def, gid2arcog_cdd,
                               neighborhood_files_path):

    data_path = os.path.join(gv.project_data_path, 'Archea/pickle/')

    fname = os.path.join(data_path, str(limit_to),
                         'pentaplets_merged_within.p.bz2')
    pentaplets = t.load_compressed_pickle(fname)
    fname = os.path.join(data_path, str(limit_to),
                         'quadruplets_merged_within.p.bz2')
    quadruplets = t.load_compressed_pickle(fname)
    fname = os.path.join(data_path, str(limit_to),
                         'triplets_merged_within.p.bz2')
    triplets = t.load_compressed_pickle(fname)
    fname = os.path.join(data_path, str(limit_to),
                         'duplets_merged_within.p.bz2')
    duplets = t.load_compressed_pickle(fname)

    # pentaplets, quadruplets, triplets = None, None, None

    print 'Generationg reports for within orders merged lists'

    flank_report_fnames = ['pentaplets_flank_counts.xls', 'quadruplets_flank_counts.xls', \
                           'triplets_flank_counts.xls', 'duplets_flank_counts.xls']

    kplets = [pentaplets, quadruplets, triplets, duplets]
    titles = ['Pentaplets', 'Quadruplets', 'Triplets', 'Duplets']

    report_files_dir = os.path.join(gv.project_data_path,
                                    'Archea/reports/merged_within_orders/',
                                    report_dir)

    for i in range(len(flank_report_fnames)):

        _fname = flank_report_fnames[i]
        _kplet_list = kplets[i]
        _title = titles[i]
        flank_counts, cog2gids, gid2weight = dist.get_flank_distributions(
            _kplet_list, neighborhood_files_path, target_profiles)

        # universal_flank_counts = t.merge_dict_list(flank_counts)
        # universal_cog2gids     = t.merge_dict_set_list(cog2gids, gid2weight)

        universal_flank_counts = t.merge_dict_set_list(cog2gids, gid2weight)

        params = dict()
        params['xls_file_name'] = os.path.join(report_files_dir, _fname)
        params['profile2def'] = profile2def
        params['flank_counts'] = universal_flank_counts
        params['title'] = _title
        params['target_profiles'] = target_profiles
        r.write_flanking_count_xls(params)

    if not os.path.exists(report_files_dir):
        os.mkdir(report_files_dir)

    for i, kplet_pool in zip([2, 3, 4, 5],
                             [duplets, triplets, quadruplets, pentaplets]):

        j = 0
        # profile2counts_pool = dist.get_flank_distributions(kplet_pool, neighborhood_files_path, target_profiles)

        file_summaries_pool = []

        for kplet_list in kplet_pool:

            summary_terms = merging.merge_into_file_summaries(
                kplet_list, neighborhood_files_path, file2src_src2org_map,
                'archaea')
            file_summaries_pool.append(summary_terms)

        for summary_terms in sorted(file_summaries_pool,
                                    key=itemgetter(5),
                                    reverse=True):

            src2org, file_summaries, community, community_count, community_count_with_flanks, weight = summary_terms

            if not src2org:
                continue

            cur_reports_folder = os.path.join(report_files_dir, str(i))
            if not os.path.exists(cur_reports_folder):
                os.mkdir(cur_reports_folder)

            xls_file_name = os.path.join(cur_reports_folder,
                                         "%d_%d.xls" % (j + 1, i))
            class2counts, class2profiles = merging.arcog_profile_count_into_class_count(
                community_count)
            class2counts_flank, class2profiles_flank = merging.arcog_profile_count_into_class_count(
                community_count_with_flanks)

            j += 1
            params = dict()
            params['xls_file_name'] = xls_file_name
            params['src2org'] = src2org
            params['file_summaries'] = file_summaries
            params['community'] = community
            params['target_profiles'] = target_profiles
            params['profile2def'] = profile2def
            params['gid2arcog_cdd'] = gid2arcog_cdd
            params['class2counts'] = class2counts
            params['class2profiles'] = class2profiles
            params['class2counts_flank'] = class2counts_flank
            # params['profile2counts'] = profile2counts
            r.write_to_xls(params)
Example #4
0
def generate_plots_from_pickle(limit_to, report_dir, target_profiles,
                               profile2def, gid2arcog_cdd,
                               neighborhood_files_path):

    data_path = os.path.join(gv.project_data_path, 'Bacteria/pickle/')

    fname = os.path.join(data_path, str(limit_to),
                         'pentaplets_merged_within.p.bz2')
    pentaplets = t.load_compressed_pickle(fname)
    fname = os.path.join(data_path, str(limit_to),
                         'quadruplets_merged_within.p.bz2')
    quadruplets = t.load_compressed_pickle(fname)
    fname = os.path.join(data_path, str(limit_to),
                         'triplets_merged_within.p.bz2')
    triplets = t.load_compressed_pickle(fname)
    fname = os.path.join(data_path, str(limit_to),
                         'duplets_merged_within.p.bz2')
    duplets = t.load_compressed_pickle(fname)

    print 'Generationg reports for across orders merged lists'

    flank_report_fnames = ['pentaplets_flank_counts.xls', 'quadruplets_flank_counts.xls', \
                           'triplets_flank_counts.xls', 'duplets_flank_counts.xls']

    kplets = [pentaplets, quadruplets, triplets, duplets]
    titles = ['Pentaplets', 'Quadruplets', 'Triplets', 'Duplets']

    report_files_dir = os.path.join(gv.project_data_path,
                                    'Bacteria/reports/merged_within_orders/',
                                    report_dir)

    for i in range(len(flank_report_fnames)):

        _fname = flank_report_fnames[i]
        _kplet_list = kplets[i]
        _title = titles[i]
        flank_counts, cog2gids, gid2weight = dist.get_flank_distributions(
            _kplet_list, neighborhood_files_path, target_profiles)
        universal_flank_counts = t.merge_dict_set_list(cog2gids, gid2weight)

        params = dict()
        params['xls_file_name'] = os.path.join(report_files_dir, _fname)
        params['profile2def'] = profile2def
        params['flank_counts'] = universal_flank_counts
        params['title'] = _title
        params['target_profiles'] = target_profiles
        r.write_flanking_count_xls(params)

    sys.exit()

    if not os.path.exists(report_files_dir):
        os.mkdir(report_files_dir)

    for i, kplet_pool in zip([5, 4, 3, 2],
                             [pentaplets, quadruplets, triplets, duplets]):
        print i
        j = 0
        # flank_counts_pool = dist.get_flank_distributions(kplet_pool, neighborhood_files_path, target_profiles)

        for k in range(len(kplet_pool)):

            kplet_sublist = kplet_pool[k]
            cur_reports_folder = os.path.join(report_files_dir, str(i))
            if not os.path.exists(cur_reports_folder):
                os.mkdir(cur_reports_folder)

            src2org, file_summaries, community, community_count, community_count_with_flanks\
                = merging.merge_into_file_summaries(kplet_sublist,
                                                    neighborhood_files_path,
                                                    file2src_src2org_map)
            if not src2org:
                continue

            xls_file_name = os.path.join(cur_reports_folder,
                                         "%d_%d.xls" % (j + 1, i))
            class2counts, class2profiles = merging.cdd_profile_count_into_class_count(
                community_count)
            class2counts_flank, class2profiles_flank = merging.cdd_profile_count_into_class_count(
                community_count_with_flanks)

            # profile2counts = flank_counts_pool[k]

            j += 1
            params = dict()
            params['xls_file_name'] = xls_file_name
            params['src2org'] = src2org
            params['file_summaries'] = file_summaries
            params['community'] = community
            params['target_profiles'] = target_profiles
            params['profile2def'] = profile2def
            params['gid2arcog_cdd'] = gid2arcog_cdd
            params['class2counts'] = class2counts
            params['class2profiles'] = class2profiles
            params['class2counts_flank'] = class2counts_flank
            # params['profile2counts'] = profile2counts
            r.write_to_xls(params)
Example #5
0
    for kplet_sublist in pentaplets:
        cur_reports_folder = os.path.join(report_files_dir, str(5))

        src2org, file_summaries, community, community_count, community_count_with_flanks = \
            merging.merge_into_file_summaries(kplet_sublist,
                                              neighborhood_files_path,
                                              file2src_src2org_map,
                                              'archaea')
        if not src2org:
            continue
        community_classes = merging.arcog_profile_count_into_class_count(
            community_count)
        community_flank_classes = merging.arcog_profile_count_into_class_count(
            community_count_with_flanks)

        xls_file_name = os.path.join(cur_reports_folder,
                                     "%d_%d.xls" % (j + 1, 5))
        print xls_file_name
        j += 1
        params = dict()
        params['xls_file_name'] = xls_file_name
        params['src2org'] = src2org
        params['file_summaries'] = file_summaries
        params['community'] = community
        params['target_profiles'] = target_profiles
        params['profile2def'] = profile2def
        params['gid2arcog_cdd'] = gid2arcog_cdd
        params['class_counts'] = community_classes
        params['class_flank_counts'] = community_flank_classes
        r.write_to_xls(params)
        break
        src2org, file_summaries, community, community_count, community_count_with_flanks = \
            merging.merge_into_file_summaries(kplet_sublist,
                                              neighborhood_files_path,
                                              file2src_src2org_map,
                                              'archaea')
        if not src2org:
            continue

        class2counts, class2profiles = merging.arcog_profile_count_into_class_count(community_count)
        class2counts_flank, class2profiles_flank = merging.arcog_profile_count_into_class_count(community_count_with_flanks)

        profile2counts = profile2counts_pool[k]

        xls_file_name = os.path.join(cur_reports_folder,  "%d_%d.xls" % (j+1, 5))
        print xls_file_name
        j += 1
        params = dict()
        params['xls_file_name'] = xls_file_name
        params['src2org'] = src2org
        params['file_summaries'] = file_summaries
        params['community'] = community
        params['target_profiles'] = target_profiles
        params['profile2def'] = profile2def
        params['gid2arcog_cdd'] = gid2arcog_cdd
        params['class2counts'] = class2counts
        params['class2profiles'] = class2profiles
        params['class2counts_flank'] = class2counts_flank
        params['profile2counts'] = profile2counts
        r.write_to_xls(params)
        break
Example #7
0
def generate_plots_from_pickle(limit_to, report_dir, target_profiles, profile2def, gid2arcog_cdd, neighborhood_files_path):

    data_path = os.path.join(gv.project_data_path, 'Bacteria/pickle/')

    fname = os.path.join(data_path, str(limit_to), 'pentaplets_merged_within.p.bz2')
    pentaplets = t.load_compressed_pickle(fname)
    fname = os.path.join(data_path, str(limit_to), 'quadruplets_merged_within.p.bz2')
    quadruplets = t.load_compressed_pickle(fname)
    fname = os.path.join(data_path, str(limit_to), 'triplets_merged_within.p.bz2')
    triplets = t.load_compressed_pickle(fname)
    fname = os.path.join(data_path, str(limit_to), 'duplets_merged_within.p.bz2')
    duplets = t.load_compressed_pickle(fname)

    print 'Generationg reports for across orders merged lists'

    flank_report_fnames = ['pentaplets_flank_counts.xls', 'quadruplets_flank_counts.xls', \
                           'triplets_flank_counts.xls', 'duplets_flank_counts.xls']

    kplets = [pentaplets, quadruplets, triplets, duplets]
    titles = ['Pentaplets', 'Quadruplets', 'Triplets', 'Duplets']

    report_files_dir = os.path.join(gv.project_data_path, 'Bacteria/reports/merged_within_orders/', report_dir)

    for i in range(len(flank_report_fnames)):

        _fname = flank_report_fnames[i]
        _kplet_list = kplets[i]
        _title = titles[i]
        flank_counts, cog2gids, gid2weight = dist.get_flank_distributions(_kplet_list, neighborhood_files_path, target_profiles)
        universal_flank_counts = t.merge_dict_set_list(cog2gids, gid2weight)

        params = dict()
        params['xls_file_name'] = os.path.join(report_files_dir, _fname)
        params['profile2def'] = profile2def
        params['flank_counts'] = universal_flank_counts
        params['title'] = _title
        params['target_profiles'] = target_profiles
        r.write_flanking_count_xls(params)

    sys.exit()

    if not os.path.exists(report_files_dir):
        os.mkdir(report_files_dir)

    for i, kplet_pool in zip([5, 4, 3, 2], [pentaplets, quadruplets, triplets, duplets]):
        print i
        j = 0
        # flank_counts_pool = dist.get_flank_distributions(kplet_pool, neighborhood_files_path, target_profiles)

        for k in range(len(kplet_pool)):

            kplet_sublist = kplet_pool[k]
            cur_reports_folder = os.path.join(report_files_dir, str(i))
            if not os.path.exists(cur_reports_folder):
                os.mkdir(cur_reports_folder)

            src2org, file_summaries, community, community_count, community_count_with_flanks\
                = merging.merge_into_file_summaries(kplet_sublist,
                                                    neighborhood_files_path,
                                                    file2src_src2org_map)
            if not src2org:
                continue

            xls_file_name = os.path.join(cur_reports_folder,  "%d_%d.xls" % (j+1, i))
            class2counts, class2profiles = merging.cdd_profile_count_into_class_count(community_count)
            class2counts_flank, class2profiles_flank = merging.cdd_profile_count_into_class_count(community_count_with_flanks)

            # profile2counts = flank_counts_pool[k]

            j += 1
            params = dict()
            params['xls_file_name'] = xls_file_name
            params['src2org'] = src2org
            params['file_summaries'] = file_summaries
            params['community'] = community
            params['target_profiles'] = target_profiles
            params['profile2def'] = profile2def
            params['gid2arcog_cdd'] = gid2arcog_cdd
            params['class2counts'] = class2counts
            params['class2profiles'] = class2profiles
            params['class2counts_flank'] = class2counts_flank
            # params['profile2counts'] = profile2counts
            r.write_to_xls(params)
Example #8
0
def generate_plots_from_pickle(limit_to, report_dir, target_profiles, profile2def, gid2arcog_cdd, neighborhood_files_path):

    data_path = os.path.join(gv.project_data_path, 'Archea/pickle/')

    fname = os.path.join(data_path, str(limit_to), 'pentaplets_merged_within.p.bz2')
    pentaplets = t.load_compressed_pickle(fname)
    fname = os.path.join(data_path, str(limit_to), 'quadruplets_merged_within.p.bz2')
    quadruplets = t.load_compressed_pickle(fname)
    fname = os.path.join(data_path, str(limit_to), 'triplets_merged_within.p.bz2')
    triplets = t.load_compressed_pickle(fname)
    fname = os.path.join(data_path, str(limit_to), 'duplets_merged_within.p.bz2')
    duplets = t.load_compressed_pickle(fname)

    # pentaplets, quadruplets, triplets = None, None, None

    print 'Generationg reports for within orders merged lists'

    flank_report_fnames = ['pentaplets_flank_counts.xls', 'quadruplets_flank_counts.xls', \
                           'triplets_flank_counts.xls', 'duplets_flank_counts.xls']

    kplets = [pentaplets, quadruplets, triplets, duplets]
    titles = ['Pentaplets', 'Quadruplets', 'Triplets', 'Duplets']

    report_files_dir = os.path.join(gv.project_data_path, 'Archea/reports/merged_within_orders/', report_dir)

    for i in range(len(flank_report_fnames)):

        _fname = flank_report_fnames[i]
        _kplet_list = kplets[i]
        _title = titles[i]
        flank_counts, cog2gids, gid2weight = dist.get_flank_distributions(_kplet_list, neighborhood_files_path, target_profiles)

        # universal_flank_counts = t.merge_dict_list(flank_counts)
        # universal_cog2gids     = t.merge_dict_set_list(cog2gids, gid2weight)

        universal_flank_counts = t.merge_dict_set_list(cog2gids, gid2weight)

        params = dict()
        params[  'xls_file_name'] = os.path.join(report_files_dir, _fname)
        params[    'profile2def'] = profile2def
        params[   'flank_counts'] = universal_flank_counts
        params[          'title'] = _title
        params['target_profiles'] = target_profiles
        r.write_flanking_count_xls(params)

    if not os.path.exists(report_files_dir):
        os.mkdir(report_files_dir)

    for i, kplet_pool in zip([2, 3, 4, 5], [duplets, triplets, quadruplets, pentaplets]):

        j = 0
        # profile2counts_pool = dist.get_flank_distributions(kplet_pool, neighborhood_files_path, target_profiles)

        file_summaries_pool = []

        for kplet_list in kplet_pool:

            summary_terms = merging.merge_into_file_summaries(kplet_list,
                                                      neighborhood_files_path,
                                                      file2src_src2org_map,
                                                      'archaea')
            file_summaries_pool.append(summary_terms)

        for summary_terms in sorted(file_summaries_pool, key=itemgetter(5), reverse=True):

            src2org, file_summaries, community, community_count, community_count_with_flanks, weight = summary_terms

            if not src2org:
                continue

            cur_reports_folder = os.path.join(report_files_dir, str(i))
            if not os.path.exists(cur_reports_folder):
                os.mkdir(cur_reports_folder)

            xls_file_name = os.path.join(cur_reports_folder,  "%d_%d.xls" % (j+1, i))
            class2counts, class2profiles = merging.arcog_profile_count_into_class_count(community_count)
            class2counts_flank, class2profiles_flank = merging.arcog_profile_count_into_class_count(community_count_with_flanks)

            j += 1
            params = dict()
            params[     'xls_file_name'] = xls_file_name
            params[           'src2org'] = src2org
            params[    'file_summaries'] = file_summaries
            params[         'community'] = community
            params[   'target_profiles'] = target_profiles
            params[       'profile2def'] = profile2def
            params[     'gid2arcog_cdd'] = gid2arcog_cdd
            params[      'class2counts'] = class2counts
            params[    'class2profiles'] = class2profiles
            params['class2counts_flank'] = class2counts_flank
            # params['profile2counts'] = profile2counts
            r.write_to_xls(params)