Example #1
0
def merging_pipeline_for_order(order, load_from_db=False):
    limit_to = 1000000000
    print "starting for ", order
    if load_from_db:
        print "Loading kplets from DB"
        if order == 2:
            kplet_file = 'duplets.p.bz2'
            #kplets = t.load_compressed_pickle(os.path.join(data_path, kplet_file))
        elif order == 3:
            kplet_file = 'triplets.p.bz2'
            #kplets = t.load_compressed_pickle(os.path.join(data_path, kplet_file))
        elif order == 4:
            kplet_file = 'quadruplets.p.bz2'
            kplets = q.get_report_kplets(profile_id2code, limit_to=limit_to)
        elif order == 5:
            kplet_file = 'pentaplets.p.bz2'
            kplets = p.get_report_kplets(profile_id2code, limit_to=limit_to)
    else:
        print "Loading kplets from pickle file"
        if order == 2:
            kplet_file = 'duplets.p.bz2'
        elif order == 3:
            kplet_file = 'triplets.p.bz2'
        elif order == 4:
            kplet_file = 'quadruplets.p.bz2'
        elif order == 5:
            kplet_file = 'pentaplets.p.bz2'
        kplet_file_full = os.path.join(data_path, kplet_file)
        print kplet_file_full
        kplets = t.load_compressed_pickle(kplet_file_full)

    # print "Starting for", kplet_file
    # print "Loading kplets"
    # kplets = t.load_compressed_pickle(os.path.join(data_path, kplet_file))
    print "No of kplets:", len(kplets)

    #print "Loading file2genes"

    #_file2genes = {}
    #for _f in os.listdir(neighborhood_files_path):
    #    _file2genes[_f] = dt.get_wgs_file(os.path.join(neighborhood_files_path, _f))

    # print 'Filtering'
    # kplets = filter_seed(kplets, _file2genes)
    # print "No of kplets:", len(kplets)
    # fname = os.path.join(data_path,  kplet_file.split('.')[0]+'_seed.p.bz2')
    # print 'Dumping', fname
    # t.dump_compressed_pickle(fname, kplets)

    print "Basic merging"
    merged_lists = merging.basic_merge_within_orders(kplets)
    fname = os.path.join(data_path, "basic_merged_" + kplet_file)
    print "Dumping basic merging: ", fname
    t.dump_compressed_pickle(fname, merged_lists)

    print "Iterative merging"
    merged_lists = merging.merge_kplets_within_orders_iterative(merged_lists)
    fname = os.path.join(data_path, "iterative_merged_" + kplet_file)
    print "Dumping Iterative merging: ", fname
    t.dump_compressed_pickle(fname, merged_lists)
Example #2
0
def generate_plots_from_pickle(limit_to, report_dir, target_profiles, profile2def, gid2arcog_cdd, neighborhood_files_path):

    data_path = os.path.join(gv.project_data_path, 'Archea/pickle/')

    fname = os.path.join(data_path, str(limit_to), 'pentaplets_merged_within.p.bz2')
    pentaplets = t.load_compressed_pickle(fname)
    # fname = os.path.join(data_path, str(limit_to), 'quadruplets_merged_within.p.bz2')
    # quadruplets = t.load_compressed_pickle(fname)
    # fname = os.path.join(data_path, str(limit_to), 'triplets_merged_within.p.bz2')
    # triplets = t.load_compressed_pickle(fname)
    # fname = os.path.join(data_path, str(limit_to), 'duplets_merged_within.p.bz2')
    # duplets = t.load_compressed_pickle(fname)

    print 'Generationg reports for within orders merged lists'

    report_files_dir = os.path.join(gv.project_data_path, 'Archea/reports/merged_within_orders/', report_dir)

    if not os.path.exists(report_files_dir):
        os.mkdir(report_files_dir)

    for i, kplet_pool in zip([5, 4, 3, 2], [pentaplets, pentaplets, pentaplets, pentaplets]):
        print 'Starting for', i
        j = 0
        profile2counts_pool = dist.get_flank_distributions(kplet_pool, neighborhood_files_path, target_profiles)

        for k in range(len(kplet_pool)):

            kplet_sublist = kplet_pool[k]
            cur_reports_folder = os.path.join(report_files_dir, str(i))
            if not os.path.exists(cur_reports_folder):
                os.mkdir(cur_reports_folder)

            src2org, file_summaries, community, community_count, community_count_with_flanks = \
                merging.merge_into_file_summaries(kplet_sublist,
                                                  neighborhood_files_path,
                                                  file2src_src2org_map,
                                                  'archaea')
            if not src2org:
                continue

            xls_file_name = os.path.join(cur_reports_folder,  "%d_%d.xls" % (j+1, i))
            class2counts, class2profiles = merging.arcog_profile_count_into_class_count(community_count)
            class2counts_flank, class2profiles_flank = merging.arcog_profile_count_into_class_count(community_count_with_flanks)

            profile2counts = profile2counts_pool[k]

            j += 1
            params = dict()
            params['xls_file_name']      = xls_file_name
            params['src2org']            = src2org
            params['file_summaries']     = file_summaries
            params['community']          = community
            params['target_profiles']    = target_profiles
            params['profile2def']        = profile2def
            params['gid2arcog_cdd']      = gid2arcog_cdd
            params['class2counts']       = class2counts
            params['class2profiles']     = class2profiles
            params['class2counts_flank'] = class2counts_flank
            params['profile2counts']     = profile2counts
            r.write_to_xls(params)
Example #3
0
def process_reporting_single_order(order, threshold):

    if order == 2:
        prefix = 'duplets'
    elif order == 3:
        prefix = 'triplets'
    elif order == 4:
        prefix = 'quadruplets'
    elif order == 5:
        prefix = 'pentaplets'

    pickle_file = os.path.join(data_path, "%s.p.bz2" % prefix)
    print "Loading kplets:", pickle_file

    kplets = t.load_compressed_pickle(pickle_file)
    kplets = {kplet.id: kplet for kplet in kplets}

    merged_data_file = os.path.join(
        data_path, "%s.merged_%s.txt" % (prefix, threshold.split('.')[1]))
    print "Loading merged file:", merged_data_file
    merged_lists = [[kplets[int(id)] for id in cur_list.split(',')[0].split()]
                    for cur_list in open(merged_data_file).readlines()
                    if cur_list.split(',')[0]]

    reports_dir = os.path.join(gv.project_data_path,
                               'cas1402/reports/thr_%s/' % threshold)
    if not os.path.exists(reports_dir):
        os.mkdir(reports_dir)
    reports_dir = os.path.join(reports_dir, str(order))
    print "Generating reports to:", reports_dir
    generate_reports(merged_lists, reports_dir, neighborhood_files_path)
Example #4
0
def process_reporting_single_order(order, threshold):

    if order == 2:
        prefix = 'duplets'
    elif order == 3:
        prefix = 'triplets'
    elif order == 4:
        prefix = 'quadruplets'
    elif order == 5:
        prefix = 'pentaplets'

    pickle_file = os.path.join(data_path, "%s.p.bz2"%prefix)
    print "Loading kplets:", pickle_file

    kplets = t.load_compressed_pickle(pickle_file)
    kplets = {kplet.id:kplet for kplet in kplets}

    merged_data_file = os.path.join(data_path, "%s.merged_%s.txt"%(prefix, threshold.split('.')[1]))
    print "Loading merged file:", merged_data_file
    merged_lists = [[kplets[int(id)] for id in cur_list.split(',')[0].split()] for cur_list in open(merged_data_file).readlines() if cur_list.split(',')[0]]

    reports_dir = os.path.join(gv.project_data_path,'cas1402/reports/thr_%s/' % threshold)
    if not os.path.exists(reports_dir):
        os.mkdir(reports_dir)
    reports_dir = os.path.join(reports_dir, str(order))
    print "Generating reports to:", reports_dir
    generate_reports(merged_lists, reports_dir, neighborhood_files_path)
Example #5
0
def prok1603_architecture_frequencies():

    work_dir = os.path.join(gv.project_data_path, 'UvrD/')

    map_file = os.path.join(work_dir, 'prok1603/prok1603_weights.txt')

    locus2weight = {l.split()[0]:float(l.split()[1]) for l in open(map_file)}

    def_file = os.path.join(gv.project_data_path, 'cas4/profiles/defenseProfiles.tab')
    profile2gene={}
    profile2def = {}

    for l in open(def_file):
        terms = l.strip().split('\t')
        profile = terms[0]
        gene_names = terms[3].split(',')
        if len(gene_names)>1:
            profile2gene[profile] = gene_names[1]
        else:
            profile2gene[profile] = gene_names[0]

        profile2def[profile] = terms[4]

    cdd_profile2gene = t.map_cdd_profile2gene_name()
    cdd_profile2gene.update(profile2gene)

    cdd_profile2def = t.map_cdd_profile2def()
    cdd_profile2def.update(profile2def)

    prok1603_loci_file = os.path.join(work_dir, 'prok1603The CRISPR/prok1603_loci.p.bz2')
    loci = t.load_compressed_pickle(prok1603_loci_file)

    profile2loci = {}

    for locus in loci:
        for _profile in locus.profiles:
            if _profile in profile2loci:
                profile2loci[_profile].append(locus)
            else:
                profile2loci[_profile] = [locus]

    for (profile, loci) in sorted(profile2loci.items(), key=lambda x: len(x[1]), reverse=True):
        _weight = sum([locus2weight[locus.base_file_name] for locus in loci])
        print "%s\t%s\t%d\t%f\t%s" % (profile,
                                      cdd_profile2gene[profile] if profile in cdd_profile2gene else "",
                                      len(loci),
                                      _weight,
                                      cdd_profile2def[profile] if profile in cdd_profile2def else "")
Example #6
0
def process_reporting_single_order_old(order):

    if order == 2:
        data_file = 'iterative_merged_duplets.p.bz2'
    elif order == 3:
        data_file = 'iterative_merged_triplets.p.bz2'
    elif order == 4:
        data_file = 'iterative_merged_quadruplets.p.bz2'
    elif order == 5:
        data_file = 'iterative_merged_pentaplets.p.bz2'

    data_file = os.path.join(data_path, data_file)
    print "Loading file", data_file
    merged_lists = t.load_compressed_pickle(data_file)
    reports_dir = os.path.join(reports_path,'merged',str(order))
    print "Generating reports to:", reports_dir

    generate_reports(merged_lists, reports_dir, neighborhood_files_path)
Example #7
0
def process_reporting_single_order_old(order):

    if order == 2:
        data_file = 'iterative_merged_duplets.p.bz2'
    elif order == 3:
        data_file = 'iterative_merged_triplets.p.bz2'
    elif order == 4:
        data_file = 'iterative_merged_quadruplets.p.bz2'
    elif order == 5:
        data_file = 'iterative_merged_pentaplets.p.bz2'

    data_file = os.path.join(data_path, data_file)
    print "Loading file", data_file
    merged_lists = t.load_compressed_pickle(data_file)
    reports_dir = os.path.join(reports_path, 'merged', str(order))
    print "Generating reports to:", reports_dir

    generate_reports(merged_lists, reports_dir, neighborhood_files_path)
Example #8
0
def process_reporting_single_order(order):

    if order == 2:
        data_file = 'iterative_merged_duplets.p.bz2'
    elif order == 3:
        data_file = 'iterative_merged_triplets.p.bz2'
    elif order == 4:
        data_file = 'iterative_merged_quadruplets.p.bz2'
    elif order == 5:
        data_file = 'iterative_merged_pentaplets.p.bz2'

    data_file = os.path.join(gv.project_data_path, 'CRISPR/pickle/',dataset_code,data_file)
    print "Loading file", data_file
    merged_lists = t.load_compressed_pickle(data_file)
    reports_dir = os.path.join(gv.project_data_path,'CRISPR/reports',dataset_code,'merged',str(order))
    print "Generating reports to:", reports_dir
    neighborhood_files_path = os.path.join(gv.project_data_path,'CRISPR/datasets/',dataset_code,'wgs')
    
    generate_reports(merged_lists, reports_dir, neighborhood_files_path)
Example #9
0
def process_reporting_single_order(order):

    if order == 2:
        data_file = 'iterative_merged_duplets.p.bz2'
    elif order == 3:
        data_file = 'iterative_merged_triplets.p.bz2'
    elif order == 4:
        data_file = 'iterative_merged_quadruplets.p.bz2'
    elif order == 5:
        data_file = 'iterative_merged_pentaplets.p.bz2'

    data_file = os.path.join(gv.project_data_path, 'CRISPR/pickle/',
                             dataset_code, data_file)
    print "Loading file", data_file
    merged_lists = t.load_compressed_pickle(data_file)
    reports_dir = os.path.join(gv.project_data_path, 'CRISPR/reports',
                               dataset_code, 'merged', str(order))
    print "Generating reports to:", reports_dir
    neighborhood_files_path = os.path.join(gv.project_data_path,
                                           'CRISPR/datasets/', dataset_code,
                                           'wgs')

    generate_reports(merged_lists, reports_dir, neighborhood_files_path)
Example #10
0
def generate_pickles(save_path, limit_to):

    if not os.path.exists(save_path):
        os.mkdir(save_path)

    # print "Loading kplets from DB"
    # pentaplets  =  p.get_report_kplets(limit_to=limit_to, load_locations=True)
    # quadruplets =  q.get_report_kplets(limit_to=limit_to, load_locations=True)
    # triplets    = tr.get_report_kplets(limit_to=limit_to, load_locations=True)
    # duplets     =  d.get_report_kplets(limit_to=limit_to, load_locations=True)

    # print "Dumping raw kplet data to files"
    # dump_file = bz2.BZ2File(os.path.join(save_path, 'duplets_raw.p.bz2'), 'w')
    # pickle.dump(duplets, dump_file)
    # dump_file = bz2.BZ2File(os.path.join(save_path, 'triplets_raw.p.bz2'), 'w')
    # pickle.dump(triplets, dump_file)
    # dump_file = bz2.BZ2File(os.path.join(save_path, 'quadruplets_raw.p.bz2'), 'w')
    # pickle.dump(quadruplets, dump_file)
    # dump_file = bz2.BZ2File(os.path.join(save_path, 'pentaplets_raw.p.bz2'), 'w')
    # pickle.dump(pentaplets, dump_file)

    print "Loading raw kplets from pickles"
    dump_file = os.path.join(save_path, 'duplets_raw.p.bz2')
    duplets = t.load_compressed_pickle(dump_file)
    dump_file = os.path.join(save_path, 'triplets_raw.p.bz2')
    triplets = t.load_compressed_pickle(dump_file)
    dump_file = os.path.join(save_path, 'quadruplets_raw.p.bz2')
    quadruplets = t.load_compressed_pickle(dump_file)
    # dump_file = os.path.join(save_path, 'pentaplets_raw.p.bz2')
    # pentaplets = t.load_compressed_pickle(dump_file)

    print "Basic within merging"
    # pentaplets = merging.basic_merge_within_orders(pentaplets)
    quadruplets = merging.basic_merge_within_orders(quadruplets)
    triplets = merging.basic_merge_within_orders(triplets)
    duplets = merging.basic_merge_within_orders(duplets)

    print "Dumping basic merges"
    # dump_file = os.path.join(save_path, 'pentaplets_basic_merged.p.bz2')
    # t.dump_compressed_pickle(dump_file, pentaplets)
    dump_file = os.path.join(save_path, 'quadruplets_basic_merged.p.bz2')
    t.dump_compressed_pickle(dump_file, quadruplets)
    dump_file = os.path.join(save_path, 'triplets_basic_merged.p.bz2')
    t.dump_compressed_pickle(dump_file, triplets)
    dump_file = os.path.join(save_path, 'duplets_basic_merged.p.bz2')
    t.dump_compressed_pickle(dump_file, duplets)

    # print "Loading basic merges"
    # dump_file = os.path.join(save_path, 'pentaplets_basic_merged.p.bz2')
    # pentaplets = t.load_compressed_pickle(dump_file)
    # dump_file = os.path.join(save_path, 'quadruplets_basic_merged.p.bz2')
    # quadruplets = t.load_compressed_pickle(dump_file)
    # dump_file = os.path.join(save_path, 'triplets_basic_merged.p.bz2')
    # triplets = t.load_compressed_pickle(dump_file)
    # dump_file = os.path.join(save_path, 'duplets_basic_merged.p.bz2')
    # duplets = t.load_compressed_pickle(dump_file)
    sys.exit()

    print 'Starting iterative within mergings'
    pentaplets = merging.merge_kplets_within_orders_iterative_2(pentaplets)
    quadruplets = merging.merge_kplets_within_orders_iterative_2(quadruplets)
    triplets = merging.merge_kplets_within_orders_iterative_2(triplets)
    duplets = merging.merge_kplets_within_orders_iterative_2(duplets)

    print "Dumping iterative merges"
    dump_file = os.path.join(save_path, 'pentaplets_iterative_merged.p.bz2')
    t.dump_compressed_pickle(dump_file, pentaplets)
    dump_file = os.path.join(save_path, 'quadruplets_iterative_merged.p.bz2')
    t.dump_compressed_pickle(dump_file, quadruplets)
    dump_file = os.path.join(save_path, 'triplets_iterative_merged.p.bz2')
    t.dump_compressed_pickle(dump_file, triplets)
    dump_file = os.path.join(save_path, 'duplets_iterative_merged.p.bz2')
    t.dump_compressed_pickle(dump_file, duplets)

    sys.exit()
    print 'Dumping merged kplet lists to files'
    dump_file = bz2.BZ2File(
        os.path.join(save_path, 'pentaplets_merged_within.p.bz2'), 'w')
    pickle.dump(pentaplets, dump_file)
    dump_file = bz2.BZ2File(
        os.path.join(save_path, 'quadruplets_merged_within.p.bz2'), 'w')
    pickle.dump(quadruplets, dump_file)
    dump_file = bz2.BZ2File(
        os.path.join(save_path, 'triplets_merged_within.p.bz2'), 'w')
    pickle.dump(triplets, dump_file)
    dump_file = bz2.BZ2File(
        os.path.join(save_path, 'duplets_merged_within.p.bz2'), 'w')
    pickle.dump(duplets, dump_file)

    print 'Starting accross mergings'
    triplets, duplets = merging.merge_kplets_across_orders(triplets, duplets)
    quadruplets, triplets = merging.merge_kplets_across_orders(
        quadruplets, triplets)
    pentaplets, quadruplets = merging.merge_kplets_across_orders(
        pentaplets, quadruplets)

    print 'Dumping to files'
    dump_file = bz2.BZ2File(
        os.path.join(save_path, 'pentaplets_merged_across.p.bz2'), 'w')
    pickle.dump(pentaplets, dump_file)
    dump_file = bz2.BZ2File(
        os.path.join(save_path, 'quadruplets_merged_across.p.bz2'), 'w')
    pickle.dump(quadruplets, dump_file)
    dump_file = bz2.BZ2File(
        os.path.join(save_path, 'triplets_merged_across.p.bz2'), 'w')
    pickle.dump(triplets, dump_file)
    dump_file = bz2.BZ2File(
        os.path.join(save_path, 'duplets_merged_across.p.bz2'), 'w')
    pickle.dump(duplets, dump_file)

    print 'Done for limit_to:', limit_to
    print
    print
Example #11
0
def generate_plots_from_pickle(limit_to, report_dir, target_profiles,
                               profile2def, gid2arcog_cdd,
                               neighborhood_files_path):

    data_path = os.path.join(gv.project_data_path, 'Bacteria/pickle/')

    fname = os.path.join(data_path, str(limit_to),
                         'pentaplets_merged_within.p.bz2')
    pentaplets = t.load_compressed_pickle(fname)
    fname = os.path.join(data_path, str(limit_to),
                         'quadruplets_merged_within.p.bz2')
    quadruplets = t.load_compressed_pickle(fname)
    fname = os.path.join(data_path, str(limit_to),
                         'triplets_merged_within.p.bz2')
    triplets = t.load_compressed_pickle(fname)
    fname = os.path.join(data_path, str(limit_to),
                         'duplets_merged_within.p.bz2')
    duplets = t.load_compressed_pickle(fname)

    print 'Generationg reports for across orders merged lists'

    flank_report_fnames = ['pentaplets_flank_counts.xls', 'quadruplets_flank_counts.xls', \
                           'triplets_flank_counts.xls', 'duplets_flank_counts.xls']

    kplets = [pentaplets, quadruplets, triplets, duplets]
    titles = ['Pentaplets', 'Quadruplets', 'Triplets', 'Duplets']

    report_files_dir = os.path.join(gv.project_data_path,
                                    'Bacteria/reports/merged_within_orders/',
                                    report_dir)

    for i in range(len(flank_report_fnames)):

        _fname = flank_report_fnames[i]
        _kplet_list = kplets[i]
        _title = titles[i]
        flank_counts, cog2gids, gid2weight = dist.get_flank_distributions(
            _kplet_list, neighborhood_files_path, target_profiles)
        universal_flank_counts = t.merge_dict_set_list(cog2gids, gid2weight)

        params = dict()
        params['xls_file_name'] = os.path.join(report_files_dir, _fname)
        params['profile2def'] = profile2def
        params['flank_counts'] = universal_flank_counts
        params['title'] = _title
        params['target_profiles'] = target_profiles
        r.write_flanking_count_xls(params)

    sys.exit()

    if not os.path.exists(report_files_dir):
        os.mkdir(report_files_dir)

    for i, kplet_pool in zip([5, 4, 3, 2],
                             [pentaplets, quadruplets, triplets, duplets]):
        print i
        j = 0
        # flank_counts_pool = dist.get_flank_distributions(kplet_pool, neighborhood_files_path, target_profiles)

        for k in range(len(kplet_pool)):

            kplet_sublist = kplet_pool[k]
            cur_reports_folder = os.path.join(report_files_dir, str(i))
            if not os.path.exists(cur_reports_folder):
                os.mkdir(cur_reports_folder)

            src2org, file_summaries, community, community_count, community_count_with_flanks\
                = merging.merge_into_file_summaries(kplet_sublist,
                                                    neighborhood_files_path,
                                                    file2src_src2org_map)
            if not src2org:
                continue

            xls_file_name = os.path.join(cur_reports_folder,
                                         "%d_%d.xls" % (j + 1, i))
            class2counts, class2profiles = merging.cdd_profile_count_into_class_count(
                community_count)
            class2counts_flank, class2profiles_flank = merging.cdd_profile_count_into_class_count(
                community_count_with_flanks)

            # profile2counts = flank_counts_pool[k]

            j += 1
            params = dict()
            params['xls_file_name'] = xls_file_name
            params['src2org'] = src2org
            params['file_summaries'] = file_summaries
            params['community'] = community
            params['target_profiles'] = target_profiles
            params['profile2def'] = profile2def
            params['gid2arcog_cdd'] = gid2arcog_cdd
            params['class2counts'] = class2counts
            params['class2profiles'] = class2profiles
            params['class2counts_flank'] = class2counts_flank
            # params['profile2counts'] = profile2counts
            r.write_to_xls(params)
Example #12
0
        params['xls_file_name'] = xls_file_name
        params['src2org'] = src2org
        params['file_summaries'] = file_summaries
        params['community'] = community
        params['target_profiles'] = target_profiles
        params['profile2def'] = profile2def
        params['gid2arcog_cdd'] = gid2arcog_cdd

        r.write_to_xls(params)
        if cnt == 200:
            break

    filtered_merged_kplets = []

    fname = '/Users/hudaiber/Projects/NewSystems/data/Bacteria/pickle/100000/pentaplets_community_count.p.bz2'
    pp_com_cnt = t.load_compressed_pickle(fname)

    pentaplets

    pivot_files = set()
    for kplet in pentaplets[1].kplets:
        pivot_files.update(set(kplet.files))

    for i in range(2, len(pentaplets)):
        _files = set()
        for kplet in pentaplets[i].kplets:
            _files.update(set(kplet.files))

        min_len = min(len(pivot_files), len(_files))
        if len(pivot_files.intersection(_files)) / float(min_len) > 0.3:
            print i, min_len, len(pivot_files.intersection(_files))
Example #13
0
    pan_data_path = '/panfs/pan1/patternquest/Projects/NewSystems/data/Archea/'

    fpath = '/panfs/pan1/patternquest/Projects/NewSystems/data/Archea/arCOG/arcogs_integrase.txt'
    integrases = [l.strip() for l in open(fpath).readlines()]
    fpath = '/panfs/pan1/patternquest/Projects/NewSystems/data/Archea/arCOG/arcogs_recombinase.txt'
    recombinases = [l.strip() for l in open(fpath).readlines()]
    fpath = '/panfs/pan1/patternquest/Projects/NewSystems/data/Archea/arCOG/arcogs_transposase.txt'
    transposases = [l.strip() for l in open(fpath).readlines()]

    pty_files_path = pan_data_path + 'genes_and_flanks/win_10/pty/'

    print "Loading loci"

    pickle_file = os.path.join(pan_data_path, 'pickle/10000/loci.p.bz2')

    loci = t.load_compressed_pickle(pickle_file)

    # all_bait_profiles = integrases + recombinases + resolvases + transposases

    # loci = []
    #
    # cnt = 0
    # for f in os.listdir(pty_files_path):
    #     loci.append(dt.get_pty_file(pty_files_path + f))
    #     if cnt % 1000 == 0:
    #         print cnt
    #     cnt += 1
    #
    # print "Dumping loci"
    # t.dump_compressed_pickle(pickle_file, loci)
    # print "Finished dumping"
Example #14
0
        _values /= np.sum(_values)
        entropy = -1 * np.sum(_values * np.log(_values))
        entropies.append(entropy)

        to_collapse_retval.append((cluster, type2cnt, entropy, gene2cnt))

    entropies = np.array(entropies)
    average_entropy = np.average(entropies)

    return singles, to_collapse_retval, average_entropy


if __name__=='__main__':

    fname = '/Volumes/hudaiber/Projects/NewSystems/data/cas4/dendrograms_10.p.bz2'
    data = t.load_compressed_pickle(fname)

    loci = data['loci']
    tree = data['root']

    leaf_names = [os.path.basename(l.file_name) for l in loci]
    newick = tree_to_newick(tree, "", tree.dist, leaf_names)

    fname = '/Volumes/hudaiber/Projects/NewSystems/data/cas4/tmp.nw'
    outf = open(fname, 'w')
    outf.write(newick)
    outf.close()



Example #15
0
    if root_node.count<size_limit:
        return [root_node]
    else:
        return break_down(root_node.left, size_limit) + break_down(root_node.right, size_limit)


if __name__=='__main__':

    files_path = os.path.join(gv.project_data_path,'CRISPR/Cas1/genes_and_flanks/wgs/')
    pickle_path = os.path.join(gv.project_data_path, 'CRISPR/pickle/')

    print "Loading data"
    Z = np.load('/Users/hudaiber/Projects/NewSystems/data/CRISPR/pickle/upgma_distance_array.npz' ).items()[0][1]
    jwd = np.load('/Users/hudaiber/Projects/NewSystems/data/CRISPR/pickle/jw_distances.npz').items()[0][1]
    fname = os.path.join(pickle_path, 'loci.p.bz2')
    loci = t.load_compressed_pickle(fname)

    print "Starting "
    root = to_tree(Z)
    root = clone_graph(root)
    nodes = get_nodes(root)
    id2node = {node.id: node for node in nodes}

    leaf_ids = leaves_list(Z)

    cnt = 0
    i = 0
    threshold = 0.5
    clusters = []
    total_count = 0
Example #16
0
def generate_plots_from_pickle(limit_to, report_dir, target_profiles, profile2def, gid2arcog_cdd, neighborhood_files_path):

    data_path = os.path.join(gv.project_data_path, 'Archea/pickle/')

    fname = os.path.join(data_path, str(limit_to), 'pentaplets_merged_within.p.bz2')
    pentaplets = t.load_compressed_pickle(fname)
    fname = os.path.join(data_path, str(limit_to), 'quadruplets_merged_within.p.bz2')
    quadruplets = t.load_compressed_pickle(fname)
    fname = os.path.join(data_path, str(limit_to), 'triplets_merged_within.p.bz2')
    triplets = t.load_compressed_pickle(fname)
    fname = os.path.join(data_path, str(limit_to), 'duplets_merged_within.p.bz2')
    duplets = t.load_compressed_pickle(fname)

    # pentaplets, quadruplets, triplets = None, None, None

    print 'Generationg reports for within orders merged lists'

    flank_report_fnames = ['pentaplets_flank_counts.xls', 'quadruplets_flank_counts.xls', \
                           'triplets_flank_counts.xls', 'duplets_flank_counts.xls']

    kplets = [pentaplets, quadruplets, triplets, duplets]
    titles = ['Pentaplets', 'Quadruplets', 'Triplets', 'Duplets']

    report_files_dir = os.path.join(gv.project_data_path, 'Archea/reports/merged_within_orders/', report_dir)

    for i in range(len(flank_report_fnames)):

        _fname = flank_report_fnames[i]
        _kplet_list = kplets[i]
        _title = titles[i]
        flank_counts, cog2gids, gid2weight = dist.get_flank_distributions(_kplet_list, neighborhood_files_path, target_profiles)

        # universal_flank_counts = t.merge_dict_list(flank_counts)
        # universal_cog2gids     = t.merge_dict_set_list(cog2gids, gid2weight)

        universal_flank_counts = t.merge_dict_set_list(cog2gids, gid2weight)

        params = dict()
        params[  'xls_file_name'] = os.path.join(report_files_dir, _fname)
        params[    'profile2def'] = profile2def
        params[   'flank_counts'] = universal_flank_counts
        params[          'title'] = _title
        params['target_profiles'] = target_profiles
        r.write_flanking_count_xls(params)

    if not os.path.exists(report_files_dir):
        os.mkdir(report_files_dir)

    for i, kplet_pool in zip([2, 3, 4, 5], [duplets, triplets, quadruplets, pentaplets]):

        j = 0
        # profile2counts_pool = dist.get_flank_distributions(kplet_pool, neighborhood_files_path, target_profiles)

        file_summaries_pool = []

        for kplet_list in kplet_pool:

            summary_terms = merging.merge_into_file_summaries(kplet_list,
                                                      neighborhood_files_path,
                                                      file2src_src2org_map,
                                                      'archaea')
            file_summaries_pool.append(summary_terms)

        for summary_terms in sorted(file_summaries_pool, key=itemgetter(5), reverse=True):

            src2org, file_summaries, community, community_count, community_count_with_flanks, weight = summary_terms

            if not src2org:
                continue

            cur_reports_folder = os.path.join(report_files_dir, str(i))
            if not os.path.exists(cur_reports_folder):
                os.mkdir(cur_reports_folder)

            xls_file_name = os.path.join(cur_reports_folder,  "%d_%d.xls" % (j+1, i))
            class2counts, class2profiles = merging.arcog_profile_count_into_class_count(community_count)
            class2counts_flank, class2profiles_flank = merging.arcog_profile_count_into_class_count(community_count_with_flanks)

            j += 1
            params = dict()
            params[     'xls_file_name'] = xls_file_name
            params[           'src2org'] = src2org
            params[    'file_summaries'] = file_summaries
            params[         'community'] = community
            params[   'target_profiles'] = target_profiles
            params[       'profile2def'] = profile2def
            params[     'gid2arcog_cdd'] = gid2arcog_cdd
            params[      'class2counts'] = class2counts
            params[    'class2profiles'] = class2profiles
            params['class2counts_flank'] = class2counts_flank
            # params['profile2counts'] = profile2counts
            r.write_to_xls(params)
Example #17
0
def generate_pickles(save_path, limit_to):

    if not os.path.exists(save_path):
        os.mkdir(save_path)

    # print "Loading kplets from DB"
    # pentaplets  =  p.get_report_kplets(limit_to=limit_to, load_locations=True)
    # quadruplets =  q.get_report_kplets(limit_to=limit_to, load_locations=True)
    # triplets    = tr.get_report_kplets(limit_to=limit_to, load_locations=True)
    # duplets     =  d.get_report_kplets(limit_to=limit_to, load_locations=True)

    # print "Dumping raw kplet data to files"
    # dump_file = bz2.BZ2File(os.path.join(save_path, 'duplets_raw.p.bz2'), 'w')
    # pickle.dump(duplets, dump_file)
    # dump_file = bz2.BZ2File(os.path.join(save_path, 'triplets_raw.p.bz2'), 'w')
    # pickle.dump(triplets, dump_file)
    # dump_file = bz2.BZ2File(os.path.join(save_path, 'quadruplets_raw.p.bz2'), 'w')
    # pickle.dump(quadruplets, dump_file)
    # dump_file = bz2.BZ2File(os.path.join(save_path, 'pentaplets_raw.p.bz2'), 'w')
    # pickle.dump(pentaplets, dump_file)

    print "Loading raw kplets from pickles"
    dump_file = os.path.join(save_path, 'duplets_raw.p.bz2')
    duplets = t.load_compressed_pickle(dump_file)
    dump_file = os.path.join(save_path, 'triplets_raw.p.bz2')
    triplets= t.load_compressed_pickle(dump_file)
    dump_file = os.path.join(save_path, 'quadruplets_raw.p.bz2')
    quadruplets = t.load_compressed_pickle(dump_file)
    # dump_file = os.path.join(save_path, 'pentaplets_raw.p.bz2')
    # pentaplets = t.load_compressed_pickle(dump_file)

    print "Basic within merging"
    # pentaplets = merging.basic_merge_within_orders(pentaplets)
    quadruplets= merging.basic_merge_within_orders(quadruplets)
    triplets = merging.basic_merge_within_orders(triplets)
    duplets = merging.basic_merge_within_orders(duplets)

    print "Dumping basic merges"
    # dump_file = os.path.join(save_path, 'pentaplets_basic_merged.p.bz2')
    # t.dump_compressed_pickle(dump_file, pentaplets)
    dump_file = os.path.join(save_path, 'quadruplets_basic_merged.p.bz2')
    t.dump_compressed_pickle(dump_file, quadruplets)
    dump_file = os.path.join(save_path, 'triplets_basic_merged.p.bz2')
    t.dump_compressed_pickle(dump_file, triplets)
    dump_file = os.path.join(save_path, 'duplets_basic_merged.p.bz2')
    t.dump_compressed_pickle(dump_file, duplets)

    # print "Loading basic merges"
    # dump_file = os.path.join(save_path, 'pentaplets_basic_merged.p.bz2')
    # pentaplets = t.load_compressed_pickle(dump_file)
    # dump_file = os.path.join(save_path, 'quadruplets_basic_merged.p.bz2')
    # quadruplets = t.load_compressed_pickle(dump_file)
    # dump_file = os.path.join(save_path, 'triplets_basic_merged.p.bz2')
    # triplets = t.load_compressed_pickle(dump_file)
    # dump_file = os.path.join(save_path, 'duplets_basic_merged.p.bz2')
    # duplets = t.load_compressed_pickle(dump_file)
    sys.exit()

    print 'Starting iterative within mergings'
    pentaplets  = merging.merge_kplets_within_orders_iterative_2(pentaplets)
    quadruplets = merging.merge_kplets_within_orders_iterative_2(quadruplets)
    triplets    = merging.merge_kplets_within_orders_iterative_2(triplets)
    duplets     = merging.merge_kplets_within_orders_iterative_2(duplets)

    print "Dumping iterative merges"
    dump_file = os.path.join(save_path, 'pentaplets_iterative_merged.p.bz2')
    t.dump_compressed_pickle(dump_file, pentaplets)
    dump_file = os.path.join(save_path, 'quadruplets_iterative_merged.p.bz2')
    t.dump_compressed_pickle(dump_file, quadruplets)
    dump_file = os.path.join(save_path, 'triplets_iterative_merged.p.bz2')
    t.dump_compressed_pickle(dump_file, triplets)
    dump_file = os.path.join(save_path, 'duplets_iterative_merged.p.bz2')
    t.dump_compressed_pickle(dump_file, duplets)
    
    sys.exit()
    print 'Dumping merged kplet lists to files'
    dump_file = bz2.BZ2File(os.path.join(save_path, 'pentaplets_merged_within.p.bz2'), 'w')
    pickle.dump(pentaplets, dump_file)
    dump_file = bz2.BZ2File(os.path.join(save_path, 'quadruplets_merged_within.p.bz2'), 'w')
    pickle.dump(quadruplets, dump_file)
    dump_file = bz2.BZ2File(os.path.join(save_path, 'triplets_merged_within.p.bz2'), 'w')
    pickle.dump(triplets, dump_file)
    dump_file = bz2.BZ2File(os.path.join(save_path, 'duplets_merged_within.p.bz2'), 'w')
    pickle.dump(duplets, dump_file)

    print 'Starting accross mergings'
    triplets, duplets = merging.merge_kplets_across_orders(triplets, duplets)
    quadruplets, triplets = merging.merge_kplets_across_orders(quadruplets, triplets)
    pentaplets, quadruplets = merging.merge_kplets_across_orders(pentaplets, quadruplets)

    print 'Dumping to files'
    dump_file = bz2.BZ2File(os.path.join(save_path, 'pentaplets_merged_across.p.bz2'), 'w')
    pickle.dump(pentaplets, dump_file)
    dump_file = bz2.BZ2File(os.path.join(save_path, 'quadruplets_merged_across.p.bz2'), 'w')
    pickle.dump(quadruplets, dump_file)
    dump_file = bz2.BZ2File(os.path.join(save_path, 'triplets_merged_across.p.bz2'), 'w')
    pickle.dump(triplets, dump_file)
    dump_file = bz2.BZ2File(os.path.join(save_path, 'duplets_merged_across.p.bz2'), 'w')
    pickle.dump(duplets, dump_file)

    print 'Done for limit_to:', limit_to
    print
    print
Example #18
0
                t.update_dictionary(gene2cnt, _gene_name, _weight)

        _values = type2cnt.values()
        _values /= np.sum(_values)
        entropy = -1 * np.sum(_values * np.log(_values))
        entropies.append(entropy)

        to_collapse_retval.append((cluster, type2cnt, entropy, gene2cnt))

    entropies = np.array(entropies)
    average_entropy = np.average(entropies)

    return singles, to_collapse_retval, average_entropy


if __name__ == '__main__':

    fname = '/Volumes/hudaiber/Projects/NewSystems/data/cas4/dendrograms_10.p.bz2'
    data = t.load_compressed_pickle(fname)

    loci = data['loci']
    tree = data['root']

    leaf_names = [os.path.basename(l.file_name) for l in loci]
    newick = tree_to_newick(tree, "", tree.dist, leaf_names)

    fname = '/Volumes/hudaiber/Projects/NewSystems/data/cas4/tmp.nw'
    outf = open(fname, 'w')
    outf.write(newick)
    outf.close()
Example #19
0
def merging_pipeline_for_order(order, data_path, load_from_db=False):
    limit_to = 1000000000
    print "starting for ", order
    if load_from_db:
        print "Loading kplets from DB"
        if order == 2:
            kplet_file = 'duplets.p.bz2'
        elif order == 3:
            kplet_file = 'triplets.p.bz2'
        elif order == 4:
            kplet_file = 'quadruplets.p.bz2'
            kplets = q.get_report_kplets(profile_id2code, limit_to=limit_to)
        elif order == 5:
            kplet_file = 'pentaplets.p.bz2'
            kplets = p.get_report_kplets(profile_id2code, limit_to=limit_to)
    else:
        print "Loading kplets from pickle file"
        if order == 2:
            kplet_file = 'duplets.p.bz2'
        elif order == 3:
            kplet_file = 'triplets.p.bz2'
        elif order == 4:
            kplet_file = 'quadruplets.p.bz2'
        elif order == 5:
            kplet_file = 'pentaplets.p.bz2'

        kplet_file_full = os.path.join(data_path, kplet_file)
        print "Loading :", kplet_file_full
        kplets = t.load_compressed_pickle(kplet_file_full)

    print "No of kplets:", len(kplets)

    loci_threshold = 0.7

    print "Starting to merge with loci_threshold:", loci_threshold
    #print "Loading file2genes"
    tic = time.time()
    print "Basic merging"
    merged_lists = merging.basic_merge_within_orders(kplets, loci_threshold)
    print "Basic merging done. Merged lists:", len(merged_lists)
    fname = os.path.join(data_path,
                         "basic_merged_" + "%f_" % loci_threshold + kplet_file)
    print "Dumping basic merging: ", fname
    t.dump_compressed_pickle(fname, merged_lists)

    print "Iterative merging"
    merged_lists = merging.merge_kplets_within_orders_iterative(
        merged_lists, loci_threshold)
    print "Iterative merging done. Merged lists:", len(merged_lists)
    fname = os.path.join(
        data_path, "iterative_merged_" + "%f_" % loci_threshold + kplet_file)
    print "Dumping Iterative merging: ", fname
    t.dump_compressed_pickle(fname, merged_lists)
    print "Completed in:", time.time() - tic, "(s)"

    loci_threshold = 0.8

    print "Starting to merge with loci_threshold:", loci_threshold
    #print "Loading file2genes"
    tic = time.time()
    print "Basic merging"
    merged_lists = merging.basic_merge_within_orders(kplets, loci_threshold)
    print "Basic merging done. Merged lists:", len(merged_lists)
    fname = os.path.join(data_path,
                         "basic_merged_" + "%f_" % loci_threshold + kplet_file)
    print "Dumping basic merging: ", fname
    t.dump_compressed_pickle(fname, merged_lists)

    print "Iterative merging"
    merged_lists = merging.merge_kplets_within_orders_iterative(
        merged_lists, loci_threshold)
    print "Iterative merging done. Merged lists:", len(merged_lists)
    fname = os.path.join(
        data_path, "iterative_merged_" + "%f_" % loci_threshold + kplet_file)
    print "Dumping Iterative merging: ", fname
    t.dump_compressed_pickle(fname, merged_lists)
    print "Completed in:", time.time() - tic, "(s)"

    loci_threshold = 0.9

    print "Starting to merge with loci_threshold:", loci_threshold
    #print "Loading file2genes"
    tic = time.time()
    print "Basic merging"
    merged_lists = merging.basic_merge_within_orders(kplets, loci_threshold)
    print "Basic merging done. Merged lists:", len(merged_lists)
    fname = os.path.join(data_path,
                         "basic_merged_" + "%f_" % loci_threshold + kplet_file)
    print "Dumping basic merging: ", fname
    t.dump_compressed_pickle(fname, merged_lists)

    print "Iterative merging"
    merged_lists = merging.merge_kplets_within_orders_iterative(
        merged_lists, loci_threshold)
    print "Iterative merging done. Merged lists:", len(merged_lists)
    fname = os.path.join(
        data_path, "iterative_merged_" + "%f_" % loci_threshold + kplet_file)
    print "Dumping Iterative merging: ", fname
    t.dump_compressed_pickle(fname, merged_lists)
    print "Completed in:", time.time() - tic, "(s)"
Example #20
0
    files_path = os.path.join(gv.project_data_path,
                              'CRISPR/datasets/crispr/wgs/')
    pickle_path = os.path.join(gv.project_data_path, 'CRISPR/pickle/crispr/')
    report_path = os.path.join(gv.project_data_path,
                               'CRISPR/reports/crispr/dendrograms/')
    if not os.path.exists(report_path):
        os.mkdir(report_path)

    # generate_data()

    score_file = os.path.join(pickle_path, 'jw_scores.npz')
    distance_file = os.path.join(pickle_path, 'jw_distances.npz')
    linkage_file = os.path.join(pickle_path, 'upgma_linkage.npz')
    locus_file = os.path.join(pickle_path, 'loci.p.bz2')

    loci = t.load_compressed_pickle(locus_file)
    # M = np.load(score_file).items()[0][1]

    print("Generating/Loading linkage file")
    Z = build_linkage(M, distance_file, linkage_file)
    Z = np.load(linkage_file).items()[0][1]
    print("Starting to form the clusters")

    root = to_tree(Z)
    root = clone_graph(root)

    nodes = get_nodes(root)
    id2node = {node.id: node for node in nodes}

    leaf_ids = leaves_list(Z)
Example #21
0
sys.path.append(gv.project_code_path)
import lib.utils.tools as t
import csv
import os
import gzip
import bz2


if __name__=='__main__':

    pickle_file = sys.argv[1]
    csv_file = sys.argv[2]


    print "Loading objects from file:", pickle_file
    kplets = t.load_compressed_pickle(pickle_file)
    print "No of kplets:", len(kplets)
    # print "Writing into CSV file:", csv_file
    # if compression=='True':
    #     # fout = bz2.BZ2File(csv_file, "w")
    #     fout = gzip.open(csv_file,"w")
    # else:
    #     fout = open(csv_file, "w")

    print "Writing into file:", csv_file
    fout = bz2.BZ2File(csv_file, "w")

    csv_writer = csv.writer(fout)
    csv_writer.writerow(('Id', 'K', 'Count', 'Codes', 'Files'))
    for kplet in kplets:
        row = []
        params['src2org'] = src2org
        params['file_summaries'] = file_summaries
        params['community'] = community
        params['target_profiles'] = target_profiles
        params['profile2def'] = profile2def
        params['gid2arcog_cdd'] = gid2arcog_cdd

        r.write_to_xls(params)
        if cnt == 200:
            break


    filtered_merged_kplets = []

    fname = '/Users/hudaiber/Projects/NewSystems/data/Bacteria/pickle/100000/pentaplets_community_count.p.bz2'
    pp_com_cnt = t.load_compressed_pickle(fname)


    pentaplets

    pivot_files = set()
    for kplet in pentaplets[1].kplets:
        pivot_files.update(set(kplet.files))

    for i in range(2, len(pentaplets)):
        _files = set()
        for kplet in pentaplets[i].kplets:
            _files.update(set(kplet.files))

        min_len = min(len(pivot_files), len(_files))
        if len(pivot_files.intersection(_files))/float(min_len) > 0.3:
Example #23
0
if __name__ == '__main__':

    work_path = os.path.join(gv.project_data_path, 'Bacteria/cases')
    pty_path = gv.pty_data_path

    kplet_id = 306123

    id2cdd = map_id2cdd_clusters()
    kplet = p.get_report_kplet(kplet_id, id2cdd, load_locations=True)

    target_profiles = set(t.bacteria_target_profiles())
    dump_file = os.path.join(work_path, 'kplet.p.bz2')
    t.dump_compressed_pickle(dump_file, kplet)

    kplet = t.load_compressed_pickle(dump_file)
    kplet_codes = kplet.codes.difference(target_profiles)

    org2src, src2blocks = sig.search_kplet_in_genomes(kplet_codes,
                                                      target_profiles,
                                                      max_dist=4)

    # dump_file = os.path.join(work_path, 'org2src_global.p.bz2')
    # t.dump_compressed_pickle(dump_file, org2src)
    # dump_file = os.path.join(work_path, 'src2blocks_global.p.bz2')
    # t.dump_compressed_pickle(dump_file, src2blocks)
    #
    # dump_file = os.path.join(work_path, 'org2src_global.p.bz2')
    # org2src = t.load_compressed_pickle(dump_file)
    # dump_file = os.path.join(work_path, 'src2blocks_global.p.bz2')
    # src2blocks = t.load_compressed_pickle(dump_file)
Example #24
0
    files_path = os.path.join(gv.project_data_path, 'cas1402/files/')
    pickle_path = os.path.join(gv.project_data_path, 'cas1402/pickle/')
    report_path = os.path.join(gv.project_data_path, 'cas1402/reports/')

    # print "Loading CDD"
    #
    # gi2annotation = t.map_gid2cdd()
    #
    # print "Loading loci"
    # loci = [Locus(os.path.join(files_path, f), gi2annotation) for f in os.listdir(files_path)]
    # loci = [locus for locus in loci if len(locus.genes) > 2]
    #
    # t.dump_compressed_pickle('loci.p.bz2', loci)

    print "Loading loci"
    loci = t.load_compressed_pickle('loci.p.bz2')

    # reporting_dot_product_run(loci)

    # process_jackard_distances(loci)
    reporting_jackard_distance(loci)
    # method = 'union'
    # thresholds = [(0, 0.01), (1, 0.4), (1, 0.6), (5, 0.5), (10, 0.6), (10, 0.4)]
    # process_dot_product(thresholds, method, loci)

    # results_data_file_name = os.path.join(gv.project_data_path,'cas1402/results_%s_1.txt'%method)
    # results_image_file_name = os.path.join(gv.project_data_path,'cas1402/clustering_%s_1.png'%method)
    #
    # reporting.plot_results(results_data_file_name, results_image_file_name)
    # r.write_flanking_count_xls(params)
    #
    # data_path = os.path.join(gv.project_data_path,'Archea/pickle/tmp/')
    #
    # # for limit_to in [10000, 1000000]:
    # for limit_to in [1000000]:
    #     print "Limit_to:", limit_to
    #     print
    #     cur_path = os.path.join(data_path, str(limit_to))
    #     generate_pickles(cur_path, limit_to)
    #     print 'Done'
    #     print "------------------------"
    # sys.exit()

    fname = '/Users/hudaiber/Projects/NewSystems/data/Archea/pickle/1000000/pentaplets_merged_within.p.bz2'
    pentaplets = t.load_compressed_pickle(fname)
    #
    # fname = '/Users/hudaiber/Projects/NewSystems/data/Archea/pickle/1000000/5_community_count.p.bz2'
    # community = t.load_compressed_pickle(fname)
    #
    # fname = '/Users/hudaiber/Projects/NewSystems/data/Archea/pickle/1000000/5_community_count_with_flanks.p.bz2'
    # community_flank = t.load_compressed_pickle(fname)
    #
    # fname = '/Users/hudaiber/Projects/NewSystems/data/Archea/pickle/1000000/5_community_count_classes.p.bz2'
    # community_classes = t.load_compressed_pickle(fname)
    #
    # fname = '/Users/hudaiber/Projects/NewSystems/data/Archea/pickle/1000000/5_community_count_with_flanks_classes.p.bz2'
    # community_flank_classes = t.load_compressed_pickle(fname)
    #
    # fname = '/Users/hudaiber/Projects/NewSystems/data/Archea/pickle/1000000/duplets_merged_within.p.bz2'
    # community_flank_classes = t.load_compressed_pickle(fname)
Example #26
0
def merging_pipeline_for_order(order, data_path, load_from_db=False):
    limit_to = 1000000000
    print "starting for ", order
    if load_from_db:
        print "Loading kplets from DB"
        if order == 2:
            kplet_file = 'duplets.p.bz2'
            #kplets = t.load_compressed_pickle(os.path.join(data_path, kplet_file))
        elif order == 3:
            kplet_file = 'triplets.p.bz2'
            #kplets = t.load_compressed_pickle(os.path.join(data_path, kplet_file))
        elif order == 4:
            kplet_file = 'quadruplets.p.bz2'
            kplets = q.get_report_kplets(profile_id2code, limit_to=limit_to)
        elif order == 5:
            kplet_file = 'pentaplets.p.bz2'
            kplets = p.get_report_kplets(profile_id2code, limit_to=limit_to)
    else:
        print "Loading kplets from pickle file"
        if order == 2:
            kplet_file = 'duplets.p.bz2'
        elif order == 3:
            kplet_file = 'triplets.p.bz2'
        elif order == 4:
            kplet_file = 'quadruplets.p.bz2'
        elif order == 5:
            kplet_file = 'pentaplets.p.bz2'

    kplet_file_full = os.path.join(data_path, kplet_file)
    print "Loading :", kplet_file_full
    kplets = t.load_compressed_pickle(kplet_file_full)

    print "No of kplets:", len(kplets)

    #print "Loading file2genes"
    # tic = time.time()
    # print "Basic merging"
    # merged_lists = merging.basic_merge_within_orders(kplets)
    # print "Basic merging done. Merged lists:", len(merged_lists)
    # # fname = os.path.join(data_path, "basic_merged_"+kplet_file)
    # # print "Dumping basic merging: ", fname
    # # t.dump_compressed_pickle(fname, merged_lists)
    #
    # print "Iterative merging"
    # merged_lists = merging.merge_kplets_within_orders_iterative(merged_lists)
    # print "Iterative merging done. Merged lists:", len(merged_lists)
    # # fname = os.path.join(data_path, "iterative_merged_"+kplet_file)
    # # print "Dumping Iterative merging: ",fname
    # # t.dump_compressed_pickle(fname, merged_lists)
    # print "Completed in:", time.time()-tic, "(s)"

    csv_kplet_file = os.path.join(data_path, kplet_file.split('.')[0] + ".csv")
    csv_merged_lists_file = os.path.join(
        data_path, "iterative_merged_" + kplet_file.split('.')[0] + ".csv")
    print "Writing kplets to csv file:"
    print csv_kplet_file

    t.write_kplets_to_csv(kplets, csv_kplet_file)
    tic = time.time()
    print "Starting kpletmerger with params:"
    print "kpletmerger", csv_kplet_file, csv_merged_lists_file
    print "\n\n"
    sp.call(["kpletmerger", csv_kplet_file, csv_merged_lists_file])
    print "\n\n"
    print "Completed in:", time.time() - tic, "(s)"
Example #27
0
def merging_pipeline_for_order(order, data_path, load_from_db=False):
    limit_to = 1000000000
    print "starting for ", order
    if load_from_db:
        print "Loading kplets from DB"
        if order == 2:
            kplet_file = 'duplets.p.bz2'
        elif order == 3:
            kplet_file = 'triplets.p.bz2'
        elif order == 4:
            kplet_file = 'quadruplets.p.bz2'
            kplets = q.get_report_kplets(profile_id2code, limit_to=limit_to)
        elif order == 5:
            kplet_file = 'pentaplets.p.bz2'
            kplets = p.get_report_kplets(profile_id2code, limit_to=limit_to)
    else:
        print "Loading kplets from pickle file"
        if order == 2:
            kplet_file = 'duplets.p.bz2'
        elif order == 3:
            kplet_file = 'triplets.p.bz2'
        elif order == 4:
            kplet_file = 'quadruplets.p.bz2'
        elif order == 5:
            kplet_file = 'pentaplets.p.bz2'

        kplet_file_full = os.path.join(data_path, kplet_file)
        print "Loading :",kplet_file_full
        kplets = t.load_compressed_pickle(kplet_file_full)

    print "No of kplets:", len(kplets)

    loci_threshold = 0.7

    print "Starting to merge with loci_threshold:", loci_threshold
    #print "Loading file2genes"
    tic = time.time()
    print "Basic merging"
    merged_lists = merging.basic_merge_within_orders(kplets, loci_threshold)
    print "Basic merging done. Merged lists:", len(merged_lists)
    fname = os.path.join(data_path, "basic_merged_"+"%f_"%loci_threshold+kplet_file)
    print "Dumping basic merging: ", fname
    t.dump_compressed_pickle(fname, merged_lists)

    print "Iterative merging"
    merged_lists = merging.merge_kplets_within_orders_iterative(merged_lists, loci_threshold)
    print "Iterative merging done. Merged lists:", len(merged_lists)
    fname = os.path.join(data_path, "iterative_merged_"+"%f_"%loci_threshold+kplet_file)
    print "Dumping Iterative merging: ",fname
    t.dump_compressed_pickle(fname, merged_lists)
    print "Completed in:", time.time()-tic, "(s)"

    loci_threshold = 0.8

    print "Starting to merge with loci_threshold:", loci_threshold
    #print "Loading file2genes"
    tic = time.time()
    print "Basic merging"
    merged_lists = merging.basic_merge_within_orders(kplets, loci_threshold)
    print "Basic merging done. Merged lists:", len(merged_lists)
    fname = os.path.join(data_path, "basic_merged_"+"%f_"%loci_threshold+kplet_file)
    print "Dumping basic merging: ", fname
    t.dump_compressed_pickle(fname, merged_lists)

    print "Iterative merging"
    merged_lists = merging.merge_kplets_within_orders_iterative(merged_lists, loci_threshold)
    print "Iterative merging done. Merged lists:", len(merged_lists)
    fname = os.path.join(data_path, "iterative_merged_"+"%f_"%loci_threshold+kplet_file)
    print "Dumping Iterative merging: ",fname
    t.dump_compressed_pickle(fname, merged_lists)
    print "Completed in:", time.time()-tic, "(s)"

    loci_threshold = 0.9

    print "Starting to merge with loci_threshold:", loci_threshold
    #print "Loading file2genes"
    tic = time.time()
    print "Basic merging"
    merged_lists = merging.basic_merge_within_orders(kplets, loci_threshold)
    print "Basic merging done. Merged lists:", len(merged_lists)
    fname = os.path.join(data_path, "basic_merged_"+"%f_"%loci_threshold+kplet_file)
    print "Dumping basic merging: ", fname
    t.dump_compressed_pickle(fname, merged_lists)

    print "Iterative merging"
    merged_lists = merging.merge_kplets_within_orders_iterative(merged_lists, loci_threshold)
    print "Iterative merging done. Merged lists:", len(merged_lists)
    fname = os.path.join(data_path, "iterative_merged_"+"%f_"%loci_threshold+kplet_file)
    print "Dumping Iterative merging: ",fname
    t.dump_compressed_pickle(fname, merged_lists)
    print "Completed in:", time.time()-tic, "(s)"
Example #28
0
if __name__=='__main__':

    work_path = os.path.join(gv.project_data_path, 'Bacteria/cases')
    pty_path = gv.pty_data_path

    kplet_id = 306123

    id2cdd = map_id2cdd_clusters()
    kplet = p.get_report_kplet(kplet_id, id2cdd, load_locations=True)

    target_profiles = set(t.bacteria_target_profiles())
    dump_file = os.path.join(work_path, 'kplet.p.bz2')
    t.dump_compressed_pickle(dump_file, kplet)

    kplet = t.load_compressed_pickle(dump_file)
    kplet_codes = kplet.codes.difference(target_profiles)

    org2src, src2blocks = sig.search_kplet_in_genomes(kplet_codes, target_profiles, max_dist=4)

    # dump_file = os.path.join(work_path, 'org2src_global.p.bz2')
    # t.dump_compressed_pickle(dump_file, org2src)
    # dump_file = os.path.join(work_path, 'src2blocks_global.p.bz2')
    # t.dump_compressed_pickle(dump_file, src2blocks)
    #
    # dump_file = os.path.join(work_path, 'org2src_global.p.bz2')
    # org2src = t.load_compressed_pickle(dump_file)
    # dump_file = os.path.join(work_path, 'src2blocks_global.p.bz2')
    # src2blocks = t.load_compressed_pickle(dump_file)

    xls_file = os.path.join(work_path,'kplet_%d_tight.xlw'%kplet_id)
Example #29
0
import pickle

if __name__ == '__main__':

    print 'Pre-Loading dictionaries'
    target_profiles = t.target_profiles()
    profile2def = t.map_profile2def()
    gid2arcog_cdd = t.map_gid2arcog_cdd()
    neighborhood_files_path = neighborhoods_path()
    print "\n"

    limit_to = 1000000
    data_path = os.path.join(gv.project_data_path, 'Archea/pickle/')
    fname = os.path.join(data_path, str(limit_to),
                         'pentaplets_merged_across.p.bz2')
    pentaplets = t.load_compressed_pickle(fname)

    report_dir = 'all'
    report_files_dir = os.path.join(gv.project_data_path,
                                    'Archea/reports/merged_across_orders/',
                                    report_dir)

    j = 0
    for kplet_sublist in pentaplets:
        cur_reports_folder = os.path.join(report_files_dir, str(5))

        src2org, file_summaries, community, community_count, community_count_with_flanks = \
            merging.merge_into_file_summaries(kplet_sublist,
                                              neighborhood_files_path,
                                              file2src_src2org_map,
                                              'archaea')
Example #30
0
if __name__ == '__main__':

    files_path = os.path.join(gv.project_data_path,
                              'CRISPR/Cas1/genes_and_flanks/wgs/')
    pickle_path = os.path.join(gv.project_data_path, 'CRISPR/pickle/')

    print "Loading data"
    Z = np.load(
        '/Users/hudaiber/Projects/NewSystems/data/CRISPR/pickle/upgma_distance_array.npz'
    ).items()[0][1]
    jwd = np.load(
        '/Users/hudaiber/Projects/NewSystems/data/CRISPR/pickle/jw_distances.npz'
    ).items()[0][1]
    fname = os.path.join(pickle_path, 'loci.p.bz2')
    loci = t.load_compressed_pickle(fname)

    print "Starting "
    root = to_tree(Z)
    root = clone_graph(root)
    nodes = get_nodes(root)
    id2node = {node.id: node for node in nodes}

    leaf_ids = leaves_list(Z)

    cnt = 0
    i = 0
    threshold = 0.5
    clusters = []
    total_count = 0
Example #31
0
    files_path  = os.path.join(gv.project_data_path, 'cas1402/files/')
    pickle_path = os.path.join(gv.project_data_path, 'cas1402/pickle/')
    report_path = os.path.join(gv.project_data_path, 'cas1402/reports/')

    # print "Loading CDD"
    #
    # gi2annotation = t.map_gid2cdd()
    #
    # print "Loading loci"
    # loci = [Locus(os.path.join(files_path, f), gi2annotation) for f in os.listdir(files_path)]
    # loci = [locus for locus in loci if len(locus.genes) > 2]
    #
    # t.dump_compressed_pickle('loci.p.bz2', loci)

    print "Loading loci"
    loci = t.load_compressed_pickle('loci.p.bz2')

    # reporting_dot_product_run(loci)

    # process_jackard_distances(loci)
    reporting_jackard_distance(loci)
    # method = 'union'
    # thresholds = [(0, 0.01), (1, 0.4), (1, 0.6), (5, 0.5), (10, 0.6), (10, 0.4)]
    # process_dot_product(thresholds, method, loci)

    # results_data_file_name = os.path.join(gv.project_data_path,'cas1402/results_%s_1.txt'%method)
    # results_image_file_name = os.path.join(gv.project_data_path,'cas1402/clustering_%s_1.png'%method)
    #
    # reporting.plot_results(results_data_file_name, results_image_file_name)

Example #32
0
def generate_plots_from_pickle(limit_to, report_dir, target_profiles, profile2def, gid2arcog_cdd, neighborhood_files_path):

    data_path = os.path.join(gv.project_data_path, 'Bacteria/pickle/')

    fname = os.path.join(data_path, str(limit_to), 'pentaplets_merged_within.p.bz2')
    pentaplets = t.load_compressed_pickle(fname)
    fname = os.path.join(data_path, str(limit_to), 'quadruplets_merged_within.p.bz2')
    quadruplets = t.load_compressed_pickle(fname)
    fname = os.path.join(data_path, str(limit_to), 'triplets_merged_within.p.bz2')
    triplets = t.load_compressed_pickle(fname)
    fname = os.path.join(data_path, str(limit_to), 'duplets_merged_within.p.bz2')
    duplets = t.load_compressed_pickle(fname)

    print 'Generationg reports for across orders merged lists'

    flank_report_fnames = ['pentaplets_flank_counts.xls', 'quadruplets_flank_counts.xls', \
                           'triplets_flank_counts.xls', 'duplets_flank_counts.xls']

    kplets = [pentaplets, quadruplets, triplets, duplets]
    titles = ['Pentaplets', 'Quadruplets', 'Triplets', 'Duplets']

    report_files_dir = os.path.join(gv.project_data_path, 'Bacteria/reports/merged_within_orders/', report_dir)

    for i in range(len(flank_report_fnames)):

        _fname = flank_report_fnames[i]
        _kplet_list = kplets[i]
        _title = titles[i]
        flank_counts, cog2gids, gid2weight = dist.get_flank_distributions(_kplet_list, neighborhood_files_path, target_profiles)
        universal_flank_counts = t.merge_dict_set_list(cog2gids, gid2weight)

        params = dict()
        params['xls_file_name'] = os.path.join(report_files_dir, _fname)
        params['profile2def'] = profile2def
        params['flank_counts'] = universal_flank_counts
        params['title'] = _title
        params['target_profiles'] = target_profiles
        r.write_flanking_count_xls(params)

    sys.exit()

    if not os.path.exists(report_files_dir):
        os.mkdir(report_files_dir)

    for i, kplet_pool in zip([5, 4, 3, 2], [pentaplets, quadruplets, triplets, duplets]):
        print i
        j = 0
        # flank_counts_pool = dist.get_flank_distributions(kplet_pool, neighborhood_files_path, target_profiles)

        for k in range(len(kplet_pool)):

            kplet_sublist = kplet_pool[k]
            cur_reports_folder = os.path.join(report_files_dir, str(i))
            if not os.path.exists(cur_reports_folder):
                os.mkdir(cur_reports_folder)

            src2org, file_summaries, community, community_count, community_count_with_flanks\
                = merging.merge_into_file_summaries(kplet_sublist,
                                                    neighborhood_files_path,
                                                    file2src_src2org_map)
            if not src2org:
                continue

            xls_file_name = os.path.join(cur_reports_folder,  "%d_%d.xls" % (j+1, i))
            class2counts, class2profiles = merging.cdd_profile_count_into_class_count(community_count)
            class2counts_flank, class2profiles_flank = merging.cdd_profile_count_into_class_count(community_count_with_flanks)

            # profile2counts = flank_counts_pool[k]

            j += 1
            params = dict()
            params['xls_file_name'] = xls_file_name
            params['src2org'] = src2org
            params['file_summaries'] = file_summaries
            params['community'] = community
            params['target_profiles'] = target_profiles
            params['profile2def'] = profile2def
            params['gid2arcog_cdd'] = gid2arcog_cdd
            params['class2counts'] = class2counts
            params['class2profiles'] = class2profiles
            params['class2counts_flank'] = class2counts_flank
            # params['profile2counts'] = profile2counts
            r.write_to_xls(params)
Example #33
0
    fpath = '/panfs/pan1/patternquest/Projects/NewSystems/data/Bacteria/CDD/cdd_integrase.txt'
    integrases = [l.split()[1] for l in open(fpath).readlines()]
    fpath = '/panfs/pan1/patternquest/Projects/NewSystems/data/Bacteria/CDD/cdd_recombinase.txt'
    recombinases = [l.split()[1] for l in open(fpath).readlines()]
    fpath = '/panfs/pan1/patternquest/Projects/NewSystems/data/Bacteria/CDD/cdd_resolvase.txt'
    resolvases = [l.split()[1] for l in open(fpath).readlines()]
    fpath = '/panfs/pan1/patternquest/Projects/NewSystems/data/Bacteria/CDD/cdd_transpos.txt'
    transposases = [l.split()[1] for l in open(fpath).readlines()]

    pty_files_path = pan_data_path + 'genes_and_flanks/win_10/raw_nbr_files/'

    print "Loading loci"

    pickle_file = os.path.join(pan_data_path, 'pickle/10000/loci.p.bz2')

    bacteria_loci = t.load_compressed_pickle(pickle_file)
    # all_bait_profiles = integrases + recombinases + resolvases + transposases

    # bacteria_loci = []
    #
    # cnt = 0
    # for f in os.listdir(pty_files_path):
    #     bacteria_loci.append(dt.get_pty_file(pty_files_path + f))
    #     if cnt % 1000 == 0:
    #         print cnt
    #     cnt += 1
    #
    # print "Dumping loci"
    # t.dump_compressed_pickle(pickle_file, bacteria_loci)
    # print "Finished dumping"
Example #34
0
if sys.platform=='darwin':
    sys.path.append('/Users/hudaiber/Projects/SystemFiles/')
elif sys.platform=='linux2':
    sys.path.append('/home/hudaiber/Projects/SystemFiles/')

import global_variables as gv
sys.path.append(gv.project_code_path)

import lib.utils.tools as t


# print "Bacteria"
# files = ['duplets_merged_within.p.bz2', 'triplets_merged_within.p.bz2',
#          'quadruplets_merged_within.p.bz2', 'pentaplets_merged_within.p.bz2']
# for f in files:
#     fpath = os.path.join(gv.project_data_path,'Bacteria/pickle/100000/',f)
#     # print fpath
#     kplets = t.load_compressed_pickle(fpath)
#     print f, len(kplets)


print "CRISPR"
files = ['duplets_seed.p.bz2', 'triplets_seed.p.bz2', 'quadruplets_seed.p.bz2', 'pentaplets_seed.p.bz2']
for f in files:
    fpath = os.path.join(gv.project_data_path,'CRISPR/pickle/cas1_2',f)
    kplets = t.load_compressed_pickle(fpath)
    print f, len(kplets)



    print "\n"

    data_path = os.path.join(gv.project_data_path, 'Archea/pickle/1000000/')
    # files = ['pentaplets_basic_merged.p.bz2', 'quadruplets_raw.p.bz2', 'triplets_raw.p.bz2', 'duplets_raw.p.bz2']
    files = ['duplets_basic_merged.p.bz2', 'triplets_basic_merged.p.bz2']

    reports_dir_base = os.path.join(
        gv.project_data_path, 'Archea/reports/merged_within_orders/raw_ranks/')
    if not os.path.exists(reports_dir_base):
        os.mkdir(reports_dir_base)

    # for k, kplet_file_name in zip([5, 4, 3, 2], files):
    for k, kplet_file_name in zip([2, 3], files):

        print "Starting for", k
        kplet_lists = t.load_compressed_pickle(
            os.path.join(data_path, kplet_file_name))
        reports_dir = os.path.join(reports_dir_base, str(k))

        if not os.path.exists(reports_dir):
            os.mkdir(reports_dir)

        for ind, kplet_list in enumerate(kplet_lists):
            src2org, file_summaries, community, community_count, community_count_with_flanks, weight = \
                    merging.merge_into_file_summaries(kplet_list.kplets,
                                                      neighborhood_files_path,
                                                      file2src_src2org_map,
                                                      'archaea')

            # codes = list(kplet.codes)
            # suffix = ['s' if code in target_profiles else 'n' for code in codes]
            #
if __name__=='__main__':

    files_path = os.path.join( gv.project_data_path, 'CRISPR/datasets/crispr/wgs/')
    pickle_path = os.path.join(gv.project_data_path, 'CRISPR/pickle/crispr/')
    report_path = os.path.join(gv.project_data_path, 'CRISPR/reports/crispr/dendrograms/')
    if not os.path.exists(report_path):
        os.mkdir(report_path)

    # generate_data()

    score_file = os.path.join(pickle_path, 'jw_scores.npz')
    distance_file = os.path.join(pickle_path, 'jw_distances.npz')
    linkage_file = os.path.join(pickle_path, 'upgma_linkage.npz')
    locus_file = os.path.join(pickle_path, 'loci.p.bz2')

    loci = t.load_compressed_pickle(locus_file)
    # M = np.load(score_file).items()[0][1]

    print("Generating/Loading linkage file")
    Z = build_linkage(M, distance_file, linkage_file)
    Z = np.load(linkage_file).items()[0][1]
    print("Starting to form the clusters")

    root = to_tree(Z)
    root = clone_graph(root)

    nodes = get_nodes(root)
    id2node = {node.id: node for node in nodes}

    leaf_ids = leaves_list(Z)
        "pfam13592": 4.0,
        "pfam13700": 4.0,
        "pfam13817": 4.0,
        "pfam14261": 5.0,
        "pfam14294": 5.0
    }

    profile2def = t.map_cdd_profile2def()
    # gid2arcog_cdd = t.map_gid2arcog_cdd()
    neighborhood_files_path = merged_neighborhoods_path()
    org2weight = t.map_genome2weight()

    pan_data_path = '/panfs/pan1/patternquest/Projects/NewSystems/data/Bacteria/'

    pickle_file = os.path.join(pan_data_path, 'pickle/10000/profile2merged_files.p.bz2')
    profile2files = t.load_compressed_pickle(pickle_file)

    baiticity_file = os.path.join(gv.project_data_path, 'baiticity/bacteria/baiticity.tab')
    profile2baiticity = {l.split()[0]: l.split()[4] for l in open(baiticity_file).readlines()[1:] if l.strip()}

    i = 1

    for highlight_profile in additional_profiles:

        _profile_containing_files = profile2files[highlight_profile]
        file_summaries = merging.get_file_summaries(_profile_containing_files, neighborhood_files_path, org2weight)

        print i, highlight_profile, len(file_summaries)

        report_file = os.path.join(gv.project_data_path,'baiticity/bacteria/reports/%s.xlsx' % highlight_profile)
Example #38
0
def merging_pipeline_for_order(order, data_path, load_from_db=False):
    limit_to = 1000000000
    print "starting for ", order
    if load_from_db:
        print "Loading kplets from DB"
        if order == 2:
            kplet_file = 'duplets.p.bz2'
            #kplets = t.load_compressed_pickle(os.path.join(data_path, kplet_file))
        elif order == 3:
            kplet_file = 'triplets.p.bz2'
            #kplets = t.load_compressed_pickle(os.path.join(data_path, kplet_file))
        elif order == 4:
            kplet_file = 'quadruplets.p.bz2'
            kplets = q.get_report_kplets(profile_id2code, limit_to=limit_to)
        elif order == 5:
            kplet_file = 'pentaplets.p.bz2'
            kplets = p.get_report_kplets(profile_id2code, limit_to=limit_to)
    else:
        print "Loading kplets from pickle file"
        if order == 2:
            kplet_file = 'duplets.p.bz2'
        elif order == 3:
            kplet_file = 'triplets.p.bz2'
        elif order == 4:
            kplet_file = 'quadruplets.p.bz2'
        elif order == 5:
            kplet_file = 'pentaplets.p.bz2'

    kplet_file_full = os.path.join(data_path, kplet_file)
    print "Loading :",kplet_file_full
    kplets = t.load_compressed_pickle(kplet_file_full)

    print "No of kplets:", len(kplets)
    
    #print "Loading file2genes"
    # tic = time.time()
    # print "Basic merging"
    # merged_lists = merging.basic_merge_within_orders(kplets)
    # print "Basic merging done. Merged lists:", len(merged_lists)
    # # fname = os.path.join(data_path, "basic_merged_"+kplet_file)
    # # print "Dumping basic merging: ", fname
    # # t.dump_compressed_pickle(fname, merged_lists)
    #
    # print "Iterative merging"
    # merged_lists = merging.merge_kplets_within_orders_iterative(merged_lists)
    # print "Iterative merging done. Merged lists:", len(merged_lists)
    # # fname = os.path.join(data_path, "iterative_merged_"+kplet_file)
    # # print "Dumping Iterative merging: ",fname
    # # t.dump_compressed_pickle(fname, merged_lists)
    # print "Completed in:", time.time()-tic, "(s)"

    csv_kplet_file = os.path.join(data_path, kplet_file.split('.')[0]+".csv")
    csv_merged_lists_file = os.path.join(data_path, "iterative_merged_"+kplet_file.split('.')[0]+".csv")
    print "Writing kplets to csv file:"
    print csv_kplet_file

    t.write_kplets_to_csv(kplets, csv_kplet_file)
    tic = time.time()
    print "Starting kpletmerger with params:"
    print "kpletmerger", csv_kplet_file, csv_merged_lists_file
    print "\n\n"
    sp.call(["kpletmerger",csv_kplet_file,csv_merged_lists_file])
    print "\n\n"
    print "Completed in:", time.time()-tic, "(s)"
Example #39
0
    baiticity_file = os.path.join(gv.project_data_path,
                                  'baiticity/archaea/baiticity.tab')
    profile2baiticity = {
        l.split()[0]: l.split()[4]
        for l in open(baiticity_file).readlines()[1:]
    }

    # gid2arcog_cdd = t.map_gid2arcog_cdd()
    neighborhood_files_path = merged_neighborhoods_path()
    org2weight = t.map_archaea_genome2weight()
    pan_data_path = '/panfs/pan1/patternquest/Projects/NewSystems/data/Archea/'

    pickle_file = pickle_file = os.path.join(
        pan_data_path, 'pickle/10000/profile2merged_files.p.bz2')
    profile2files = t.load_compressed_pickle(pickle_file)

    i = 1

    for highlight_profile in additional_profiles:

        _profile_containing_files = profile2files[highlight_profile]
        file_summaries = merging.get_file_summaries(_profile_containing_files,
                                                    neighborhood_files_path,
                                                    org2weight)

        print i, highlight_profile, len(file_summaries)

        if len(file_summaries) == 1:

            clusters = []
Example #40
0
import os
import sys
if sys.platform == 'darwin':
    sys.path.append('/Users/hudaiber/Projects/SystemFiles/')
elif sys.platform == 'linux2':
    sys.path.append('/home/hudaiber/Projects/SystemFiles/')

import global_variables as gv
sys.path.append(gv.project_code_path)

import lib.utils.tools as t

# print "Bacteria"
# files = ['duplets_merged_within.p.bz2', 'triplets_merged_within.p.bz2',
#          'quadruplets_merged_within.p.bz2', 'pentaplets_merged_within.p.bz2']
# for f in files:
#     fpath = os.path.join(gv.project_data_path,'Bacteria/pickle/100000/',f)
#     # print fpath
#     kplets = t.load_compressed_pickle(fpath)
#     print f, len(kplets)

print "CRISPR"
files = [
    'duplets_seed.p.bz2', 'triplets_seed.p.bz2', 'quadruplets_seed.p.bz2',
    'pentaplets_seed.p.bz2'
]
for f in files:
    fpath = os.path.join(gv.project_data_path, 'CRISPR/pickle/cas1_2', f)
    kplets = t.load_compressed_pickle(fpath)
    print f, len(kplets)
    neighborhood_files_path = neighborhoods_path()
    print "\n"

    data_path = os.path.join(gv.project_data_path, 'Archea/pickle/1000000/')
    # files = ['pentaplets_basic_merged.p.bz2', 'quadruplets_raw.p.bz2', 'triplets_raw.p.bz2', 'duplets_raw.p.bz2']
    files = ['duplets_basic_merged.p.bz2', 'triplets_basic_merged.p.bz2']

    reports_dir_base = os.path.join(gv.project_data_path,'Archea/reports/merged_within_orders/raw_ranks/')
    if not os.path.exists(reports_dir_base):
        os.mkdir(reports_dir_base)

    # for k, kplet_file_name in zip([5, 4, 3, 2], files):
    for k, kplet_file_name in zip([2, 3], files):

        print "Starting for", k
        kplet_lists = t.load_compressed_pickle(os.path.join(data_path, kplet_file_name))
        reports_dir = os.path.join(reports_dir_base, str(k))

        if not os.path.exists(reports_dir):
            os.mkdir(reports_dir)

        for ind, kplet_list in enumerate(kplet_lists):
            src2org, file_summaries, community, community_count, community_count_with_flanks, weight = \
                    merging.merge_into_file_summaries(kplet_list.kplets,
                                                      neighborhood_files_path,
                                                      file2src_src2org_map,
                                                      'archaea')

            # codes = list(kplet.codes)
            # suffix = ['s' if code in target_profiles else 'n' for code in codes]
            #
Example #42
0
def generate_plots_from_pickle(limit_to, report_dir, target_profiles,
                               profile2def, gid2arcog_cdd,
                               neighborhood_files_path):

    data_path = os.path.join(gv.project_data_path, 'Archea/pickle/')

    fname = os.path.join(data_path, str(limit_to),
                         'pentaplets_merged_within.p.bz2')
    pentaplets = t.load_compressed_pickle(fname)
    fname = os.path.join(data_path, str(limit_to),
                         'quadruplets_merged_within.p.bz2')
    quadruplets = t.load_compressed_pickle(fname)
    fname = os.path.join(data_path, str(limit_to),
                         'triplets_merged_within.p.bz2')
    triplets = t.load_compressed_pickle(fname)
    fname = os.path.join(data_path, str(limit_to),
                         'duplets_merged_within.p.bz2')
    duplets = t.load_compressed_pickle(fname)

    # pentaplets, quadruplets, triplets = None, None, None

    print 'Generationg reports for within orders merged lists'

    flank_report_fnames = ['pentaplets_flank_counts.xls', 'quadruplets_flank_counts.xls', \
                           'triplets_flank_counts.xls', 'duplets_flank_counts.xls']

    kplets = [pentaplets, quadruplets, triplets, duplets]
    titles = ['Pentaplets', 'Quadruplets', 'Triplets', 'Duplets']

    report_files_dir = os.path.join(gv.project_data_path,
                                    'Archea/reports/merged_within_orders/',
                                    report_dir)

    for i in range(len(flank_report_fnames)):

        _fname = flank_report_fnames[i]
        _kplet_list = kplets[i]
        _title = titles[i]
        flank_counts, cog2gids, gid2weight = dist.get_flank_distributions(
            _kplet_list, neighborhood_files_path, target_profiles)

        # universal_flank_counts = t.merge_dict_list(flank_counts)
        # universal_cog2gids     = t.merge_dict_set_list(cog2gids, gid2weight)

        universal_flank_counts = t.merge_dict_set_list(cog2gids, gid2weight)

        params = dict()
        params['xls_file_name'] = os.path.join(report_files_dir, _fname)
        params['profile2def'] = profile2def
        params['flank_counts'] = universal_flank_counts
        params['title'] = _title
        params['target_profiles'] = target_profiles
        r.write_flanking_count_xls(params)

    if not os.path.exists(report_files_dir):
        os.mkdir(report_files_dir)

    for i, kplet_pool in zip([2, 3, 4, 5],
                             [duplets, triplets, quadruplets, pentaplets]):

        j = 0
        # profile2counts_pool = dist.get_flank_distributions(kplet_pool, neighborhood_files_path, target_profiles)

        file_summaries_pool = []

        for kplet_list in kplet_pool:

            summary_terms = merging.merge_into_file_summaries(
                kplet_list, neighborhood_files_path, file2src_src2org_map,
                'archaea')
            file_summaries_pool.append(summary_terms)

        for summary_terms in sorted(file_summaries_pool,
                                    key=itemgetter(5),
                                    reverse=True):

            src2org, file_summaries, community, community_count, community_count_with_flanks, weight = summary_terms

            if not src2org:
                continue

            cur_reports_folder = os.path.join(report_files_dir, str(i))
            if not os.path.exists(cur_reports_folder):
                os.mkdir(cur_reports_folder)

            xls_file_name = os.path.join(cur_reports_folder,
                                         "%d_%d.xls" % (j + 1, i))
            class2counts, class2profiles = merging.arcog_profile_count_into_class_count(
                community_count)
            class2counts_flank, class2profiles_flank = merging.arcog_profile_count_into_class_count(
                community_count_with_flanks)

            j += 1
            params = dict()
            params['xls_file_name'] = xls_file_name
            params['src2org'] = src2org
            params['file_summaries'] = file_summaries
            params['community'] = community
            params['target_profiles'] = target_profiles
            params['profile2def'] = profile2def
            params['gid2arcog_cdd'] = gid2arcog_cdd
            params['class2counts'] = class2counts
            params['class2profiles'] = class2profiles
            params['class2counts_flank'] = class2counts_flank
            # params['profile2counts'] = profile2counts
            r.write_to_xls(params)
def merging_pipeline_for_order(order, load_from_db=False):
    limit_to = 1000000000
    print "starting for ", order
    if load_from_db:
        print "Loading kplets from DB"
        if order == 2:
            kplet_file = 'duplets.p.bz2'
            #kplets = t.load_compressed_pickle(os.path.join(data_path, kplet_file))
        elif order == 3:
            kplet_file = 'triplets.p.bz2'
            #kplets = t.load_compressed_pickle(os.path.join(data_path, kplet_file))
        elif order == 4:
            kplet_file = 'quadruplets.p.bz2'
            kplets = q.get_report_kplets(profile_id2code, limit_to=limit_to)
        elif order == 5:
            kplet_file = 'pentaplets.p.bz2'
            kplets = p.get_report_kplets(profile_id2code, limit_to=limit_to)
    else:
        print "Loading kplets from pickle file"
        if order == 2:
            kplet_file = 'duplets.p.bz2'
        elif order == 3:
            kplet_file = 'triplets.p.bz2'
        elif order == 4:
            kplet_file = 'quadruplets.p.bz2'
        elif order == 5:
            kplet_file = 'pentaplets.p.bz2'
	kplet_file_full = os.path.join(data_path, kplet_file)
        print kplet_file_full
        kplets = t.load_compressed_pickle(kplet_file_full)



    # print "Starting for", kplet_file
    # print "Loading kplets"
    # kplets = t.load_compressed_pickle(os.path.join(data_path, kplet_file))
    print "No of kplets:", len(kplets)
    
    #print "Loading file2genes"

    #_file2genes = {}
    #for _f in os.listdir(neighborhood_files_path):
    #    _file2genes[_f] = dt.get_wgs_file(os.path.join(neighborhood_files_path, _f))

    # print 'Filtering'
    # kplets = filter_seed(kplets, _file2genes)
    # print "No of kplets:", len(kplets)
    # fname = os.path.join(data_path,  kplet_file.split('.')[0]+'_seed.p.bz2')
    # print 'Dumping', fname
    # t.dump_compressed_pickle(fname, kplets)

    print "Basic merging"
    merged_lists = merging.basic_merge_within_orders(kplets)
    fname = os.path.join(data_path, "basic_merged_"+kplet_file)
    print "Dumping basic merging: ", fname
    t.dump_compressed_pickle(fname, merged_lists)

    print "Iterative merging"
    merged_lists = merging.merge_kplets_within_orders_iterative(merged_lists)
    fname = os.path.join(data_path, "iterative_merged_"+kplet_file)
    print "Dumping Iterative merging: ",fname
    t.dump_compressed_pickle(fname, merged_lists)