Exemple #1
0
def cas4_dataset():

    print "Loading loci"
    def_file = os.path.join(gv.project_data_path, 'cas4/profiles/defenseProfiles.tab')
    profile2gene = {}

    for l in open(def_file):
        terms = l.split('\t')
        profile = terms[0]
        gene_names = terms[3].split(',')
        if len(gene_names) > 1:
            profile2gene[profile] = gene_names[1]
        else:
            profile2gene[profile] = gene_names[0]

    cdd_profile2gene = t.map_cdd_profile2gene_name()
    cdd_profile2gene.update(profile2gene)

    loci = [Locus(os.path.join(work_dir, 'files', f), file_format='generic', profile2gene=cdd_profile2gene)
            for f in os.listdir(os.path.join(work_dir, 'files'))]

    # dendrogram_file = os.path.join(work_dir, 'loci_dendrogram.pdf')

    singles, cluster_packs, _ = dnd.classify_loci_hierarchically(loci, threshold=2.6)

    print "Clusters: %d, singles %d" % (len(cluster_packs), len(singles))

    reports_dir = os.path.join(work_dir, 'reports/cas4')

    feature_profiles = [l.strip() for l in open(os.path.join(work_dir, 'feature_profiles.txt')).readlines()]

    r.generate_cluster_reports_cas4(cluster_packs, loci, reports_dir, feature_profiles)
Exemple #2
0
def prok1603_architecture_frequencies():

    work_dir = os.path.join(gv.project_data_path, 'UvrD/')

    map_file = os.path.join(work_dir, 'prok1603/prok1603_weights.txt')

    locus2weight = {l.split()[0]:float(l.split()[1]) for l in open(map_file)}

    def_file = os.path.join(gv.project_data_path, 'cas4/profiles/defenseProfiles.tab')
    profile2gene={}
    profile2def = {}

    for l in open(def_file):
        terms = l.strip().split('\t')
        profile = terms[0]
        gene_names = terms[3].split(',')
        if len(gene_names)>1:
            profile2gene[profile] = gene_names[1]
        else:
            profile2gene[profile] = gene_names[0]

        profile2def[profile] = terms[4]

    cdd_profile2gene = t.map_cdd_profile2gene_name()
    cdd_profile2gene.update(profile2gene)

    cdd_profile2def = t.map_cdd_profile2def()
    cdd_profile2def.update(profile2def)

    prok1603_loci_file = os.path.join(work_dir, 'prok1603The CRISPR/prok1603_loci.p.bz2')
    loci = t.load_compressed_pickle(prok1603_loci_file)

    profile2loci = {}

    for locus in loci:
        for _profile in locus.profiles:
            if _profile in profile2loci:
                profile2loci[_profile].append(locus)
            else:
                profile2loci[_profile] = [locus]

    for (profile, loci) in sorted(profile2loci.items(), key=lambda x: len(x[1]), reverse=True):
        _weight = sum([locus2weight[locus.base_file_name] for locus in loci])
        print "%s\t%s\t%d\t%f\t%s" % (profile,
                                      cdd_profile2gene[profile] if profile in cdd_profile2gene else "",
                                      len(loci),
                                      _weight,
                                      cdd_profile2def[profile] if profile in cdd_profile2def else "")
Exemple #3
0
def cas4_extract_dendrogram():

    work_dir = os.path.join(gv.project_data_path, 'cas4')

    print "Loading loci"

    def_file = os.path.join(gv.project_data_path, 'cas4/profiles/defenseProfiles.tab')
    profile2gene = {}

    for l in open(def_file):
        terms = l.split('\t')
        profile = terms[0]
        gene_names = terms[3].split(',')
        if len(gene_names) > 1:
            profile2gene[profile] = gene_names[1]
        else:
            profile2gene[profile] = gene_names[0]

    cdd_profile2gene = t.map_cdd_profile2gene_name()
    cdd_profile2gene.update(profile2gene)

    files_path = os.path.join(work_dir, 'files')

    loci = [Locus(os.path.join(files_path, f), file_format='generic', profile2gene=cdd_profile2gene) for f in
            os.listdir(files_path)]

    tic = time.time()
    print "Generating score matrix"
    M = scores.generate_jackard_score_matrix(loci)

    tac = time.time() - tic
    print "Elapsed time:", float(tac) / 60 / 60, float(tac) / 60, float(tac)

    tic = time.time()
    jw_file = os.path.join(work_dir, 'pickle/jw_scores.p.bz2')
    print "Dumping JW scores to:", jw_file
    t.dump_compressed_pickle(jw_file, M)
    tac = time.time() - tic
    print "Elapsed time:", float(tac) / 60 / 60, float(tac) / 60, float(tac)

    # print "Loading JW scores from:", prok1603_jw_file
    # M = t.load_compressed_pickle(prok1603_jw_file)

    tree_file = os.path.join(work_dir, 'jw_upgma.tre')
    print "Generating tree:", tree_file
    dnd.convert_score_to_newick(M, [os.path.basename(l.file_name) for l in loci], tree_file)
    print "Loading loci"

    def_file = os.path.join(gv.project_data_path, 'cas4/profiles/defenseProfiles.tab')
    profile2gene = {}

    for l in open(def_file):
        terms = l.split('\t')
        profile = terms[0]
        gene_names = terms[3].split(',')
        if len(gene_names) > 1:
            profile2gene[profile] = gene_names[1]
        else:
            profile2gene[profile] = gene_names[0]

    cdd_profile2gene = t.map_cdd_profile2gene_name()
    cdd_profile2gene.update(profile2gene)

    loci = [Locus(os.path.join(files_path, f), file_format='generic', profile2gene=cdd_profile2gene)
            for f in os.listdir(files_path)]

    locus2id = {}
    cluster2id = {}
    feature_clusters = get_feature_labels()

    print "No of feature clusters", len(feature_clusters)

    vertices, edges = [], []

    id = 1
    for locus in loci:
    print "Loading loci"

    def_file = os.path.join(gv.project_data_path, 'cas4/profiles/defenseProfiles.tab')
    profile2gene = {}

    for l in open(def_file):
        terms = l.split('\t')
        profile = terms[0]
        gene_names = terms[3].split(',')
        if len(gene_names) > 1:
            profile2gene[profile] = gene_names[1]
        else:
            profile2gene[profile] = gene_names[0]

    cdd_profile2gene = t.map_cdd_profile2gene_name()
    cdd_profile2gene.update(profile2gene)

    file2locus = {}

    for f in os.listdir(data_dir + 'files'):
        locus = Locus(os.path.join(data_dir, 'files', f),
                      file_format='generic',
                      profile2gene=cdd_profile2gene)
        file2locus[f] = locus

    locus2weight = { l.split()[0]: float(l.split()[1]) for l in open(weights_file)}
    community2nodes = parse_tree_file(infomap_file, locus2weight)

    sorted_clusters = sorted(community2nodes.values(),
                             key=lambda x: sum(locus.weight for locus in x if locus.type==2),
def merge_loci(source_path, dest_path):

    profile2gene = t.map_cdd_profile2gene_name()

    print "Loading loci"
    loci = [BasicLocus(os.path.join(source_path, f), profile2gene=profile2gene) for f in os.listdir(source_path)]

    print "Merging loci"

    cnt = 1
    while True:

        merged_out = [0]*len(loci)
        new_loci = []

        for i in range(len(loci)-1):

            if merged_out[i]:
                continue

            for j in range(i+1, len(loci)):

                if merged_out[i]:
                    continue

                if loci[i].overlaps(loci[j]):
                    loci[i].merge(loci[j])
                    merged_out[j] = 1

            new_loci.append(loci[i])

        if not merged_out[len(loci)-1]:
            new_loci.append(loci[-1])

        print "Iteration %d results. Old list: %d, New list: %d" % (cnt, len(loci), len(new_loci))
        cnt += 1

        if len(loci) == len(new_loci):
            loci = new_loci
            break

        loci = new_loci

    # with open(work_dir+'merged_loci_name.txt', 'wb') as outf:
    #
    #     [outf.write("%s\n"%_file) for locus in loci for _file in locus.merged_files]

    print "Writing merged loci to files"

    for locus in loci:

        fname = os.path.join(dest_path, locus.merged_base_files.pop())

        while os.path.exists(fname):
            print "Duplicate file name:", fname
            fname = os.path.join(dest_path, locus.merged_base_files.pop())

        header_lines=[]
        if locus.merged_base_files:
            header_line = "Merged files: %s" % ",".join(f for f in locus.merged_base_files)
            header_lines = [header_line]

        t.write_genes_to_pty(locus.genes, fname, header_lines=header_lines)
Exemple #7
0
def prok1603_extract_dendrogram():

    work_dir = os.path.join(gv.project_data_path, 'UvrD/')

    files_path = os.path.join(work_dir, 'prok1603/merged_files/')

    print "Loading loci"

    def_file = os.path.join(gv.project_data_path, 'cas4/profiles/defenseProfiles.tab')
    profile2gene = {}

    for l in open(def_file):
        terms = l.split('\t')
        profile = terms[0]
        gene_names = terms[3].split(',')
        if len(gene_names) > 1:
            profile2gene[profile] = gene_names[1]
        else:
            profile2gene[profile] = gene_names[0]

    cdd_profile2gene = t.map_cdd_profile2gene_name()
    cdd_profile2gene.update(profile2gene)

    loci = [BasicLocus(os.path.join(files_path, f), profile2gene=cdd_profile2gene) for f in
            os.listdir(files_path)]

    prok1603_loci_file = os.path.join(work_dir, 'prok1603/prok1603_loci.p.bz2')
    # loci = t.load_compressed_pickle(prok1603_loci_file)
    print "Loci:", len(loci)
    print "Dumping loci to:", prok1603_loci_file

    t.dump_compressed_pickle(prok1603_loci_file, loci)
    sys.exit()

    tic = time.time()
    print "Generating score matrix"
    M = scores.generate_jackard_score_matrix(loci)

    tac = time.time() - tic
    print "Elapsed time:", float(tac)/60/60, float(tac)/60, float(tac)

    tic = time.time()
    prok1603_jw_file = os.path.join(work_dir, 'prok1603_jw_scores.p')
    print "Dumping JW scores to:", prok1603_jw_file
    with open(prok1603_jw_file, 'wb') as outf:
        cPickle.dump(M, outf, protocol=cPickle.HIGHEST_PROTOCOL)
    tac = time.time() - tic
    print "Elapsed time:", float(tac) / 60 / 60, float(tac) / 60, float(tac)

    tic = time.time()
    prok1603_jw_file = os.path.join(work_dir, 'prok1603_jw_scores.p.bz2')
    print "Dumping JW scores to:", prok1603_jw_file
    t.dump_compressed_pickle(prok1603_jw_file, M)
    tac = time.time() - tic
    print "Elapsed time:", float(tac) / 60 / 60, float(tac) / 60, float(tac)

    # print "Loading JW scores from:", prok1603_jw_file
    # M = t.load_compressed_pickle(prok1603_jw_file)

    tic = time.time()
    prok1603_jw_file = os.path.join(work_dir, 'prok1603/prok1603_jw_scores.npz')
    # prok1603_jw_file = os.path.join('/Users/hudaiber/Projects/NewSystems/data/UvrD/prok1603/prok1603_jw_scores.npz')
    print "Dumping JW scores to:", prok1603_jw_file
    np.savez_compressed(prok1603_jw_file, M)
    tac = time.time() - tic
    print "Elapsed time:", float(tac) / 60 / 60, float(tac) / 60, float(tac)

    # print "Loading JW scores from:", prok1603_jw_file
    # M = t.load_compressed_pickle(prok1603_jw_file)

    prok1603_tree_file = os.path.join(work_dir, 'prok1603/prok1603_upgma.tre')
    print "Generating tree:", prok1603_tree_file
    dnd.convert_score_to_newick(M, [os.path.basename(l.file_name) for l in loci], prok1603_tree_file)
Exemple #8
0
def tree_leaves():

    work_dir = os.path.join(gv.project_data_path, 'UvrD/prok1603')
    tree_dir = os.path.join(work_dir, 'clust_tree/')
    files_dir = os.path.join(work_dir, 'merged_files')

    profile2gene = t.map_cdd_profile2gene_name()

    gi2org = {l.split()[0]: l.rstrip().split()[1] for l in open(work_dir + '/gi_org.txt')}

    gi2weight = {l.split()[0].split('.')[0]: float(l.split()[1]) for l in open(work_dir + '/prok1603_weights.txt')}

    cl2size, cl2gis, cl2weight = {}, {}, {}

    for l in open(tree_dir + 'uvrd.cls'):
        terms = l.rstrip().split()
        cl2size[terms[1]] = terms[0]
        cl2gis[terms[1]] = terms[2:]
        cl2weight[terms[1]] = sum([ gi2weight[gi] if gi in gi2weight else 0 for gi in terms[2:]])

    tree_string = open(tree_dir + 'uvrd.up.tre').readline()

    leave_file_names = [os.path.basename(l) for l in glob.glob(tree_dir + '*.sr')]

    for leave_file_name in leave_file_names:

        leave_file_gis = [ l.split()[0] for l in open(os.path.join(tree_dir, leave_file_name))]

        system_gene_pool = []

        sgp_count = {}

        for gi in leave_file_gis:
            system_genes = get_system_genes(gi, files_dir, profile2gene)

            if not system_genes:
                continue
            system_gene_pool.append(system_genes)

            t.update_dictionary(sgp_count, system_genes, gi2weight[gi])

        sorted_sgp_count = sorted(sgp_count.items(), key=lambda x: x[1], reverse=True)

        leaf_name = os.path.splitext(leave_file_name)[0]
        gene_names = sorted_sgp_count[0][0] if sorted_sgp_count else ""

        representative = gi2org[random.choice(leave_file_gis)]

        total_weight = sum([v for k,v in sgp_count.items()])
        leaf_prefix = "%s|" % int(total_weight) if total_weight else "-"

        has_genes = False

        for _gene_name in ["Cas4", "UvrA", "UvrB", "UvrC", "SbcS", "SbcD"]:
        # for _gene_name in ["Cas4"]:
            _weight = sum([v for k, v in sgp_count.items() if _gene_name.lower() in k.lower()])
            if _weight:
                leaf_prefix += "%s=%d|" % (_gene_name, _weight)
                has_genes = True

        if has_genes:
            new_leaf_name = leaf_prefix + representative + "|" + leaf_name.split('.')[1]
        else:
            new_leaf_name = leaf_prefix + leaf_name.split('.')[1]

        # new_leaf_name = "cas4=%s/%s|%s|%s" % (int(cas4_weight) if total_weight else "-",
        #                                  int(total_weight) if total_weight else "-",
        #                                  representative,
        #                                  leaf_name.split('.')[1])

        print leaf_name, new_leaf_name

        tree_string = tree_string.replace(leaf_name + ":", new_leaf_name + ":")

        # new_file_name = os.path.join(tree_dir, os.path.splitext(leave_file_name)[0] + '.def')
        # with open(new_file_name, 'w') as new_file:
        #
        #     for k, v in sorted_sgp_count:
        #         new_file.write("#%s\t%f\n" % (k, v))
        #
        #     new_file.write("\n")
        #
        #     [new_file.write("%s\t%s\n" % (gi, gi2org[gi])) for gi in leave_file_gis]

    with open(tree_dir + 'uvrd.up_all_genes.tree', 'w') as outf:
        outf.write(tree_string)
Exemple #9
0
def merge_loci(source_path, dest_path):

    profile2gene = t.map_cdd_profile2gene_name()

    print "Loading loci"
    loci = [
        BasicLocus(os.path.join(source_path, f), profile2gene=profile2gene)
        for f in os.listdir(source_path)
    ]

    print "Merging loci"

    cnt = 1
    while True:

        merged_out = [0] * len(loci)
        new_loci = []

        for i in range(len(loci) - 1):

            if merged_out[i]:
                continue

            for j in range(i + 1, len(loci)):

                if merged_out[i]:
                    continue

                if loci[i].overlaps(loci[j]):
                    loci[i].merge(loci[j])
                    merged_out[j] = 1

            new_loci.append(loci[i])

        if not merged_out[len(loci) - 1]:
            new_loci.append(loci[-1])

        print "Iteration %d results. Old list: %d, New list: %d" % (
            cnt, len(loci), len(new_loci))
        cnt += 1

        if len(loci) == len(new_loci):
            loci = new_loci
            break

        loci = new_loci

    # with open(work_dir+'merged_loci_name.txt', 'wb') as outf:
    #
    #     [outf.write("%s\n"%_file) for locus in loci for _file in locus.merged_files]

    print "Writing merged loci to files"

    for locus in loci:

        fname = os.path.join(dest_path, locus.merged_base_files.pop())

        while os.path.exists(fname):
            print "Duplicate file name:", fname
            fname = os.path.join(dest_path, locus.merged_base_files.pop())

        header_lines = []
        if locus.merged_base_files:
            header_line = "Merged files: %s" % ",".join(
                f for f in locus.merged_base_files)
            header_lines = [header_line]

        t.write_genes_to_pty(locus.genes, fname, header_lines=header_lines)