Example #1
0
def get_flank_distributions(kplets_2d_list, neighborhood_path, target_profiles):

    org2weights = t.map_genome2weight()
    flanking_genes_count = []

    cog2gids = []

    gid2weight = dict()

    for kplets_list in kplets_2d_list:
        cur_flanking_genes_count = dict()

        cur_cog2gids = dict()

        for kplet in kplets_list:
            neighborhoods = [Neighborhood(os.path.join(neighborhood_path, f)) for f in kplet.files]

            for neighborhood in neighborhoods:
                for gene in neighborhood.genes:

                    gid2weight[int(gene.gid)] = org2weights[gene.organism]

                    for cogid in gene.cogid.split():
                        # if cogid in target_profiles:
                        #     continue
                        t.update_dictionary(cur_flanking_genes_count,cogid,org2weights[gene.organism])
                        t.update_dictionary_set(cur_cog2gids, cogid, set([int(gene.gid)]))

        flanking_genes_count.append(cur_flanking_genes_count)
        cog2gids.append(cur_cog2gids)

    return flanking_genes_count, cog2gids, gid2weight
Example #2
0
def arcog_profiles_pool_into_classes_pool(profile_community):

    _arcog2class = t.map_arcog2class()
    class_community = list()

    for profiles in profile_community:
        classes = dict()
        for k in profiles:
            if k in _arcog2class:
                t.update_dictionary(classes, _arcog2class[k], profiles[k])
        class_community.append(classes)
    return class_community
Example #3
0
def arcog_profiles_pool_into_classes_pool(profile_community):

    _arcog2class = t.map_arcog2class()
    class_community = list()

    for profiles in profile_community:
        classes = dict()
        for k in profiles:
            if k in _arcog2class:
                t.update_dictionary(classes, _arcog2class[k], profiles[k])
        class_community.append(classes)
    return class_community
Example #4
0
def merge_into_file_summaries(kplets, neighborhood_files_path, file2src_src2org_map, data_type='bacteria'):

    _org2weight = t.map_genome2weight()

    _file2kplets = dict()
    for kplet in kplets:
        for f in kplet.files:
            if f in _file2kplets:
                _file2kplets[f].append(kplet)
            else:
                _file2kplets[f] = [kplet]

    kplet_files = _file2kplets.keys()
    _file2src, _src2org = file2src_src2org_map(kplet_files)

    file_summaries = list()
    for f in kplet_files:
        _neighborhood = Neighborhood(os.path.join(neighborhood_files_path, f))
        _src = _file2src[f]
        _org = _src2org[_src]
        _weight = _org2weight[_org]
        kplets = _file2kplets[f]
        _neighborhood.extend_flanks(10, os.path.join(gv.pty_data_path, _org, "%s.pty" % _src), _gid2arcog_cdd)
        file_summaries.append(NeighborhoodFileSummary(f, kplets, _neighborhood, _org, _src, _weight))

    # file_summaries = trim_file_summary_list(file_summaries, data_type)
    # file_summaries = [fs for fs in file_summaries if fs]

    # Updating the map _file2src after trimming.
    # new_file_list = [ fs.file_name for fs in file_summaries]
    # for _file_name in _file2src.keys():
    #     if _file_name not in new_file_list:
    #         del _file2src[_file_name]

    # if len(file_summaries) < 2:
    #     return None, None, None, None, None, None

    file_summaries.sort(key= lambda x: x.weight, reverse=True)

    community_count_with_flanks = {}
    community_count = {}
    _org2weight = t.map_genome2weight()

    total_weight = 0

    for i in range(len(file_summaries)):
        cur_file_summary = file_summaries[i]
        _weight = _org2weight[cur_file_summary.org]
        total_weight += _weight
        for gene in cur_file_summary.neighborhood.genes:
            if gene.tag == 'flank':
                for k in gene.cogid.split():
                    t.update_dictionary(community_count_with_flanks, k, _weight)
            else:
                for k in gene.cogid.split():
                    t.update_dictionary(community_count_with_flanks, k, _weight)
                    t.update_dictionary(community_count, k, _weight)
    community = []
    return _src2org, file_summaries, community, community_count, community_count_with_flanks, total_weight
Example #5
0
def cdd_profile_count_into_class_count(community_count):

    _cdd2class = t.map_cdd2class()
    class_count = dict()
    class2profiles = dict()

    for k in community_count:
        if k in _cdd2class:
            _classes = _cdd2class[k]
            for _class in _classes:
                t.update_dictionary(class_count, _class, community_count[k])
                t.update_dictionary(class2profiles, _class, [k])
        else:
            _class = 'Unclassified'
            t.update_dictionary(class_count, _class, community_count[k])
            t.update_dictionary(class2profiles, _class, [k])

    return class_count, class2profiles
Example #6
0
def cdd_profile_count_into_class_count(community_count):

    _cdd2class = t.map_cdd2class()
    class_count = dict()
    class2profiles = dict()

    for k in community_count:
        if k in _cdd2class:
            _classes = _cdd2class[k]
            for _class in _classes:
                t.update_dictionary(class_count,    _class, community_count[k])
                t.update_dictionary(class2profiles, _class, [k])
        else:
            _class = 'Unclassified'
            t.update_dictionary(class_count,    _class, community_count[k])
            t.update_dictionary(class2profiles, _class, [k])

    return class_count, class2profiles
Example #7
0
def arcog_profile_count_into_class_count(community_count):

    _arcog2class = t.map_arcog2class()
    class2count = dict()
    class2profiles = dict()

    for k in community_count:
        if k in _arcog2class:
            _classes = _arcog2class[k]
            for _class in _classes:
                t.update_dictionary(class2count, _class, community_count[k])
                # t.update_dictionary_list_value(class2profiles, _class, k)
                t.update_dictionary(class2profiles, _class, [k])
        else:
            _class = 'Unclassified'
            t.update_dictionary(class2count, _class , community_count[k])
            # t.update_dictionary_list_value(class2profiles, _class, k)
            t.update_dictionary(class2profiles, _class, [k])

    return class2count, class2profiles
Example #8
0
def arcog_profile_count_into_class_count(community_count):

    _arcog2class = t.map_arcog2class()
    class2count = dict()
    class2profiles = dict()

    for k in community_count:
        if k in _arcog2class:
            _classes = _arcog2class[k]
            for _class in _classes:
                t.update_dictionary(class2count, _class, community_count[k])
                # t.update_dictionary_list_value(class2profiles, _class, k)
                t.update_dictionary(class2profiles, _class, [k])
        else:
            _class = 'Unclassified'
            t.update_dictionary(class2count, _class, community_count[k])
            # t.update_dictionary_list_value(class2profiles, _class, k)
            t.update_dictionary(class2profiles, _class, [k])

    return class2count, class2profiles
Example #9
0
from lib.utils import tools as t

cdd_file = '/Users/hudaiber/data/CDD/all_Prok1402.ccp.csv'

gnm2weight = t.map_genome2weight()

profile2count = {}
profile2weight = {}

missing = []

for l in open(cdd_file):
    terms = l.split(',')
    org = terms[1]
    profile = terms[6]
    if org in gnm2weight:
        t.update_dictionary(profile2count, profile, 1)
        t.update_dictionary(profile2weight, profile, gnm2weight[org])
    else:
        missing.append(org)

print "Missing from weights:"
for gnm in set(missing):
    print gnm

print "Finished scanning"
with open('/Users/hudaiber/data/CDD/profile2weight.tab', 'w') as outf:
    outf.write("#Profile\tweight\tcount\n")
    for k, v in profile2weight.items():
        outf.write("%s\t%f\t%s\n" % (k, v, profile2count[k]))
print "Done"
Example #10
0
def generate_community_reports(nodes_pool,
                               reports_dir,
                               locus2weight,
                               file2locus,
                               profile2def,
                               feature_profiles_file=None):

    # if not feature_labels:
    #     local_features = True
    # else:
    #     local_features = False

    # thr_occ, thr_crisp, cluster_threshold = thresholds_pack

    summary_file = os.path.join(reports_dir, 'summary.xlsx' )

    workbook = x.Workbook(summary_file)
    worksheet = workbook.add_worksheet()

    header_format = workbook.add_format()
    header_format.set_font_size(12)
    header_format.set_bold()
    header_format.set_align('center')
    # worksheet.set_column(4,5,50)
    worksheet.write_row(0, 0, ["File name", "Size", "Effective size", "Genes"], header_format)

    print "Generating report files"
    ind = 1

    for nodes in nodes_pool:

        loci_size = len([node for node in nodes if node.type == 2])
        loci_esize = sum(node.weight for node in nodes if node.type == 2)

        # if loci_esize < 5:
        #     continue

        loci = [file2locus[node.file_name] for node in nodes if node.type == 2]

        xls_file_name = os.path.join(reports_dir, '%d.xlsx' % ind)
        loci_file_name = os.path.join(reports_dir, '%d.tab' % ind)

        with open(loci_file_name, 'w') as outf:
            loci_files = ",".join(os.path.basename(locus.file_name) for locus in loci)
            outf.write(loci_files + "\n")

        gene2cnt = {}
        profile2cnt = {}
        for locus in loci:
            weight = locus2weight[os.path.basename(locus.file_name)]
            for gene_name in locus.gene_names:
                t.update_dictionary(gene2cnt, gene_name, weight)
            for cl in locus.clusters:
                t.update_dictionary(profile2cnt, cl, weight)

        sorted_gene2count = sorted(gene2cnt.items(), key=lambda x: x[1], reverse=True)
        gene_counts = ";".join(["%s:%.2f" % (gene_name, count) for (gene_name, count) in sorted_gene2count[:10]])

        worksheet.write_row(ind + 1, 0, ['%d.xlsx' % ind,
                                         loci_size,
                                         loci_esize,
                                         gene_counts])
        args = {}

        args['xls_file_name'] = xls_file_name
        args['loci'] = loci
        args['profile_code2def'] = profile2def
        if not feature_profiles_file:
            args['feature_labels'] = [ k for k,v in profile2cnt.items() if v >= loci_esize/2 ]
        else:
            args['feature_labels'] = [l.strip() for l in open(feature_profiles_file)]

        try:
            r.write_to_xls_loci_plain(args)
        except:
            sys.exit()
        ind += 1
Example #11
0
def dull_gene_name():

    cas_gene_names = [
        l.strip() for l in open(
            os.path.join(gv.project_data_path, 'cas1402/all_gene_names.txt'))
    ]

    gene_name2gids = {gene: set() for gene in cas_gene_names}

    cnt = 0
    with open(os.path.join(gv.project_data_path,
                           'cas1402/cas1402.arrisl.lst')) as inf:

        for in_line in inf:

            if in_line.startswith("==="):
                continue

            parts = in_line.strip().split('\t')
            if len(parts) < 9:
                continue

            _gene = parts[8]

            if _gene in cas_gene_names:

                gene_name2gids[_gene].update([parts[0]])

    cdd_gid2profiles = t.map_gid2cdd()

    cas_gene2profile = {gene: {} for gene in cas_gene_names}

    for _cas_gene in cas_gene_names:
        for _gid in gene_name2gids[_cas_gene]:
            if not _gid in cdd_gid2profiles:
                # t.update_dictionary(cas_gene2profile[_cas_gene], "NA", 1)
                continue
            for _profile in cdd_gid2profiles[_gid].split():

                t.update_dictionary(cas_gene2profile[_cas_gene], _profile, 1)

    work_dir = os.path.join(gv.project_data_path, 'cas1402/crispricity/')

    with open(os.path.join(work_dir, 'gene_name2profiles.txt'), 'w') as outf:
        for _gene_name in cas_gene_names:
            for _profile in cas_gene2profile[_gene_name]:
                outf.write("%s\t%s\t%d\n" %
                           (_gene_name, _profile,
                            cas_gene2profile[_gene_name][_profile]))

    cas_related_profiles = set([
        _profile for _gene in cas_gene_names
        for _profile in cas_gene2profile[_gene].keys()
    ])

    cr_occurrence = []
    cr_crispricity = []
    ncr_occurrence = []
    ncr_crispricity = []

    for l in open(os.path.join(work_dir, 'crispricity.tab')).readlines()[1:]:
        if not l:
            continue
        parts = l.split('\t')

        if parts[0] in cas_related_profiles:
            cr_occurrence.append(parts[1])
            cr_crispricity.append(parts[2])
        else:
            ncr_occurrence.append(parts[1])
            ncr_crispricity.append(parts[2])

    cr_occurrence = np.asarray(cr_occurrence, dtype=np.float)
    cr_occurrence = np.log(cr_occurrence)
    cr_crispricity = np.asarray(cr_crispricity)

    ncr_occurrence = np.asarray(ncr_occurrence, dtype=np.float)
    ncr_occurrence = np.log(ncr_occurrence)
    ncr_crispricity = np.asarray(ncr_crispricity)

    plt.ioff()
    fig, ax = plt.subplots()
    ax.scatter(cr_occurrence,
               cr_crispricity,
               color='r',
               s=1,
               label="Cas related")
    ax.scatter(ncr_occurrence,
               ncr_crispricity,
               color='b',
               s=1,
               label="Not Cas related")

    ax.axvline(1.6, color='g', linewidth=0.5)
    ax.axhline(0.5, color='g', linewidth=0.5)

    plt.xlabel("Effective orcurrence in CRISPR loci (log)")
    plt.ylabel("Crispricity")

    plt.legend(loc="upper left", fontsize=7)

    plt.savefig(os.path.join(work_dir, 'crispricity_log.png'))
Example #12
0
def sub_classify_by_scores_cas4(M, threshold, loci, inf_default=50):

    if not (M == np.transpose(M)).all():
        M += np.transpose(M)
        M = np.negative(np.log(M))
        np.fill_diagonal(M, 0)
        inf_idx = np.isinf(M)
        M[inf_idx] = inf_default

    M_array = ssd.squareform(M)

    Z = linkage(M_array, method='average')

    root = to_tree(Z)

    leaf_names = [os.path.basename(l.file_name) for l in loci]
    newick = tree_to_newick(root, "", root.dist, leaf_names)

    fname = gv.project_data_path+'/cas4/tmp.nw'
    outf = open(fname, 'w')
    outf.write(newick)
    outf.close()

    proc = sp.Popen(['tree_listnodes', '-o=4', fname], stdout=sp.PIPE, stderr=open(os.devnull, 'wb'))

    locus2weight = {}

    for line in proc.stdout:

        terms = line.strip().split()
        if len(terms) == 2:
            locus2weight[terms[0]] = float(terms[1])

    root = clone_graph(root)

    nodes = get_nodes(root)
    id2node = {node.id: node for node in nodes}

    leaf_ids = leaves_list(Z)

    cnt = 0
    i = 0
    total_count = 1

    pool = []

    while True:
        cur_node = id2node[leaf_ids[i]]
        parent_dist = cur_node.parent.dist

        while parent_dist < threshold:
            cur_node = cur_node.parent
            parent_dist = cur_node.parent.dist

        cur_leaf_ids = get_leaves(cur_node)

        pool.append([id for id in cur_leaf_ids])

        total_count += cur_node.count

        i += len(cur_leaf_ids)

        if i >= len(leaf_ids):
            break
        cnt += 1

    to_collapse = [l for l in pool if len(l) > 1]
    singles = [l[0] for l in pool if len(l) == 1]

    to_collapse = sorted(to_collapse, key=lambda x: len(x), reverse=True)

    gene2cnt = {}

    for locus in loci:
        try:
            weight = locus2weight[os.path.basename(locus.file_name)]
        except:
            print "Skipping:" , os.path.basename(locus.file_name)
            continue
        for gene_name in locus.gene_names:
            t.update_dictionary(gene2cnt, gene_name, weight)
            # t.update_dictionary(gene2cnt, gene_name, 1)

    return singles, to_collapse, gene2cnt
Example #13
0
def classify_by_scores_cas1402(M, threshold, loci):

    M_array = ssd.squareform(M)

    Z = linkage(M_array, method='average')

    root = to_tree(Z)
    root = clone_graph(root)

    nodes = get_nodes(root)
    id2node = {node.id: node for node in nodes}

    leaf_ids = leaves_list(Z)

    cnt = 0
    i = 0
    total_count = 1

    pool = []

    while True:
        cur_node = id2node[leaf_ids[i]]
        parent_dist = cur_node.parent.dist

        while parent_dist < threshold:
            cur_node = cur_node.parent
            parent_dist = cur_node.parent.dist

        cur_leaf_ids = get_leaves(cur_node)

        pool.append([id for id in cur_leaf_ids])

        total_count += cur_node.count

        i += len(cur_leaf_ids)

        if i >= len(leaf_ids)-1:
            break
        cnt += 1

    to_collapse = [l for l in pool if len(l) > 1]
    singles = [l[0] for l in pool if len(l) == 1]

    to_collapse = sorted(to_collapse, key=lambda x: len(x), reverse=True)

    sum_errors = []
    entropies = []
    weights = []
    to_collapse_retval = []

    cluster_ind = 0

    for cluster in to_collapse:

        cluster_ind += 1
        type2cnt = {}
        type2wgt = {}

        cluster_files = [loci[id].file_name.split('/')[-1] for id in cluster]

        cluster_weight = 0

        for _f in cluster_files:

            file_weight = gnm2weight[file2org[_f]]
            cluster_weight += file_weight

            if _f not in file2crispr_type:
                t.update_dictionary(type2cnt, "NA", 1)
                t.update_dictionary(type2wgt, "NA", file_weight)
                continue
            for _type in file2crispr_type[_f]:
                t.update_dictionary(type2cnt, _type, 1)
                t.update_dictionary(type2wgt, _type, file_weight)

        _weights = np.array(type2wgt.values(), dtype=np.float)

        sum_errors.append(np.sum(np.square(_weights - np.mean(_weights))))

        _weights /= np.sum(_weights)
        entropy = -1 * np.sum(_weights * np.log(_weights))
        entropies.append(entropy)
        weights.append(cluster_weight)

        to_collapse_retval.append((cluster, type2cnt, type2wgt, entropy))

    sum_errors = np.average(sum_errors)

    entropies = np.array(entropies)
    weights = np.array(weights)

    average_entropy = np.sum(entropies * weights) / np.sum(weights)
    sum_errors = np.sum(sum_errors * weights) / np.sum(weights)

    return singles, to_collapse_retval, sum_errors, average_entropy
Example #14
0
def sub_classify_by_scores_cas4(M, threshold, loci, inf_default=50):

    if not (M == np.transpose(M)).all():
        M += np.transpose(M)
        M = np.negative(np.log(M))
        np.fill_diagonal(M, 0)
        inf_idx = np.isinf(M)
        M[inf_idx] = inf_default

    M_array = ssd.squareform(M)

    Z = linkage(M_array, method='average')

    root = to_tree(Z)

    leaf_names = [os.path.basename(l.file_name) for l in loci]
    newick = tree_to_newick(root, "", root.dist, leaf_names)

    fname = gv.project_data_path + '/cas4/tmp.nw'
    outf = open(fname, 'w')
    outf.write(newick)
    outf.close()

    proc = sp.Popen(['tree_listnodes', '-o=4', fname],
                    stdout=sp.PIPE,
                    stderr=open(os.devnull, 'wb'))

    locus2weight = {}

    for line in proc.stdout:

        terms = line.strip().split()
        if len(terms) == 2:
            locus2weight[terms[0]] = float(terms[1])

    root = clone_graph(root)

    nodes = get_nodes(root)
    id2node = {node.id: node for node in nodes}

    leaf_ids = leaves_list(Z)

    cnt = 0
    i = 0
    total_count = 1

    pool = []

    while True:
        cur_node = id2node[leaf_ids[i]]
        parent_dist = cur_node.parent.dist

        while parent_dist < threshold:
            cur_node = cur_node.parent
            parent_dist = cur_node.parent.dist

        cur_leaf_ids = get_leaves(cur_node)

        pool.append([id for id in cur_leaf_ids])

        total_count += cur_node.count

        i += len(cur_leaf_ids)

        if i >= len(leaf_ids):
            break
        cnt += 1

    to_collapse = [l for l in pool if len(l) > 1]
    singles = [l[0] for l in pool if len(l) == 1]

    to_collapse = sorted(to_collapse, key=lambda x: len(x), reverse=True)

    gene2cnt = {}

    for locus in loci:
        try:
            weight = locus2weight[os.path.basename(locus.file_name)]
        except:
            print "Skipping:", os.path.basename(locus.file_name)
            continue
        for gene_name in locus.gene_names:
            t.update_dictionary(gene2cnt, gene_name, weight)
            # t.update_dictionary(gene2cnt, gene_name, 1)

    return singles, to_collapse, gene2cnt
Example #15
0
def classify_by_scores_cas4(M,
                            threshold,
                            loci,
                            inf_default=50,
                            locus2weight=None):

    if not (M == np.transpose(M)).all():

        M += np.transpose(M)
        M = np.negative(np.log(M))
        np.fill_diagonal(M, 0)
        inf_idx = np.isinf(M)
        M[inf_idx] = inf_default

    M_array = ssd.squareform(M)

    Z = linkage(M_array, method='average')

    root = to_tree(Z)

    leaf_names = [os.path.basename(l.file_name) for l in loci]
    newick = tree_to_newick(root, "", root.dist, leaf_names)

    fname = gv.project_data_path + '/cas4/tmp.nw'
    outf = open(fname, 'w')
    outf.write(newick)
    outf.close()

    proc = sp.Popen(['tree_listnodes', '-o=4', fname],
                    stdout=sp.PIPE,
                    stderr=open(os.devnull, 'wb'))

    locus2weight = {}

    for line in proc.stdout:

        terms = line.strip().split()
        if len(terms) == 2:
            locus2weight[terms[0]] = float(terms[1])

    root = clone_graph(root)

    nodes = get_nodes(root)
    id2node = {node.id: node for node in nodes}

    leaf_ids = leaves_list(Z)

    cnt = 0
    i = 0
    total_count = 1

    pool = []

    while True:
        cur_node = id2node[leaf_ids[i]]
        parent_dist = cur_node.parent.dist

        while parent_dist < threshold:
            cur_node = cur_node.parent
            parent_dist = cur_node.parent.dist

        cur_leaf_ids = get_leaves(cur_node)

        pool.append([id for id in cur_leaf_ids])

        total_count += cur_node.count

        i += len(cur_leaf_ids)

        # if i >= len(leaf_ids)-1:
        if i >= len(leaf_ids):
            break
        cnt += 1

    to_collapse = [l for l in pool if len(l) > 1]
    singles = [l[0] for l in pool if len(l) == 1]

    to_collapse = sorted(to_collapse, key=lambda x: len(x), reverse=True)

    entropies = []
    to_collapse_retval = []

    cluster_ind = 0

    for cluster in to_collapse:

        cluster_ind += 1
        type2cnt = {}
        gene2cnt = {}

        for pos in cluster:
            t.update_dictionary(type2cnt, loci[pos].crispr_type, 1.0)

            _fname = os.path.basename(loci[pos].file_name)
            _weight = locus2weight[_fname] if _fname in locus2weight else 1

            for _gene_name in loci[pos].gene_names:
                t.update_dictionary(gene2cnt, _gene_name, _weight)

        _values = type2cnt.values()
        _values /= np.sum(_values)
        entropy = -1 * np.sum(_values * np.log(_values))
        entropies.append(entropy)

        to_collapse_retval.append((cluster, type2cnt, entropy, gene2cnt))

    entropies = np.array(entropies)

    average_entropy = np.average(entropies)

    return singles, to_collapse_retval, average_entropy
Example #16
0
def classify_by_scores(M, threshold, loci):

    M_array = ssd.squareform(M)
    # main linkage structure for upgma
    # print "Building linkage"
    Z = linkage(M_array, method='average')

    # Z = np.load(linkage_file).items()[0][1]
    # print "plotting dendogram"
    # plot_dendrogram(Z, report_path)

    root = to_tree(Z)
    root = clone_graph(root)

    nodes = get_nodes(root)
    id2node = {node.id: node for node in nodes}

    leaf_ids = leaves_list(Z)

    cnt = 0
    i = 0
    total_count = 1

    pool = []
    # print "Starting merging"

    while True:
        cur_node = id2node[leaf_ids[i]]
        parent_dist = cur_node.parent.dist

        while parent_dist < threshold:
            cur_node = cur_node.parent
            parent_dist = cur_node.parent.dist

        cur_leaf_ids = get_leaves(cur_node)

        pool.append([id for id in cur_leaf_ids])

        total_count += cur_node.count

        i += len(cur_leaf_ids)

        if i >= len(leaf_ids)-1:
            break
        cnt += 1

    to_collapse = [l for l in pool if len(l) > 1]
    singles = [l[0] for l in pool if len(l) == 1]

    to_collapse = sorted(to_collapse, key=lambda x: sum(gnm2weight[loci[i].organism] for i in x), reverse=True)

    sum_errors = []
    entropies = []
    weights = []
    to_collapse_retval = []

    cluster_ind = 0

    for cluster in to_collapse:

        cluster_ind += 1
        type2cnt = {}
        type2wgt = {}

        cluster_files = [loci[id].file_name.split('/')[-1] for id in cluster]

        cluster_weight = 0

        for _f in cluster_files:

            file_weight = gnm2weight[file2org[_f]]
            cluster_weight += file_weight

            if _f not in file2crispr_type:
                t.update_dictionary(type2cnt, "NA", 1)
                t.update_dictionary(type2wgt, "NA", file_weight)
                continue
            for _type in file2crispr_type[_f]:
                t.update_dictionary(type2cnt, _type, 1)
                t.update_dictionary(type2wgt, _type, file_weight)

        _weights = np.array(type2wgt.values(), dtype=np.float)

        sum_errors.append(np.sum(np.square(_weights - np.mean(_weights))))

        _weights /= np.sum(_weights)
        entropy = -1 * np.sum(_weights * np.log(_weights))
        entropies.append(entropy)
        weights.append(cluster_weight)

        to_collapse_retval.append((cluster, type2cnt, type2wgt, entropy))

    sum_errors = np.average(sum_errors)

    entropies = np.array(entropies)
    weights = np.array(weights)

    average_entropy = np.sum(entropies * weights) / np.sum(weights)
    sum_errors = np.sum(sum_errors * weights) / np.sum(weights)

    return singles, to_collapse_retval, sum_errors, average_entropy
Example #17
0
def generate_cluster_reports(cluster_packs, loci, reports_dir, feature_labels, method, thresholds_pack):

    if not feature_labels:
        local_features = True
    else:
        local_features = False

    thr_occ, thr_crisp, cluster_threshold = thresholds_pack

    summary_file = os.path.join(reports_dir,
                                'summary_%s_%d_%.2f_%.2f.xls' % (method, thr_occ, thr_crisp, cluster_threshold))

    workbook = x.Workbook(summary_file)
    worksheet = workbook.add_worksheet()

    header_format = workbook.add_format()
    header_format.set_font_size(12)
    header_format.set_bold()
    header_format.set_align('center')
    worksheet.set_column(4,5,50)
    worksheet.write_row(0, 0, ["File name", "Weight", "Loci", "Entropy", "systems weight", "systems count"], header_format)

    print "Generating report files"
    ind = 0

    weights = np.zeros(len(cluster_packs))
    entropies = np.zeros(len(cluster_packs))

    for outer_i in range(len(cluster_packs)):

        (cluster, type2count, type2weight, entropy) = cluster_packs[outer_i]

        ind += 1
        cl_files = [os.path.basename(loci[i].file_name) for i in cluster]

        weight = sum([gnm2weight[file2org[file]] for file in cl_files])

        weights[outer_i] = weight
        entropies[outer_i] = entropy

        crispr_cas_types_count = " ; ".join([k+":"+str(v) for (k,v) in sorted(type2count.items(), key=itemgetter(1), reverse=True)])
        crispr_cas_types_weight = " ; ".join([k+":"+str(v) for (k,v) in sorted(type2weight.items(), key=itemgetter(1), reverse=True)])

        xls_file_name = os.path.join(reports_dir, '%d.xls' % ind)

        worksheet.write_row(ind+1, 0, ['%d.xls'%ind,
                                       weight,
                                       len(cl_files),
                                       entropy,
                                       crispr_cas_types_weight,
                                       crispr_cas_types_count,
                                       " "])

        cl_loci = sorted([loci[_i] for _i in cluster], key = lambda x: gnm2weight[x.organism], reverse=True)

        local_profile2weight = {}
        for locus in cl_loci:
            for gene in locus.genes:
                for profile in gene.cogid.split(','):
                    t.update_dictionary(local_profile2weight, profile, gnm2weight[locus.organism])

        global_profile2weight = t.map_global_cdd_profile_count()

        if local_features:
            feature_labels = [ k for k,v in local_profile2weight.items() if v/weight >= 0.5 ]

        params = {}

        params['xls_file_name']         = xls_file_name
        params['loci']                  = cl_loci
        params['weight']                = weight
        params['profile_code2def']      = profile_code2def
        params['gnm2weight']            = gnm2weight
        params['feature_labels']        = feature_labels
        params['file2crispr_type']      = file2crispr_type
        params['local_profile2weight']  = local_profile2weight
        params['global_profile2weight'] = global_profile2weight

        r.write_to_xls_generic_loci(params)

    worksheet.write_row(ind+3, 0, ['Average entropy'], header_format)
    worksheet.write_row(ind+3, 1, [np.sum(weights*entropies)/np.sum(weights)])

    worksheet.write_row(ind + 4, 0, ['Exp(Average entropy)'], header_format)
    worksheet.write_row(ind + 4, 1, [np.exp(np.sum(weights * entropies) / np.sum(weights))])
Example #18
0
def classify_loci_hierarchically(loci,
                                 threshold=5,
                                 inf_default=50,
                                 dendrogram_file=None):

    M = scores.jackard_weighted_scores(loci)

    if not (M == np.transpose(M)).all():
        M += np.transpose(M)
        M = np.negative(np.log(M))
        np.fill_diagonal(M, 0)
        inf_idx = np.isinf(M)
        M[inf_idx] = inf_default

    M_array = ssd.squareform(M)
    Z = linkage(M_array, method='average')

    if dendrogram_file:
        plot_dendrogram(Z, dendrogram_file)
        return

    root = to_tree(Z)
    locus2weight = tree_to_weights(root, loci)

    root = clone_graph(root)
    nodes = get_nodes(root)
    id2node = {node.id: node for node in nodes}
    leaf_ids = leaves_list(Z)

    cnt = 0
    i = 0
    total_count = 1

    pool = []

    while True:
        cur_node = id2node[leaf_ids[i]]
        parent_dist = cur_node.parent.dist

        while parent_dist < threshold:
            cur_node = cur_node.parent
            parent_dist = cur_node.parent.dist

        cur_leaf_ids = get_leaves(cur_node)
        pool.append([id for id in cur_leaf_ids])
        total_count += cur_node.count
        i += len(cur_leaf_ids)
        if i >= len(leaf_ids):
            break
        cnt += 1

    to_collapse = [l for l in pool if len(l) > 1]
    to_collapse = sorted(to_collapse, key=lambda x: len(x), reverse=True)
    singles = [l[0] for l in pool if len(l) == 1]

    entropies = []
    to_collapse_retval = []

    cluster_ind = 0

    for cluster in to_collapse:

        cluster_ind += 1
        type2cnt = {}
        gene2cnt = {}

        for pos in cluster:
            t.update_dictionary(type2cnt, loci[pos].crispr_type, 1.0)

            _fname = os.path.basename(loci[pos].file_name)
            _weight = locus2weight[_fname] if _fname in locus2weight else 1

            for _gene_name in loci[pos].gene_names:
                t.update_dictionary(gene2cnt, _gene_name, _weight)

        _values = type2cnt.values()
        _values /= np.sum(_values)
        entropy = -1 * np.sum(_values * np.log(_values))
        entropies.append(entropy)

        to_collapse_retval.append((cluster, type2cnt, entropy, gene2cnt))

    entropies = np.array(entropies)
    average_entropy = np.average(entropies)

    return singles, to_collapse_retval, average_entropy
Example #19
0
def classify_by_scores_cas4(M, threshold, loci, inf_default=50, locus2weight=None):

    if not (M == np.transpose(M)).all():

        M += np.transpose(M)
        M = np.negative(np.log(M))
        np.fill_diagonal(M, 0)
        inf_idx = np.isinf(M)
        M[inf_idx] = inf_default

    M_array = ssd.squareform(M)

    Z = linkage(M_array, method='average')

    root = to_tree(Z)

    leaf_names = [os.path.basename(l.file_name) for l in loci]
    newick = tree_to_newick(root, "", root.dist, leaf_names)

    fname = gv.project_data_path + '/cas4/tmp.nw'
    outf = open(fname, 'w')
    outf.write(newick)
    outf.close()

    proc = sp.Popen(['tree_listnodes', '-o=4', fname], stdout=sp.PIPE, stderr=open(os.devnull, 'wb'))

    locus2weight = {}

    for line in proc.stdout:

        terms = line.strip().split()
        if len(terms) == 2:
            locus2weight[terms[0]] = float(terms[1])

    root = clone_graph(root)

    nodes = get_nodes(root)
    id2node = {node.id: node for node in nodes}

    leaf_ids = leaves_list(Z)

    cnt = 0
    i = 0
    total_count = 1

    pool = []

    while True:
        cur_node = id2node[leaf_ids[i]]
        parent_dist = cur_node.parent.dist

        while parent_dist < threshold:
            cur_node = cur_node.parent
            parent_dist = cur_node.parent.dist

        cur_leaf_ids = get_leaves(cur_node)

        pool.append([id for id in cur_leaf_ids])

        total_count += cur_node.count

        i += len(cur_leaf_ids)

        # if i >= len(leaf_ids)-1:
        if i >= len(leaf_ids):
            break
        cnt += 1
        
    to_collapse = [l for l in pool if len(l) > 1]
    singles = [l[0] for l in pool if len(l) == 1]

    to_collapse = sorted(to_collapse, key=lambda x: len(x), reverse=True)

    entropies = []
    to_collapse_retval = []

    cluster_ind = 0

    for cluster in to_collapse:

        cluster_ind += 1
        type2cnt = {}
        gene2cnt = {}

        for pos in cluster:
            t.update_dictionary(type2cnt, loci[pos].crispr_type, 1.0)

            _fname = os.path.basename(loci[pos].file_name)
            _weight = locus2weight[_fname] if _fname in locus2weight else 1

            for _gene_name in loci[pos].gene_names:
                t.update_dictionary(gene2cnt, _gene_name, _weight)

        _values = type2cnt.values()
        _values /= np.sum(_values)
        entropy = -1 * np.sum(_values * np.log(_values))
        entropies.append(entropy)

        to_collapse_retval.append((cluster, type2cnt, entropy, gene2cnt))

    entropies = np.array(entropies)

    average_entropy = np.average(entropies)

    return singles, to_collapse_retval, average_entropy
Example #20
0
def kplet_list_to_file_summaries(kplets,
                                 neighborhood_files_path,
                                 filter_weak_hits=True):

    file_summaries = list()
    organisms = set()

    _file2kplets = dict()
    _kplet2count_af = dict()  # kplet2count after filtration
    _kplet2count_bf = dict()  # kplet2count before filtration

    _profile2count_bf = dict()
    _profile2count_af = dict()
    _cas_type2count = dict()
    # filter_size = 5

    for kplet in kplets:
        for f in kplet.files:
            t.update_dictionary(_file2kplets, f, [kplet])

    initial_length = len(_file2kplets)

    for f in _file2kplets.keys():
        [
            t.update_dictionary(_kplet2count_bf, kplet.id, 1)
            for kplet in _file2kplets[f]
        ]
    del f

    # if filter_weak_hits:
    #     _file2kplets = {k: v for (k,v) in _file2kplets.items() if len(v) > filter_size}

    if len(_file2kplets) < 2: return None

    _file2genes = {
        f: dt.get_pty_file_generic(os.path.join(neighborhood_files_path, f))
        for f in _file2kplets.keys()
    }
    _files = set(_file2kplets.keys())

    for _f in _files:

        _genes = _file2genes[_f]
        _src = _genes[0].src
        _org = _genes[0].organism

        organisms.update([_org])
        _nfs = NeighborhoodFileSummary(_f, _file2kplets[_f], _genes, _org,
                                       _src, _org2weight[_org])

        for _gene in _genes:
            if _gene.gid in _gi2castype:
                _nfs.cas_type = ";".join(_gi2castype[_gene.gid])
                for _cas_type in _gi2castype[_gene.gid]:
                    t.update_dictionary(_cas_type2count, _cas_type, 1)
                break

        [
            t.update_dictionary(_kplet2count_af, kplet.id, 1)
            for kplet in _file2kplets[_f]
        ]

        for _gene in _genes:
            for _c in _gene.cogid.split(','):
                t.update_dictionary(_profile2count_af, _c, 1)

        file_summaries.append(_nfs)
    # file_summaries = [fs for fs in file_summaries if len(fs.kplets)>1]
    # _files = [fs.file_name for fs in file_summaries]

    # for _f in _files:
    # [t.update_dictionary(_kplet2count_af, kplet.id, 1) for kplet in _file2kplets[_f]]
    #
    # _gene_list = _file2genes[_f]
    # for _gene in _gene_list:
    #     for _c in _gene.cogid.split(','):
    #         t.update_dictionary(_profile2count_af, _c, 1)

    file_summaries.sort(key=lambda x: x.org)
    retval = GenericMergingKplets2FsOutput()
    retval.file_summaries = file_summaries
    retval.organisms = organisms
    retval.profile2count_bf = _profile2count_bf
    retval.profile2count_af = _profile2count_af
    retval.kplet2count_af = _kplet2count_af
    retval.kplet2count_bf = _kplet2count_bf
    retval.weight = sum(fs.weight for fs in file_summaries)
    retval.cas_type2count = _cas_type2count

    return retval
Example #21
0
def classify_loci_hierarchically(loci, threshold=5, inf_default=50, dendrogram_file=None):

    M = scores.jackard_weighted_scores(loci)

    if not (M == np.transpose(M)).all():
        M += np.transpose(M)
        M = np.negative(np.log(M))
        np.fill_diagonal(M, 0)
        inf_idx = np.isinf(M)
        M[inf_idx] = inf_default

    M_array = ssd.squareform(M)
    Z = linkage(M_array, method='average')

    if dendrogram_file:
        plot_dendrogram(Z, dendrogram_file)
        return

    root = to_tree(Z)
    locus2weight = tree_to_weights(root, loci)

    root = clone_graph(root)
    nodes = get_nodes(root)
    id2node = {node.id: node for node in nodes}
    leaf_ids = leaves_list(Z)

    cnt = 0
    i = 0
    total_count = 1

    pool = []

    while True:
        cur_node = id2node[leaf_ids[i]]
        parent_dist = cur_node.parent.dist

        while parent_dist < threshold:
            cur_node = cur_node.parent
            parent_dist = cur_node.parent.dist

        cur_leaf_ids = get_leaves(cur_node)
        pool.append([id for id in cur_leaf_ids])
        total_count += cur_node.count
        i += len(cur_leaf_ids)
        if i >= len(leaf_ids):
            break
        cnt += 1

    to_collapse = [l for l in pool if len(l) > 1]
    to_collapse = sorted(to_collapse, key=lambda x: len(x), reverse=True)
    singles = [l[0] for l in pool if len(l) == 1]

    entropies = []
    to_collapse_retval = []

    cluster_ind = 0

    for cluster in to_collapse:

        cluster_ind += 1
        type2cnt = {}
        gene2cnt = {}

        for pos in cluster:
            t.update_dictionary(type2cnt, loci[pos].crispr_type, 1.0)

            _fname = os.path.basename(loci[pos].file_name)
            _weight = locus2weight[_fname] if _fname in locus2weight else 1

            for _gene_name in loci[pos].gene_names:
                t.update_dictionary(gene2cnt, _gene_name, _weight)

        _values = type2cnt.values()
        _values /= np.sum(_values)
        entropy = -1 * np.sum(_values * np.log(_values))
        entropies.append(entropy)

        to_collapse_retval.append((cluster, type2cnt, entropy, gene2cnt))

    entropies = np.array(entropies)
    average_entropy = np.average(entropies)

    return singles, to_collapse_retval, average_entropy
from lib.utils import tools as t

cdd_file = '/Users/hudaiber/data/CDD/all_Prok1402.ccp.csv'

gnm2weight = t.map_genome2weight()

profile2count = {}
profile2weight = {}

missing = []

for l in open(cdd_file):
    terms = l.split(',')
    org = terms[1]
    profile = terms[6]
    if org in gnm2weight:
        t.update_dictionary(profile2count, profile, 1)
        t.update_dictionary(profile2weight, profile, gnm2weight[org])
    else:
        missing.append(org)

print "Missing from weights:"
for gnm in set(missing):
    print gnm

print "Finished scanning"
with open('/Users/hudaiber/data/CDD/profile2weight.tab','w') as outf:
    outf.write("#Profile\tweight\tcount\n")
    for k,v in profile2weight.items():
        outf.write("%s\t%f\t%s\n"%(k,v, profile2count[k]))
print "Done"
Example #23
0
def generate_jw_cluster_reports(cluster_packs, loci, reports_dir, threshold):

    # if not feature_labels:
    #     local_features = True
    # else:
    #     local_features = False

    # thr_occ, thr_crisp, cluster_threshold = thresholds_pack

    summary_file = os.path.join(reports_dir,
                                'summary_jw_%.2f.xls' % threshold)

    workbook = x.Workbook(summary_file)
    worksheet = workbook.add_worksheet()

    header_format = workbook.add_format()
    header_format.set_font_size(12)
    header_format.set_bold()
    header_format.set_align('center')
    worksheet.set_column(4,5,50)
    worksheet.write_row(0, 0, ["File name", "Weight", "Loci", "Entropy", "systems weight", "systems count"], header_format)

    print "Generating report files"
    ind = 0

    weights = np.zeros(len(cluster_packs))
    entropies = np.zeros(len(cluster_packs))

    for outer_i in range(len(cluster_packs)):

        (cluster, type2count, type2weight, entropy) = cluster_packs[outer_i]

        ind += 1
        cl_files = [os.path.basename(loci[i].file_name) for i in cluster]

        weight = sum([gnm2weight[file2org[file]] for file in cl_files])

        weights[outer_i] = weight
        entropies[outer_i] = entropy

        crispr_cas_types_count = " ; ".join([k+":"+str(v) for (k,v) in sorted(type2count.items(), key=itemgetter(1), reverse=True)])
        crispr_cas_types_weight = " ; ".join([k+":"+str(v) for (k,v) in sorted(type2weight.items(), key=itemgetter(1), reverse=True)])

        xls_file_name = os.path.join(reports_dir, '%d.xls' % ind)

        worksheet.write_row(ind+1, 0, ['%d.xls'%ind,
                                       weight,
                                       len(cl_files),
                                       entropy,
                                       crispr_cas_types_weight,
                                       crispr_cas_types_count,
                                       " "])

        cl_loci = sorted([loci[_i] for _i in cluster], key = lambda x: gnm2weight[x.organism], reverse=True)

        local_profile2weight = {}
        for locus in cl_loci:
            for gene in locus.genes:
                for profile in gene.cogid.split(','):
                    t.update_dictionary(local_profile2weight, profile, gnm2weight[locus.organism])

        global_profile2weight = t.map_global_cdd_profile_count()

        # if local_features:
        #     feature_labels = [ k for k,v in local_profile2weight.items() if v/weight >= 0.5 ]

        params = {}

        params['xls_file_name']         = xls_file_name
        params['loci']                  = cl_loci
        params['weight']                = weight
        params['profile_code2def']      = profile_code2def
        params['gnm2weight']            = gnm2weight
        # params['feature_labels']        = feature_labels
        params['feature_labels']        = []
        params['file2crispr_type']      = file2crispr_type
        params['local_profile2weight']  = local_profile2weight
        params['global_profile2weight'] = global_profile2weight

        r.write_to_xls_generic_loci(params)

    worksheet.write_row(ind+3, 0, ['Average entropy'], header_format)
    worksheet.write_row(ind+3, 1, [np.sum(weights*entropies)/np.sum(weights)])

    worksheet.write_row(ind + 4, 0, ['Exp(Average entropy)'], header_format)
    worksheet.write_row(ind + 4, 1, [np.exp(np.sum(weights * entropies) / np.sum(weights))])
Example #24
0
def kplet_list_to_file_summaries(kplets, neighborhood_files_path, filter_weak_hits=True, dataset=None):

    file_summaries = list()
    organisms = set()
    _crispr_type2files = dict()
    _file2kplets = dict()
    _kplet2count_af = dict() # kplet2count after filtration
    _kplet2count_bf = dict() # kplet2count before filtration

    _profile2count_bf = dict()
    _profile2count_af = dict()

    filter_size = 5

    singletons = get_singleton_loci(dataset)
    clusters = get_clustered_loci(dataset)

    for kplet in kplets:
        for f in kplet.files:
            t.update_dictionary(_file2kplets, f, [kplet])

    initial_length = len(_file2kplets)

    for f in _file2kplets.keys():
        [t.update_dictionary(_kplet2count_bf, kplet.id, 1) for kplet in _file2kplets[f]]
    del f

    kplet_ids = [k.id for k in kplets]

    if filter_weak_hits:
        _file2kplets = {k: v for (k,v) in _file2kplets.items() if len(v) > filter_size}

    if len(_file2kplets) < 2: return None

    _file2genes = {f: dt.get_wgs_file(os.path.join(neighborhood_files_path, f)) for f in _file2kplets.keys()}
    _files = set(_file2kplets.keys())

    for _gene_list in _file2genes.values():
        for _gene in _gene_list:
            for _c in _gene.cogid.split(','):
                t.update_dictionary(_profile2count_bf, _c, 1)
    del _gene_list, _gene, _c

    while _files:
        _f = _files.pop()
        if _f in singletons:
            _genes = _file2genes[_f]
            _src = _genes[0].src
            _org = _genes[0].organism
            _crispr_type = _genes[0].crispr_type
            t.update_dictionary_set(_crispr_type2files, _crispr_type, _f)

            file_summaries.append(WGSNeighborhoodFileSummary(_f, _file2kplets[_f], _genes, _org, _src, 'singleton'))
            organisms.update(set([_org]))

        else:
            _cluster = None
            for cl in clusters:
                if _f in cl.files:
                    _cluster = cl
                    break
            if not _cluster:
                continue
            del cl

            _cl_files = _cluster.files.intersection(_files)
            _representative = _f
            del _f

            for _cl_file in _cl_files:
                if len(_file2genes[_cl_file]) > len(_file2genes[_representative]):
                    _representative = _cl_file

            _genes = _file2genes[_representative]
            _src = _genes[0].src
            _org = _genes[0].organism
            _crispr_type = _genes[0].crispr_type
            t.update_dictionary_set(_crispr_type2files, _crispr_type, _representative)

            _file_summary = WGSNeighborhoodFileSummary(_representative, _file2kplets[_representative], _genes, _org,
                                                       _src, _cluster)
            _file_summary.cluster_local_count = len(_cl_files)+1

            file_summaries.append(_file_summary)
            organisms.update(set([_org]))

            _files = _files.difference(_cl_files)

    file_summaries = [fs for fs in file_summaries if len(fs.kplets)>1]

    _files = [fs.file_name for fs in file_summaries]
    for _f in _files:
        [t.update_dictionary(_kplet2count_af, kplet.id, 1) for kplet in _file2kplets[_f]]

        _gene_list = _file2genes[_f]
        for _gene in _gene_list:
            for _c in _gene.cogid.split(','):
                t.update_dictionary(_profile2count_af, _c, 1)

    file_summaries.sort(key=lambda x: x.org)
    retval = CrisprMergingKplets2FsOutput()
    retval.file_summaries = file_summaries
    retval.organisms = organisms
    retval.crispr_type2files = _crispr_type2files
    retval.kplet2count_af = _kplet2count_af
    retval.kplet2count_bf = _kplet2count_bf
    retval.initial_length = initial_length
    retval.kplets = kplets
    retval.profile2count_bf = _profile2count_bf
    retval.profile2count_af = _profile2count_af

    return retval
Example #25
0
def merge_into_file_summaries(kplets,
                              neighborhood_files_path,
                              file2src_src2org_map,
                              data_type='bacteria'):

    _org2weight = t.map_genome2weight()

    _file2kplets = dict()
    for kplet in kplets:
        for f in kplet.files:
            if f in _file2kplets:
                _file2kplets[f].append(kplet)
            else:
                _file2kplets[f] = [kplet]

    kplet_files = _file2kplets.keys()
    _file2src, _src2org = file2src_src2org_map(kplet_files)

    file_summaries = list()
    for f in kplet_files:
        _neighborhood = Neighborhood(os.path.join(neighborhood_files_path, f))
        _src = _file2src[f]
        _org = _src2org[_src]
        _weight = _org2weight[_org]
        kplets = _file2kplets[f]
        _neighborhood.extend_flanks(
            10, os.path.join(gv.pty_data_path, _org, "%s.pty" % _src),
            _gid2arcog_cdd)
        file_summaries.append(
            NeighborhoodFileSummary(f, kplets, _neighborhood, _org, _src,
                                    _weight))

    # file_summaries = trim_file_summary_list(file_summaries, data_type)
    # file_summaries = [fs for fs in file_summaries if fs]

    # Updating the map _file2src after trimming.
    # new_file_list = [ fs.file_name for fs in file_summaries]
    # for _file_name in _file2src.keys():
    #     if _file_name not in new_file_list:
    #         del _file2src[_file_name]

    # if len(file_summaries) < 2:
    #     return None, None, None, None, None, None

    file_summaries.sort(key=lambda x: x.weight, reverse=True)

    community_count_with_flanks = {}
    community_count = {}
    _org2weight = t.map_genome2weight()

    total_weight = 0

    for i in range(len(file_summaries)):
        cur_file_summary = file_summaries[i]
        _weight = _org2weight[cur_file_summary.org]
        total_weight += _weight
        for gene in cur_file_summary.neighborhood.genes:
            if gene.tag == 'flank':
                for k in gene.cogid.split():
                    t.update_dictionary(community_count_with_flanks, k,
                                        _weight)
            else:
                for k in gene.cogid.split():
                    t.update_dictionary(community_count_with_flanks, k,
                                        _weight)
                    t.update_dictionary(community_count, k, _weight)
    community = []
    return _src2org, file_summaries, community, community_count, community_count_with_flanks, total_weight
Example #26
0
def tree_leaves():

    work_dir = os.path.join(gv.project_data_path, 'UvrD/prok1603')
    tree_dir = os.path.join(work_dir, 'clust_tree/')
    files_dir = os.path.join(work_dir, 'merged_files')

    profile2gene = t.map_cdd_profile2gene_name()

    gi2org = {l.split()[0]: l.rstrip().split()[1] for l in open(work_dir + '/gi_org.txt')}

    gi2weight = {l.split()[0].split('.')[0]: float(l.split()[1]) for l in open(work_dir + '/prok1603_weights.txt')}

    cl2size, cl2gis, cl2weight = {}, {}, {}

    for l in open(tree_dir + 'uvrd.cls'):
        terms = l.rstrip().split()
        cl2size[terms[1]] = terms[0]
        cl2gis[terms[1]] = terms[2:]
        cl2weight[terms[1]] = sum([ gi2weight[gi] if gi in gi2weight else 0 for gi in terms[2:]])

    tree_string = open(tree_dir + 'uvrd.up.tre').readline()

    leave_file_names = [os.path.basename(l) for l in glob.glob(tree_dir + '*.sr')]

    for leave_file_name in leave_file_names:

        leave_file_gis = [ l.split()[0] for l in open(os.path.join(tree_dir, leave_file_name))]

        system_gene_pool = []

        sgp_count = {}

        for gi in leave_file_gis:
            system_genes = get_system_genes(gi, files_dir, profile2gene)

            if not system_genes:
                continue
            system_gene_pool.append(system_genes)

            t.update_dictionary(sgp_count, system_genes, gi2weight[gi])

        sorted_sgp_count = sorted(sgp_count.items(), key=lambda x: x[1], reverse=True)

        leaf_name = os.path.splitext(leave_file_name)[0]
        gene_names = sorted_sgp_count[0][0] if sorted_sgp_count else ""

        representative = gi2org[random.choice(leave_file_gis)]

        total_weight = sum([v for k,v in sgp_count.items()])
        leaf_prefix = "%s|" % int(total_weight) if total_weight else "-"

        has_genes = False

        for _gene_name in ["Cas4", "UvrA", "UvrB", "UvrC", "SbcS", "SbcD"]:
        # for _gene_name in ["Cas4"]:
            _weight = sum([v for k, v in sgp_count.items() if _gene_name.lower() in k.lower()])
            if _weight:
                leaf_prefix += "%s=%d|" % (_gene_name, _weight)
                has_genes = True

        if has_genes:
            new_leaf_name = leaf_prefix + representative + "|" + leaf_name.split('.')[1]
        else:
            new_leaf_name = leaf_prefix + leaf_name.split('.')[1]

        # new_leaf_name = "cas4=%s/%s|%s|%s" % (int(cas4_weight) if total_weight else "-",
        #                                  int(total_weight) if total_weight else "-",
        #                                  representative,
        #                                  leaf_name.split('.')[1])

        print leaf_name, new_leaf_name

        tree_string = tree_string.replace(leaf_name + ":", new_leaf_name + ":")

        # new_file_name = os.path.join(tree_dir, os.path.splitext(leave_file_name)[0] + '.def')
        # with open(new_file_name, 'w') as new_file:
        #
        #     for k, v in sorted_sgp_count:
        #         new_file.write("#%s\t%f\n" % (k, v))
        #
        #     new_file.write("\n")
        #
        #     [new_file.write("%s\t%s\n" % (gi, gi2org[gi])) for gi in leave_file_gis]

    with open(tree_dir + 'uvrd.up_all_genes.tree', 'w') as outf:
        outf.write(tree_string)
Example #27
0
def generate_community_reports(nodes_pool,
                               reports_dir,
                               locus2weight,
                               file2locus,
                               profile2def,
                               feature_profiles_file=None):

    # if not feature_labels:
    #     local_features = True
    # else:
    #     local_features = False

    # thr_occ, thr_crisp, cluster_threshold = thresholds_pack

    summary_file = os.path.join(reports_dir, 'summary.xlsx')

    workbook = x.Workbook(summary_file)
    worksheet = workbook.add_worksheet()

    header_format = workbook.add_format()
    header_format.set_font_size(12)
    header_format.set_bold()
    header_format.set_align('center')
    # worksheet.set_column(4,5,50)
    worksheet.write_row(0, 0, ["File name", "Size", "Effective size", "Genes"],
                        header_format)

    print "Generating report files"
    ind = 1

    for nodes in nodes_pool:

        loci_size = len([node for node in nodes if node.type == 2])
        loci_esize = sum(node.weight for node in nodes if node.type == 2)

        # if loci_esize < 5:
        #     continue

        loci = [file2locus[node.file_name] for node in nodes if node.type == 2]

        xls_file_name = os.path.join(reports_dir, '%d.xlsx' % ind)
        loci_file_name = os.path.join(reports_dir, '%d.tab' % ind)

        with open(loci_file_name, 'w') as outf:
            loci_files = ",".join(
                os.path.basename(locus.file_name) for locus in loci)
            outf.write(loci_files + "\n")

        gene2cnt = {}
        profile2cnt = {}
        for locus in loci:
            weight = locus2weight[os.path.basename(locus.file_name)]
            for gene_name in locus.gene_names:
                t.update_dictionary(gene2cnt, gene_name, weight)
            for cl in locus.clusters:
                t.update_dictionary(profile2cnt, cl, weight)

        sorted_gene2count = sorted(gene2cnt.items(),
                                   key=lambda x: x[1],
                                   reverse=True)
        gene_counts = ";".join([
            "%s:%.2f" % (gene_name, count)
            for (gene_name, count) in sorted_gene2count[:10]
        ])

        worksheet.write_row(
            ind + 1, 0, ['%d.xlsx' % ind, loci_size, loci_esize, gene_counts])
        args = {}

        args['xls_file_name'] = xls_file_name
        args['loci'] = loci
        args['profile_code2def'] = profile2def
        if not feature_profiles_file:
            args['feature_labels'] = [
                k for k, v in profile2cnt.items() if v >= loci_esize / 2
            ]
        else:
            args['feature_labels'] = [
                l.strip() for l in open(feature_profiles_file)
            ]

        try:
            r.write_to_xls_loci_plain(args)
        except:
            sys.exit()
        ind += 1
Example #28
0
def dull_gene_name():

    cas_gene_names = [l.strip() for l in open(os.path.join(gv.project_data_path,'cas1402/all_gene_names.txt'))]

    gene_name2gids = { gene:set() for gene in cas_gene_names }

    cnt = 0
    with open(os.path.join(gv.project_data_path,'cas1402/cas1402.arrisl.lst')) as inf:

        for in_line in inf:

            if in_line.startswith("==="):
                continue

            parts = in_line.strip().split('\t')
            if len(parts) < 9:
                continue

            _gene = parts[8]

            if _gene in cas_gene_names:

                gene_name2gids[_gene].update([parts[0]])

    cdd_gid2profiles = t.map_gid2cdd()

    cas_gene2profile = { gene:{} for gene in cas_gene_names }

    for _cas_gene in cas_gene_names:
        for _gid in gene_name2gids[_cas_gene]:
            if not _gid in cdd_gid2profiles:
                # t.update_dictionary(cas_gene2profile[_cas_gene], "NA", 1)
                continue
            for _profile in cdd_gid2profiles[_gid].split():

                t.update_dictionary(cas_gene2profile[_cas_gene], _profile, 1)

    work_dir = os.path.join(gv.project_data_path,'cas1402/crispricity/')

    with open(os.path.join(work_dir, 'gene_name2profiles.txt'), 'w') as outf:
        for _gene_name in cas_gene_names:
            for _profile in cas_gene2profile[_gene_name]:
                outf.write("%s\t%s\t%d\n" % (_gene_name, _profile, cas_gene2profile[_gene_name][_profile]))

    cas_related_profiles = set([_profile for _gene in cas_gene_names for _profile in cas_gene2profile[_gene].keys()])

    cr_occurrence = []
    cr_crispricity = []
    ncr_occurrence = []
    ncr_crispricity = []

    for l in open(os.path.join(work_dir, 'crispricity.tab')).readlines()[1:]:
        if not l:
            continue
        parts = l.split('\t')

        if parts[0] in cas_related_profiles:
            cr_occurrence.append(parts[1])
            cr_crispricity.append(parts[2])
        else:
            ncr_occurrence.append(parts[1])
            ncr_crispricity.append(parts[2])

    cr_occurrence = np.asarray(cr_occurrence, dtype=np.float)
    cr_occurrence = np.log(cr_occurrence)
    cr_crispricity = np.asarray(cr_crispricity)

    ncr_occurrence = np.asarray(ncr_occurrence, dtype=np.float)
    ncr_occurrence = np.log(ncr_occurrence)
    ncr_crispricity = np.asarray(ncr_crispricity)

    plt.ioff()
    fig, ax = plt.subplots()
    ax.scatter(cr_occurrence, cr_crispricity, color='r', s=1, label="Cas related")
    ax.scatter(ncr_occurrence, ncr_crispricity, color='b', s=1, label="Not Cas related")

    ax.axvline(1.6 ,color='g', linewidth=0.5)
    ax.axhline(0.5 ,color='g', linewidth=0.5)

    plt.xlabel("Effective orcurrence in CRISPR loci (log)")
    plt.ylabel("Crispricity")

    plt.legend(loc="upper left", fontsize=7)

    plt.savefig(os.path.join(work_dir, 'crispricity_log.png'))
Example #29
0
def extract_all_duplets_from_prok1402():
    """
    Extraction adjacent duplets is done by means of recording them in the dictionary pair2weight

    The overall abundance of profiles is also needed. It's recorded in profile2weight
    """
    pty_path = "/panfs/pan1/patternquest/data/Pty/genomes/"
    work_dir = os.path.join(data_path, 'prok1402/graph/graph_files/')

    print("Loading dictionaries")
    gi2profiles = t.map_gi2profiles()
    genome2weight = t.map_genome2weight()
    pair2weight = defaultdict(float)
    pair2count = defaultdict(int)
    profile2weight = defaultdict(float)

    print("Reading Prok1402")
    for root, dirs, files in os.walk(pty_path):
        for f in files:

            if not f.endswith(".pty"):
                continue

            file_name = os.path.join(root, f)
            genome = os.path.basename(root)

            genes = t.parse_pty_file(file_name)
            for gene in genes:
                gene.profiles = gi2profiles[gene.gid]

                for profile in gene.profiles:
                    t.update_dictionary(profile2weight, profile,
                                        genome2weight[genome])

            previous_profiles = genes[0].profiles

            if len(previous_profiles) > 1:
                domain_duplets = list(combinations(previous_profiles, 2))

                for duplet in domain_duplets:
                    [kplet_1, kplet_2] = sorted(duplet)
                    key = "%s-%s" % (kplet_1, kplet_2)
                    t.update_dictionary(pair2weight, key,
                                        genome2weight[genome])
                    t.update_dictionary(pair2count, key, 1)

            for gene in genes[1:]:
                cur_profiles = gene.profiles

                if not previous_profiles:
                    previous_profiles = cur_profiles
                    continue

                if len(cur_profiles) > 1:
                    domain_duplets = list(combinations(previous_profiles, 2))

                    for duplet in domain_duplets:
                        [kplet_1, kplet_2] = sorted(duplet)
                        key = "%s-%s" % (kplet_1, kplet_2)
                        t.update_dictionary(pair2weight, key,
                                            genome2weight[genome])
                        t.update_dictionary(pair2count, key, 1)

                adjacent_duplets = list(
                    product(previous_profiles, cur_profiles))

                for duplet in adjacent_duplets:
                    [kplet_1, kplet_2] = sorted(duplet)
                    key = "%s-%s" % (kplet_1, kplet_2)
                    t.update_dictionary(pair2weight, key,
                                        genome2weight[genome])
                    t.update_dictionary(pair2count, key, 1)

                previous_profiles = cur_profiles

    print("Writing to files")
    with open(os.path.join(work_dir, "prok1402_adj_duplets_weights.txt"),
              "w") as outf:

        for (key, weight) in sorted(pair2weight.items(),
                                    key=lambda x: x[1],
                                    reverse=True):
            [kplet_1, kplet_2] = key.split("-")
            outf.write("%s\t%s\t%f\n" % (kplet_1, kplet_2, weight))

    with open(os.path.join(work_dir, "prok1402_profile_abundance.txt"),
              "w") as outf:
        for (profile, weight) in sorted(profile2weight.items(),
                                        key=lambda x: x[1],
                                        reverse=True):
            outf.write("%s\t%f\n" % (profile, weight))
def extract_all_duplets_from_prok1402():

    """
    Extraction adjacent duplets is done by means of recording them in the dictionary pair2weight

    The overall abundance of profiles is also needed. It's recorded in profile2weight
    """
    pty_path = "/panfs/pan1/patternquest/data/Pty/genomes/"
    work_dir = os.path.join(data_path, 'prok1402/graph/graph_files/')

    print("Loading dictionaries")
    gi2profiles = t.map_gi2profiles()
    genome2weight = t.map_genome2weight()
    pair2weight = defaultdict(float)
    pair2count = defaultdict(int)
    profile2weight=defaultdict(float)

    print("Reading Prok1402")
    for root, dirs, files in os.walk(pty_path):
        for f in files:

            if not f.endswith(".pty"):
                continue

            file_name = os.path.join(root, f)
            genome = os.path.basename(root)

            genes = t.parse_pty_file(file_name)
            for gene in genes:
                gene.profiles = gi2profiles[gene.gid]

                for profile in gene.profiles:
                    t.update_dictionary(profile2weight, profile, genome2weight[genome])

            previous_profiles = genes[0].profiles

            if len(previous_profiles) > 1:
                domain_duplets = list(combinations(previous_profiles,2))

                for duplet in domain_duplets:
                    [kplet_1, kplet_2] = sorted(duplet)
                    key = "%s-%s" % (kplet_1, kplet_2)
                    t.update_dictionary(pair2weight, key, genome2weight[genome])
                    t.update_dictionary(pair2count, key, 1)

            for gene in genes[1:]:
                    cur_profiles = gene.profiles

                    if not previous_profiles:
                        previous_profiles = cur_profiles
                        continue

                    if len(cur_profiles) > 1:
                        domain_duplets = list(combinations(previous_profiles, 2))

                        for duplet in domain_duplets:
                            [kplet_1, kplet_2] = sorted(duplet)
                            key = "%s-%s" % (kplet_1, kplet_2)
                            t.update_dictionary(pair2weight, key, genome2weight[genome])
                            t.update_dictionary(pair2count, key, 1)

                    adjacent_duplets = list(product(previous_profiles, cur_profiles))

                    for duplet in adjacent_duplets:
                        [kplet_1, kplet_2] = sorted(duplet)
                        key = "%s-%s" % (kplet_1, kplet_2)
                        t.update_dictionary(pair2weight, key, genome2weight[genome])
                        t.update_dictionary(pair2count, key, 1)

                    previous_profiles = cur_profiles

    print("Writing to files")
    with open(os.path.join(work_dir, "prok1402_adj_duplets_weights.txt"), "w") as outf:

        for (key,weight) in sorted(pair2weight.items(), key=lambda x: x[1], reverse=True):
            [kplet_1, kplet_2] = key.split("-")
            outf.write("%s\t%s\t%f\n" % (kplet_1, kplet_2, weight))

    with open(os.path.join(work_dir, "prok1402_profile_abundance.txt"), "w") as outf:
        for (profile,weight) in sorted(profile2weight.items(), key=lambda x: x[1], reverse=True):
            outf.write("%s\t%f\n" % (profile, weight))