Example #1
0
def get_flank_distributions(kplets_2d_list, neighborhood_path, target_profiles):

    org2weights = t.map_genome2weight()
    flanking_genes_count = []

    cog2gids = []

    gid2weight = dict()

    for kplets_list in kplets_2d_list:
        cur_flanking_genes_count = dict()

        cur_cog2gids = dict()

        for kplet in kplets_list:
            neighborhoods = [Neighborhood(os.path.join(neighborhood_path, f)) for f in kplet.files]

            for neighborhood in neighborhoods:
                for gene in neighborhood.genes:

                    gid2weight[int(gene.gid)] = org2weights[gene.organism]

                    for cogid in gene.cogid.split():
                        # if cogid in target_profiles:
                        #     continue
                        t.update_dictionary(cur_flanking_genes_count,cogid,org2weights[gene.organism])
                        t.update_dictionary_set(cur_cog2gids, cogid, set([int(gene.gid)]))

        flanking_genes_count.append(cur_flanking_genes_count)
        cog2gids.append(cur_cog2gids)

    return flanking_genes_count, cog2gids, gid2weight
Example #2
0
def search_kplet_in_genomes(kplet_codes, target_profiles, max_dist=20, block_size=4):

    org2src = dict()
    src2blocks = dict()

    for _org in os.listdir(gv.pty_data_path):

        _org_path = os.path.join(gv.pty_data_path, _org)
        for _src in os.listdir(_org_path):
            _genes = dt.get_pty(os.path.join(_org_path, _src))

            blocks = list()
            cur_block = list()

            last_ind = None

            for (ind, _gene) in enumerate(_genes):
                _cogids = set(gid2cdd[_gene.gid].split() if _gene.gid in gid2cdd else set([]))
                _gene.cogid = _cogids
                if _cogids.intersection(kplet_codes):
                    if not last_ind:
                        cur_block.append(_gene)
                        last_ind = ind+1
                        continue

                    if ind - last_ind < max_dist:
                        cur_block += _genes[last_ind: ind+1]
                    else:
                        blocks.append(cur_block)
                        cur_block = [_gene]
                    last_ind = ind+1
            blocks.append(cur_block)

            filtered_blocks = list()

            for block in blocks:
                block_codes = set([])
                block_all_codes = set([])

                for _gene in block:
                    block_codes.update(_gene.cogid.intersection(kplet_codes))
                    block_all_codes.update(_gene.cogid)

                if len(block_codes) >= block_size and not block_all_codes.intersection(target_profiles):
                    filtered_blocks.append(block)
            del blocks

            if filtered_blocks:
                t.update_dictionary_set(org2src, _org, _src)
                src2blocks[_src] = filtered_blocks

    return org2src, src2blocks
Example #3
0
def kplet_to_file_summaries(kplet, neighborhood_files_path):

    file_summaries = list()
    organisms = set()

    _crispr_type2files = dict()

    for f in kplet.files:
        _genes = dt.get_wgs_file(os.path.join(neighborhood_files_path, f))

        _src = _genes[0].src
        _org = _genes[0].organism

        _crispr_type = _genes[0].crispr_type
        t.update_dictionary_set(_crispr_type2files, _crispr_type, f)

        file_summaries.append(WGSNeighborhoodFileSummary(f,[kplet],_genes,_org,_src))
        organisms.update(_org)

    file_summaries.sort(key=lambda x: x.org)
    return file_summaries, organisms, _crispr_type2files
Example #4
0
def census_table():

    cas42locus = {}
    genome2cas4 = {}
    loci_file = os.path.join(data_path, 'Islands_ID.ann_clust')
    loci = t.parse_crispr_loci(loci_file)

    census_outf = open(os.path.join(data_path, 'cas4_census.txt'), "w")
    # columns = ["GI", "profiles", "Gene name(s)", "Status", "Complete", "Type", "Organism", "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
    columns = [
        "GI", "profiles", "Gene name(s)", "Status", "Complete", "Type",
        "Organism", "Total Cas4 in genome", "Kingdom"
    ]
    census_outf.write("#" + "\t".join(columns) + "\n")

    # annotation_outf = open(os.path.join(target_path, 'cas4_annotation.txt'), "w")
    # annotation_outf.write("\t".join(["GI","annotation"]) + "\n")

    for locus in loci:
        cas42locus.update({gene: locus for gene in locus.cas4_genes})
        [
            t.update_dictionary_set(genome2cas4, locus.organism, _cas4.gid)
            for _cas4 in locus.cas4_genes
        ]

    for cas4_gene, locus in sorted(cas42locus.items(), key=lambda x: x[0].gid):

        _gi = cas4_gene.gid
        _profiles = cas4_gene.cogid
        _gene_names = ",".join(set(cas4_gene.gene_name.split(',')))
        _status = locus.status
        _complete = "complete" if locus.complete else "partial"
        _type = locus.type if ";" not in locus.type else locus.type.split(
            ";")[0]
        _org = locus.organism

        _domain = genome2domain[_org]
        # _phylum = genome2phylum[_org]
        # _class = genome2class[_org]
        # _order = genome2Order[_org]
        # _family = genome2Family[_org]
        # _genus = genome2Genus[_org]
        # _species = genome2Species[_org].strip()
        # _lineage = genome2tax_lineage[_org]
        # terms = [_gi, _profiles, _gene_names, _status, _complete, _type, _org, _domain, _phylum, _class, _order, _family, _genus, _species]
        terms = [
            _gi, _profiles, _gene_names, _status, _complete, _type, _org,
            str(len(genome2cas4[_org])), _domain
        ]
        census_outf.write("\t".join(terms) + "\n")
Example #5
0
def census_table():

    cas42locus = {}
    genome2cas4 = {}
    loci_file = os.path.join(data_path, 'Islands_ID.ann_clust')
    loci = t.parse_crispr_loci(loci_file)

    census_outf = open(os.path.join(data_path, 'cas4_census.txt'), "w")
    # columns = ["GI", "profiles", "Gene name(s)", "Status", "Complete", "Type", "Organism", "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
    columns = ["GI", "profiles", "Gene name(s)", "Status", "Complete", "Type", "Organism", "Total Cas4 in genome", "Kingdom"]
    census_outf.write("#"+"\t".join(columns)+"\n")

    # annotation_outf = open(os.path.join(target_path, 'cas4_annotation.txt'), "w")
    # annotation_outf.write("\t".join(["GI","annotation"]) + "\n")

    for locus in loci:
        cas42locus.update({gene:locus for gene in locus.cas4_genes})
        [t.update_dictionary_set(genome2cas4, locus.organism, _cas4.gid) for _cas4 in locus.cas4_genes]

    for cas4_gene, locus in sorted(cas42locus.items(), key=lambda x: x[0].gid):

        _gi = cas4_gene.gid
        _profiles = cas4_gene.cogid
        _gene_names = ",".join(set(cas4_gene.gene_name.split(',')))
        _status = locus.status
        _complete = "complete" if locus.complete else "partial"
        _type = locus.type if ";" not in locus.type else locus.type.split(";")[0]
        _org = locus.organism

        _domain = genome2domain[_org]
        # _phylum = genome2phylum[_org]
        # _class = genome2class[_org]
        # _order = genome2Order[_org]
        # _family = genome2Family[_org]
        # _genus = genome2Genus[_org]
        # _species = genome2Species[_org].strip()
        # _lineage = genome2tax_lineage[_org]
        # terms = [_gi, _profiles, _gene_names, _status, _complete, _type, _org, _domain, _phylum, _class, _order, _family, _genus, _species]
        terms = [_gi, _profiles, _gene_names, _status, _complete, _type, _org, str(len(genome2cas4[_org])), _domain]
        census_outf.write("\t".join(terms)+"\n")
Example #6
0
def kplet_list_to_file_summaries(kplets, neighborhood_files_path, filter_weak_hits=True, dataset=None):

    file_summaries = list()
    organisms = set()
    _crispr_type2files = dict()
    _file2kplets = dict()
    _kplet2count_af = dict() # kplet2count after filtration
    _kplet2count_bf = dict() # kplet2count before filtration

    _profile2count_bf = dict()
    _profile2count_af = dict()

    filter_size = 5

    singletons = get_singleton_loci(dataset)
    clusters = get_clustered_loci(dataset)

    for kplet in kplets:
        for f in kplet.files:
            t.update_dictionary(_file2kplets, f, [kplet])

    initial_length = len(_file2kplets)

    for f in _file2kplets.keys():
        [t.update_dictionary(_kplet2count_bf, kplet.id, 1) for kplet in _file2kplets[f]]
    del f

    kplet_ids = [k.id for k in kplets]

    if filter_weak_hits:
        _file2kplets = {k: v for (k,v) in _file2kplets.items() if len(v) > filter_size}

    if len(_file2kplets) < 2: return None

    _file2genes = {f: dt.get_wgs_file(os.path.join(neighborhood_files_path, f)) for f in _file2kplets.keys()}
    _files = set(_file2kplets.keys())

    for _gene_list in _file2genes.values():
        for _gene in _gene_list:
            for _c in _gene.cogid.split(','):
                t.update_dictionary(_profile2count_bf, _c, 1)
    del _gene_list, _gene, _c

    while _files:
        _f = _files.pop()
        if _f in singletons:
            _genes = _file2genes[_f]
            _src = _genes[0].src
            _org = _genes[0].organism
            _crispr_type = _genes[0].crispr_type
            t.update_dictionary_set(_crispr_type2files, _crispr_type, _f)

            file_summaries.append(WGSNeighborhoodFileSummary(_f, _file2kplets[_f], _genes, _org, _src, 'singleton'))
            organisms.update(set([_org]))

        else:
            _cluster = None
            for cl in clusters:
                if _f in cl.files:
                    _cluster = cl
                    break
            if not _cluster:
                continue
            del cl

            _cl_files = _cluster.files.intersection(_files)
            _representative = _f
            del _f

            for _cl_file in _cl_files:
                if len(_file2genes[_cl_file]) > len(_file2genes[_representative]):
                    _representative = _cl_file

            _genes = _file2genes[_representative]
            _src = _genes[0].src
            _org = _genes[0].organism
            _crispr_type = _genes[0].crispr_type
            t.update_dictionary_set(_crispr_type2files, _crispr_type, _representative)

            _file_summary = WGSNeighborhoodFileSummary(_representative, _file2kplets[_representative], _genes, _org,
                                                       _src, _cluster)
            _file_summary.cluster_local_count = len(_cl_files)+1

            file_summaries.append(_file_summary)
            organisms.update(set([_org]))

            _files = _files.difference(_cl_files)

    file_summaries = [fs for fs in file_summaries if len(fs.kplets)>1]

    _files = [fs.file_name for fs in file_summaries]
    for _f in _files:
        [t.update_dictionary(_kplet2count_af, kplet.id, 1) for kplet in _file2kplets[_f]]

        _gene_list = _file2genes[_f]
        for _gene in _gene_list:
            for _c in _gene.cogid.split(','):
                t.update_dictionary(_profile2count_af, _c, 1)

    file_summaries.sort(key=lambda x: x.org)
    retval = CrisprMergingKplets2FsOutput()
    retval.file_summaries = file_summaries
    retval.organisms = organisms
    retval.crispr_type2files = _crispr_type2files
    retval.kplet2count_af = _kplet2count_af
    retval.kplet2count_bf = _kplet2count_bf
    retval.initial_length = initial_length
    retval.kplets = kplets
    retval.profile2count_bf = _profile2count_bf
    retval.profile2count_af = _profile2count_af

    return retval
Example #7
0
# 			if 'COG0210' in profiles:
# 				print len(profiles), f, profiles

if __name__ == "__main__":

    work_dir = '/home/hudaiber/Projects/NewSystems/data/UvrD/paralogs/'
    # prok1603_ccp        = '/home/hudaiber/Projects/NewSystems/data/UvrD/paralogs/Prok1603.ccp.csv'
    prok1603_ccp = '/dev/shm/Prok1603.ccp.csv'
    prok1603_except_dir = '/panfs/pan1/patternquest/Projects/NewSystems/data/UvrD/except_prok1603/COG0210/hhpred/'

    pk_gi2pr = {}
    for l in open(prok1603_ccp):
        terms = l.strip().split(',')
        gi = terms[0]
        profile = terms[6]
        t.update_dictionary_set(pk_gi2pr, gi, profile)

    outfmt = "%s\t%s\t%s"
    print outfmt % ("#GI", "Genome", "Cas4 fusion")
    print "#--------------------------------------------"

    all_lines = open(os.path.join(work_dir, 'from_not_prok1603/uvrd_filtered.pty')).readlines() + \
                open(os.path.join(work_dir, 'from_prok1603/uvrd.pty')).readlines()

    for pty_line in open(os.path.join(work_dir,
                                      'from_prok1603/uvrd.pty')).readlines():

        parts = pty_line.strip().split()
        genome = parts[3]
        gi = parts[-1]
        acc = parts[-2]