Example #1
0
    def __init__(self, file_name, annotation_map=None, file_format='pty', profile2gene=None):

        self.file_name = file_name
        self.base_file_name = os.path.basename(file_name)

        self.merged_files = set([self.file_name])
        self.merged_base_files = set([self.base_file_name])

        if file_format=='pty':
            _genes = dt.get_pty_file(file_name, annotation_map=annotation_map, profile2gene=profile2gene)
        else:
            _genes = dt.get_pty_file_generic(file_name, profile2gene=profile2gene)

        # print "File name:", file_name, "Genes size:",len(_genes)

        self.genes = _genes
        self.profiles = set(profile for g in _genes for profile in g.cogid.split(',') if profile != '')

        self.gene_names = set(gene_name for g in _genes for gene_name in g.gene_name.split(',') if gene_name != '')
        self.organism = _genes[0].organism
        self.source = _genes[0].src

        _forward = set()
        _reverse = set()

        for i in range(len(self.genes)):
            _gene = self.genes[i]
            for _cogid in _gene.cogid.split(','):
                _forward.update((_cogid,))
                if i == len(self.genes)-1:
                    continue
                _next_gene = self.genes[i+1]
                for _next_cogid in _next_gene.cogid.split(','):
                    _forward.update(("%s-%s" % (_cogid, _next_cogid),))

        self.genes.sort(reverse=True)

        for i in range(len(self.genes)):
            _gene = self.genes[i]
            for _cogid in _gene.cogid.split(','):
                _reverse.update((_cogid,))
                if i == len(self.genes)-1:
                    continue
                _next_gene = self.genes[i+1]
                for _next_cogid in _next_gene.cogid.split(','):
                    _reverse.update(("%s-%s" % (_cogid, _next_cogid),))

        self.forward_set = _forward
        self.reverse_set = _reverse

        self.feature_weights = None
        self.feature_labels  = None
Example #2
0
    def __init__(self, file_name, annotation_map=None):

        self.file_name = file_name

        _genes = dt.get_pty_file_generic(file_name,
                                         annotation_map=annotation_map)
        self.genes = _genes
        self.profiles = set(profile for g in _genes
                            for profile in g.cogid.split() if profile != '')
        self.organism = _genes[0].organism
        self.source = _genes[0].src

        _forward = set()
        _reverse = set()

        for i in range(len(_genes)):
            _gene = _genes[i]
            for _cogid in _gene.cogid.split(','):
                _forward.update((_cogid, ))
                if i == len(_genes) - 1:
                    continue
                _next_gene = _genes[i + 1]
                for _next_cogid in _next_gene.cogid.split(','):
                    _forward.update(("%s-%s" % (_cogid, _next_cogid), ))

        _genes.sort(reverse=True)

        for i in range(len(_genes)):
            _gene = _genes[i]
            for _cogid in _gene.cogid.split(','):
                _reverse.update((_cogid, ))
                if i == len(_genes) - 1:
                    continue
                _next_gene = _genes[i + 1]
                for _next_cogid in _next_gene.cogid.split(','):
                    _reverse.update(("%s-%s" % (_cogid, _next_cogid), ))

        self.forward_set = _forward
        self.reverse_set = _reverse

        self.feature_weights = None
        self.feature_labels = None
Example #3
0

if __name__ == '__main__':

    # prok1402_path_file = '/mnt/storage/data/CDD/all_Prok1402.ccp.csv'
    prok1402_path_file = os.path.join(os.path.expanduser('~'),
                                      'data/CDD/all_Prok1402.ccp.csv')

    print "Loading map_gid2cdd"
    gi2annotation = t.map_gid2cdd()

    print "Loading CRISPR loci"
    # cas1402_loci_path = os.path.join(gv.project_data_path,'cas1402/files/')
    cas1402_loci_path = os.path.join(gv.project_data_path, 'cas1402/files/')
    cas1402_loci = [
        dt.get_pty_file_generic(os.path.join(cas1402_loci_path, f),
                                annotation_map=gi2annotation)
        for f in os.listdir(cas1402_loci_path)
    ]

    cas1402_gis = set([gene.gid for locus in cas1402_loci for gene in locus])
    cas1402_organisms = set([locus[0].organism for locus in cas1402_loci])

    calculate_profile_based_crispricity(cas1402_loci, cas1402_gis,
                                        cas1402_organisms, prok1402_path_file)

    # all_orgs = set()
    # for profile, org2gis in global_profile2orgs2gis.items():
    #     if 'Yersinia_pseudotuberculosis_PB1__uid59153' in org2gis:
    #         print "Yersinia_pseudotuberculosis_PB1__uid59153"
    #     for _org in org2gis.keys():
    #         all_orgs.update([_org])
Example #4
0
    plt.ylabel("Crispricity")

    plt.legend(loc="upper left", fontsize=7)

    plt.savefig(os.path.join(work_dir, 'crispricity_log.png'))


if __name__=='__main__':

    # prok1402_path_file = '/mnt/storage/data/CDD/all_Prok1402.ccp.csv'
    prok1402_path_file = os.path.join(os.path.expanduser('~'),'data/CDD/all_Prok1402.ccp.csv')

    print "Loading map_gid2cdd"
    gi2annotation = t.map_gid2cdd()

    print "Loading CRISPR loci"
    # cas1402_loci_path = os.path.join(gv.project_data_path,'cas1402/files/')
    cas1402_loci_path = os.path.join(gv.project_data_path,'cas1402/files/')
    cas1402_loci = [dt.get_pty_file_generic(os.path.join(cas1402_loci_path, f), annotation_map = gi2annotation) for f in os.listdir(cas1402_loci_path)]

    cas1402_gis = set([gene.gid for locus in cas1402_loci for gene in locus])
    cas1402_organisms = set([locus[0].organism for locus in cas1402_loci])

    calculate_profile_based_crispricity(cas1402_loci, cas1402_gis, cas1402_organisms, prok1402_path_file)

    # all_orgs = set()
    # for profile, org2gis in global_profile2orgs2gis.items():
    #     if 'Yersinia_pseudotuberculosis_PB1__uid59153' in org2gis:
    #         print "Yersinia_pseudotuberculosis_PB1__uid59153"
    #     for _org in org2gis.keys():
    #         all_orgs.update([_org])
Example #5
0
def kplet_list_to_file_summaries(kplets,
                                 neighborhood_files_path,
                                 filter_weak_hits=True):

    file_summaries = list()
    organisms = set()

    _file2kplets = dict()
    _kplet2count_af = dict()  # kplet2count after filtration
    _kplet2count_bf = dict()  # kplet2count before filtration

    _profile2count_bf = dict()
    _profile2count_af = dict()
    _cas_type2count = dict()
    # filter_size = 5

    for kplet in kplets:
        for f in kplet.files:
            t.update_dictionary(_file2kplets, f, [kplet])

    initial_length = len(_file2kplets)

    for f in _file2kplets.keys():
        [
            t.update_dictionary(_kplet2count_bf, kplet.id, 1)
            for kplet in _file2kplets[f]
        ]
    del f

    # if filter_weak_hits:
    #     _file2kplets = {k: v for (k,v) in _file2kplets.items() if len(v) > filter_size}

    if len(_file2kplets) < 2: return None

    _file2genes = {
        f: dt.get_pty_file_generic(os.path.join(neighborhood_files_path, f))
        for f in _file2kplets.keys()
    }
    _files = set(_file2kplets.keys())

    for _f in _files:

        _genes = _file2genes[_f]
        _src = _genes[0].src
        _org = _genes[0].organism

        organisms.update([_org])
        _nfs = NeighborhoodFileSummary(_f, _file2kplets[_f], _genes, _org,
                                       _src, _org2weight[_org])

        for _gene in _genes:
            if _gene.gid in _gi2castype:
                _nfs.cas_type = ";".join(_gi2castype[_gene.gid])
                for _cas_type in _gi2castype[_gene.gid]:
                    t.update_dictionary(_cas_type2count, _cas_type, 1)
                break

        [
            t.update_dictionary(_kplet2count_af, kplet.id, 1)
            for kplet in _file2kplets[_f]
        ]

        for _gene in _genes:
            for _c in _gene.cogid.split(','):
                t.update_dictionary(_profile2count_af, _c, 1)

        file_summaries.append(_nfs)
    # file_summaries = [fs for fs in file_summaries if len(fs.kplets)>1]
    # _files = [fs.file_name for fs in file_summaries]

    # for _f in _files:
    # [t.update_dictionary(_kplet2count_af, kplet.id, 1) for kplet in _file2kplets[_f]]
    #
    # _gene_list = _file2genes[_f]
    # for _gene in _gene_list:
    #     for _c in _gene.cogid.split(','):
    #         t.update_dictionary(_profile2count_af, _c, 1)

    file_summaries.sort(key=lambda x: x.org)
    retval = GenericMergingKplets2FsOutput()
    retval.file_summaries = file_summaries
    retval.organisms = organisms
    retval.profile2count_bf = _profile2count_bf
    retval.profile2count_af = _profile2count_af
    retval.kplet2count_af = _kplet2count_af
    retval.kplet2count_bf = _kplet2count_bf
    retval.weight = sum(fs.weight for fs in file_summaries)
    retval.cas_type2count = _cas_type2count

    return retval
Example #6
0
    def __init__(self,
                 file_name,
                 annotation_map=None,
                 file_format='pty',
                 profile2gene=None):

        self.file_name = file_name
        self.base_file_name = os.path.basename(file_name)
        if file_format == 'pty':
            _genes = dt.get_pty_file(file_name, annotation_map=annotation_map)
        else:
            _genes = dt.get_pty_file_generic(file_name,
                                             profile2gene=profile2gene)

        locus_file = open(file_name)

        type_line = locus_file.readline()

        if type_line.startswith('#type:'):
            self.crispr_type = type_line.split(':')[1].strip()
        else:
            self.crispr_type = None

        self.summary_line = locus_file.readline()

        assert self.summary_line.startswith("===")

        locus_file.close()

        self.genes = _genes
        self.profiles = set(profile for g in _genes
                            for profile in g.cogid.split() if profile != '')
        self.gene_names = set(gene_name for g in _genes
                              for gene_name in g.gene_name.split(',')
                              if gene_name != '')
        self.clusters = set(gene.cluster_id for gene in _genes)
        self.organism = _genes[0].organism
        self.source = _genes[0].src

        _forward = set()
        _reverse = set()

        for i in range(len(self.genes)):
            _gene = self.genes[i]
            for _cogid in _gene.cogid.split(','):
                _forward.update((_cogid, ))
                if i == len(self.genes) - 1:
                    continue
                _next_gene = self.genes[i + 1]
                for _next_cogid in _next_gene.cogid.split(','):
                    _forward.update(("%s-%s" % (_cogid, _next_cogid), ))

        self.genes.sort(reverse=True)

        for i in range(len(self.genes)):
            _gene = self.genes[i]
            for _cogid in _gene.cogid.split(','):
                _reverse.update((_cogid, ))
                if i == len(self.genes) - 1:
                    continue
                _next_gene = self.genes[i + 1]
                for _next_cogid in _next_gene.cogid.split(','):
                    _reverse.update(("%s-%s" % (_cogid, _next_cogid), ))

        self.forward_set = _forward
        self.reverse_set = _reverse

        self.feature_weights = None
        self.feature_labels = None
Example #7
0
    def __init__(self,
                 file_name,
                 annotation_map=None,
                 file_format='pty',
                 profile2gene=None):

        self.file_name = file_name
        self.base_file_name = os.path.basename(file_name)

        self.merged_files = set([self.file_name])
        self.merged_base_files = set([self.base_file_name])

        if file_format == 'pty':
            _genes = dt.get_pty_file(file_name,
                                     annotation_map=annotation_map,
                                     profile2gene=profile2gene)
        else:
            _genes = dt.get_pty_file_generic(file_name,
                                             profile2gene=profile2gene)

        # print "File name:", file_name, "Genes size:",len(_genes)

        self.genes = _genes
        self.profiles = set(profile for g in _genes
                            for profile in g.cogid.split(',') if profile != '')

        self.gene_names = set(gene_name for g in _genes
                              for gene_name in g.gene_name.split(',')
                              if gene_name != '')
        self.organism = _genes[0].organism
        self.source = _genes[0].src

        _forward = set()
        _reverse = set()

        for i in range(len(self.genes)):
            _gene = self.genes[i]
            for _cogid in _gene.cogid.split(','):
                _forward.update((_cogid, ))
                if i == len(self.genes) - 1:
                    continue
                _next_gene = self.genes[i + 1]
                for _next_cogid in _next_gene.cogid.split(','):
                    _forward.update(("%s-%s" % (_cogid, _next_cogid), ))

        self.genes.sort(reverse=True)

        for i in range(len(self.genes)):
            _gene = self.genes[i]
            for _cogid in _gene.cogid.split(','):
                _reverse.update((_cogid, ))
                if i == len(self.genes) - 1:
                    continue
                _next_gene = self.genes[i + 1]
                for _next_cogid in _next_gene.cogid.split(','):
                    _reverse.update(("%s-%s" % (_cogid, _next_cogid), ))

        self.forward_set = _forward
        self.reverse_set = _reverse

        self.feature_weights = None
        self.feature_labels = None
Example #8
0
    sys.path.append('/Users/hudaiber/Projects/SystemFiles/')
elif sys.platform=='linux2':
    sys.path.append('/home/hudaiber/Projects/SystemFiles/')
import global_variables as gv
# sys.path.append(gv.project_code_path)
import dm_tools as dt

cas_type_file = os.path.join(gv.project_data_path, 'cas1402/cas1402.type.tab')
_gi2castype =  {l.strip().split('\t')[0]: l.strip().split('\t')[1].split(';') for l in open(cas_type_file).readlines()}

files_path = os.path.join(gv.project_data_path, 'cas1402/files')

file2type = {}

for f in os.listdir(files_path):
    _genes = dt.get_pty_file_generic(os.path.join(files_path, f))
    for _gene in _genes:
        if _gene.gid in _gi2castype:
            if f in file2type:
                file2type[f] += _gi2castype[_gene.gid]
            else:
                file2type[f] = _gi2castype[_gene.gid]

print len(file2type)
print

outf = os.path.join(gv.project_data_path, 'cas1402/file2type.tab')

with open(outf,'w') as of:
    for file, cas_type in file2type.items():
        of.write("%s\t%s\n" % (file, ";".join(cas_type)))
Example #9
0
    def __init__(self, file_name, annotation_map=None, file_format='pty', profile2gene=None):

        self.file_name = file_name
        self.base_file_name = os.path.basename(file_name)
        if file_format=='pty':
            _genes = dt.get_pty_file(file_name, annotation_map=annotation_map)
        else:
            _genes = dt.get_pty_file_generic(file_name, profile2gene=profile2gene)

        locus_file = open(file_name)

        type_line = locus_file.readline()

        if type_line.startswith('#type:'):
            self.crispr_type = type_line.split(':')[1].strip()
        else:
            self.crispr_type = None

        self.summary_line = locus_file.readline()

        assert self.summary_line.startswith("===")

        locus_file.close()

        self.genes = _genes
        self.profiles = set(profile for g in _genes for profile in g.cogid.split() if profile != '')
        self.gene_names = set(gene_name for g in _genes for gene_name in g.gene_name.split(',') if gene_name != '')
        self.clusters = set(gene.cluster_id for gene in _genes)
        self.organism = _genes[0].organism
        self.source = _genes[0].src

        _forward = set()
        _reverse = set()

        for i in range(len(self.genes)):
            _gene = self.genes[i]
            for _cogid in _gene.cogid.split(','):
                _forward.update((_cogid,))
                if i == len(self.genes)-1:
                    continue
                _next_gene = self.genes[i+1]
                for _next_cogid in _next_gene.cogid.split(','):
                    _forward.update(("%s-%s" % (_cogid, _next_cogid),))

        self.genes.sort(reverse=True)

        for i in range(len(self.genes)):
            _gene = self.genes[i]
            for _cogid in _gene.cogid.split(','):
                _reverse.update((_cogid,))
                if i == len(self.genes)-1:
                    continue
                _next_gene = self.genes[i+1]
                for _next_cogid in _next_gene.cogid.split(','):
                    _reverse.update(("%s-%s" % (_cogid, _next_cogid),))

        self.forward_set = _forward
        self.reverse_set = _reverse

        self.feature_weights = None
        self.feature_labels  = None