コード例 #1
0
        def get_clusters_to_merge():
            smallest_min_distance, clusters_to_merge = None, None
            n_skipped = 0
            for clust_a, clust_b in itertools.combinations(
                    clusters, 2
            ):  # find the two clusters which contain the pair of sequences which are closest in hamming fraction (skipping cluster pairs that would make a cluster that's too big)
                if len(clust_a) + len(
                        clust_b
                ) > max_per_cluster and not glomerate.merge_whatever_you_got:  # merged cluster would be too big, so look for smaller (albeit further-apart) things to merge
                    n_skipped += 1
                    continue
                min_distance = None  # find the smallest hamming distance between any two sequences in the two clusters
                for query_a in clust_a:
                    for query_b in clust_b:
                        joint_key = query_a + ';' + query_b  #';'.join([query_a, query_b])
                        if joint_key not in distances:
                            distances[joint_key] = utils.hamming_fraction(
                                naive_seqs[query_a], naive_seqs[query_b])
                            distances[query_b + ';' + query_a] = distances[
                                joint_key]  # also add with key in reverse order, in case we run into the pair that way later on
                        if min_distance is None or distances[
                                joint_key] < min_distance:
                            min_distance = distances[joint_key]
                if smallest_min_distance is None or min_distance < smallest_min_distance:
                    smallest_min_distance = min_distance
                    clusters_to_merge = (clust_a, clust_b)

            if debug and n_skipped > 0:
                print '      skipped: %d ' % n_skipped

            return clusters_to_merge
コード例 #2
0
def run_sklearn_mds(n_components, n_clusters, seqfos, seed, reco_info=None, region=None, aligned=False, n_init=4, max_iter=300, eps=1e-3, n_jobs=-1, plotdir=None, debug=False):
    # NOTE set <n_components> to None to run plain kmeans, without mds TODO clean this up

    start = time.time()
    assert n_clusters is not None
    if 'sklearn' not in sys.modules:
        from sklearn import manifold  # these are both slow af to import, even on local ssd
        from sklearn.cluster import KMeans

    if len(set(sfo['name'] for sfo in seqfos)) != len(seqfos):
        raise Exception('duplicate sequence ids in <seqfos>')

    if not aligned:  # NOTE unlike the bios2mds version above, this modifies <seqfos>
        if debug:
            print 'align'
        seqfos = utils.align_many_seqs(seqfos)

    if debug:
        print '  distances'
    # translations = string.maketrans('ACGT-', '01234')
    # def convert(seq):
    #     return [int(c) for c in seq.translate(translations)]
    # converted_seqs = [convert(x['seq']) for x in seqfos]
    # similarities = scipy.spatial.distance.pdist(converted_seqs, 'hamming')
    # similarities = scipy.spatial.distance.squareform(similarities)
    similarities = scipy.spatial.distance.squareform([utils.hamming_fraction(seqfos[i]['seq'], seqfos[j]['seq']) for i in range(len(seqfos)) for j in range(i + 1, len(seqfos))])
    random_state = numpy.random.RandomState(seed=seed)

    pos = None
    if n_components is not None:
        if debug:
            print '  mds'
        mds = sys.modules['sklearn'].manifold.MDS(n_components=n_components, n_init=n_init, max_iter=max_iter, eps=eps, random_state=random_state, dissimilarity="precomputed", n_jobs=n_jobs)
        pos = mds.fit_transform(similarities)
        # pos = mds.fit(similarities).embedding_

    if debug:
        print '    kmeans clustering with %d clusters' % n_clusters
    kmeans = sys.modules['sklearn'].cluster.KMeans(n_clusters=n_clusters, random_state=random_state).fit(pos if pos is not None else similarities)
    pcvals = {seqfos[iseq]['name'] : pos[iseq] if pos is not None else None for iseq in range(len(seqfos))}
    labels = {seqfos[iseq]['name'] : kmeans.labels_[iseq] for iseq in range(len(seqfos))}
    partition = utils.group_seqs_by_value(pcvals.keys(), lambda q: labels[q])

    if plotdir is not None:
        utils.prep_dir(plotdir, wildlings=['*.svg'])
        if debug:
            print '    plot'
        plot_mds(n_components, pcvals, plotdir, 'mds', partition=partition)

        if reco_info is not None:
            labels = {uid : reco_info[uid][region + '_gene'] for uid in pcvals}
            plot_mds(n_components, pcvals, plotdir, 'true-genes', labels=labels)

    if debug:
        print '    kmeans time %.1f' % (time.time() - start)

    return partition
コード例 #3
0
    def hamming_distance_to_true_naive(self, true_line, line, query_name, restrict_to_region='', normalize=False, padfo=None, debug=False):
        """
        Hamming distance between the inferred naive sequence and the tue naive sequence.
        <restrict_to_region> if set, restrict the comparison to the section of the *true* sequence assigned to the given region.
        NOTE this will not in general correspond to the similarly-assigned region in the inferred naive sequence.
        if <normalize> divide by sequence length
        """

        true_naive_seq = utils.get_full_naive_seq(self.germlines, true_line)
        inferred_naive_seq = utils.get_full_naive_seq(self.germlines, line)

        left_hack_add_on = ''
        right_hack_add_on = ''
        if len(true_line['seq']) > len(line['seq']):  # ihhhmmm doesn't report the bits of the sequence it erodes off the ends, so we have to add them back on
        # if len(true_naive_seq) > len(inferred_naive_seq):  # hm, now why did I use line['seq'] stuff before?
            start = true_line['seq'].find(line['seq'])
            assert start >= 0
            end = len(line['seq']) + start
            left_hack_add_on = true_line['seq'][: start]
            right_hack_add_on = true_line['seq'][ end :]
            # extra_penalty = len(left_hack_add_on) + len(right_hack_add_on)
            inferred_naive_seq = 'N'*len(left_hack_add_on) + inferred_naive_seq + 'N'*len(right_hack_add_on)
            if debug:
                print '  adding to inferred naive seq'

        # if restrict_to_region == '':
        #     print '  before', inferred_naive_seq
        if padfo is not None:  # remove N padding from the inferred sequence
            inferred_naive_seq = inferred_naive_seq[padfo['padleft'] : ]
            if padfo['padright'] > 0:
                inferred_naive_seq = inferred_naive_seq[ : -padfo['padright']]
        # if restrict_to_region == '':
        #     print '  after ', inferred_naive_seq

        bounds = None
        if restrict_to_region != '':
            bounds = utils.get_regional_naive_seq_bounds(restrict_to_region, self.germlines, true_line)  # get the bounds of this *true* region
            true_naive_seq = true_naive_seq[bounds[0] : bounds[1]]
            inferred_naive_seq = inferred_naive_seq[bounds[0] : bounds[1]]

        if debug:
            print restrict_to_region, 'region, bounds', bounds
            print '  true ', true_naive_seq
            print '  infer', inferred_naive_seq

        if len(true_naive_seq) != len(inferred_naive_seq):
            raise Exception('still not the same lengths for %s\n  %s\n  %s' % (query_name, true_naive_seq, inferred_naive_seq))
        fraction, len_excluding_ambig = utils.hamming_fraction(true_naive_seq, inferred_naive_seq, return_len_excluding_ambig=True)
        total_distance = int(fraction * len_excluding_ambig)
        if len(true_naive_seq) == 0:
            print 'WARNING zero length sequence in hamming_distance_to_true_naive'
            return 0
        if normalize:
            return int(100 * (float(total_distance) / len(true_naive_seq)))
        else:
            return total_distance
コード例 #4
0
def run_sklearn_mds(n_components, n_clusters, seqfos, seed, reco_info=None, region=None, aligned=False, n_init=4, max_iter=300, eps=1e-3, n_jobs=-1, plotdir=None):
    print '%s not testing this after moving these imports down here' % utils.color('red', 'hey')
    from sklearn import manifold  # these are both slow af to import, even on local ssd
    from sklearn.cluster import KMeans

    if len(set(sfo['name'] for sfo in seqfos)) != len(seqfos):
        raise Exception('duplicate sequence ids in <seqfos>')

    print 'align'
    if not aligned:  # NOTE unlike the bios2mds version above, this modifies <seqfos>
        seqfos = utils.align_many_seqs(seqfos)

    print '  distances'
    # translations = string.maketrans('ACGT-', '01234')
    # def convert(seq):
    #     return [int(c) for c in seq.translate(translations)]
    # converted_seqs = [convert(x['seq']) for x in seqfos]
    # similarities = scipy.spatial.distance.pdist(converted_seqs, 'hamming')
    # similarities = scipy.spatial.distance.squareform(similarities)
    similarities = scipy.spatial.distance.squareform([utils.hamming_fraction(seqfos[i]['seq'], seqfos[j]['seq']) for i in range(len(seqfos)) for j in range(i + 1, len(seqfos))])

    print '  mds'
    random_state = numpy.random.RandomState(seed=seed)
    mds = manifold.MDS(n_components=n_components, n_init=n_init, max_iter=max_iter, eps=eps, random_state=random_state, dissimilarity="precomputed", n_jobs=n_jobs)
    pos = mds.fit_transform(similarities)
    # pos = mds.fit(similarities).embedding_

    print '  kmeans'
    kmeans = KMeans(n_clusters=n_clusters, random_state=random_state).fit(pos)
    pcvals = {seqfos[iseq]['name'] : pos[iseq] for iseq in range(len(seqfos))}
    labels = {seqfos[iseq]['name'] : kmeans.labels_[iseq] for iseq in range(len(seqfos))}
    def keyfunc(q):  # should really integrate this with utils.collapse_naive_seqs()/utils.split_partition_with_criterion()
        return labels[q]
    partition = [list(group) for _, group in itertools.groupby(sorted(pcvals, key=keyfunc), key=keyfunc)]

    if plotdir is not None:
        utils.prep_dir(plotdir, wildlings=['*.svg'])
        print '  plot'
        plot_mds(n_components, pcvals, plotdir, 'mds', partition=partition)

        if reco_info is not None:
            labels = {uid : reco_info[uid][region + '_gene'] for uid in pcvals}
            plot_mds(n_components, pcvals, plotdir, 'true-genes', labels=labels)

    return partition
コード例 #5
0
    def plot_within_vs_between_hists(self, partition, annotations, base_plotdir):
        classes = self.get_cdr3_length_classes(partition, annotations)

        overall_distances = {'within' : [mut_freq for info in annotations.values() for mut_freq in info['mut_freqs']],
                             'between' : []}
        sub_distances = {}
        def nseq(cl):
            return annotations[':'.join(cl)]['naive_seq']
        for cdr3_length, clusters in classes.items():  # for each cdr3 length, loop over each pair of clusters that have that cdr3 length
            # NOTE/TODO I'm extremely unhappy that I have to put the naive seq length check here. But we pad cdr3 length subclasses to the same length during smith waterman, and by the time we get to here, in very rare cases the the cdr3 length has changed.
            hfracs = [utils.hamming_fraction(nseq(cl_a), nseq(cl_b)) for cl_a, cl_b in itertools.combinations(clusters, 2) if len(nseq(cl_a)) == len(nseq(cl_b))]  # hamming fractions for each pair of clusters with this cdr3 length
            sub_distances[cdr3_length] = {'within' : [mut_freq for cluster in clusters for mut_freq in annotations[':'.join(cluster)]['mut_freqs']],
                                          'between' : hfracs}
            overall_distances['between'] += hfracs

        self.plot_each_within_vs_between_hist(overall_distances, base_plotdir + '/overall', 'within-vs-between', '')
        for cdr3_length, subd in sub_distances.items():
            self.plot_each_within_vs_between_hist(subd, base_plotdir + '/within-vs-between', 'cdr3-length-%d' % cdr3_length, 'CDR3 %d' % cdr3_length)
コード例 #6
0
    def from_same_lineage(cluster_id, uid):
        for clid in id_clusters[cluster_id]:  # loop over seqs already in the cluster (it only has to match one of 'em)
            is_match = True
            for key in ('cdr3_length', 'v_gene', 'j_gene'):  # same cdr3 length, v gene, and j gene
                if info[clid][key] != info[uid][key]:
                    is_match = False
                    break
            if not is_match:
                continue
            cl_seq = get_d_plus_insertions(clid)
            u_seq = get_d_plus_insertions(uid)
            if len(cl_seq) != len(u_seq):
                continue
            hamming_frac = utils.hamming_fraction(cl_seq, u_seq)
            if hamming_frac > 1. - threshold:
                continue

            return True  # if we get to here, it's a match

        return False
コード例 #7
0
ファイル: plot-germlines.py プロジェクト: Irrationone/partis
def get_gene_pair_matrix(genelist, difftype):
    """ return matrix comparing all pairs of genes in <genelist> """
    smatrix = [[] for _ in range(len(genelist))]
    for iv in range(len(genelist)):
        for jv in range(len(genelist)):
            if jv < iv + 1:
                smatrix[iv].append(0.)
                continue
            s1, s2 = [glfo['aligned-genes']['v'][genelist[index]] for index in [iv, jv]]
            # utils.color_mutants(s1, s2, print_result=True)
            if difftype == 'hamming':
                fraction, length = utils.hamming_fraction(s1, s2, return_len_excluding_ambig=True, extra_bases='.')
            elif difftype == 'indels':
                fraction = indel_difference_fraction(s1, s2)
            elif difftype == 'subs':
                fraction = substitution_difference_fraction(s1, s2)
            else:
                raise Exception('unexpected difftype %s' % difftype)
            smatrix[iv].append(fraction)
    return smatrix
コード例 #8
0
    def plot_within_vs_between_hists(self, partition, annotations,
                                     base_plotdir):
        classes = self.get_cdr3_length_classes(partition, annotations)

        overall_distances = {
            'within': [
                mut_freq for info in annotations.values()
                for mut_freq in info['mut_freqs']
            ],
            'between': []
        }
        sub_distances = {}

        def nseq(cl):
            return annotations[':'.join(cl)]['naive_seq']

        for cdr3_length, clusters in classes.items(
        ):  # for each cdr3 length, loop over each pair of clusters that have that cdr3 length
            # NOTE/TODO I'm extremely unhappy that I have to put the naive seq length check here. But we pad cdr3 length subclasses to the same length during smith waterman, and by the time we get to here, in very rare cases the the cdr3 length has changed.
            hfracs = [
                utils.hamming_fraction(nseq(cl_a), nseq(cl_b))
                for cl_a, cl_b in itertools.combinations(clusters, 2)
                if len(nseq(cl_a)) == len(nseq(cl_b))
            ]  # hamming fractions for each pair of clusters with this cdr3 length
            sub_distances[cdr3_length] = {
                'within': [
                    mut_freq for cluster in clusters
                    for mut_freq in annotations[':'.join(cluster)]['mut_freqs']
                ],
                'between':
                hfracs
            }
            overall_distances['between'] += hfracs

        self.plot_each_within_vs_between_hist(overall_distances,
                                              base_plotdir + '/overall',
                                              'within-vs-between', '')
        for cdr3_length, subd in sub_distances.items():
            self.plot_each_within_vs_between_hist(
                subd, base_plotdir + '/within-vs-between',
                'cdr3-length-%d' % cdr3_length, 'CDR3 %d' % cdr3_length)
コード例 #9
0
ファイル: plot-germlines.py プロジェクト: Irrationone/partis
def get_gene_set_mean_matrix(genesets, difftype):
    """ return matrix comparing the sets of genes in <genenames>, i.e. each entry is the average over all pairs of sequences in set 1 and set 2. """
    setnames, genenames = genesets.keys(), genesets.values()
    n_sets = len(genenames)
    smatrix = [[] for _ in range(n_sets)]
    for iv in range(n_sets):
        for jv in range(n_sets):
            # if setnames[iv] != '3/OR15' or setnames[jv] != '4/OR15':
            #     smatrix[iv].append(0.)
            #     continue
            if jv < iv + 1:
                smatrix[iv].append(0.)
                continue
            # print '  %s %s' % (setnames[iv], setnames[jv])
            seqs1 = [glfo['aligned-genes']['v'][g] for g in genenames[iv]]
            seqs2 = [glfo['aligned-genes']['v'][g] for g in genenames[jv]]

            total, nfractions = 0., 0
            for is1 in range(len(seqs1)):
                # print '   ', utils.color_gene(genenames[iv][is1])
                for is2 in range(len(seqs2)):
                    # print '     ', utils.color_gene(genenames[jv][is2]),
                    s1 = seqs1[is1]
                    s2 = seqs2[is2]
                    # utils.color_mutants(s1, s2, print_result=True, extra_str='    ')
                    if difftype == 'hamming':
                        fraction, length = utils.hamming_fraction(s1, s2, return_len_excluding_ambig=True, extra_bases='.')
                    elif difftype == 'indels':
                        fraction = indel_difference_fraction(s1, s2)
                    elif difftype == 'subs':
                        fraction = substitution_difference_fraction(s1, s2)
                    else:
                        raise Exception('unexpected difftype %s' % difftype)
                    # print '      %.3f' % fraction
                    total += fraction
                    nfractions += 1
            meanfraction = 0. if nfractions == 0 else float(total) / nfractions
            # print '   mean %.3f' % meanfraction
            smatrix[iv].append(meanfraction)

    return smatrix
コード例 #10
0
    def from_same_lineage(cluster_id, uid):
        for clid in id_clusters[
                cluster_id]:  # loop over seqs already in the cluster (it only has to match one of 'em)
            is_match = True
            for key in ('cdr3_length', 'v_gene',
                        'j_gene'):  # same cdr3 length, v gene, and j gene
                if info[clid][key] != info[uid][key]:
                    is_match = False
                    break
            if not is_match:
                continue
            cl_seq = get_d_plus_insertions(clid)
            u_seq = get_d_plus_insertions(uid)
            if len(cl_seq) != len(u_seq):
                continue
            hamming_frac = utils.hamming_fraction(cl_seq, u_seq)
            if hamming_frac > 1. - threshold:
                continue

            return True  # if we get to here, it's a match

        return False
コード例 #11
0
ファイル: glomerator.py プロジェクト: psathyrella/partis
        def get_clusters_to_merge():
            smallest_min_distance, clusters_to_merge = None, None
            n_skipped = 0
            for clust_a, clust_b in itertools.combinations(clusters, 2):  # find the two clusters which contain the pair of sequences which are closest in hamming fraction (skipping cluster pairs that would make a cluster that's too big)
                if len(clust_a) + len(clust_b) > max_per_cluster and not glomerate.merge_whatever_you_got:  # merged cluster would be too big, so look for smaller (albeit further-apart) things to merge
                    n_skipped += 1
                    continue
                min_distance = None  # find the smallest hamming distance between any two sequences in the two clusters
                for query_a in clust_a:
                    for query_b in clust_b:
                        joint_key = query_a + ';' + query_b  #';'.join([query_a, query_b])
                        if joint_key not in distances:
                            distances[joint_key] = utils.hamming_fraction(naive_seqs[query_a], naive_seqs[query_b])
                            distances[query_b + ';' + query_a] = distances[joint_key]  # also add with key in reverse order, in case we run into the pair that way later on
                        if min_distance is None or distances[joint_key] < min_distance:
                            min_distance = distances[joint_key]
                if smallest_min_distance is None or min_distance < smallest_min_distance:
                    smallest_min_distance = min_distance
                    clusters_to_merge = (clust_a, clust_b)

            if debug and n_skipped > 0:
                print '      skipped: %d ' % n_skipped

            return clusters_to_merge
コード例 #12
0
ファイル: test.py プロジェクト: Irrationone/partis
    def compare_partition_cachefiles(self, input_stype):
        """ NOTE only writing this for the ref input_stype a.t.m. """

        # ----------------------------------------------------------------------------------------
        def print_key_differences(vtype, refkeys, newkeys):
            print '    %s keys' % vtype
            if len(refkeys - newkeys) > 0 or len(newkeys - refkeys) > 0:
                if len(refkeys - newkeys) > 0:
                    print utils.color('red', '      %d only in ref version' % len(refkeys - newkeys))
                if len(newkeys - refkeys) > 0:
                    print utils.color('red', '      %d only in new version' % len(newkeys - refkeys))
                print '      %d in common' % len(refkeys & newkeys)
            else:
                print '        %d identical keys in new and ref cache' % len(refkeys)

        ptest = 'partition-' + input_stype + '-simu'
        if args.quick and ptest not in self.quick_tests:
            return

        # ----------------------------------------------------------------------------------------
        print '  %s input partition cache file' % input_stype
        def readcache(fname):
            cache = {'naive_seqs' : {}, 'logprobs' : {}}
            with open(fname) as cachefile:
                reader = csv.DictReader(cachefile)
                for line in reader:
                    if line['naive_seq'] != '':
                        cache['naive_seqs'][line['unique_ids']] = line['naive_seq']
                    if line['logprob'] != '':
                        cache['logprobs'][line['unique_ids']] = float(line['logprob'])
            return cache

        refcache = readcache(self.dirs['ref'] + '/' + self.cachefnames[input_stype])
        newcache = readcache(self.dirs['new'] + '/' + self.cachefnames[input_stype])

        # work out intersection and complement
        refkeys, newkeys = {}, {}
        for vtype in ['naive_seqs', 'logprobs']:
            refkeys[vtype] = set(refcache[vtype].keys())
            newkeys[vtype] = set(newcache[vtype].keys())
            print_key_differences(vtype, refkeys[vtype], newkeys[vtype])

        hammings = []
        n_hammings = 0
        n_different_length, n_big_hammings = 0, 0
        hamming_eps = 0.
        vtype = 'naive_seqs'
        for uids in refkeys[vtype] & newkeys[vtype]:
            refseq = refcache[vtype][uids]
            newseq = newcache[vtype][uids]
            n_hammings += 1
            if len(refseq) == len(newseq):
                hamming_fraction = utils.hamming_fraction(refseq, newseq)
                if hamming_fraction > hamming_eps:
                    n_big_hammings += 1
                    hammings.append(hamming_fraction)
            else:
                n_different_length += 1

        diff_hfracs_str = '%3d / %4d' % (n_big_hammings, n_hammings)
        mean_hfrac_str = '%.3f' % (numpy.average(hammings) if len(hammings) > 0 else 0.)
        if n_big_hammings > 0:
            diff_hfracs_str = utils.color('red', diff_hfracs_str)
            mean_hfrac_str = utils.color('red', mean_hfrac_str)

        abs_delta_logprobs = []
        n_delta_logprobs = 0
        n_big_delta_logprobs = 0
        logprob_eps = 1e-5
        vtype = 'logprobs'
        for uids in refkeys[vtype] & newkeys[vtype]:
            refval = refcache[vtype][uids]
            newval = newcache[vtype][uids]
            n_delta_logprobs += 1
            abs_delta_logprob = abs(refval - newval)
            if abs_delta_logprob > logprob_eps:
                n_big_delta_logprobs += 1
                abs_delta_logprobs.append(abs_delta_logprob)

        diff_logprob_str = '%3d / %4d' % (n_big_delta_logprobs, n_delta_logprobs)
        mean_logprob_str = '%.3f' % (numpy.average(abs_delta_logprobs) if len(abs_delta_logprobs) > 0 else 0.)
        if n_big_delta_logprobs > 0:
            diff_logprob_str = utils.color('red', diff_logprob_str)
            mean_logprob_str = utils.color('red', mean_logprob_str)
        print '                  fraction different     mean abs difference among differents'
        print '      naive seqs     %s                      %s      (hamming fraction)' % (diff_hfracs_str, mean_hfrac_str)
        print '      log probs      %s                      %s' % (diff_logprob_str, mean_logprob_str)
        if n_different_length > 0:
            print utils.color('red', '        %d different length' % n_different_length)
コード例 #13
0
ファイル: test.py プロジェクト: antibodyome/partis
    def compare_partition_cachefiles(self, input_stype):
        """ NOTE only writing this for the ref input_stype a.t.m. """
        ptest = 'partition-' + input_stype + '-simu'
        if args.quick and ptest not in self.quick_tests:
            return

        print '%s input partition cache file' % input_stype

        def readcache(fname):
            cache = {}
            with open(fname) as cachefile:
                reader = csv.DictReader(cachefile)
                for line in reader:
                    cache[line['unique_ids']] = {'naive_seq' : line['naive_seq'], 'logprob' : float(line['logprob'])}
            return cache

        refcache = readcache(self.dirs['ref'] + '/' + self.cachefnames[input_stype])
        newcache = readcache(self.dirs['new'] + '/' + self.cachefnames[input_stype])

        # work out intersection and complement
        refkeys = set(refcache.keys())
        newkeys = set(newcache.keys())
        if len(refkeys - newkeys) > 0 or len(newkeys - refkeys) > 0:
            if len(refkeys - newkeys) > 0:
                print utils.color('red', '  %d only in ref version' % len(refkeys - newkeys))
            if len(newkeys - refkeys) > 0:
                print utils.color('red', '  %d only in new version' % len(newkeys - refkeys))
            print '  %d in common' % len(refkeys & newkeys)
        else:
            print '    %d identical keys in new and ref cache' % len(refkeys)

        hammings, delta_logprobs = [], []
        n_hammings, n_delta_logprobs = 0, 0
        n_different_length, n_big_hammings, n_big_delta_logprobs = 0, 0, 0
        hamming_eps = 0.
        logprob_eps = 1e-5
        for uids in refkeys & newkeys:
            refline = refcache[uids]
            newline = newcache[uids]
            if refline['naive_seq'] != '':
                n_hammings += 1
                if len(refline['naive_seq']) == len(newline['naive_seq']):
                    hamming_fraction = utils.hamming_fraction(refline['naive_seq'], newline['naive_seq'])
                    if hamming_fraction > hamming_eps:
                        n_big_hammings += 1
                        hammings.append(hamming_fraction)
                else:
                    n_different_length += 1
            if refline['logprob'] != '':
                n_delta_logprobs += 1
                delta_logprob = abs(float(refline['logprob']) - float(newline['logprob']))
                if delta_logprob > logprob_eps:
                    n_big_delta_logprobs += 1
                    delta_logprobs.append(delta_logprob)

        diff_hfracs_str = '%3d / %4d' % (n_big_hammings, n_hammings)
        mean_hfrac_str = '%.3f' % (numpy.average(hammings) if len(hammings) > 0 else 0.)
        if n_big_hammings > 0:
            diff_hfracs_str = utils.color('red', diff_hfracs_str)
            mean_hfrac_str = utils.color('red', mean_hfrac_str)

        diff_logprob_str = '%3d / %4d' % (n_big_delta_logprobs, n_delta_logprobs)
        mean_logprob_str = '%.6f' % (numpy.average(delta_logprobs) if len(delta_logprobs) > 0 else 0.)
        if n_big_delta_logprobs > 0:
            diff_logprob_str = utils.color('red', diff_logprob_str)
            mean_logprob_str = utils.color('red', mean_logprob_str)
        print '                fraction different     mean difference among differents'
        print '    naive seqs     %s                      %s      (hamming fraction)' % (diff_hfracs_str, mean_hfrac_str)
        print '    log probs      %s                      %s' % (diff_logprob_str, mean_logprob_str)
        if n_different_length > 0:
            print utils.color('red', '      %d different length' % n_different_length)
コード例 #14
0
ファイル: test.py プロジェクト: stevenweaver/partis
    def compare_partition_cachefiles(self, input_stype):
        """ NOTE only writing this for the ref input_stype a.t.m. """
        ptest = "partition-" + input_stype + "-simu"
        if args.quick and ptest not in self.quick_tests:
            return

        print "%s partition cache file" % input_stype

        def readcache(fname):
            cache = {}
            with open(fname) as cachefile:
                reader = csv.DictReader(cachefile)
                for line in reader:
                    cache[line["unique_ids"]] = {"naive_seq": line["naive_seq"], "logprob": float(line["logprob"])}
            return cache

        refcache = readcache(self.dirs["ref"] + "/" + self.cachefnames[input_stype])
        newcache = readcache(self.dirs["new"] + "/" + self.cachefnames[input_stype])

        # work out intersection and complement
        refkeys = set(refcache.keys())
        newkeys = set(newcache.keys())
        if len(refkeys - newkeys) > 0 or len(newkeys - refkeys) > 0:
            if len(refkeys - newkeys) > 0:
                print utils.color("red", "  %d only in ref" % len(refkeys - newkeys))
            if len(newkeys - refkeys) > 0:
                print utils.color("red", "  %d only in new" % len(newkeys - refkeys))
            print "  %d in common" % len(refkeys & newkeys)
        else:
            print "    %d identical keys in new and ref cache" % len(refkeys)

        hammings, delta_logprobs = [], []
        n_hammings, n_delta_logprobs = 0, 0
        n_different_length, n_big_hammings, n_big_delta_logprobs = 0, 0, 0
        hamming_eps = 0.0
        logprob_eps = 1e-5
        for uids in refkeys & newkeys:
            refline = refcache[uids]
            newline = newcache[uids]
            if refline["naive_seq"] != "":
                n_hammings += 1
                if len(refline["naive_seq"]) == len(newline["naive_seq"]):
                    hamming_fraction = utils.hamming_fraction(refline["naive_seq"], newline["naive_seq"])
                    if hamming_fraction > hamming_eps:
                        n_big_hammings += 1
                        hammings.append(hamming_fraction)
                else:
                    n_different_length += 1
            if refline["logprob"] != "":
                n_delta_logprobs += 1
                delta_logprob = abs(float(refline["logprob"]) - float(newline["logprob"]))
                if delta_logprob > logprob_eps:
                    n_big_delta_logprobs += 1
                    delta_logprobs.append(delta_logprob)

        diff_hfracs_str = "%3d / %4d" % (n_big_hammings, n_hammings)
        mean_hfrac_str = "%.3f" % (numpy.average(hammings) if len(hammings) > 0 else 0.0)
        if n_big_hammings > 0:
            diff_hfracs_str = utils.color("red", diff_hfracs_str)
            mean_hfrac_str = utils.color("red", mean_hfrac_str)

        diff_logprob_str = "%3d / %4d" % (n_big_delta_logprobs, n_delta_logprobs)
        mean_logprob_str = "%.6f" % (numpy.average(delta_logprobs) if len(delta_logprobs) > 0 else 0.0)
        if n_big_delta_logprobs > 0:
            diff_logprob_str = utils.color("red", diff_logprob_str)
            mean_logprob_str = utils.color("red", mean_logprob_str)
        print "                fraction different     mean difference among differents"
        print "    naive seqs     %s                      %s      (hamming fraction)" % (
            diff_hfracs_str,
            mean_hfrac_str,
        )
        print "    log probs      %s                      %s" % (diff_logprob_str, mean_logprob_str)
        if n_different_length > 0:
            print utils.color("red", "      %d different length" % n_different_length)
コード例 #15
0
ファイル: test.py プロジェクト: atombaby/partis
    def compare_partition_cachefiles(self, input_stype):
        """ NOTE only writing this for the ref input_stype a.t.m. """

        # ----------------------------------------------------------------------------------------
        def print_key_differences(vtype, refkeys, newkeys):
            print '    %s keys' % vtype
            if len(refkeys - newkeys) > 0 or len(newkeys - refkeys) > 0:
                if len(refkeys - newkeys) > 0:
                    print utils.color('red', '      %d only in ref version' % len(refkeys - newkeys))
                if len(newkeys - refkeys) > 0:
                    print utils.color('red', '      %d only in new version' % len(newkeys - refkeys))
                print '      %d in common' % len(refkeys & newkeys)
            else:
                print '        %d identical keys in new and ref cache' % len(refkeys)

        ptest = 'partition-' + input_stype + '-simu'
        if args.quick and ptest not in self.quick_tests:
            return

        # ----------------------------------------------------------------------------------------
        print '  %s input partition cache file' % input_stype
        def readcache(fname):
            cache = {'naive_seqs' : {}, 'logprobs' : {}}
            with open(fname) as cachefile:
                reader = csv.DictReader(cachefile)
                for line in reader:
                    if line['naive_seq'] != '':
                        cache['naive_seqs'][line['unique_ids']] = line['naive_seq']
                    if line['logprob'] != '':
                        cache['logprobs'][line['unique_ids']] = float(line['logprob'])
            return cache

        refcache = readcache(self.dirs['ref'] + '/' + self.cachefnames[input_stype])
        newcache = readcache(self.dirs['new'] + '/' + self.cachefnames[input_stype])

        # work out intersection and complement
        refkeys, newkeys = {}, {}
        for vtype in ['naive_seqs', 'logprobs']:
            refkeys[vtype] = set(refcache[vtype].keys())
            newkeys[vtype] = set(newcache[vtype].keys())
            print_key_differences(vtype, refkeys[vtype], newkeys[vtype])

        hammings = []
        n_hammings = 0
        n_different_length, n_big_hammings = 0, 0
        hamming_eps = 0.
        vtype = 'naive_seqs'
        for uids in refkeys[vtype] & newkeys[vtype]:
            refseq = refcache[vtype][uids]
            newseq = newcache[vtype][uids]
            n_hammings += 1
            if len(refseq) == len(newseq):
                hamming_fraction = utils.hamming_fraction(refseq, newseq)
                if hamming_fraction > hamming_eps:
                    n_big_hammings += 1
                    hammings.append(hamming_fraction)
            else:
                n_different_length += 1

        diff_hfracs_str = '%3d / %4d' % (n_big_hammings, n_hammings)
        mean_hfrac_str = '%.3f' % (numpy.average(hammings) if len(hammings) > 0 else 0.)
        if n_big_hammings > 0:
            diff_hfracs_str = utils.color('red', diff_hfracs_str)
            mean_hfrac_str = utils.color('red', mean_hfrac_str)

        abs_delta_logprobs = []
        n_delta_logprobs = 0
        n_big_delta_logprobs = 0
        logprob_eps = 1e-5
        vtype = 'logprobs'
        for uids in refkeys[vtype] & newkeys[vtype]:
            refval = refcache[vtype][uids]
            newval = newcache[vtype][uids]
            n_delta_logprobs += 1
            abs_delta_logprob = abs(refval - newval)
            if abs_delta_logprob > logprob_eps:
                # print '%s  %s  ref  %f  new %f' % (vtype, uids, refval, newval)
                n_big_delta_logprobs += 1
                abs_delta_logprobs.append(abs_delta_logprob)

        diff_logprob_str = '%3d / %4d' % (n_big_delta_logprobs, n_delta_logprobs)
        mean_logprob_str = '%.3f' % (numpy.average(abs_delta_logprobs) if len(abs_delta_logprobs) > 0 else 0.)
        if n_big_delta_logprobs > 0:
            diff_logprob_str = utils.color('red', diff_logprob_str)
            mean_logprob_str = utils.color('red', mean_logprob_str)
        print '                  fraction different     mean abs difference among differents'
        print '      naive seqs     %s                      %s      (hamming fraction)' % (diff_hfracs_str, mean_hfrac_str)
        print '      log probs      %s                      %s' % (diff_logprob_str, mean_logprob_str)
        if n_different_length > 0:
            print utils.color('red', '        %d different length' % n_different_length)
コード例 #16
0
    def hamming_distance_to_true_naive(self, true_line, line, query_name, restrict_to_region='', normalize=False, padfo=None, debug=False):
        """
        Hamming distance between the inferred naive sequence and the tue naive sequence.
        <restrict_to_region> if set, restrict the comparison to the section of the *true* sequence assigned to the given region.
        NOTE this will not in general correspond to the similarly-assigned region in the inferred naive sequence.
        if <normalize> divide by sequence length
        """

        true_naive_seq = true_line['naive_seq']
        inferred_naive_seq = line['naive_seq']
        if len(true_naive_seq) != len(inferred_naive_seq):
            print '%20s    true      inf' % ''
            for k in true_line:
                print '%20s   %s' % (k, true_line[k]),
                if k in line:
                    print '   %s' % line[k]
                else:
                    print '    NOPE'
            for k in line:
                if k not in true_line:
                    print '  not in true line   %20s    %s' % (k, line[k])
            raise Exception('%s true and inferred sequences not the same length\n   %s\n   %s\n' % (line['unique_id'], true_naive_seq, inferred_naive_seq))

        # assert False # read through this whole damn thing and make sure it's ok

        left_hack_add_on = ''
        right_hack_add_on = ''
        # if len(true_line['seq']) > len(utils.remove_ambiguous_ends(line['seq'], line['fv_insertion'], line['jf_insertion'])):  # ihhhmmm doesn't report the bits of the sequence it erodes off the ends, so we have to add them back on
        # # if len(true_naive_seq) > len(inferred_naive_seq):  # hm, now why did I use line['seq'] stuff before?
        #     assert False
        #     start = true_line['seq'].find(line['seq'])
        #     assert start >= 0
        #     end = len(line['seq']) + start
        #     left_hack_add_on = true_line['seq'][: start]
        #     right_hack_add_on = true_line['seq'][ end :]
        #     # extra_penalty = len(left_hack_add_on) + len(right_hack_add_on)
        #     inferred_naive_seq = 'N'*len(left_hack_add_on) + inferred_naive_seq + 'N'*len(right_hack_add_on)
        #     if debug:
        #         print '  adding to inferred naive seq'


        if padfo is not None:  # remove N padding from the inferred sequence
            if debug:
                print 'removing padfo'
                print inferred_naive_seq
            if inferred_naive_seq[padfo['padleft'] : ].count('N') == padfo['padleft']:  # this fails to happen if reset_effective_erosions_and_effective_insertions already removed the Ns
                inferred_naive_seq = inferred_naive_seq[padfo['padleft'] : ]
            elif debug:  # NOTE if no debug, we just fall through, which isok
                print 'tried to remove non Ns!\n   %s\n   padleft %d\n' % (inferred_naive_seq, padfo['padleft'])
            if padfo['padright'] > 0:
                if inferred_naive_seq[ : padfo['padright']].count('N') == padfo['padright']:  # this fails to happen if reset_effective_erosions_and_effective_insertions already removed the Ns
                    inferred_naive_seq = inferred_naive_seq[ : -padfo['padright']]
                elif debug:  # NOTE if no debug, we just fall through, which isok
                    print 'tried to remove non Ns!\n   %s\n   padright %d\n' % (inferred_naive_seq, padfo['padright'])
            if debug:
                print padfo['padleft'] * ' ' + inferred_naive_seq + padfo['padleft'] * ' '

        bounds = None
        if restrict_to_region != '':
            bounds = true_line['regional_bounds'][restrict_to_region]
            if debug:
                print 'restrict to %s' % restrict_to_region
                utils.color_mutants(true_naive_seq, inferred_naive_seq, print_result=True, extra_str='      ')
                utils.color_mutants(true_naive_seq[bounds[0] : bounds[1]], inferred_naive_seq[bounds[0] : bounds[1]], print_result=True, extra_str='      ' + bounds[0]*' ')
            true_naive_seq = true_naive_seq[bounds[0] : bounds[1]]
            inferred_naive_seq = inferred_naive_seq[bounds[0] : bounds[1]]

        if len(true_naive_seq) != len(inferred_naive_seq):
            raise Exception('still not the same lengths for %s\n  %s\n  %s' % (query_name, true_naive_seq, inferred_naive_seq))
        fraction, len_excluding_ambig = utils.hamming_fraction(true_naive_seq, inferred_naive_seq, return_len_excluding_ambig=True)
        total_distance = int(fraction * len_excluding_ambig)
        if len(true_naive_seq) == 0:
            print 'WARNING zero length sequence in hamming_distance_to_true_naive'
            return 0
        if normalize:
            return int(100 * (float(total_distance) / len(true_naive_seq)))
        else:
            return total_distance
コード例 #17
0
    def resolve_discordant_clusters(single_cluster,
                                    single_annotation,
                                    cluster_list,
                                    annotation_list,
                                    tdbg=False):
        # NOTE single_cluster and cluster_list in general have quite different sets of uids, and that's fine. All that matters here is we're trying to find all the clusters that should be split from one another (without doing some all against all horror)
        if len(cluster_list) == 1:  # nothing to do
            return [single_cluster
                    ]  # NOTE <single_cluster> doesn't get used after here
        adict = utils.get_annotation_dict(annotation_list)
        cdr3_groups = utils.group_seqs_by_value(
            cluster_list, lambda c: adict[akey(c)]['cdr3_length']
        )  # group the together clusters in <cluster_list> that have the same cdr3 (there's already utils.split_clusters_by_cdr3(), but it uses different inputs (e.g. sw_info) so i think it makes sense to not use it here)
        if tdbg:
            print '   %s one cluster vs %d clusters' % (utils.color(
                'blue', 'syncing'), len(cluster_list))
            print '     split into %d cdr3 groups' % len(cdr3_groups)
        lo_hbound, hi_hbound = utils.get_naive_hamming_bounds(
            'likelihood',
            overall_mute_freq=numpy.mean(
                [f for l in annotation_list for f in l['mut_freqs']])
        )  # these are the wider bounds, so < lo is almost certainly clonal, > hi is almost certainly not
        return_clusts = []
        for icdr, cdrgroup in enumerate(
                cdr3_groups
        ):  # within each cdr3 group, split (i.e. use the cluster boundaries from cluster_list rather than single_cluster) if naive hfrac is > hi_hbound (but then there's shenanigans to adjudicate between different possibilities)
            if tdbg:
                print '      %s hfrac bound %.2f' % (utils.color(
                    'purple', 'icdr %d' % icdr), hi_hbound)

            # first figure out who needs to be split from whom
            clusters_to_split = {
                akey(c): []
                for c in cdrgroup
            }  # map from each cluster ('s key) to a list of clusters from which it should be split
            for c1, c2 in itertools.combinations(
                    cdrgroup, 2
            ):  # we could take account of the hfrac of both chains at this point, but looking at only the "split" one rather than the "merged" one, as we do here, is i think equivalent to assuming the merged one has zero hfrac, which is probably fine, since we only split if the split chain is very strongly suggesting we split
                hfrac = utils.hamming_fraction(
                    adict[akey(c1)]['naive_seq'], adict[akey(c2)]['naive_seq']
                )  # all clusters with the same cdr3 len have been padded in waterer so their naive seqs are the same length
                if hfrac > hi_hbound:
                    clusters_to_split[akey(c1)].append(c2)
                    clusters_to_split[akey(c2)].append(c1)

            # then do the splitting, which is accomplished by merging each cluster in <cdrgroup> with every other cluster in <cdrgroup> from which we aren't supposed to split it (i.e. that aren't in its <clusters_to_split>)
            if tdbg:
                print '                  N to     new'
                print '          size    split   cluster?'
            tmpclusts_for_return = [
            ]  # final (return) clusters for this cdr3 class
            for cclust in cdrgroup:
                split_clusts = clusters_to_split[akey(cclust)]
                if tdbg:
                    print '         %4d    %3d' % (len(cclust),
                                                   len(split_clusts)),
                found_one = False
                for rclust in tmpclusts_for_return:  # look for an existing return cluster to which we can merge cclust, i.e. that doesn't have any uids from which we want to split
                    if any_in_common(
                        [rclust], split_clusts
                    ):  # if any uid in rclust is in a cluster from which we want to be split, skip it, i.e. don't merge with that cluster (note that we have to do it by uid because the rclusts are already merged so don't necessarily correspond to any existing cluster)
                        continue
                    # if found_one: print 'it happened!'  # can't happen any more since I switched to 'break' (although see note below)
                    if tdbg: print '     merging with size %d' % len(rclust)
                    rclust += cclust
                    found_one = True
                    break  # i.e. we just merge with the first one we find and stop looking; if there's more than one, it means we could merge all three together if we wanted (triangle inequality-ish, see diagram linked at top of fcn), but i doubt it'll matter either way, and this is easier
                if not found_one:
                    if tdbg: print '      y'
                    tmpclusts_for_return.append(
                        cclust
                    )  # if we didn't find an existing cluster that we can add it to, add it as a new cluster

            return_clusts += tmpclusts_for_return

        if debug:
            print '      returning: %s' % ' '.join(
                [str(len(c)) for c in return_clusts])
            # ptnprint(return_clusts)
        return return_clusts