Python hamming_distance 예제들, GCutils.hamming_distance Python 예제들

예제 #1

0

파일 보기

파일: selection_utils.py 프로젝트: dunleavy005/bcr-phylo-benchmark

def target_distance_fcn(args, this_seq, target_seqs):
    if args.metric_for_target_distance == 'aa':
        return min([hamming_distance(this_seq.aa, t.aa) for t in target_seqs])
    elif args.metric_for_target_distance == 'nuc':
        return min(
            [hamming_distance(this_seq.nuc, t.nuc) for t in target_seqs])
    else:
        assert False

예제 #2

0

파일 보기

파일: selection_utils.py 프로젝트: m-vieira/bcr-phylo-benchmark

def calc_Kd(seqAA, targetAAseqs, hd2affy):
    '''Find the closest target sequence to and apply the "hamming distance to affinity" transformation function.'''
    if '*' in seqAA:  # Non-sense sequences have zero affinity (zero affinty meaning infinity Kd)
        return (float('inf'))
    else:
        hd = min([hamming_distance(seqAA, t) for t in targetAAseqs])
    return (hd2affy(hd))

예제 #3

0

파일 보기

파일: gctree_tools.py 프로젝트: m-vieira/bcr-phylo-benchmark

 def compare(self, tree2, method='identity'):
     '''compare this tree to the other tree'''
     if method == 'identity':
         # we compare lists of seq, parent, abundance
         # return true if these lists are identical, else false
         list1 = sorted((node.sequence, node.frequency,
                         node.up.sequence if node.up is not None else None)
                        for node in self.tree.traverse())
         list2 = sorted((node.sequence, node.frequency,
                         node.up.sequence if node.up is not None else None)
                        for node in tree2.tree.traverse())
         return list1 == list2
     elif method == 'MRCA':
         # matrix of hamming distance of common ancestors of taxa
         # takes a true and inferred tree as CollapsedTree objects
         taxa = [
             node.sequence for node in self.tree.traverse()
             if node.frequency
         ]
         n_taxa = len(taxa)
         d = scipy.zeros(shape=(n_taxa, n_taxa))
         sum_sites = scipy.zeros(shape=(n_taxa, n_taxa))
         for i in range(n_taxa):
             nodei_true = self.tree.iter_search_nodes(
                 sequence=taxa[i]).next()
             nodei = tree2.tree.iter_search_nodes(sequence=taxa[i]).next()
             for j in range(i + 1, n_taxa):
                 nodej_true = self.tree.iter_search_nodes(
                     sequence=taxa[j]).next()
                 nodej = tree2.tree.iter_search_nodes(
                     sequence=taxa[j]).next()
                 MRCA_true = self.tree.get_common_ancestor(
                     (nodei_true, nodej_true)).sequence
                 MRCA = tree2.tree.get_common_ancestor(
                     (nodei, nodej)).sequence
                 d[i, j] = hamming_distance(MRCA_true, MRCA)
                 sum_sites[i, j] = len(MRCA_true)
         return d.sum() / sum_sites.sum()
     elif method == 'RF':
         tree1_copy = self.tree.copy(method='deepcopy')
         tree2_copy = tree2.tree.copy(method='deepcopy')
         for treex in (tree1_copy, tree2_copy):
             for node in list(treex.traverse()):
                 if node.frequency > 0:
                     child = TreeNode()
                     child.add_feature('sequence', node.sequence)
                     node.add_child(child)
         try:
             return tree1_copy.robinson_foulds(tree2_copy,
                                               attr_t1='sequence',
                                               attr_t2='sequence',
                                               unrooted_trees=True)[0]
         except:
             return tree1_copy.robinson_foulds(tree2_copy,
                                               attr_t1='sequence',
                                               attr_t2='sequence',
                                               unrooted_trees=True,
                                               allow_dup=True)[0]
     else:
         raise ValueError('invalid distance method: ' + method)

예제 #4

0

파일 보기

파일: iqtree_tools.py 프로젝트: m-vieira/bcr-phylo-benchmark

def ASR_parser(args):
    try:
        import cPickle as pickle
    except:
        import pickle
    from GCutils import CollapsedForest, CollapsedTree, hamming_distance

    try:
        tree = Tree(args.tree, format=1)
    except Exception as e:
        print(e)
        raise TreeFileParsingError('Could not read the input tree. Is this really newick format?')

    counts = {l.split(',')[0]:int(l.split(',')[1]) for l in open(args.counts)}
    tree.add_feature('frequency', 0)       # Placeholder will be deleted when rerooting
    tree.add_feature('sequence', 'DUMMY')  # Placeholder will be deleted when rerooting
    tree = map_asr_to_tree(args.asr_seq, args.leaf_seq, tree, args.naive, counts)

    # Reroot to make the naive sequence the real root instead of just an outgroup:
    tree = reroot_tree(tree, pattern=args.naive)

    # Recompute branch lengths as hamming distances:
    tree.dist = 0  # No branch above root
    for node in tree.iter_descendants():
        node.dist = hamming_distance(node.sequence, node.up.sequence)

    iqtree_tree = CollapsedTree(tree=tree, name=args.name)
    # Add colors:
    if args.colormap is not None:
        with open(args.colormap, 'rb') as fh:
            colormap = pickle.load(fh)
        with open(args.idmap, 'rb') as fh:
            id_map = pickle.load(fh)
        # Reverse the id_map:
        id_map = {cs:seq_id for seq_id, cell_ids in id_map.items() for cs in cell_ids}
        # Expand the colormap and map to sequence ids:
        colormap_seqid = dict()
        for key, color in colormap.items():
            if isinstance(key, str) and key in id_map:
                colormap_seqid[id_map[key]] = color
            else:
                for cell_id in key:
                    if cell_id in id_map:
                        colormap_seqid[id_map[cell_id]] = color
        colormap = colormap_seqid
    else:
        colormap = None
    iqtree_tree.render(args.outbase + '.svg', colormap=colormap)
    iqtree_forest = CollapsedForest(forest=[iqtree_tree], name=args.name)
    # Dump tree as newick:
    iqtree_forest.write_random_tree(args.outbase+'.tree')
    print('number of trees with integer branch lengths:', iqtree_forest.n_trees)

    with open(args.outbase + '.p', 'wb') as f:
        pickle.dump(iqtree_forest, f)

    print('Done parsing IQ-TREE tree')

예제 #5

0

파일 보기

def build_tree(sequences, parents, counts=None, naive='naive'):
    # build an ete tree
    # first a dictionary of disconnected nodes
    nodes = {}
    for name in sequences:
        node = Tree()
        node.name = name
        node.add_feature('nuc_seq', sequences[node.name])
        node.add_feature('aa_seq', local_translate(sequences[node.name]))
        if counts is not None and node.name in counts:
            node.add_feature('frequency', counts[node.name])
        else:
            node.add_feature('frequency', 0)
        nodes[name] = node
    for name in sequences:
        if name in parents:
            nodes[parents[name]].add_child(nodes[name])
        else:
            tree = nodes[name]
    # Reroot on naive:
    if naive is not None:
        naive_id = [n for n in nodes if naive in n][0]
        assert len(nodes[naive_id].children) == 0
        naive_parent = nodes[naive_id].up
        naive_parent.remove_child(nodes[naive_id])
        nodes[naive_id].add_child(naive_parent)
        # remove possible unecessary unifurcation after rerooting
        if len(naive_parent.children) == 1:
            naive_parent.delete(prevent_nondicotomic=False)
            naive_parent.children[0].dist = hamming_distance(
                naive_parent.children[0].nuc_seq, nodes[naive_id].nuc_seq)
        tree = nodes[naive_id]

    # make random choices for ambiguous bases
    tree = disambiguate(tree)

    # compute branch lengths
    tree.dist = 0  # no branch above root
    for node in tree.iter_descendants():
        node.dist = hamming_distance(node.nuc_seq, node.up.nuc_seq)

    return tree

예제 #6

0

파일 보기

파일: simulator.py 프로젝트: m-vieira/bcr-phylo-benchmark

 def one_mutant(self, sequence, Nmuts, lambda0=0.1):
     '''
     Make a single mutant with a hamming distance, in amino acid space, of Nmuts away from the starting point.
     '''
     trial = 100  # Allow 100 trials before quitting
     while trial > 0:
         mut_seq = sequence[:]
         aa = translate(sequence)
         aa_mut = translate(mut_seq)
         dist = hamming_distance(aa, aa_mut)
         while dist < Nmuts:
             mut_seq = self.mutate(mut_seq, lambda0=lambda0)
             aa_mut = translate(mut_seq)
             dist = hamming_distance(aa, aa_mut)
         if dist == Nmuts and '*' not in aa_mut:  # Stop codon cannot be part of the return
             return aa_mut
         else:
             trial -= 1
     raise RuntimeError(
         '100 consecutive attempts for creating a target sequence failed.')

예제 #7

0

파일 보기

파일: simulator.py 프로젝트: m-vieira/bcr-phylo-benchmark

def simulate(args):
    '''
    Simulation subprogram. Can simulate in two modes.
    a) Neutral mode. A Galton–Watson process, with mutation probabilities according to a user defined motif model e.g. S5F.
    b) Selection mode. Using the same mutation process as in a), but in selection mode the poisson progeny distribution's lambda parameter
    is dynamically adjusted accordring to the hamming distance to a list of target sequences. The closer a sequence gets to one of the targets
    the higher fitness and the closer lambda will approach 2, vice versa when the sequence is far away lambda approaches 0.
    '''
    if args.random_seed is not None:
        numpy.random.seed(args.random_seed)
        random.seed(args.random_seed)
    mutation_model = MutationModel(args.mutability, args.substitution)
    if args.lambda0 is None:
        args.lambda0 = [max([1, int(.01 * len(args.sequence))])]
    if args.random_seq is not None:
        from Bio import SeqIO
        records = list(SeqIO.parse(args.random_seq, "fasta"))
        random.shuffle(records)
        args.sequence = str(records[0].seq).upper()
    else:
        args.sequence = args.sequence.upper()
    if args.sequence2 is not None:
        if len(args.lambda0
               ) == 1:  # Use the same mutation rate on both sequences
            args.lambda0 = [args.lambda0[0], args.lambda0[0]]
        elif len(args.lambda0) != 2:
            raise Exception(
                'Only one or two lambda0 can be defined for a two sequence simulation.'
            )

        # Extract the bounds between sequence 1 and 2:
        pair_bounds = ((0, len(args.sequence)),
                       (len(args.sequence),
                        len(args.sequence) + len(args.sequence2)))
        # Merge the two seqeunces to simplify future dealing with the pair:
        args.sequence += args.sequence2.upper()
    else:
        pair_bounds = None
    if args.selection:
        assert (
            args.B_total >= args.f_full
        )  # the fully activating fraction on BA must be possible to reach within B_total
        # Find the total amount of A necessary for sustaining the inputted carrying capacity:
        print((args.carry_cap, args.B_total, args.f_full, args.mature_affy))
        A_total = selection_utils.find_A_total(args.carry_cap, args.B_total,
                                               args.f_full, args.mature_affy,
                                               args.U)
        # Calculate the parameters for the logistic function:
        Lp = selection_utils.find_Lp(args.f_full, args.U)
        selection_params = [
            args.stop_dist, args.mature_affy, args.naive_affy,
            args.target_dist, args.target_count, args.skip_update, A_total,
            args.B_total, Lp, args.k, args.outbase
        ]
    else:
        selection_params = None

    trials = 1000
    # This loop makes us resimulate if size too small, or backmutation:
    for trial in range(trials):
        try:
            tree = mutation_model.simulate(args.sequence,
                                           pair_bounds=pair_bounds,
                                           lambda_=args.lambda_,
                                           lambda0=args.lambda0,
                                           n=args.n,
                                           N=args.N,
                                           T=args.T,
                                           verbose=args.verbose,
                                           selection_params=selection_params)
            if args.selection:
                collapsed_tree = CollapsedTree(tree=tree,
                                               name='GCsim selection',
                                               collapse_syn=False,
                                               allow_repeats=True)
            else:
                collapsed_tree = CollapsedTree(
                    tree=tree, name='GCsim neutral'
                )  # <-- This will fail if backmutations
            tree.ladderize()
            uniques = sum(node.frequency > 0
                          for node in collapsed_tree.tree.traverse())
            if uniques < 2:
                raise RuntimeError(
                    'collapsed tree contains {} sampled sequences'.format(
                        uniques))
            break
        except RuntimeError as e:
            print('{}, trying again'.format(e))
        else:
            raise
    if trial == trials - 1:
        raise RuntimeError('{} attempts exceeded'.format(trials))

    # In the case of a sequence pair print them to separate files:
    if args.sequence2 is not None:
        fh1 = open(args.outbase + '_seq1.fasta', 'w')
        fh2 = open(args.outbase + '_seq2.fasta', 'w')
        fh1.write('>naive\n')
        fh1.write(args.sequence[pair_bounds[0][0]:pair_bounds[0][1]] + '\n')
        fh2.write('>naive\n')
        fh2.write(args.sequence[pair_bounds[1][0]:pair_bounds[1][1]] + '\n')
        for leaf in tree.iter_leaves():
            if leaf.frequency != 0:
                fh1.write('>' + leaf.name + '\n')
                fh1.write(leaf.sequence[pair_bounds[0][0]:pair_bounds[0][1]] +
                          '\n')
                fh2.write('>' + leaf.name + '\n')
                fh2.write(leaf.sequence[pair_bounds[1][0]:pair_bounds[1][1]] +
                          '\n')
    else:
        with open(args.outbase + '.fasta', 'w') as f:
            f.write('>naive\n')
            f.write(args.sequence + '\n')
            for leaf in tree.iter_leaves():
                if leaf.frequency != 0:
                    f.write('>' + leaf.name + '\n')
                    f.write(leaf.sequence + '\n')

    # Some observable simulation stats to write:
    frequency, distance_from_naive, degree = zip(
        *[(node.frequency, hamming_distance(node.sequence, args.sequence),
           sum(
               hamming_distance(node.sequence, node2.sequence) == 1
               for node2 in collapsed_tree.tree.traverse()
               if node2.frequency and node2 is not node))
          for node in collapsed_tree.tree.traverse() if node.frequency])
    stats = pd.DataFrame({
        'genotype abundance': frequency,
        'Hamming distance to root genotype': distance_from_naive,
        'Hamming neighbor genotypes': degree
    })
    stats.to_csv(args.outbase + '_stats.tsv', sep='\t', index=False)

    print('{} simulated observed sequences'.format(
        sum(leaf.frequency for leaf in collapsed_tree.tree.traverse())))

    # Render the full lineage tree:
    ts = TreeStyle()
    ts.rotation = 90
    ts.show_leaf_name = False
    ts.show_scale = False

    colors = {}
    palette = SVG_COLORS
    palette -= set(['black', 'white', 'gray'])
    palette = cycle(list(palette))  # <-- Circular iterator

    # Either plot by DNA sequence or amino acid sequence:
    if args.plotAA and args.selection:
        colors[tree.AAseq] = 'gray'
    else:
        colors[tree.sequence] = 'gray'

    for n in tree.traverse():
        nstyle = NodeStyle()
        nstyle["size"] = 10
        if args.plotAA:
            if n.AAseq not in colors:
                colors[n.AAseq] = next(palette)
            nstyle['fgcolor'] = colors[n.AAseq]
        else:
            if n.sequence not in colors:
                colors[n.sequence] = next(palette)
            nstyle['fgcolor'] = colors[n.sequence]
        n.set_style(nstyle)

    # Render and pickle lineage tree:
    tree.render(args.outbase + '_lineage_tree.svg', tree_style=ts)
    with open(args.outbase + '_lineage_tree.p', 'wb') as f:
        pickle.dump(tree, f)

    # Render collapsed tree,
    # create an id-wise colormap
    # NOTE: node.name can be a set
    if args.plotAA and args.selection:
        colormap = {
            node.name: colors[node.AAseq]
            for node in collapsed_tree.tree.traverse()
        }
    else:
        colormap = {
            node.name: colors[node.sequence]
            for node in collapsed_tree.tree.traverse()
        }
    collapsed_tree.write(args.outbase + '_collapsed_tree.p')
    collapsed_tree.render(args.outbase + '_collapsed_tree.svg',
                          idlabel=args.idlabel,
                          colormap=colormap)
    # Print colormap to file:
    with open(args.outbase + '_collapsed_tree_colormap.tsv', 'w') as f:
        for name, color in colormap.items():
            f.write((name if isinstance(name, str) else ','.join(name)) +
                    '\t' + color + '\n')
    with open(args.outbase + '_collapsed_tree_colormap.p', 'wb') as f:
        pickle.dump(colormap, f)

    if args.selection:
        # Define a list a suitable colors that are easy to distinguish:
        palette = [
            'crimson', 'purple', 'hotpink', 'limegreen', 'darkorange',
            'darkkhaki', 'brown', 'lightsalmon', 'darkgreen', 'darkseagreen',
            'darkslateblue', 'teal', 'olive', 'wheat', 'magenta',
            'lightsteelblue', 'plum', 'gold'
        ]
        palette = cycle(list(palette))  # <-- circular iterator
        colors = {
            i: next(palette)
            for i in range(int(len(args.sequence) // 3))
        }
        # The minimum distance to the target is colored:
        colormap = {
            node.name: colors[node.target_dist]
            for node in collapsed_tree.tree.traverse()
        }
        collapsed_tree.write(args.outbase + '_collapsed_runstat_color_tree.p')
        collapsed_tree.render(args.outbase +
                              '_collapsed_runstat_color_tree.svg',
                              idlabel=args.idlabel,
                              colormap=colormap)
        # Write a file with the selection run stats. These are also plotted:
        with open(args.outbase + '_selection_runstats.p', 'rb') as fh:
            runstats = pickle.load(fh)
            selection_utils.plot_runstats(runstats, args.outbase, colors)

예제 #8

0

파일 보기

파일: simulator.py 프로젝트: m-vieira/bcr-phylo-benchmark

    def simulate(self,
                 sequence,
                 pair_bounds=None,
                 lambda_=0.9,
                 lambda0=[1],
                 N=None,
                 T=None,
                 n=None,
                 verbose=False,
                 selection_params=None):
        '''
        Simulate a poisson branching process with mutation introduced
        by the chosen mutation model e.g. motif or uniform.
        Can either simulate under a neutral model without selection,
        or using an affinity muturation inspired model for selection.
        '''
        progeny = poisson(lambda_)  # Default progeny distribution
        stop_dist = None  # Default stopping criterium for affinity simulation
        # Checking the validity of the input parameters:
        if N is not None and T is not None:
            raise ValueError(
                'Only one of N and T can be used. One must be None.')
        if selection_params is not None and T is None:
            raise ValueError(
                'Simulation with selection was chosen. A time, T, must be specified.'
            )
        elif N is None and T is None:
            raise ValueError('Either N or T must be specified.')
        if N is not None and n is not None and n[-1] > N:
            raise ValueError('n ({}) must not larger than N ({})'.format(
                n[-1], N))
        elif N is not None and n is not None and len(n) != 1:
            raise ValueError(
                'n ({}) must a single value when specifying N'.format(n))
        if T is not None and len(T) > 1 and (n is None or
                                             (len(n) != 1
                                              and len(n) != len(T))):
            raise ValueError(
                'n must be specified when using intermediate sampling:', n)
        elif T is not None and len(T) > 1 and len(n) == 1:
            n = [n[-1]] * len(T)

        # Planting the tree:
        tree = TreeNode()
        tree.dist = 0
        tree.add_feature('sequence', sequence)
        tree.add_feature('terminated', False)
        tree.add_feature('sampled', False)
        tree.add_feature('frequency', 0)
        tree.add_feature('time', 0)

        if selection_params is not None:
            hd_generation = list(
            )  # Collect an array of the counts of each hamming distance at each time step
            stop_dist, mature_affy, naive_affy, target_dist, target_count, skip_update, A_total, B_total, Lp, k, outbase = selection_params
            # Make a list of target sequences:
            targetAAseqs = [
                self.one_mutant(sequence, target_dist)
                for i in range(target_count)
            ]
            # Assert that the target sequences are comparable to the naive sequence:
            aa = translate(tree.sequence)
            assert (sum([1 for t in targetAAseqs if len(t) != len(aa)]) == 0
                    )  # All targets are same length
            assert (sum([
                1 for t in targetAAseqs
                if hamming_distance(aa, t) == target_dist
            ]))  # All target are "target_dist" away from the naive sequence
            # Affinity is an exponential function of hamming distance:
            assert (target_dist > 0)

            def hd2affy(hd):
                return (mature_affy + hd**k *
                        (naive_affy - mature_affy) / target_dist**k)

            # We store both the amino acid sequence and the affinity as tree features:
            tree.add_feature('AAseq', str(aa))
            tree.add_feature(
                'Kd', selection_utils.calc_Kd(tree.AAseq, targetAAseqs,
                                              hd2affy))
            tree.add_feature(
                'target_dist',
                min([
                    hamming_distance(tree.AAseq, taa) for taa in targetAAseqs
                ]))

        t = 0  # <-- Time at start
        leaves_unterminated = 1
        # Small lambdas are causing problems so make a minimum:
        lambda_min = 10e-10
        hd_distrib = []
        while leaves_unterminated > 0 and (
                leaves_unterminated < N if N is not None else
                True) and (t < max(T) if T is not None else True) and (
                    stop_dist >= min(hd_distrib)
                    if stop_dist is not None and t > 0 else True):
            if verbose:
                print('At time:', t)
            t += 1
            # Sample intermediate time point:
            if T is not None and len(T) > 1 and (t - 1) in T:
                si = T.index(t - 1)
                live_nostop_leaves = [
                    l for l in tree.iter_leaves()
                    if not l.terminated and not has_stop(l.sequence)
                ]
                random.shuffle(live_nostop_leaves)
                if len(live_nostop_leaves) < n[si]:
                    raise RuntimeError(
                        'tree with {} leaves, less than what desired for intermediate sampling {}. Try later generation or increasing the carrying capacity.'
                        .format(leaves_unterminated, n))
                # Make the sample and kill the cells sampled:
                for leaf in live_nostop_leaves[:n[si]]:
                    leaves_unterminated -= 1
                    leaf.sampled = True
                    leaf.terminated = True
                if verbose:
                    print('Made an intermediate sample at time:', t - 1)
            live_leaves = [l for l in tree.iter_leaves() if not l.terminated]
            random.shuffle(live_leaves)
            skip_lambda_n = 0  # At every new round reset the all the lambdas
            # Draw progeny for each leaf:
            for leaf in live_leaves:
                if selection_params is not None:
                    if skip_lambda_n == 0:
                        skip_lambda_n = skip_update + 1  # Add one so skip_update=0 is no skip
                        tree = selection_utils.lambda_selection(
                            tree, targetAAseqs, hd2affy, A_total, B_total, Lp)
                    if leaf.lambda_ > lambda_min:
                        progeny = poisson(leaf.lambda_)
                    else:
                        progeny = poisson(lambda_min)
                    skip_lambda_n -= 1
                n_children = progeny.rvs()
                leaves_unterminated += n_children - 1  # <-- Getting 1, is equal to staying alive
                if not n_children:
                    leaf.terminated = True
                for child_count in range(n_children):
                    # If sequence pair mutate them separately with their own mutation rate:
                    if pair_bounds is not None:
                        mutated_sequence1 = self.mutate(
                            leaf.sequence[pair_bounds[0][0]:pair_bounds[0][1]],
                            lambda0=lambda0[0])
                        mutated_sequence2 = self.mutate(
                            leaf.sequence[pair_bounds[1][0]:pair_bounds[1][1]],
                            lambda0=lambda0[1])
                        mutated_sequence = mutated_sequence1 + mutated_sequence2
                    else:
                        mutated_sequence = self.mutate(leaf.sequence,
                                                       lambda0=lambda0[0])
                    child = TreeNode()
                    child.dist = sum(
                        x != y
                        for x, y in zip(mutated_sequence, leaf.sequence))
                    child.add_feature('sequence', mutated_sequence)
                    if selection_params is not None:
                        aa = translate(child.sequence)
                        child.add_feature('AAseq', str(aa))
                        child.add_feature(
                            'Kd',
                            selection_utils.calc_Kd(child.AAseq, targetAAseqs,
                                                    hd2affy))
                        child.add_feature(
                            'target_dist',
                            min([
                                hamming_distance(child.AAseq, taa)
                                for taa in targetAAseqs
                            ]))
                    child.add_feature('frequency', 0)
                    child.add_feature('terminated', False)
                    child.add_feature('sampled', False)
                    child.add_feature('time', t)
                    leaf.add_child(child)
            if selection_params is not None:
                hd_distrib = [
                    min([
                        hamming_distance(tn.AAseq, ta) for ta in targetAAseqs
                    ]) for tn in tree.iter_leaves() if not tn.terminated
                ]
                if target_dist > 0:
                    hist = scipy.histogram(hd_distrib,
                                           bins=list(range(target_dist * 10)))
                else:  # Just make a minimum of 10 bins
                    hist = scipy.histogram(hd_distrib, bins=list(range(10)))
                hd_generation.append(hist)
                if verbose and hd_distrib:
                    print('Total cell population:', sum(hist[0]))
                    print('Majority hamming distance:', scipy.argmax(hist[0]))
                    print('Affinity of latest sampled leaf:', leaf.Kd)
                    print(
                        'Progeny distribution lambda for the latest sampled leaf:',
                        leaf.lambda_)

        if leaves_unterminated < N:
            raise RuntimeError(
                'Tree terminated with {} leaves, {} desired'.format(
                    leaves_unterminated, N))

        # Keep a histogram of the hamming distances at each generation:
        if selection_params is not None:
            with open(outbase + '_selection_runstats.p', 'wb') as f:
                pickle.dump(hd_generation, f)

        # Each leaf in final generation gets an observation frequency of 1, unless downsampled:
        if T is not None and len(T) > 1:
            # Iterate the intermediate time steps (excluding the last time):
            for Ti in sorted(T)[:-1]:
                si = T.index(Ti)
                # Only sample those that have been 'sampled' at intermediate sampling times:
                final_leaves = [
                    leaf for leaf in tree.iter_descendants()
                    if leaf.time == Ti and leaf.sampled
                ]
                if len(final_leaves) < n[si]:
                    raise RuntimeError(
                        'tree terminated with {} leaves, less than what desired after downsampling {}'
                        .format(leaves_unterminated, n[si]))
                for leaf in final_leaves:  # No need to down-sample, this was already done in the simulation loop
                    leaf.frequency = 1
        if selection_params and max(T) != t:
            raise RuntimeError(
                'tree terminated with before the requested sample time.')
        # Do the normal sampling of the last time step:
        final_leaves = [
            leaf for leaf in tree.iter_leaves()
            if leaf.time == t and not has_stop(leaf.sequence)
        ]
        # Report stop codon sequences:
        stop_leaves = [
            leaf for leaf in tree.iter_leaves()
            if leaf.time == t and has_stop(leaf.sequence)
        ]
        if stop_leaves:
            print(
                'Tree contains {} leaves with stop codons, out of {} total at last time point.'
                .format(len(stop_leaves), len(final_leaves)))

        if T is not None:
            si = T.index(sorted(T)[-1])
        else:
            si = 0
        # By default, downsample to the target simulation size:
        if n is not None and len(final_leaves) >= n[si]:
            for leaf in random.sample(final_leaves, n[si]):
                leaf.frequency = 1
        elif n is None and N is not None:
            if len(
                    final_leaves
            ) < N:  # Removed nonsense sequences might decrease the number of final leaves to less than N
                N = len(final_leaves)
            for leaf in random.sample(final_leaves, N):
                leaf.frequency = 1
        elif N is None and T is not None:
            for leaf in final_leaves:
                leaf.frequency = 1
        elif n is not None and len(final_leaves) < n[si]:
            raise RuntimeError(
                'tree terminated with {} leaves, less than what desired after downsampling {}'
                .format(leaves_unterminated, n[si]))
        else:
            raise RuntimeError('Unknown option.')

        # Prune away lineages that are unobserved:
        for node in tree.iter_descendants():
            if sum(node2.frequency for node2 in node.traverse()) == 0:
                node.detach()

        # Remove unobserved unifurcations:
        for node in tree.iter_descendants():
            parent = node.up
            if node.frequency == 0 and len(node.children) == 1:
                node.delete(prevent_nondicotomic=False)
                node.children[0].dist = hamming_distance(
                    node.children[0].sequence, parent.sequence)

        # Assign unique names to each node:
        for i, node in enumerate(tree.traverse(), 1):
            node.name = 'simcell_{}'.format(i)

        # Return the uncollapsed tree:
        return tree

예제 #9

0

파일 보기

파일: COAR.py 프로젝트: m-vieira/bcr-phylo-benchmark

def align_lineages(seq,
                   tree_t,
                   tree_i,
                   gap_penalty_pct=0,
                   known_root=True,
                   allow_double_gap=False):
    '''
    Standard implementation of a Needleman-Wunsch algorithm as described here:
    http://telliott99.blogspot.com/2009/08/alignment-needleman-wunsch.html
    https://en.wikipedia.org/wiki/Needleman%E2%80%93Wunsch_algorithm
    And implemented here:
    https://github.com/alevchuk/pairwise-alignment-in-python/blob/master/alignment.py

    gap_penalty_pct is the gap penalty relative to the sequence length of the sequences on the tree.
    '''
    nt = find_node_by_seq(tree_t, seq)
    lt = reconstruct_lineage(tree_t, nt)
    ni = find_node_by_seq(tree_i, seq)
    li = reconstruct_lineage(tree_i, ni)
    # One lineages must be longer than just the root and the terminal node
    if len(lt) <= 2 and len(li) <= 2:
        return False

    # Gap penalty chosen not too large:
    gap_penalty = -1 * int((len(seq) / 100.0) * gap_penalty_pct)
    assert (gap_penalty <= 0)  # Penalties must be negative
    if gap_penalty == 0:  # If gap penalty is zero only gaps in the shortes sequence will be allowed
        assert (allow_double_gap is False)

    # Generate a score matrix matrix:
    kt = len(lt)
    ki = len(li)
    # Disallow gaps in the longest list:
    if allow_double_gap is False and kt > ki:
        # If true is longer than inferred allow gap only in inferred:
        gap_penalty_i = gap_penalty
        gap_penalty_j = -1 * float('inf')
    elif allow_double_gap is False and kt < ki:
        # If inferred is longer than true allow gap only in true:
        gap_penalty_i = -1 * float('inf')
        gap_penalty_j = gap_penalty
    elif allow_double_gap is False and kt == ki:
        # If lists are equally long no gaps are allowed:
        gap_penalty_i = -1 * float('inf')
        gap_penalty_j = -1 * float('inf')
    else:
        gap_penalty_i = gap_penalty
        gap_penalty_j = gap_penalty

    sc_mat = np.zeros((kt, ki), dtype=np.float64)
    for i in range(kt):
        for j in range(ki):
            # Notice the score is defined by number of mismatches:
            #sc_mat[i, j] = len(lt[i]) - hamming_distance(lt[i], li[j])
            sc_mat[i, j] = -1 * hamming_distance(lt[i], li[j])

###    print(sc_mat)
# Calculate the alignment scores:
    aln_sc = np.zeros((kt + 1, ki + 1), dtype=np.float64)
    for i in range(0, kt + 1):
        if known_root is True:
            aln_sc[i][0] = -1 * float('inf')
        else:
            aln_sc[i][0] = gap_penalty_i * i
    for j in range(0, ki + 1):
        if known_root is True:
            aln_sc[0][j] = -1 * float('inf')
        else:
            aln_sc[0][j] = gap_penalty_j * j
    aln_sc[0][0] = 0  # The top left is fixed to zero
    ###    print(aln_sc)
    for i in range(1, kt + 1):
        for j in range(1, ki + 1):
            match = aln_sc[i - 1][j - 1] + sc_mat[i - 1, j - 1]
            gap_in_inferred = aln_sc[i - 1][j] + gap_penalty_i
            gap_in_true = aln_sc[i][j - 1] + gap_penalty_j
            aln_sc[i][j] = max(match, gap_in_inferred, gap_in_true)


###    print(aln_sc)
# Traceback to compute the alignment:
    align_t, align_i, asr_align = list(), list(), list()
    i, j = kt, ki
    alignment_score = aln_sc[i][j]
    while i > 0 and j > 0:
        sc_current = aln_sc[i][j]
        sc_diagonal = aln_sc[i - 1][j - 1]
        sc_up = aln_sc[i][j - 1]
        sc_left = aln_sc[i - 1][j]

        if sc_current == (sc_diagonal + sc_mat[i - 1, j - 1]):
            align_t.append(lt[i - 1])
            align_i.append(li[j - 1])
            i -= 1
            j -= 1
        elif sc_current == (sc_left + gap_penalty_i):
            align_t.append(lt[i - 1])
            align_i.append('-')
            i -= 1
        elif sc_current == (sc_up + gap_penalty_j):
            align_t.append('-')
            align_i.append(li[j - 1])
            j -= 1

    # If space left fill it with gaps:
    while i > 0:
        asr_align.append(gap_penalty_i)
        align_t.append(lt[i - 1])
        align_i.append('-')
        i -= 1
    while j > 0:
        asr_align.append(gap_penalty_j)
        align_t.append('-')
        align_i.append(li[j - 1])
        j -= 1

    max_penalty = 0
    for a, b in zip(align_t, align_i):
        if a == '-' or b == '-':
            max_penalty += gap_penalty
        else:
            max_penalty += -len(a)
    # Notice that the root and the terminal node is excluded from this comparison.
    # by adding their length to the max_penalty:
    if known_root is True:
        max_penalty += 2 * len(lt[0])
    else:  # Or in the case of an unknown root, just add the terminal node
        max_penalty += len(lt[0])

    return [align_t, align_i, alignment_score, max_penalty]

예제 #10

0

파일 보기

파일: gctree_tools.py 프로젝트: m-vieira/bcr-phylo-benchmark

    def render(self,
               outfile,
               idlabel=False,
               colormap=None,
               show_support=False,
               chain_split=None):
        '''render to image file, filetype inferred from suffix, svg for color images'''
        def my_layout(node):
            circle_color = 'lightgray' if colormap is None or node.name not in colormap else colormap[
                node.name]
            text_color = 'black'
            if isinstance(circle_color, str):
                C = CircleFace(radius=max(3, 10 * scipy.sqrt(node.frequency)),
                               color=circle_color,
                               label={
                                   'text': str(node.frequency),
                                   'color': text_color
                               } if node.frequency > 0 else None)
                C.rotation = -90
                C.hz_align = 1
                faces.add_face_to_node(C, node, 0)
            else:
                P = PieChartFace(
                    [100 * x / node.frequency for x in circle_color.values()],
                    2 * 10 * scipy.sqrt(node.frequency),
                    2 * 10 * scipy.sqrt(node.frequency),
                    colors=[(color if color != 'None' else 'lightgray')
                            for color in list(circle_color.keys())],
                    line_color=None)
                T = TextFace(' '.join(
                    [str(x) for x in list(circle_color.values())]),
                             tight_text=True)
                T.hz_align = 1
                T.rotation = -90
                faces.add_face_to_node(P, node, 0, position='branch-right')
                faces.add_face_to_node(T, node, 1, position='branch-right')
            if idlabel:
                T = TextFace(node.name, tight_text=True, fsize=6)
                T.rotation = -90
                T.hz_align = 1
                faces.add_face_to_node(
                    T,
                    node,
                    1 if isinstance(circle_color, str) else 2,
                    position='branch-right')

        for node in self.tree.traverse():
            nstyle = NodeStyle()
            nstyle['size'] = 0
            if node.up is not None:
                if set(node.sequence.upper()) == set('ACGT'):
                    if chain_split is not None:
                        if self.frame is not None:
                            raise NotImplementedError(
                                'frame not implemented with chain_split')
                        leftseq_mutated = hamming_distance(
                            node.sequence[:chain_split],
                            node.up.sequence[:chain_split]) > 0
                        rightseq_mutated = hamming_distance(
                            node.sequence[chain_split:],
                            node.up.sequence[chain_split:]) > 0
                        if leftseq_mutated and rightseq_mutated:
                            nstyle['hz_line_color'] = 'purple'
                            nstyle['hz_line_width'] = 3
                        elif leftseq_mutated:
                            nstyle['hz_line_color'] = 'red'
                            nstyle['hz_line_width'] = 2
                        elif rightseq_mutated:
                            nstyle['hz_line_color'] = 'blue'
                            nstyle['hz_line_width'] = 2
                    if self.frame is not None:
                        aa = Seq(
                            node.sequence[(self.frame -
                                           1):(self.frame - 1 +
                                               (3 *
                                                (((len(node.sequence) -
                                                   (self.frame - 1)) // 3))))],
                            generic_dna).translate()
                        aa_parent = Seq(
                            node.up.sequence[(self.frame -
                                              1):(self.frame - 1 + (3 * ((
                                                  (len(node.sequence) -
                                                   (self.frame - 1)) // 3))))],
                            generic_dna).translate()
                        nonsyn = hamming_distance(aa, aa_parent)
                        if '*' in aa:
                            nstyle['bgcolor'] = 'red'
                        if nonsyn > 0:
                            nstyle['hz_line_color'] = 'black'
                            nstyle['hz_line_width'] = nonsyn
                        else:
                            nstyle['hz_line_type'] = 1
            node.set_style(nstyle)

        ts = TreeStyle()
        ts.show_leaf_name = False
        ts.rotation = 90
        ts.draw_aligned_faces_as_table = False
        ts.allow_face_overlap = True
        ts.layout_fn = my_layout
        ts.show_scale = False
        ts.show_branch_support = show_support
        self.tree.render(outfile, tree_style=ts)
        # if we labelled seqs, let's also write the alignment out so we have the sequences (including of internal nodes)
        if idlabel:
            aln = MultipleSeqAlignment([])
            for node in self.tree.traverse():
                aln.append(
                    SeqRecord(Seq(str(node.sequence), generic_dna),
                              id=str(node.name),
                              description='abundance={}'.format(
                                  node.frequency)))
            AlignIO.write(aln,
                          open(os.path.splitext(outfile)[0] + '.fasta', 'w'),
                          'fasta')

예제 #11

0

파일 보기

파일: gctree_tools.py 프로젝트: m-vieira/bcr-phylo-benchmark

    def __init__(self,
                 params=None,
                 tree=None,
                 frame=None,
                 collapse_syn=False,
                 allow_repeats=False):
        '''
        For intialization, either params or tree (or both) must be provided
        params: offspring distribution parameters
        tree: ete tree with frequency node feature. If uncollapsed, it will be collapsed
        frame: tranlation frame, with default None, no tranlation attempted
        '''
        LeavesAndClades.__init__(self, params=params)
        if frame is not None and frame not in (1, 2, 3):
            raise RuntimeError('frame must be 1, 2, 3, or None')
        self.frame = frame

        if collapse_syn is True:
            tree.dist = 0  # no branch above root
            for node in tree.iter_descendants():
                aa = Seq(
                    node.sequence[(frame - 1):(frame - 1 +
                                               (3 * (((len(node.sequence) -
                                                       (frame - 1)) // 3))))],
                    generic_dna).translate()
                aa_parent = Seq(
                    node.up.sequence[(frame - 1):(frame - 1 +
                                                  (3 *
                                                   (((len(node.sequence) -
                                                      (frame - 1)) // 3))))],
                    generic_dna).translate()
                node.dist = hamming_distance(aa, aa_parent)

        if tree is not None:
            self.tree = tree.copy()
            # remove unobserved internal unifurcations
            for node in self.tree.iter_descendants():
                parent = node.up
                if node.frequency == 0 and len(node.children) == 1:
                    node.delete(prevent_nondicotomic=False)
                    node.children[0].dist = hamming_distance(
                        node.children[0].sequence, parent.sequence)

            # iterate over the tree below root and collapse edges of zero length
            # if the node is a leaf and it's parent has nonzero frequency we combine taxa names to a set
            # this acommodates bootstrap samples that result in repeated genotypes
            observed_genotypes = set((leaf.name for leaf in self.tree))
            observed_genotypes.add(self.tree.name)
            for node in self.tree.get_descendants(strategy='postorder'):
                if node.dist == 0:
                    node.up.frequency += node.frequency
                    node_set = set([node.name]) if isinstance(
                        node.name, str) else set(node.name)
                    node_up_set = set([node.up.name]) if isinstance(
                        node.up.name, str) else set(node.up.name)
                    if node_up_set < observed_genotypes:
                        if node_set < observed_genotypes:
                            node.up.name = tuple(node_set | node_up_set)
                            if len(node.up.name) == 1:
                                node.up.name = node.up.name[0]
                    elif node_set < observed_genotypes:
                        node.up.name = tuple(node_set)
                        if len(node.up.name) == 1:
                            node.up.name = node.up.name[0]
                    node.delete(prevent_nondicotomic=False)

            final_observed_genotypes = set([
                name for node in self.tree.traverse()
                if node.frequency > 0 or node == self.tree for name in ((
                    node.name, ) if isinstance(node.name, str) else node.name)
            ])
            if final_observed_genotypes != observed_genotypes:
                raise RuntimeError(
                    'observed genotypes don\'t match after collapse\n\tbefore: {}\n\tafter: {}\n\tsymmetric diff: {}'
                    .format(observed_genotypes, final_observed_genotypes,
                            observed_genotypes ^ final_observed_genotypes))
            assert sum(node.frequency for node in tree.traverse()) == sum(
                node.frequency for node in self.tree.traverse())

            rep_seq = sum(
                node.frequency > 0 for node in self.tree.traverse()) - len(
                    set([
                        node.sequence
                        for node in self.tree.traverse() if node.frequency > 0
                    ]))
            if not allow_repeats and rep_seq:
                raise RuntimeError(
                    'Repeated observed sequences in collapsed tree. {} sequences were found repeated.'
                    .format(rep_seq))
            elif allow_repeats and rep_seq:
                rep_seq = sum(node.frequency > 0
                              for node in self.tree.traverse()) - len(
                                  set([
                                      node.sequence
                                      for node in self.tree.traverse()
                                      if node.frequency > 0
                                  ]))
                print(
                    'Repeated observed sequences in collapsed tree. {} sequences were found repeated.'
                    .format(rep_seq))
            # a custom ladderize accounting for abundance and sequence to break ties in abundance
            for node in self.tree.traverse(strategy='postorder'):
                # add a partition feature and compute it recursively up the tree
                node.add_feature(
                    'partition',
                    node.frequency + sum(node2.partition
                                         for node2 in node.children))
                # sort children of this node based on partion and sequence
                node.children.sort(
                    key=lambda node: (node.partition, node.sequence))
        else:
            self.tree = tree