Python get_populationsの例、Rtree_to_covariance_matrix.get_populations Pythonの例

コード例 #1

0

ファイルを表示

ファイル: generate_sadmix_trees.py プロジェクト: Tmacme/AdmixtureBayes

def admixes_are_sadmixes_popz(tree):
    pops=get_populations(tree)
    #print pops
    for key,node in tree.items():
        if node_is_admixture(node):
            sadmix_bool=admix_is_sadmix(tree, (key,1), pops)
            if not sadmix_bool:
                return False
    return True

コード例 #2

0

ファイルを表示

ファイル: downstream_analysis_tool.py プロジェクト: Tmacme/AdmixtureBayes

 def __call__(self, Rtree=None, **kwargs):
     if Rtree is None:
         tree = kwargs['full_tree']
     else:
         tree = Rtree
     pops = get_populations(tree,
                            self.min_w,
                            keys_to_include=self.keys_to_include)
     return {'pops': '-'.join(pops)}, False

コード例 #3

0

ファイルを表示

ファイル: generate_sadmix_trees.py プロジェクト: Tmacme/AdmixtureBayes

def admix_is_sadmix_popz(tree, branch, reference_pop):
    ntree,f,b=deladmix(tree,fixed_remove=branch, preserve_root_distance=False)
    pops_n=get_populations(ntree)
    #print pops_n
    return reference_pop!=pops_n

コード例 #4

0

ファイルを表示

def tree_to_node_combinations(tree):
    return get_populations(tree)

コード例 #5

0

ファイルを表示

ファイル: admixture_trees_to_consensus_tree.py プロジェクト: Tmacme/AdmixtureBayes

def main(args):
    parser = ArgumentParser(
        usage='pipeline for plotting posterior distribution summaries.',
        version='1.0.0')

    parser.add_argument(
        '--posterior_distribution_file',
        required=True,
        type=str,
        help=
        'The file containing posterior distributions from the "AdmixtureBayes posterior" command. It needs the two columns "pops" and topology.'
    )
    parser.add_argument(
        '--plot',
        choices=['consensus_trees', 'top_node_trees', 'top_trees'],
        required=True,
        help='The type of plot to make. Choose between: 1) consensus_trees. '
        'It plots an admixture graph based on all nodes that have a higher (marginal) posterior probability of X. '
        'Different X\'s can be supplied with the command --consensus_threshold \n'
        '2) top_node_trees. It plots the X highest posterior combinations of node types '
        'and creates the corresponding minimal topologies.  X can be supplied through the command --top_node_trees_to_plot'
        '3) top_trees. It plots the X highest posterior topologies. X can be supplied by the command --top_trees_to_plot'
    )
    parser.add_argument('--outgroup',
                        default='outgroup',
                        help='name of the outgroup to plot')
    parser.add_argument(
        '--consensus_threshold',
        default=[0.25, 0.5, 0.75, 0.9, 0.95, 0.99],
        type=float,
        nargs='+',
        help=
        'The posterior thresholds for which to draw different consensus trees.'
    )
    parser.add_argument(
        '--top_node_trees_to_plot',
        type=int,
        default=3,
        help='The number of node trees (or minimal topologies) to plot')
    parser.add_argument('--top_trees_to_plot',
                        type=int,
                        default=3,
                        help='The number of trees (or topologies) to plot ')
    parser.add_argument(
        '--write_ranking_to_file',
        type=str,
        default='',
        help=
        'if a file is supplied here, the natural rankings for each of the plots is written here.'
    )
    parser.add_argument(
        '--rankings_to_write_to_file',
        type=int,
        default=1000,
        help=
        'the number of rankings(nodes, min topology or topology depending on --plot) to write to the ranking file.'
    )
    parser.add_argument(
        '--dont_annotate_node_posterior',
        default=False,
        action='store_true',
        help=
        'This will not color the nodes according to their posterior probability.'
    )
    parser.add_argument('--nodes',
                        default='',
                        type=str,
                        help='file where the first line is the leaf nodes')
    parser.add_argument('--suppress_plot', default=False, action='store_true')
    parser.add_argument(
        '--no_sort',
        default=False,
        action='store_true',
        help=
        'often the tree is sorted according to the leaf names. no_sort willl assumed that they are not sorted according to this but sorted according to '
    )
    parser.add_argument('--sep',
                        default=',',
                        type=str,
                        help='the separator used in the input file')

    #parser.add_argument('--no_header', default=False, action='store_true',help='will assume that there is no header in the file')
    #parser.add_argument('--burn_in_rows', default=0, type=int, help='the number of rows that will be skipped in the input file as burn-in period')
    #parser.add_argument('--burn_in_fraction', default=0.0, type=float, help='the proportion of the rows that are discarded as burn in period')
    #parser.add_argument('--tree_column_name', default='tree', type=str, help='the name in the header of the column with all the trees.')
    parser.add_argument(
        '--consensus_method',
        choices=['descendant_frequencies'],
        default='descendant_frequencies',
        help='Which method should be used to calculate the consensus tree?')
    #parser.add_argument('--min_w', default=0.0, type=float, help='a lower threshold of which descendants matter when the consensus_method is descendant_frequencies.')

    #parser.add_argument('--plot_tops_file', action='store_true', default=False, help='this will assume that the file is a tops file from downstream_analysis_parser and plot each line numbered.')

    #parser.add_argument('--get_effective_number_of_admixtures', action='store_true', default=False, help='this will cancel all the other analysis and only print the effective number of admixes(tadmixes/sadmixes or admixes) to a a file.')
    #parser.add_argument('--effective_number_of_admixtures_file', type=str, default='no_tadmixes.txt', help='this is the file in which to write the effective number of admixes in the file')
    #parser.add_argument('--type_of_effective_admixtures', type=str, choices=['sadmix','tadmix','admix'], help='this is the type of admixes to write to the file.')

    #parser.add_argument('--node_count_file', default='', type=str, help='if plot_tops option is supplied')
    #parser.add_argument('--node_count_probs', default='', type=str, help='if supplied this will make a new ')
    #parser.add_argument('--test_run', default=False, action='store_true',
    #                    help='will overwrite everything and run a test function')

    options = parser.parse_args(args)

    def combine_nodes(node_structure, new_node, seen_sets):
        candidate = new_node.name
        seen = []
        for lists_of_fixed_size in seen_sets[::-1]:
            for attached_branch in lists_of_fixed_size:
                if (attached_branch.issubset(candidate) and
                    ((not attached_branch.issubset(seen)) or
                     (not node_structure[attached_branch].has_parent()))):
                    seen.extend(list(attached_branch))
                    new_node.add_child(node_structure[attached_branch])
                    node_structure[attached_branch].add_parent(new_node)
        return node_structure

    def get_number_of_tadmixtures(node_structure):
        total = 0
        for key in node_structure:
            total += max(0, node_structure[key].get_number_of_parents() - 1)
        return total

    def node_combinations_to_node_structure(node_combinations):
        length_sorted = {}
        for node_combination in node_combinations:
            leaves = frozenset(node_combination.split('.'))
            k = len(leaves)
            if k in length_sorted:
                length_sorted[k].append(leaves)
            else:
                length_sorted[k] = [leaves]
        length_sorted_list = [
            length_sorted.get(k, [])
            for k in range(1,
                           max(length_sorted.keys()) + 1)
        ]
        #length_sorted_list is of the form [[[A],[B],[C]],[[A,B],[B,C]],...,[[A,B,C]]]
        node_structure = {}
        for leaf_node in length_sorted_list[0]:
            node_structure[leaf_node] = Node(leaf_node)
        added_sets = [length_sorted_list[0]]
        for lists_of_fixed_size in length_sorted_list[1:]:
            for branch_set in lists_of_fixed_size:
                new_node = Node(branch_set)
                combine_nodes(node_structure, new_node, added_sets)
                node_structure[branch_set] = new_node
            added_sets.append(lists_of_fixed_size)
        return node_structure

    # if options.node_count_file:
    #     with open(options.node_count_file, 'r') as f:
    #         node_count_dic={}
    #         for lin in f.readlines():
    #             key,freq=lin.rstrip().split()
    #             node_count_dic[frozenset(key.split('.'))]=float(freq)
    # else:
    #     node_count_dic=None

    if options.plot == 'consensus_trees' or options.plot == 'top_node_trees':
        df = pd.read_csv(options.posterior_distribution_file,
                         sep=options.sep,
                         usecols=['pops'])
        nodes_list = df['pops'].tolist()
        #print(nodes_list)
        seen_combinations = {}
        for nodes in nodes_list:
            #print(nodes)
            for node in nodes.split('-'):
                #print(node)
                seen_combinations[node] = seen_combinations.get(node, 0) + 1
        N = len(nodes_list)
        #print(seen_combinations)
        if options.plot == 'consensus_trees':
            node_combinations = []
            for threshold in options.consensus_threshold:
                total_threshold = int(N * threshold)
                final_node_combinations = [
                    k for k, v in seen_combinations.items()
                    if v > total_threshold
                ]
                node_combinations.append(final_node_combinations)
            if not options.dont_annotate_node_posterior:
                node_count_dic = {
                    frozenset(k.split('.')): float(v) / N
                    for k, v in seen_combinations.items()
                }
            else:
                node_count_dic = None
            for i, final_node_combinations in enumerate(node_combinations):
                #print(final_node_combinations)
                final_node_structure = node_combinations_to_node_structure(
                    final_node_combinations)
                if not options.suppress_plot:
                    from tree_plotting import plot_node_structure_as_directed_graph
                    plot_node_structure_as_directed_graph(
                        final_node_structure,
                        drawing_name='consensus_' +
                        str(int(100 * options.consensus_threshold[i])) +
                        '.png',
                        node_dic=node_count_dic)
            if options.write_ranking_to_file:
                with open(options.write_ranking_to_file, 'w') as f:
                    c = Counter(seen_combinations)
                    to_write = c.most_common(options.rankings_to_write_to_file)
                    for node, frequency in to_write:
                        f.write(node + ',' + str(float(frequency) / N) + '\n')
        elif options.plot == 'top_node_trees':
            c = Counter(nodes_list)
            to_plots = c.most_common(options.top_node_trees_to_plot)
            if options.write_ranking_to_file:
                with open(options.write_ranking_to_file, 'w') as f:
                    for tree, frequency in c.most_common(
                            options.rankings_to_write_to_file):
                        f.write(tree + ',' + str(float(frequency) / N) + '\n')
            if not options.dont_annotate_node_posterior:
                c = Counter(seen_combinations)
                node_count_dic = {
                    frozenset(key.split('.')): float(count) / N
                    for key, count in c.most_common(1000)
                }
            else:
                node_count_dic = None
            if not options.suppress_plot:
                from tree_plotting import plot_node_structure_as_directed_graph
                for i, (to_plot, count) in enumerate(to_plots):
                    node_structure = node_combinations_to_node_structure(
                        to_plot.split('-'))
                    plot_node_structure_as_directed_graph(
                        node_structure,
                        drawing_name='minimal_topology_' + str(i + 1) + '.png',
                        node_dic=node_count_dic)
    elif options.plot == 'top_trees':
        df = pd.read_csv(options.posterior_distribution_file,
                         sep=options.sep,
                         usecols=['pops', 'topology'])
        trees_list = df['topology'].tolist()
        no_leaves = len(trees_list[0].split('-')[0].split('.'))
        N = len(trees_list)
        c = Counter(trees_list)
        to_plots = c.most_common(options.top_trees_to_plot)

        #obtaining nodes:
        if not options.nodes:
            nodes = df['pops'].tolist()[0].split('-')
            leaves = list(
                set([leaf for node in nodes for leaf in node.split('.')]))
            if len(leaves) == no_leaves:
                pass  #everything is good
            elif len(leaves) == no_leaves - 1:
                #adding outgroup
                leaves.append(options.outgroup)
            else:
                assert False, 'The number of leaves could not be obtained'
            assert not options.no_sort, 'When nodes are not specified, they will always be sorted'
            leaves = sorted(leaves)
        else:
            leaves = read_one_line(options.nodes)
            if not options.no_sort:
                leaves = sorted(leaves)

        if options.write_ranking_to_file:
            with open(options.write_ranking_to_file, 'w') as f:
                for tree, frequency in c.most_common(
                        options.rankings_to_write_to_file):
                    f.write(tree + ',' + str(float(frequency) / N) + '\n')

        if not options.suppress_plot:
            from tree_plotting import plot_as_directed_graph
            for i, (to_plot, count) in enumerate(to_plots):
                tree = topological_identifier_to_tree_clean(
                    to_plot,
                    leaves=generate_predefined_list_string(deepcopy(leaves)))
                plot_as_directed_graph(tree,
                                       drawing_name='topology_' + str(i + 1) +
                                       '.png')
    sys.exit()

    if options.plot_tops_file:
        with open(options.input_file, 'r') as f:
            for n, lin in enumerate(f.readlines()):
                rank, probability, combination = lin.rstrip().split(',')
                all_nodes = [c.split('.') for c in combination.split('_')]
                flattened = [item for sublist in all_nodes for item in sublist]
                a = list(set(flattened))
                code = rank + '_' + str(int(
                    100 * round(float(probability), 2))) + '_' + '_'.join(a)
                print 'code', code
                node_structure = node_combinations_to_node_structure(
                    combination.split('_'))

                print node_structure
                plot_node_structure_as_directed_graph(node_structure,
                                                      drawing_name=code +
                                                      '.png',
                                                      node_dic=node_count_dic)
        sys.exit()

    if options.test_run:
        from generate_prior_trees import generate_phylogeny
        from tree_statistics import unique_identifier_and_branch_lengths
        from tree_plotting import plot_node_structure_as_directed_graph, plot_as_directed_graph
        N = 5
        tree1 = generate_phylogeny(N, 1)
        plot_as_directed_graph(tree1, drawing_name='tree1.png')
        tree2 = generate_phylogeny(N, 1)
        plot_as_directed_graph(tree2, drawing_name='tree2.png')
        stree1 = unique_identifier_and_branch_lengths(tree1)
        stree2 = unique_identifier_and_branch_lengths(tree2)
        with open('tmp_tree.txt', 'w') as f:
            f.write(' '.join(['s' + str(i) for i in range(1, N + 1)]) + '\n')
            f.write(stree1)
        with open('trees.txt', 'w') as f:
            f.write(stree1 + '\n' + stree2 + '\n' + stree1)

        options.input_file = 'trees.txt'
        options.nodes = 'tmp_tree.txt'
        options.no_header = True
        options.posterior_threshold = [0.25, 0.5, 0.9]

    if options.input_file == options.node_count_file:
        node_combinations = []
        print 'using population sets from ', options.node_count_file
        for threshold in options.posterior_threshold:
            final_node_combinations = [
                '.'.join(sorted(list(k))) for k, v in node_count_dic.items()
                if v > threshold
            ]
            node_combinations.append(final_node_combinations)
    else:
        print 'Reading file...'
        #loading trees
        if options.no_header:
            strees = []
            with open(options.input_file, 'r') as f:
                for lin in f.readlines():
                    strees.append(lin.rstrip())
        else:
            df = pd.read_csv(options.input_file,
                             sep=options.sep,
                             usecols=[options.tree_column_name])
            strees = df[options.tree_column_name].tolist()
        n = len(strees)
        print 'trees read: ', n

        #thinning tree list

        rows_to_remove_from_fraction = int(options.burn_in_fraction * n)
        rows_to_remove = max(rows_to_remove_from_fraction,
                             options.burn_in_rows)
        strees = strees[rows_to_remove:]

        print 'removed burn-in:', rows_to_remove
        print 'In list are now', len(strees), 'trees'

        #thinning

        distance_between = max(1, len(strees) // options.max_number_of_trees)
        nstrees = []
        for a, stree in enumerate(strees):
            if a % distance_between == 0 and len(
                    nstrees) < options.max_number_of_trees:
                nstrees.append(stree)
        print 'thinned'
        print 'In list are now', len(nstrees), 'trees'

        N = len(nstrees)

        seen_node_combinations = {}

        nodes = read_one_line(options.nodes)
        if not options.no_sort:
            nodes = sorted(nodes)

        tenth = len(nstrees) // 10
        trees = []
        for i, stree in enumerate(nstrees):
            if tenth > 0 and i % tenth == 0:
                print i // tenth * 10, '%'
            if ';' in stree:
                tree = identifier_to_tree_clean(
                    stree,
                    leaves=generate_predefined_list_string(deepcopy(nodes)))
            else:
                tree = topological_identifier_to_tree_clean(
                    stree,
                    leaves=generate_predefined_list_string(deepcopy(nodes)))
            trees.append(tree)
            ad = get_populations(tree, min_w=options.min_w)
            for a in ad:
                seen_node_combinations[a] = seen_node_combinations.get(a,
                                                                       0) + 1
        node_combinations = []
        for threshold in options.posterior_threshold:
            total_threshold = int(N * threshold)
            final_node_combinations = [
                k for k, v in seen_node_combinations.items()
                if v > total_threshold
            ]
            node_combinations.append(final_node_combinations)

    for i, final_node_combinations in enumerate(node_combinations):
        print 'final_node_combinations', final_node_combinations
        final_node_structure = node_combinations_to_node_structure(
            final_node_combinations)
        if options.get_effective_number_of_admixtures:
            with open(options.effective_number_of_admixtures_file, 'w') as f:
                if options.type_of_effective_admixtures == 'tadmix':
                    effictive_admixtures = get_number_of_tadmixtures(
                        final_node_structure)
                    f.write(str(effictive_admixtures))
                elif options.type_of_effective_admixtures == 'sadmix':
                    val = 0
                    count = 0
                    for tree in trees:
                        val += effective_number_of_admixes(tree)
                        count += 1
                    if count == 1:
                        f.write(str(int(val)))
                    else:
                        f.write(str(float(val) / count))
                elif options.type_of_effective_admixtures == 'admix':
                    val = 0
                    count = 0
                    for tree in trees:
                        val += get_number_of_admixes(tree)
                        count += 1
                    if count == 1:
                        f.write(str(int(val)))
                    else:
                        f.write(str(float(val) / count))
        if not options.suppress_plot:
            from tree_plotting import plot_node_structure_as_directed_graph, plot_as_directed_graph
            plot_node_structure_as_directed_graph(final_node_structure,
                                                  drawing_name='tmp' +
                                                  str(i + 1) + '.png',
                                                  node_dic=node_count_dic)

コード例 #6

0

ファイルを表示

ファイル: downstream_analysis_tool.py プロジェクト: Tmacme/AdmixtureBayes

 def __init__(self, true_tree, min_w=0.0, keys_to_include=None):
     self.true_pops = set(get_populations(true_tree, min_w,
                                          keys_to_include))