Example #1
0
def makeUltra(treeFile, outgroup):
    """Make a tree ultrametric
    """
    print("loading trees...")
    treelist = []
    with open(treeFile, 'r') as newick:
        for line in newick:
            if not line.startswith("NA"):
                t = Tree(line)
                if outgroup:
                    t.set_outgroup("outgroup")
                t.convert_to_ultrametric()
                treelist.append(t)
    return (treelist)
Example #2
0
def get_phyparts_nodes(sptree_fn, phyparts_root):
    sptree = Tree(sptree_fn)
    sptree.convert_to_ultrametric()

    phyparts_node_key = [line for line in open(phyparts_root + ".node.key")]
    subtrees_dict = {
        n.split()[0]: Tree(n.split()[1] + ";")
        for n in phyparts_node_key
    }
    subtrees_topids = {}
    for x in subtrees_dict:
        subtrees_topids[x] = subtrees_dict[x].get_topology_id()

    for node in sptree.traverse():
        node_topid = node.get_topology_id()
        for subtree in subtrees_dict:
            if node_topid == subtrees_topids[subtree]:
                node.name = subtree
    return sptree, subtrees_dict, subtrees_topids
def get_phyparts_nodes(sptree_fn,phyparts_root):
	sptree = Tree(sptree_fn)
	sptree.convert_to_ultrametric()

	phyparts_node_key = [line for line in open(phyparts_root+".node.key")]
	subtrees_dict = {n.split()[0]:Tree(n.split()[1]+";") for n in phyparts_node_key}
	subtrees_topids = {}
	for x in subtrees_dict:
		subtrees_topids[x] = subtrees_dict[x].get_topology_id()
	#print(subtrees_topids['1'])
	#print()
	for node in sptree.traverse():
		node_topid = node.get_topology_id()
		if "Takakia_4343a" in node.get_leaf_names():
			print(node_topid)
			print(node)
		for subtree in subtrees_dict:
			if node_topid == subtrees_topids[subtree]:
				node.name = subtree
	return sptree,subtrees_dict,subtrees_topids
def cluster_hierarchical(output_path, matrix, species_names, cluster_alg, cluster_alg_label, config, phonemes_encoding_tree=False):
    print(f" - Creating tree using {cluster_alg_label}, saving to .nw and .pdf")
    # Turn off distances if this is a phonemes encoding tree
    newick_string = cluster_alg(matrix, species_names, distances= (not phonemes_encoding_tree))
    # Load newick string into ete3 Tree object
    tree = Tree(newick_string)
    if phonemes_encoding_tree:
        tree.convert_to_ultrametric()
    for node in tree.traverse():
        node.set_style(config["ete_node_style"])
        if phonemes_encoding_tree:
            node.img_style["size"] = 0
        if node.is_leaf():
            # Add bit of extra space between leaf branch and leaf label
            name_face = TextFace(f" {node.name}", fgcolor="black", ftype="Charis SIL Compact", fsize=14)
            node.add_face(name_face, column=0, position='branch-right')
            
    # Output to pdf and nw
    filename_base = output_path + "".join([w[0] for w in cluster_alg_label.split()])
    tree.render(f"{filename_base}.pdf", tree_style=config["ete_tree_style"])
    tree.write(format=0, outfile=f"{filename_base}.nw")
    return tree
Example #5
0
def main(*args):
    start = time.time()
    hemiplasytool.print_banner()

    parser = argparse.ArgumentParser(
        description="Tool for characterising \
                        hemiplasy given traits mapped onto a species tree"
    )
    parser.add_argument(
        "-v",
        "--verbose",
        help="Enable debugging messages to be displayed",
        action="store_true",
    )
    parser.add_argument(
        "input",
        metavar="input",
        help="Input NEXUS file",
    )
    parser.add_argument(
        "-n",
        "--replicates",
        metavar="",
        help="Number of replicates per batch",
        default=1000000,
    )
    parser.add_argument(
        "-t", "--threads", metavar="", help="Number of threads for simulations", default=16
    )
    parser.add_argument(
        "-p", "--mspath", metavar="", help="Path to ms (if not in user path)", default="ms"
    )
    parser.add_argument(
        "-g", "--seqgenpath", metavar="", help="Path to seq-gen (if not in user path)", default="seq-gen"
    )
    parser.add_argument(
        "-s",
        "--mutationrate",
        metavar="",
        help="Seq-gen mutation rate (default 0.05)",
        default=0.05,
    )

    parser.add_argument(
        "-c", "--CI", metavar="", help="Optionally simulate at the upper ('upper') or lower ('lower') bounds of the 95 %% CI for the coalescent conversion regression.", default=None
    )
    parser.add_argument("-o", "--outputdir", metavar="", help="Output directory/prefix")

    args = parser.parse_args()

    # Setup ###################
    log.basicConfig(level=log.DEBUG)
    logger = log.getLogger()
    if args.verbose:
        logger.disabled = False
    else:
        logger.disabled = True
    mpl_logger = log.getLogger("matplotlib")
    mpl_logger.setLevel(log.WARNING)
    ##########################
    

    # Read input file
    log.debug("Reading input file...")
    treeSp, derived, admix, outgroup, type, tree2, conversion_type = hemiplasytool.readInput(args.input)
    tmp1 = Tree(treeSp, format = 1)
    t = tmp1
    tmp1.convert_to_ultrametric()



    if type != 'coal':
        # Convert ML tree to a coalescent tree based on GCFs
        treeSp, t, treeSp_low, t_low, treeSp_up, t_up, intercept, coef, newick_internals, coal_internals = hemiplasytool.subs2coal(treeSp)
        original_tree = [treeSp, t]
    else:
        original_tree = [treeSp, Tree(treeSp, format=1)]

    sim_type = args.CI

    if sim_type != None:
        if sim_type == 'upper':
            treeSp = treeSp_up
            t = t_up
        elif sim_type == 'lower':
            treeSp = treeSp_low
            t = t_low

    # Tree pruning
    if outgroup != None:
        log.debug("Pruning tree...")
        # Prune tree
        treeSp,t = hemiplasytool.prune_tree(treeSp, derived, outgroup)
        tree2,t2 = hemiplasytool.prune_tree(tree2, derived, outgroup)


    taxalist = [i.name for i in t.iter_leaves()]
    [i.name for i in t.iter_leaves()]

    # Convert coalescent tree to ms splits



    treeSp, conversions = hemiplasytool.names2ints(treeSp, conversion_type, type)
    original_tree[0], tmp = hemiplasytool.names2ints(original_tree[0], conversion_type, type)

    # Convert newick tree to ms splits
    splits, taxa = hemiplasytool.newick2ms(treeSp)
    traits = {}
    for i in taxalist:
        if i in derived:
            traits[conversions[i]] = 1
        else:
            traits[conversions[i]] = 0


    #Generate tree in ete3 with internal branches labeled based on user input
    #plus how ms interprets them. e.g., I4(3). This way I can easily specify the
    #events to ms.
    if len(admix) != 0:
        tree2_ete, tree2_newick, node_conversions = hemiplasytool.make_introgression_tree(tree2, conversions)

    
        #Update conversion dictionary to contain node conversions (e.g. I4 -> 2)
        conversions = {**conversions, **node_conversions}

        #Perform conversions on admix list
        events = []

        #Parse admix list, divide times by 2
        for e in admix:
            events.append([str(float(e[0])/2.0), str(conversions[e[1]]), str(conversions[e[2]]), e[3]])
        admix = events

        #Sort admix list earliest to latest (not sure if ms requires this or not)

        admix.sort(key = lambda x: float(x[0]), reverse=True)



    # Make program calls
    threads = int(args.threads)
    reps = int(args.replicates)

    breaks = [] #WHAT TO DO WITH THIS?????





    # Begin batches
    taxalist = []
    for s in traits.keys():
        taxalist.append(int(s))

    inherited = []

    results = {}
    n_mutations_d = []
    n_mutations_c = []

    all_focal_trees = []
    counts_by_tree = []

    
    events = []
    for e in admix:
        events.append([e[0], str(conversions[e[1]]), str(conversions[e[2]]), e[3]])
    admix = events
    total_reps_for_intro = 0
    if len(admix) != 0:
        for e in admix:
            total_reps_for_intro += int(reps * float(e[3]))
    remaining_reps = reps - total_reps_for_intro


    per_thread = remaining_reps//threads
    v = remaining_reps/threads
    per_thread = [per_thread]*threads
    if not v.is_integer():
        threads += 1
        per_thread.append(remaining_reps%(threads-1))


    if len(admix) != 0:
        #Extra thread for introgression
        threads += 1

    prefix = args.outputdir

    processes_ms = []
    processes_sq = []
    intro_indices = []
    if len(admix) == 0:
        for y in range(0, threads):
            ms_call = hemiplasytool.splits_to_ms(splits, taxa, per_thread[y], args.mspath, y, prefix)
            m = hemiplasytool.call_programs(ms_call, "", "trees.tmp", taxalist)
            processes_ms.append(m)
    elif len(admix) != 0:
        for y in range(0, threads):
            if y != threads-1:
                ms_call = hemiplasytool.splits_to_ms(splits, taxa, per_thread[y], args.mspath, y, prefix)
                m = hemiplasytool.call_programs(ms_call, "", "trees.tmp", taxalist)
                processes_ms.append(m)
            elif y == threads-1:
                if (len(admix) != 0):
                    ms_calls = []
                    for m, event in enumerate(admix):
                        o = str(y) + "_" + str(m)
                        intro_indices.append(m)
                        ms_call = hemiplasytool.splits_to_ms(splits, taxa, 
                            int(reps * float(event[3])), args.mspath, o, prefix, event)
                        m = hemiplasytool.call_programs(ms_call, "", "trees.tmp", taxalist)
                        processes_ms.append(m)
                else:
                    ms_call = hemiplasytool.splits_to_ms(splits, taxa, per_thread[y], args.mspath, y, prefix)
                    m = hemiplasytool.call_programs(ms_call, "", "trees.tmp", taxalist)
                    processes_ms.append(m)

    done = False
    while done == False:
        ms_processes = []
        for p in processes_ms:
            poll = p.poll()
            if poll == None:
                ms_processes.append(False)
            else:
                ms_processes.append(True)
    
        j = all(process == True for process in ms_processes)
        done = j



    string_cat_ms = "cat "
    for y in range(0, threads):
        if y != threads-1:
            string_cat_ms += prefix + ".trees" + str(y) + ".tmp "
        elif y == threads-1:
            if (len(admix) != 0):
                for intro in intro_indices:
                    string_cat_ms += prefix + ".trees" + str(y) + "_" + str(intro) + ".tmp "
            else:
                string_cat_ms += prefix + ".trees" + str(y) + ".tmp "
    string_cat_ms += "> " + prefix + ".trees.tmp"
    os.system(string_cat_ms)

    for y in range(0, threads):
        if y != threads-1:
            seqgencall = hemiplasytool.seq_gen_call(prefix + ".trees" + str(y) + ".tmp", args.seqgenpath, args.mutationrate, str(y), prefix)
            s = hemiplasytool.call_programs_sg(ms_call, seqgencall, "trees.tmp", taxalist)
            processes_sq.append(s)
        else:
            if (len(admix) != 0):
                for z, intro in enumerate(intro_indices):
                    seqgencall = hemiplasytool.seq_gen_call(prefix + ".trees" + str(y) + "_" + str(intro) + ".tmp", args.seqgenpath, args.mutationrate, str(y), prefix, z)
                    s = hemiplasytool.call_programs_sg(ms_call, seqgencall, "trees.tmp", taxalist)
                    processes_sq.append(s)
            else:
                seqgencall = hemiplasytool.seq_gen_call(prefix + ".trees" + str(y) + ".tmp", args.seqgenpath, args.mutationrate, str(y), prefix)
                s = hemiplasytool.call_programs_sg(ms_call, seqgencall, "trees.tmp", taxalist)
                processes_sq.append(s)


    intro_start = sum(per_thread)

    done = False
    while done == False:
        sg_processes = []
        for p in processes_sq:
            poll = p.poll()
            if poll == None:
                sg_processes.append(False)
            else:
                sg_processes.append(True)
        j = all(process == True for process in sg_processes)
        done = j

    string_cat = "cat "
    for y in range(0, threads):
        if y != threads-1:
            string_cat += prefix + ".seqs" + str(y) + ".tmp "
        elif y == threads-1:
            if (len(admix) != 0):
                for z, intro in enumerate(intro_indices):
                    string_cat += prefix + ".seqs" + str(y) + "_" + str(z) + ".tmp "
            else:
                string_cat += prefix + ".seqs" + str(y) + ".tmp "

    string_cat += "> " + prefix + ".seqs.tmp"
    os.system(string_cat)


    # Gets indices of trees with site patterns that match speecies pattern
    log.debug("Finding trees that match species trait pattern...")
    match_species_pattern, counts_by_tree = seqtools.readSeqs(
        prefix + ".seqs.tmp", len(taxalist), traits, len(splits), i, prefix, intro_start
    )


    log.debug("Getting focal trees...")
    # Gets the trees at these indices 
    focal_trees, _ = seqtools.getTrees(prefix + ".trees.tmp", match_species_pattern)
    all_focal_trees = focal_trees
    assert len(match_species_pattern) == len(focal_trees)
    log.debug("Calculating discordance...")
    results[i], disc, conc = seqtools.propDiscordant(focal_trees, treeSp)

    ##The error we're getting wrt introgression not being accurately relfected in mutation counting happens around here \
    ##Either count_mutations is not working on introgressed trees OR we are passing a list of trees that doesnt contain \
    ##the introgreesion trees.


    focaltrees_d = seqtools.parse_seqgen(prefix + ".focaltrees.tmp", len(taxalist), disc)
    focaltrees_c = seqtools.parse_seqgen(prefix + ".focaltrees.tmp", len(taxalist), conc)
    
    for index, tree in enumerate(focaltrees_d):
        n_mutations_d.append(seqtools.count_mutations(tree, len(taxalist)))
    for index, tree in enumerate(focaltrees_c):
        n_mutations_c.append(seqtools.count_mutations(tree, len(taxalist)))
    nderived = 0
    for trait in traits.values():
        if trait == 1:
            nderived += 1
    interesting = seqtools.get_interesting(
        focaltrees_d, nderived, len(traits.keys())
    )
    for item in interesting:
        test_summarize = seqtools.summarize_interesting(item, len(traits.keys()))
        inherited = inherited + test_summarize

    # Clean up temporary files
    #os.system("rm *.tmp")
    ###################################################################

    # Begin summary of all batches
    mutation_counts_d = [[x, n_mutations_d.count(x)] for x in set(n_mutations_d)]
    mutation_counts_c = [[x, n_mutations_c.count(x)] for x in set(n_mutations_c)]
    summary = hemiplasytool.summarize(results)
    #counts_by_tree = seqtools.sum_counts_by_tree(counts_by_tree)
    if len(inherited) > 0:
        mutation_pat = hemiplasytool.summarize_inherited(inherited)
    else:
        mutation_pat = None
        log.debug(
            "Not enough 'interesting' cases to provide mutation inheritance patterns"
        )
    min_mutations_required = hemiplasytool.fitchs_alg(str(treeSp), traits)

    if type == "coal":
        intercept, coef, newick_internals, coal_internals = [None]*4


    log.debug("Writing output file...")
    hemiplasytool.write_output(
        summary,
        mutation_counts_c,
        mutation_counts_d,
        mutation_pat,
        counts_by_tree,
        str(treeSp),
        admix,
        traits,
        min_mutations_required,
        args.outputdir,
        (reps),
        conversions,
        original_tree[0],
        intercept,
        coef,
        newick_internals,
        coal_internals,
        args.mutationrate
    )
    hemiplasytool.write_unique_trees(all_focal_trees, args.outputdir, traits)
    end = time.time()
    print("\nTime elapsed: " + str(end - start) + " seconds")
Example #6
0
def main(argv, wayout):
    if not len(argv):
        argv.append('-h')
    parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter,
        description=__doc__)
    parser.add_argument(
        "-p",
        "--predictor",
        type=str,
        help=
        "predictor values: multiple sequence alignment or trait table filename",
        required=True)
    parser.add_argument(
        "-f",
        "--format",
        default="fasta",
        help="predictor file format [fasta]; pass 'table' for quantitative mode"
    )
    parser.add_argument('-r',
                        '--response',
                        type=str,
                        help="response values: trait table filename",
                        required=True)
    parser.add_argument("-t",
                        "--tree",
                        type=str,
                        help="tree filename (Newick format)",
                        required=True)
    parser.add_argument("-l",
                        "--lamb-pagel",
                        type=float,
                        default=1,
                        help="Pagel's Lambda [1]"
                        )  # dammit Pagel, "lambda" is a reserved word!
    parser.add_argument("-s",
                        "--sub_weight",
                        type=float,
                        default=1,
                        help="Substitution rate weight scalar [1]")
    parser.add_argument(
        "-sm",
        "--sub_matrix",
        type=str,
        default="BLOSUM62",
        help="Pass in a custom substitution rate matrix [BLOSUM62]")
    parser.add_argument(
        "-k",
        "--key_seq",
        type=str,
        help="Name of key sequence on which to index the output columns [None]"
    )
    parser.add_argument("--cpu",
                        type=int,
                        default=cpu_count(),
                        help="Thread count (max # CPUs to use) [{}]".format(
                            cpu_count()))
    parser.add_argument(
        "-b",
        "--bell_curves",
        type=float,
        default=0,
        help="p-value cutoff below which t-PDF bell curves will be drawn [0]")
    parser.add_argument(
        "-m",
        "--manhattan",
        action="store_true",
        help="Save Manhattan plots with default thresholds [0.05,0.01,0.001]")
    parser.add_argument("-mt",
                        "--manhattan_thresholds",
                        type=tuple,
                        default=(0.10, 0.05),
                        help="List of custom thresholds for Manhattan plots")

    global args
    args = parser.parse_args(argv)

    # set mode switch
    quanPredictor = (args.format == "table")

    # Load the continuous response values into a DataFrame in order
    responseTable = makeNiceTraitTable(args.response)

    # tell user what just happened
    print >> sys.stderr, "# MESSAGE: Loaded {} dependent variables for {} taxa".format(
        responseTable.shape[1], responseTable.shape[0])

    # Load the tree
    tree = Tree(args.tree)
    tree.convert_to_ultrametric(
        tree_length=1)  # IMPORTANT: normalize root-to-leaf distance to 1
    # TODO: look at ways to calculate C from an additive tree. Brownian model may be compromised.

    # Transform branch lengths into a variance-covariance matrix
    # Use the taxa and order provided in responseTable
    # IMPORTANT: orderedLeaves dictates the order of taxa in all matrices and vectors. In this case it should theoretically = responseTable.index
    # IFF they both contain all the same taxa
    phyCovMatrix, orderedLeaves = tree2covMatrix(
        tree, taxa=responseTable.index,
        pagel=args.lamb_pagel)  # This stays the same across all columns

    # remake the response table using only taxa that are in the tree
    responseTable = makeNiceTraitTable(args.response, orderedLeaves)
    # and then convert that dataframe to a list of tuples for faster iteration, omitting the index
    responseList = np.array(
        [row for row in responseTable.itertuples(index=False, name=None)])

    # tell user what just happened
    print >> sys.stderr, "# MESSAGE: Generated phylogenetic covariance matrix for {} taxa".format(
        len(orderedLeaves))

    # make sure there are dirs ready to receive the requested plots
    if args.bell_curves > 0:
        bellDir = "tdists/"
        try:
            os.mkdir(bellDir)
        except:
            pass

    if args.manhattan:
        manhattanDir = "manhattan/"
        try:
            os.mkdir(manhattanDir)
        except:
            pass

    if quanPredictor:  # if in quantitative-predictor mode

        # Load the continuous predictor values into a DataFrame and sort it to match the order of the cov matrix
        # this drops any taxa in the table that are missing from the tree
        predictorTable = makeNiceTraitTable(args.predictor, orderedLeaves)
        # and then convert the dataframe to a list of columns for faster iteration, omitting the index
        predictorList = np.array([
            col for col in predictorTable.T.itertuples(index=False, name=None)
        ])

        # Find predictors that are not in tree or response table, issue warnings
        missingTaxa = set()
        for pTaxon in predictorTable.index:
            if pTaxon not in orderedLeaves:
                #print >> sys.stderr, "# WARNING: {} is missing from the tree and will be dropped from analysis".format(pTaxon) # this is already handled in tree2covMatrix()
                missingTaxa.add(pTaxon)
            if pTaxon not in responseTable.index:
                print >> sys.stderr, "# WARNING: {} is missing from the response table and will be dropped from analysis".format(
                    pTaxon)
                missingTaxa.add(pTaxon)

        def PGLSUnpack(pargs):
            testsResults = list()
            # for each dependent variable
            for depIndex, dependent in enumerate(pargs[0].T):
                # recreate the argument list using only that dependent variable
                dargs = [dependent] + pargs[1:]
                # regress against that variable
                results = pgls(*dargs).fit()
                # in present form I only accept one predictor, so the F-pval and AIC are "all that matter"
                testsResults.append((results.f_pvalue, results.aic))

            pbar.update(1)  # update progress bar
            return testsResults

        # init progress bar
        print >> sys.stderr, "# MESSAGE: Analyzing columns..."
        pbar = tqdm(total=len(predictorList))

        #print responseList
        #print responseList.shape[0]
        #print [predictor for predictor in predictorList]
        #print len(predictorList[0])

        # Processes are split up by column: each process executes 1 GLS regression
        tpool = pool.ThreadPool(args.cpu)
        sumStats = list(
            tpool.map(PGLSUnpack, [(responseList, predictor, phyCovMatrix)
                                   for predictor in predictorList]))
        tpool.close()
        tpool.join()
        pbar.close()

        # prepare the table of p-values for output
        # rows are predictors (genes or traits), columns are response variables
        # put tuples of stats a DataFrame(); index is OG/gene/something else
        outTable1 = pd.DataFrame(sumStats)
        # split the stats into their own columns
        outTable = pd.DataFrame()
        for col in outTable1.columns:
            outTable = pd.concat(
                [outTable,
                 pd.DataFrame(outTable1[col].values.tolist())],
                axis=1)

        # attach the index IDs and set them to index
        outTable = pd.concat(
            [outTable,
             pd.DataFrame({"Predictor": predictorTable.T.index})],
            axis=1).set_index("Predictor")

        # specify stats being reported for each dependent var
        statsNames = ("F-p_value", "F-AIC")
        # iterate over them to get the column names in order (to match the DataFrame() generated above)
        outTable.columns = [
            depName + '.' + statName for depName in responseTable.columns
            for statName in statsNames
        ]

    else:  # if in CPGLS mode

        # Load in AA alignment
        alignment = AlignIO.read(args.predictor, format=args.format)

        # Load the substitution matrix and normalize if necessary
        if args.sub_matrix.find("BLOSUM") > -1:  # if it's a BLOSUM matrix
            try:  # look for it in the blosum module
                subMatrix = blosum.submatrix(
                    int(''.join([i for i in args.sub_matrix if i.isdigit()])))
            except:
                print >> sys.stderr, "# ERROR: {} is not part of the blosum submatrix module!".format(
                    args.sub_matrix)
                sys.exit(1)
        else:  # load it from file
            subMatrix = np.array(pd.read_table(args.sub_matrix, sep='\t'))

        if not np.allclose(
                subMatrix, subMatrix.T
        ):  # NOTE that the 'is' operator won't work with np.array()!
            print >> sys.stderr, "# ERROR: Substitution rate matrix is asymmetric, exiting"
            sys.exit(1)
        subMatrixNorm = normalizeRowWise(subMatrix, 1)
        if not np.allclose(subMatrix, subMatrixNorm):
            print >> sys.stderr, "# WARNING: Substitution rate matrix was not properly normalized, but now it is"
            subMatrix = subMatrixNorm

        # Find seqs that are not in tree or response table, issue warnings
        missingTaxa = set()
        for record in alignment:
            if record.id not in orderedLeaves:
                print >> sys.stderr, "# WARNING: {} is missing from the tree and will be dropped from analysis".format(
                    record.id)
                missingTaxa.add(record.id)
            if record.id not in responseTable.index:
                print >> sys.stderr, "# WARNING: {} is missing from the response table and will be dropped from analysis".format(
                    record.id)
                missingTaxa.add(record.id)

        # Drop those missing taxa
        keepRecords = dict()
        for record in alignment:
            if record.id not in missingTaxa:
                keepRecords[record.id] = record
        keepRecordsList = [keepRecords[taxon] for taxon in orderedLeaves
                           ]  # get the seqs in order!

        # Get the root phenotypes, aka phylomeans, for fixing the intercept
        # unpack the response vars into taxon-keyed dicts
        responseDicts = [
            dependent[1].to_dict() for dependent in responseTable.T.iterrows()
        ]
        # get the phylomeans and store them to a list of intercepts
        # the below pulls out the _last_ value for a tree traversed in post-order
        intercepts = [ancR(tree, response)[-1] for response in responseDicts]

        # store the culled alignment as an array, sequences row-wise
        alignArray = np.array([list(record) for record in keepRecordsList],
                              np.character)
        # then split in into column vectors
        colVectors = np.hsplit(alignArray, alignArray.shape[1])

        if args.key_seq:  # map the PP scores to key sequence positions
            # get key columns of the alignment
            keyCols = key_alignment_columns(alignment, args.key_seq)
            colVectors = [colVectors[i - 1] for i in keyCols]

        ### Multiple-arg unpacker for the ThreadPool. Dependent variable names and bell curve directory are passed by default
        def CPGLSUnpack(pargs, depNames=responseTable.columns):
            # init master list for all significance test results across dependents
            testsResults = list()
            # for each dependent variable
            for depIndex, dependent in enumerate(pargs[0].T):
                # get the name of the dependent variable
                depName = depNames[depIndex]
                intercept = intercepts[depIndex]
                # recreate the argument list using only that dependent variable
                dargs = [dependent, intercept] + pargs[2:]
                #(responseList, intercepts, phyCovMatrix, subMatrix, column, args.sub_weight, colnum)
                #(responseList, intercept, phyCovMatrix, subMatrix, colVector, subWeight=1, colnum=0)
                # regress against that variable
                regDesign = cpgls(*dargs)

                # can get the whitened data for further manipulation
                #print "whitened endog"
                #print pd.DataFrame(regDesign.wendog).to_csv(sep='\t') #TEST, print whitened data
                #print "whitened exog"
                #print pd.DataFrame(regDesign[0].wexog).to_csv(sep='\t')  # TEST, print whitened data

                results = regDesign.fit()  # fit the regression, SILENTLY
                # if we want to see the summary of every regression
                #print results.summary()

                #print "FP " + str(results.f_pvalue) #TEST

                if ~np.isnan(results.f_pvalue):  # if not a conserved column

                    # do pairwise 2-sample t-tests and get the p-values
                    multicorr = 'hs'  # the multiple-test correction to be used
                    pairwiseT = results.t_test_pairwise(
                        term_name='aa', method=multicorr).result_frame
                    pairTDict = pairwiseT["pvalue-" + multicorr].T.to_dict()
                    min_pairT = np.nanmin(pairTDict.values())
                    #pairTDict = str(pairTDict) # safer for .to_csv()?

                    # Do a within/among comparison of variances
                    anovaF_pvalue = sm.stats.anova_lm(results,
                                                      typ=2)["PR(>F)"]["aa"]

                    # Do normality tests for diagnostic purposes
                    # omni:
                    omniNorm_pvalue = sms.omni_normtest(results.resid)[1]
                    # Jarque-Bera:
                    #jbNorm_pvalue = sms.jarque_bera(results.resid)[1]

                    # collate all the p-values, including the F-pVal for the linear model
                    pDictList = [
                        results.f_pvalue, anovaF_pvalue, min_pairT, pairTDict,
                        omniNorm_pvalue
                    ]

                else:
                    pDictList = [
                        None
                    ] * 5  # return a null list; must be same length as pDictList above!

                # append the set of p-values to the list of lists for different response variables
                testsResults = testsResults + pDictList

                # if the column makes the specified p-cutoff to draw bell curves
                '''if pDictList[2] and pDictList[2] <= args.bell_curves:
					# generate discrete series for the t-distributions
					xs, tPDFs, tCDFs = tDists(summary)
					# draw t-distributions identified by dependent and column
					drawDists(xs, tPDFs, depName, pargs[5], bellDir)'''

            pbar.update(1)  # update progress bar
            # return minimum p-val and p-val dict for pool map, all should be Bonferroni-corrected beforehand
            return testsResults

        # init progress bar#
        print >> sys.stderr, "# MESSAGE: Analyzing columns..."
        pbar = tqdm(total=len(colVectors))

        #for i in range(len(colVectors)): #TEST for error handling in the ThreadPool
        #	print >> sys.stderr, CPGLSUnpack([responseList, intercepts, phyCovMatrix, subMatrix, colVectors[i], args.sub_weight, i])

        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            # Processes are split up by column: each process executes 1 GLS regression
            tpool = pool.ThreadPool(args.cpu)
            sumStats = list(
                tpool.map(CPGLSUnpack, [[
                    responseList, intercepts, phyCovMatrix, subMatrix, column,
                    args.sub_weight, colnum
                ] for colnum, column in enumerate(colVectors)])
            )  # split alignment array into columns and feed them to pgls()
            tpool.close()
            tpool.join()
            pbar.close()

        # prepare the table of p-values for output
        outTable = pd.DataFrame({
            "Sites": [i + 1 for i in range(len(colVectors))]
        }).set_index("Sites")  # init table with a site column
        # specify stats being reported for each dependent var
        statsNames = ("model_F-p", "anova_F-p", "min_t-p_HolmSidak",
                      "pairwise_t-p_HolmSidak", "omni_X2-p"
                      )  # to return the min p-val and all the p-vals
        # create data columns in the same order as the lists returned by the ThreadPool
        statsColumns = [
            depName + '.' + statName for depName in responseTable.columns
            for statName in statsNames
        ]
        for i, statCol in enumerate(statsColumns):
            outTable[statCol] = [el[i] for el in sumStats]

    if args.manhattan:
        # NOTE that this will generally crash any parent script calling this main() function,
        # because matplotlib fails to hand plotting resources back to the parent process.

        # get all numeric columns from outTable
        numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
        outNumerics = outTable.select_dtypes(include=numerics)

        # draw plots in parallel
        print >> sys.stderr, "# MESSAGE: Drawing Manhattan plots..."

        def manhattan_unpack(pargs):
            manhattanPlot(*pargs)
            return 0

        #print >> sys.stderr, manhattan_unpack(*[(outTable.index, list(outNumerics.T.ix[0]), manhattanDir+str(outNumerics.T.index[0])+".pdf", args.manhattan_thresholds, args.key_seq)]) #TEST

        tpool = pool.ThreadPool(args.cpu)
        #tpool = pool.ThreadPool(1)
        # list comp plots each numeric column against the index and names the files for those numeric columns
        # with itertuples(), element 0 of the row is the index name and [1:] are the actual values
        tpool.map(manhattan_unpack,
                  [(outTable.index, list(y[1:]), manhattanDir + str(y[0]) +
                    ".pdf", args.manhattan_thresholds, args.key_seq)
                   for y in outNumerics.T.itertuples(name=None)])
        tpool.close()
        tpool.join()

    # return the results DataFrame to any calling process
    return outTable
Example #7
0
from ete3 import Tree, TreeStyle, TextFace

#t = Tree("(((((FelCat:1.0, (PriBen:1.0, PriViv:1.0)Anc2:1.0)Anc3:1.0, (AciJub:1.0, PumCon:1.0)Anc1:1.0)Anc4:1.0, LynPar:1.0)Anc5:1.0, CarCar:1.0)Anc6:1.0, (PanTig:1.0,(PanOnc:1.0, (PanLeo:1.0, PanPar:1.0)Anc7:1.0)Anc8:1.0)Anc9:1.0);", format=1)

#Topolgy from Evolution of Cats, 2007
#t = Tree("(((((FelCat:2.0, (PriBen:1.0, PriViv:1.0)Anc2:1.0), (AciJub:2.0, PumCon:2.0)Anc1:1.0), LynPar:4.0), CarCar:5.0), (PanTig:5.0,(PanOnc:4.0, (PanLeo:3.0, PanPar:3.0)Anc7:1.0)Anc8:1.0)Anc9:1.0);", format=1)

#Topology from Kliver
#t = Tree("(((((FelCat:2.0, (PriBen:1.0, PriViv:1.0)Anc2:1.0), LynPar:3.0), (AciJub:3.0, PumCon:3.0)Anc1:1.0), CarCar:5.0), (PanTig:5.0,(PanOnc:4.0, (PanLeo:3.0, PanPar:3.0)Anc7:1.0)Anc8:1.0)Anc9:1.0);", format=1)

#branch length = Kliver's tree branches lengths*1000
#t = Tree("((((AciJub:2.91010233990525967,PumCon:2.18060580721677255)Anc1:1.05347870516064369,(((PriBen:0.98094949038301073,PriViv:1.00313958496688689)Anc2:1.80011684448748259,FelCat:2.86356101725523731)100:3.3638608143336113,LynPar:3.07709301059199899)96:0.14292306680570912)100:0.59099766777017380,CarCar:3.15189615063413749)100:0.78860645297528301,((PanOnc:1.05156689588516367,(PanLeo:1.24626526455903818,PanPar:0.96868719506379029)Anc7:0.28662050599669259)Anc8:0.38474422414646876,PanTig:1.69403903125740969)Anc9:2.67796389385434753);", format=1)
t = Tree(
    "((CroCro:0.03163103687492756222,((((AciJub:0.00291010233990525967,PumCon:0.00218060580721677255)Anc1:0.00105347870516064369,(((PriBen:0.00098094949038301073,PriViv:0.00100313958496688689)Anc2:0.00180011684448748259,FelCat:0.00286356101725523731)100:0.00033638608143336113,LynPar:0.00307709301059199899)96:0.00014292306680570912)100:0.00059099766777017380,CarCar:0.00315189615063413749)100:0.00078860645297528301,((PanOnc:0.00105156689588516367,(PanLeo:0.00124626526455903818,PanPar:0.00096868719506379029)Anc7:0.00028662050599669259)Anc8:0.00038474422414646876,PanTig:0.00169403903125740969)Anc9:0.00267796389385434753)100:0.01757780609900925356):0.03362414544383966059,CanFam:0.03362414544383966059);",
    format=1)
t.convert_to_ultrametric()

print t.write(format=1)

(t & "AciJub").add_features(label="150")
(t & "AciJub").add_face(TextFace((t & "AciJub").label, ftype='Arial'),
                        column=0,
                        position="branch-top")
(t & "PumCon").add_features(label="293")
(t & "PumCon").add_face(TextFace((t & "PumCon").label, ftype='Arial'),
                        column=0,
                        position="branch-top")
(t & "CarCar").add_features(label="185")
(t & "CarCar").add_face(TextFace((t & "CarCar").label, ftype='Arial'),
                        column=0,
                        position="branch-top")
Example #8
0
            current_color %= len(colors)
            style = NodeStyle()
            style['vt_line_color'] = colors[current_color]
            style['hz_line_color'] = colors[current_color]
            style['size'] = 0
            style['fgcolor'] = '#000000'
            style["vt_line_width"] = 2
            style["hz_line_width"] = 2
            for gg in g.traverse():
                if not gg.coloured:
                    gg.set_style(style)
                    gg.coloured = True
            source_style = NodeStyle(style)
            source_style['size'] = 5
            source_style['fgcolor'] = '#C01000'
            g.set_style(source_style)

    tstyle = TreeStyle()
    #tstyle.show_leaf_name = False
    tstyle.scale = 28
    tstyle.branch_vertical_margin = 6
    tstyle.show_branch_length = False

    # tstyle.show_branch_support = True
    tstyle.show_scale = False

    if ultrametric == 1:
        gene_tree.convert_to_ultrametric()
    gene_tree.show(tree_style=tstyle)
    # species_tree.show()