def makeUltra(treeFile, outgroup): """Make a tree ultrametric """ print("loading trees...") treelist = [] with open(treeFile, 'r') as newick: for line in newick: if not line.startswith("NA"): t = Tree(line) if outgroup: t.set_outgroup("outgroup") t.convert_to_ultrametric() treelist.append(t) return (treelist)
def get_phyparts_nodes(sptree_fn, phyparts_root): sptree = Tree(sptree_fn) sptree.convert_to_ultrametric() phyparts_node_key = [line for line in open(phyparts_root + ".node.key")] subtrees_dict = { n.split()[0]: Tree(n.split()[1] + ";") for n in phyparts_node_key } subtrees_topids = {} for x in subtrees_dict: subtrees_topids[x] = subtrees_dict[x].get_topology_id() for node in sptree.traverse(): node_topid = node.get_topology_id() for subtree in subtrees_dict: if node_topid == subtrees_topids[subtree]: node.name = subtree return sptree, subtrees_dict, subtrees_topids
def get_phyparts_nodes(sptree_fn,phyparts_root): sptree = Tree(sptree_fn) sptree.convert_to_ultrametric() phyparts_node_key = [line for line in open(phyparts_root+".node.key")] subtrees_dict = {n.split()[0]:Tree(n.split()[1]+";") for n in phyparts_node_key} subtrees_topids = {} for x in subtrees_dict: subtrees_topids[x] = subtrees_dict[x].get_topology_id() #print(subtrees_topids['1']) #print() for node in sptree.traverse(): node_topid = node.get_topology_id() if "Takakia_4343a" in node.get_leaf_names(): print(node_topid) print(node) for subtree in subtrees_dict: if node_topid == subtrees_topids[subtree]: node.name = subtree return sptree,subtrees_dict,subtrees_topids
def cluster_hierarchical(output_path, matrix, species_names, cluster_alg, cluster_alg_label, config, phonemes_encoding_tree=False): print(f" - Creating tree using {cluster_alg_label}, saving to .nw and .pdf") # Turn off distances if this is a phonemes encoding tree newick_string = cluster_alg(matrix, species_names, distances= (not phonemes_encoding_tree)) # Load newick string into ete3 Tree object tree = Tree(newick_string) if phonemes_encoding_tree: tree.convert_to_ultrametric() for node in tree.traverse(): node.set_style(config["ete_node_style"]) if phonemes_encoding_tree: node.img_style["size"] = 0 if node.is_leaf(): # Add bit of extra space between leaf branch and leaf label name_face = TextFace(f" {node.name}", fgcolor="black", ftype="Charis SIL Compact", fsize=14) node.add_face(name_face, column=0, position='branch-right') # Output to pdf and nw filename_base = output_path + "".join([w[0] for w in cluster_alg_label.split()]) tree.render(f"{filename_base}.pdf", tree_style=config["ete_tree_style"]) tree.write(format=0, outfile=f"{filename_base}.nw") return tree
def main(*args): start = time.time() hemiplasytool.print_banner() parser = argparse.ArgumentParser( description="Tool for characterising \ hemiplasy given traits mapped onto a species tree" ) parser.add_argument( "-v", "--verbose", help="Enable debugging messages to be displayed", action="store_true", ) parser.add_argument( "input", metavar="input", help="Input NEXUS file", ) parser.add_argument( "-n", "--replicates", metavar="", help="Number of replicates per batch", default=1000000, ) parser.add_argument( "-t", "--threads", metavar="", help="Number of threads for simulations", default=16 ) parser.add_argument( "-p", "--mspath", metavar="", help="Path to ms (if not in user path)", default="ms" ) parser.add_argument( "-g", "--seqgenpath", metavar="", help="Path to seq-gen (if not in user path)", default="seq-gen" ) parser.add_argument( "-s", "--mutationrate", metavar="", help="Seq-gen mutation rate (default 0.05)", default=0.05, ) parser.add_argument( "-c", "--CI", metavar="", help="Optionally simulate at the upper ('upper') or lower ('lower') bounds of the 95 %% CI for the coalescent conversion regression.", default=None ) parser.add_argument("-o", "--outputdir", metavar="", help="Output directory/prefix") args = parser.parse_args() # Setup ################### log.basicConfig(level=log.DEBUG) logger = log.getLogger() if args.verbose: logger.disabled = False else: logger.disabled = True mpl_logger = log.getLogger("matplotlib") mpl_logger.setLevel(log.WARNING) ########################## # Read input file log.debug("Reading input file...") treeSp, derived, admix, outgroup, type, tree2, conversion_type = hemiplasytool.readInput(args.input) tmp1 = Tree(treeSp, format = 1) t = tmp1 tmp1.convert_to_ultrametric() if type != 'coal': # Convert ML tree to a coalescent tree based on GCFs treeSp, t, treeSp_low, t_low, treeSp_up, t_up, intercept, coef, newick_internals, coal_internals = hemiplasytool.subs2coal(treeSp) original_tree = [treeSp, t] else: original_tree = [treeSp, Tree(treeSp, format=1)] sim_type = args.CI if sim_type != None: if sim_type == 'upper': treeSp = treeSp_up t = t_up elif sim_type == 'lower': treeSp = treeSp_low t = t_low # Tree pruning if outgroup != None: log.debug("Pruning tree...") # Prune tree treeSp,t = hemiplasytool.prune_tree(treeSp, derived, outgroup) tree2,t2 = hemiplasytool.prune_tree(tree2, derived, outgroup) taxalist = [i.name for i in t.iter_leaves()] [i.name for i in t.iter_leaves()] # Convert coalescent tree to ms splits treeSp, conversions = hemiplasytool.names2ints(treeSp, conversion_type, type) original_tree[0], tmp = hemiplasytool.names2ints(original_tree[0], conversion_type, type) # Convert newick tree to ms splits splits, taxa = hemiplasytool.newick2ms(treeSp) traits = {} for i in taxalist: if i in derived: traits[conversions[i]] = 1 else: traits[conversions[i]] = 0 #Generate tree in ete3 with internal branches labeled based on user input #plus how ms interprets them. e.g., I4(3). This way I can easily specify the #events to ms. if len(admix) != 0: tree2_ete, tree2_newick, node_conversions = hemiplasytool.make_introgression_tree(tree2, conversions) #Update conversion dictionary to contain node conversions (e.g. I4 -> 2) conversions = {**conversions, **node_conversions} #Perform conversions on admix list events = [] #Parse admix list, divide times by 2 for e in admix: events.append([str(float(e[0])/2.0), str(conversions[e[1]]), str(conversions[e[2]]), e[3]]) admix = events #Sort admix list earliest to latest (not sure if ms requires this or not) admix.sort(key = lambda x: float(x[0]), reverse=True) # Make program calls threads = int(args.threads) reps = int(args.replicates) breaks = [] #WHAT TO DO WITH THIS????? # Begin batches taxalist = [] for s in traits.keys(): taxalist.append(int(s)) inherited = [] results = {} n_mutations_d = [] n_mutations_c = [] all_focal_trees = [] counts_by_tree = [] events = [] for e in admix: events.append([e[0], str(conversions[e[1]]), str(conversions[e[2]]), e[3]]) admix = events total_reps_for_intro = 0 if len(admix) != 0: for e in admix: total_reps_for_intro += int(reps * float(e[3])) remaining_reps = reps - total_reps_for_intro per_thread = remaining_reps//threads v = remaining_reps/threads per_thread = [per_thread]*threads if not v.is_integer(): threads += 1 per_thread.append(remaining_reps%(threads-1)) if len(admix) != 0: #Extra thread for introgression threads += 1 prefix = args.outputdir processes_ms = [] processes_sq = [] intro_indices = [] if len(admix) == 0: for y in range(0, threads): ms_call = hemiplasytool.splits_to_ms(splits, taxa, per_thread[y], args.mspath, y, prefix) m = hemiplasytool.call_programs(ms_call, "", "trees.tmp", taxalist) processes_ms.append(m) elif len(admix) != 0: for y in range(0, threads): if y != threads-1: ms_call = hemiplasytool.splits_to_ms(splits, taxa, per_thread[y], args.mspath, y, prefix) m = hemiplasytool.call_programs(ms_call, "", "trees.tmp", taxalist) processes_ms.append(m) elif y == threads-1: if (len(admix) != 0): ms_calls = [] for m, event in enumerate(admix): o = str(y) + "_" + str(m) intro_indices.append(m) ms_call = hemiplasytool.splits_to_ms(splits, taxa, int(reps * float(event[3])), args.mspath, o, prefix, event) m = hemiplasytool.call_programs(ms_call, "", "trees.tmp", taxalist) processes_ms.append(m) else: ms_call = hemiplasytool.splits_to_ms(splits, taxa, per_thread[y], args.mspath, y, prefix) m = hemiplasytool.call_programs(ms_call, "", "trees.tmp", taxalist) processes_ms.append(m) done = False while done == False: ms_processes = [] for p in processes_ms: poll = p.poll() if poll == None: ms_processes.append(False) else: ms_processes.append(True) j = all(process == True for process in ms_processes) done = j string_cat_ms = "cat " for y in range(0, threads): if y != threads-1: string_cat_ms += prefix + ".trees" + str(y) + ".tmp " elif y == threads-1: if (len(admix) != 0): for intro in intro_indices: string_cat_ms += prefix + ".trees" + str(y) + "_" + str(intro) + ".tmp " else: string_cat_ms += prefix + ".trees" + str(y) + ".tmp " string_cat_ms += "> " + prefix + ".trees.tmp" os.system(string_cat_ms) for y in range(0, threads): if y != threads-1: seqgencall = hemiplasytool.seq_gen_call(prefix + ".trees" + str(y) + ".tmp", args.seqgenpath, args.mutationrate, str(y), prefix) s = hemiplasytool.call_programs_sg(ms_call, seqgencall, "trees.tmp", taxalist) processes_sq.append(s) else: if (len(admix) != 0): for z, intro in enumerate(intro_indices): seqgencall = hemiplasytool.seq_gen_call(prefix + ".trees" + str(y) + "_" + str(intro) + ".tmp", args.seqgenpath, args.mutationrate, str(y), prefix, z) s = hemiplasytool.call_programs_sg(ms_call, seqgencall, "trees.tmp", taxalist) processes_sq.append(s) else: seqgencall = hemiplasytool.seq_gen_call(prefix + ".trees" + str(y) + ".tmp", args.seqgenpath, args.mutationrate, str(y), prefix) s = hemiplasytool.call_programs_sg(ms_call, seqgencall, "trees.tmp", taxalist) processes_sq.append(s) intro_start = sum(per_thread) done = False while done == False: sg_processes = [] for p in processes_sq: poll = p.poll() if poll == None: sg_processes.append(False) else: sg_processes.append(True) j = all(process == True for process in sg_processes) done = j string_cat = "cat " for y in range(0, threads): if y != threads-1: string_cat += prefix + ".seqs" + str(y) + ".tmp " elif y == threads-1: if (len(admix) != 0): for z, intro in enumerate(intro_indices): string_cat += prefix + ".seqs" + str(y) + "_" + str(z) + ".tmp " else: string_cat += prefix + ".seqs" + str(y) + ".tmp " string_cat += "> " + prefix + ".seqs.tmp" os.system(string_cat) # Gets indices of trees with site patterns that match speecies pattern log.debug("Finding trees that match species trait pattern...") match_species_pattern, counts_by_tree = seqtools.readSeqs( prefix + ".seqs.tmp", len(taxalist), traits, len(splits), i, prefix, intro_start ) log.debug("Getting focal trees...") # Gets the trees at these indices focal_trees, _ = seqtools.getTrees(prefix + ".trees.tmp", match_species_pattern) all_focal_trees = focal_trees assert len(match_species_pattern) == len(focal_trees) log.debug("Calculating discordance...") results[i], disc, conc = seqtools.propDiscordant(focal_trees, treeSp) ##The error we're getting wrt introgression not being accurately relfected in mutation counting happens around here \ ##Either count_mutations is not working on introgressed trees OR we are passing a list of trees that doesnt contain \ ##the introgreesion trees. focaltrees_d = seqtools.parse_seqgen(prefix + ".focaltrees.tmp", len(taxalist), disc) focaltrees_c = seqtools.parse_seqgen(prefix + ".focaltrees.tmp", len(taxalist), conc) for index, tree in enumerate(focaltrees_d): n_mutations_d.append(seqtools.count_mutations(tree, len(taxalist))) for index, tree in enumerate(focaltrees_c): n_mutations_c.append(seqtools.count_mutations(tree, len(taxalist))) nderived = 0 for trait in traits.values(): if trait == 1: nderived += 1 interesting = seqtools.get_interesting( focaltrees_d, nderived, len(traits.keys()) ) for item in interesting: test_summarize = seqtools.summarize_interesting(item, len(traits.keys())) inherited = inherited + test_summarize # Clean up temporary files #os.system("rm *.tmp") ################################################################### # Begin summary of all batches mutation_counts_d = [[x, n_mutations_d.count(x)] for x in set(n_mutations_d)] mutation_counts_c = [[x, n_mutations_c.count(x)] for x in set(n_mutations_c)] summary = hemiplasytool.summarize(results) #counts_by_tree = seqtools.sum_counts_by_tree(counts_by_tree) if len(inherited) > 0: mutation_pat = hemiplasytool.summarize_inherited(inherited) else: mutation_pat = None log.debug( "Not enough 'interesting' cases to provide mutation inheritance patterns" ) min_mutations_required = hemiplasytool.fitchs_alg(str(treeSp), traits) if type == "coal": intercept, coef, newick_internals, coal_internals = [None]*4 log.debug("Writing output file...") hemiplasytool.write_output( summary, mutation_counts_c, mutation_counts_d, mutation_pat, counts_by_tree, str(treeSp), admix, traits, min_mutations_required, args.outputdir, (reps), conversions, original_tree[0], intercept, coef, newick_internals, coal_internals, args.mutationrate ) hemiplasytool.write_unique_trees(all_focal_trees, args.outputdir, traits) end = time.time() print("\nTime elapsed: " + str(end - start) + " seconds")
def main(argv, wayout): if not len(argv): argv.append('-h') parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__) parser.add_argument( "-p", "--predictor", type=str, help= "predictor values: multiple sequence alignment or trait table filename", required=True) parser.add_argument( "-f", "--format", default="fasta", help="predictor file format [fasta]; pass 'table' for quantitative mode" ) parser.add_argument('-r', '--response', type=str, help="response values: trait table filename", required=True) parser.add_argument("-t", "--tree", type=str, help="tree filename (Newick format)", required=True) parser.add_argument("-l", "--lamb-pagel", type=float, default=1, help="Pagel's Lambda [1]" ) # dammit Pagel, "lambda" is a reserved word! parser.add_argument("-s", "--sub_weight", type=float, default=1, help="Substitution rate weight scalar [1]") parser.add_argument( "-sm", "--sub_matrix", type=str, default="BLOSUM62", help="Pass in a custom substitution rate matrix [BLOSUM62]") parser.add_argument( "-k", "--key_seq", type=str, help="Name of key sequence on which to index the output columns [None]" ) parser.add_argument("--cpu", type=int, default=cpu_count(), help="Thread count (max # CPUs to use) [{}]".format( cpu_count())) parser.add_argument( "-b", "--bell_curves", type=float, default=0, help="p-value cutoff below which t-PDF bell curves will be drawn [0]") parser.add_argument( "-m", "--manhattan", action="store_true", help="Save Manhattan plots with default thresholds [0.05,0.01,0.001]") parser.add_argument("-mt", "--manhattan_thresholds", type=tuple, default=(0.10, 0.05), help="List of custom thresholds for Manhattan plots") global args args = parser.parse_args(argv) # set mode switch quanPredictor = (args.format == "table") # Load the continuous response values into a DataFrame in order responseTable = makeNiceTraitTable(args.response) # tell user what just happened print >> sys.stderr, "# MESSAGE: Loaded {} dependent variables for {} taxa".format( responseTable.shape[1], responseTable.shape[0]) # Load the tree tree = Tree(args.tree) tree.convert_to_ultrametric( tree_length=1) # IMPORTANT: normalize root-to-leaf distance to 1 # TODO: look at ways to calculate C from an additive tree. Brownian model may be compromised. # Transform branch lengths into a variance-covariance matrix # Use the taxa and order provided in responseTable # IMPORTANT: orderedLeaves dictates the order of taxa in all matrices and vectors. In this case it should theoretically = responseTable.index # IFF they both contain all the same taxa phyCovMatrix, orderedLeaves = tree2covMatrix( tree, taxa=responseTable.index, pagel=args.lamb_pagel) # This stays the same across all columns # remake the response table using only taxa that are in the tree responseTable = makeNiceTraitTable(args.response, orderedLeaves) # and then convert that dataframe to a list of tuples for faster iteration, omitting the index responseList = np.array( [row for row in responseTable.itertuples(index=False, name=None)]) # tell user what just happened print >> sys.stderr, "# MESSAGE: Generated phylogenetic covariance matrix for {} taxa".format( len(orderedLeaves)) # make sure there are dirs ready to receive the requested plots if args.bell_curves > 0: bellDir = "tdists/" try: os.mkdir(bellDir) except: pass if args.manhattan: manhattanDir = "manhattan/" try: os.mkdir(manhattanDir) except: pass if quanPredictor: # if in quantitative-predictor mode # Load the continuous predictor values into a DataFrame and sort it to match the order of the cov matrix # this drops any taxa in the table that are missing from the tree predictorTable = makeNiceTraitTable(args.predictor, orderedLeaves) # and then convert the dataframe to a list of columns for faster iteration, omitting the index predictorList = np.array([ col for col in predictorTable.T.itertuples(index=False, name=None) ]) # Find predictors that are not in tree or response table, issue warnings missingTaxa = set() for pTaxon in predictorTable.index: if pTaxon not in orderedLeaves: #print >> sys.stderr, "# WARNING: {} is missing from the tree and will be dropped from analysis".format(pTaxon) # this is already handled in tree2covMatrix() missingTaxa.add(pTaxon) if pTaxon not in responseTable.index: print >> sys.stderr, "# WARNING: {} is missing from the response table and will be dropped from analysis".format( pTaxon) missingTaxa.add(pTaxon) def PGLSUnpack(pargs): testsResults = list() # for each dependent variable for depIndex, dependent in enumerate(pargs[0].T): # recreate the argument list using only that dependent variable dargs = [dependent] + pargs[1:] # regress against that variable results = pgls(*dargs).fit() # in present form I only accept one predictor, so the F-pval and AIC are "all that matter" testsResults.append((results.f_pvalue, results.aic)) pbar.update(1) # update progress bar return testsResults # init progress bar print >> sys.stderr, "# MESSAGE: Analyzing columns..." pbar = tqdm(total=len(predictorList)) #print responseList #print responseList.shape[0] #print [predictor for predictor in predictorList] #print len(predictorList[0]) # Processes are split up by column: each process executes 1 GLS regression tpool = pool.ThreadPool(args.cpu) sumStats = list( tpool.map(PGLSUnpack, [(responseList, predictor, phyCovMatrix) for predictor in predictorList])) tpool.close() tpool.join() pbar.close() # prepare the table of p-values for output # rows are predictors (genes or traits), columns are response variables # put tuples of stats a DataFrame(); index is OG/gene/something else outTable1 = pd.DataFrame(sumStats) # split the stats into their own columns outTable = pd.DataFrame() for col in outTable1.columns: outTable = pd.concat( [outTable, pd.DataFrame(outTable1[col].values.tolist())], axis=1) # attach the index IDs and set them to index outTable = pd.concat( [outTable, pd.DataFrame({"Predictor": predictorTable.T.index})], axis=1).set_index("Predictor") # specify stats being reported for each dependent var statsNames = ("F-p_value", "F-AIC") # iterate over them to get the column names in order (to match the DataFrame() generated above) outTable.columns = [ depName + '.' + statName for depName in responseTable.columns for statName in statsNames ] else: # if in CPGLS mode # Load in AA alignment alignment = AlignIO.read(args.predictor, format=args.format) # Load the substitution matrix and normalize if necessary if args.sub_matrix.find("BLOSUM") > -1: # if it's a BLOSUM matrix try: # look for it in the blosum module subMatrix = blosum.submatrix( int(''.join([i for i in args.sub_matrix if i.isdigit()]))) except: print >> sys.stderr, "# ERROR: {} is not part of the blosum submatrix module!".format( args.sub_matrix) sys.exit(1) else: # load it from file subMatrix = np.array(pd.read_table(args.sub_matrix, sep='\t')) if not np.allclose( subMatrix, subMatrix.T ): # NOTE that the 'is' operator won't work with np.array()! print >> sys.stderr, "# ERROR: Substitution rate matrix is asymmetric, exiting" sys.exit(1) subMatrixNorm = normalizeRowWise(subMatrix, 1) if not np.allclose(subMatrix, subMatrixNorm): print >> sys.stderr, "# WARNING: Substitution rate matrix was not properly normalized, but now it is" subMatrix = subMatrixNorm # Find seqs that are not in tree or response table, issue warnings missingTaxa = set() for record in alignment: if record.id not in orderedLeaves: print >> sys.stderr, "# WARNING: {} is missing from the tree and will be dropped from analysis".format( record.id) missingTaxa.add(record.id) if record.id not in responseTable.index: print >> sys.stderr, "# WARNING: {} is missing from the response table and will be dropped from analysis".format( record.id) missingTaxa.add(record.id) # Drop those missing taxa keepRecords = dict() for record in alignment: if record.id not in missingTaxa: keepRecords[record.id] = record keepRecordsList = [keepRecords[taxon] for taxon in orderedLeaves ] # get the seqs in order! # Get the root phenotypes, aka phylomeans, for fixing the intercept # unpack the response vars into taxon-keyed dicts responseDicts = [ dependent[1].to_dict() for dependent in responseTable.T.iterrows() ] # get the phylomeans and store them to a list of intercepts # the below pulls out the _last_ value for a tree traversed in post-order intercepts = [ancR(tree, response)[-1] for response in responseDicts] # store the culled alignment as an array, sequences row-wise alignArray = np.array([list(record) for record in keepRecordsList], np.character) # then split in into column vectors colVectors = np.hsplit(alignArray, alignArray.shape[1]) if args.key_seq: # map the PP scores to key sequence positions # get key columns of the alignment keyCols = key_alignment_columns(alignment, args.key_seq) colVectors = [colVectors[i - 1] for i in keyCols] ### Multiple-arg unpacker for the ThreadPool. Dependent variable names and bell curve directory are passed by default def CPGLSUnpack(pargs, depNames=responseTable.columns): # init master list for all significance test results across dependents testsResults = list() # for each dependent variable for depIndex, dependent in enumerate(pargs[0].T): # get the name of the dependent variable depName = depNames[depIndex] intercept = intercepts[depIndex] # recreate the argument list using only that dependent variable dargs = [dependent, intercept] + pargs[2:] #(responseList, intercepts, phyCovMatrix, subMatrix, column, args.sub_weight, colnum) #(responseList, intercept, phyCovMatrix, subMatrix, colVector, subWeight=1, colnum=0) # regress against that variable regDesign = cpgls(*dargs) # can get the whitened data for further manipulation #print "whitened endog" #print pd.DataFrame(regDesign.wendog).to_csv(sep='\t') #TEST, print whitened data #print "whitened exog" #print pd.DataFrame(regDesign[0].wexog).to_csv(sep='\t') # TEST, print whitened data results = regDesign.fit() # fit the regression, SILENTLY # if we want to see the summary of every regression #print results.summary() #print "FP " + str(results.f_pvalue) #TEST if ~np.isnan(results.f_pvalue): # if not a conserved column # do pairwise 2-sample t-tests and get the p-values multicorr = 'hs' # the multiple-test correction to be used pairwiseT = results.t_test_pairwise( term_name='aa', method=multicorr).result_frame pairTDict = pairwiseT["pvalue-" + multicorr].T.to_dict() min_pairT = np.nanmin(pairTDict.values()) #pairTDict = str(pairTDict) # safer for .to_csv()? # Do a within/among comparison of variances anovaF_pvalue = sm.stats.anova_lm(results, typ=2)["PR(>F)"]["aa"] # Do normality tests for diagnostic purposes # omni: omniNorm_pvalue = sms.omni_normtest(results.resid)[1] # Jarque-Bera: #jbNorm_pvalue = sms.jarque_bera(results.resid)[1] # collate all the p-values, including the F-pVal for the linear model pDictList = [ results.f_pvalue, anovaF_pvalue, min_pairT, pairTDict, omniNorm_pvalue ] else: pDictList = [ None ] * 5 # return a null list; must be same length as pDictList above! # append the set of p-values to the list of lists for different response variables testsResults = testsResults + pDictList # if the column makes the specified p-cutoff to draw bell curves '''if pDictList[2] and pDictList[2] <= args.bell_curves: # generate discrete series for the t-distributions xs, tPDFs, tCDFs = tDists(summary) # draw t-distributions identified by dependent and column drawDists(xs, tPDFs, depName, pargs[5], bellDir)''' pbar.update(1) # update progress bar # return minimum p-val and p-val dict for pool map, all should be Bonferroni-corrected beforehand return testsResults # init progress bar# print >> sys.stderr, "# MESSAGE: Analyzing columns..." pbar = tqdm(total=len(colVectors)) #for i in range(len(colVectors)): #TEST for error handling in the ThreadPool # print >> sys.stderr, CPGLSUnpack([responseList, intercepts, phyCovMatrix, subMatrix, colVectors[i], args.sub_weight, i]) with warnings.catch_warnings(): warnings.simplefilter("ignore") # Processes are split up by column: each process executes 1 GLS regression tpool = pool.ThreadPool(args.cpu) sumStats = list( tpool.map(CPGLSUnpack, [[ responseList, intercepts, phyCovMatrix, subMatrix, column, args.sub_weight, colnum ] for colnum, column in enumerate(colVectors)]) ) # split alignment array into columns and feed them to pgls() tpool.close() tpool.join() pbar.close() # prepare the table of p-values for output outTable = pd.DataFrame({ "Sites": [i + 1 for i in range(len(colVectors))] }).set_index("Sites") # init table with a site column # specify stats being reported for each dependent var statsNames = ("model_F-p", "anova_F-p", "min_t-p_HolmSidak", "pairwise_t-p_HolmSidak", "omni_X2-p" ) # to return the min p-val and all the p-vals # create data columns in the same order as the lists returned by the ThreadPool statsColumns = [ depName + '.' + statName for depName in responseTable.columns for statName in statsNames ] for i, statCol in enumerate(statsColumns): outTable[statCol] = [el[i] for el in sumStats] if args.manhattan: # NOTE that this will generally crash any parent script calling this main() function, # because matplotlib fails to hand plotting resources back to the parent process. # get all numeric columns from outTable numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] outNumerics = outTable.select_dtypes(include=numerics) # draw plots in parallel print >> sys.stderr, "# MESSAGE: Drawing Manhattan plots..." def manhattan_unpack(pargs): manhattanPlot(*pargs) return 0 #print >> sys.stderr, manhattan_unpack(*[(outTable.index, list(outNumerics.T.ix[0]), manhattanDir+str(outNumerics.T.index[0])+".pdf", args.manhattan_thresholds, args.key_seq)]) #TEST tpool = pool.ThreadPool(args.cpu) #tpool = pool.ThreadPool(1) # list comp plots each numeric column against the index and names the files for those numeric columns # with itertuples(), element 0 of the row is the index name and [1:] are the actual values tpool.map(manhattan_unpack, [(outTable.index, list(y[1:]), manhattanDir + str(y[0]) + ".pdf", args.manhattan_thresholds, args.key_seq) for y in outNumerics.T.itertuples(name=None)]) tpool.close() tpool.join() # return the results DataFrame to any calling process return outTable
from ete3 import Tree, TreeStyle, TextFace #t = Tree("(((((FelCat:1.0, (PriBen:1.0, PriViv:1.0)Anc2:1.0)Anc3:1.0, (AciJub:1.0, PumCon:1.0)Anc1:1.0)Anc4:1.0, LynPar:1.0)Anc5:1.0, CarCar:1.0)Anc6:1.0, (PanTig:1.0,(PanOnc:1.0, (PanLeo:1.0, PanPar:1.0)Anc7:1.0)Anc8:1.0)Anc9:1.0);", format=1) #Topolgy from Evolution of Cats, 2007 #t = Tree("(((((FelCat:2.0, (PriBen:1.0, PriViv:1.0)Anc2:1.0), (AciJub:2.0, PumCon:2.0)Anc1:1.0), LynPar:4.0), CarCar:5.0), (PanTig:5.0,(PanOnc:4.0, (PanLeo:3.0, PanPar:3.0)Anc7:1.0)Anc8:1.0)Anc9:1.0);", format=1) #Topology from Kliver #t = Tree("(((((FelCat:2.0, (PriBen:1.0, PriViv:1.0)Anc2:1.0), LynPar:3.0), (AciJub:3.0, PumCon:3.0)Anc1:1.0), CarCar:5.0), (PanTig:5.0,(PanOnc:4.0, (PanLeo:3.0, PanPar:3.0)Anc7:1.0)Anc8:1.0)Anc9:1.0);", format=1) #branch length = Kliver's tree branches lengths*1000 #t = Tree("((((AciJub:2.91010233990525967,PumCon:2.18060580721677255)Anc1:1.05347870516064369,(((PriBen:0.98094949038301073,PriViv:1.00313958496688689)Anc2:1.80011684448748259,FelCat:2.86356101725523731)100:3.3638608143336113,LynPar:3.07709301059199899)96:0.14292306680570912)100:0.59099766777017380,CarCar:3.15189615063413749)100:0.78860645297528301,((PanOnc:1.05156689588516367,(PanLeo:1.24626526455903818,PanPar:0.96868719506379029)Anc7:0.28662050599669259)Anc8:0.38474422414646876,PanTig:1.69403903125740969)Anc9:2.67796389385434753);", format=1) t = Tree( "((CroCro:0.03163103687492756222,((((AciJub:0.00291010233990525967,PumCon:0.00218060580721677255)Anc1:0.00105347870516064369,(((PriBen:0.00098094949038301073,PriViv:0.00100313958496688689)Anc2:0.00180011684448748259,FelCat:0.00286356101725523731)100:0.00033638608143336113,LynPar:0.00307709301059199899)96:0.00014292306680570912)100:0.00059099766777017380,CarCar:0.00315189615063413749)100:0.00078860645297528301,((PanOnc:0.00105156689588516367,(PanLeo:0.00124626526455903818,PanPar:0.00096868719506379029)Anc7:0.00028662050599669259)Anc8:0.00038474422414646876,PanTig:0.00169403903125740969)Anc9:0.00267796389385434753)100:0.01757780609900925356):0.03362414544383966059,CanFam:0.03362414544383966059);", format=1) t.convert_to_ultrametric() print t.write(format=1) (t & "AciJub").add_features(label="150") (t & "AciJub").add_face(TextFace((t & "AciJub").label, ftype='Arial'), column=0, position="branch-top") (t & "PumCon").add_features(label="293") (t & "PumCon").add_face(TextFace((t & "PumCon").label, ftype='Arial'), column=0, position="branch-top") (t & "CarCar").add_features(label="185") (t & "CarCar").add_face(TextFace((t & "CarCar").label, ftype='Arial'), column=0, position="branch-top")
current_color %= len(colors) style = NodeStyle() style['vt_line_color'] = colors[current_color] style['hz_line_color'] = colors[current_color] style['size'] = 0 style['fgcolor'] = '#000000' style["vt_line_width"] = 2 style["hz_line_width"] = 2 for gg in g.traverse(): if not gg.coloured: gg.set_style(style) gg.coloured = True source_style = NodeStyle(style) source_style['size'] = 5 source_style['fgcolor'] = '#C01000' g.set_style(source_style) tstyle = TreeStyle() #tstyle.show_leaf_name = False tstyle.scale = 28 tstyle.branch_vertical_margin = 6 tstyle.show_branch_length = False # tstyle.show_branch_support = True tstyle.show_scale = False if ultrametric == 1: gene_tree.convert_to_ultrametric() gene_tree.show(tree_style=tstyle) # species_tree.show()