def trim(OGid): # 0 Load MSA try: msa1 = read_fasta(f'../align_fastas1/out/{OGid}.mfa') except FileNotFoundError: msa1 = read_fasta(f'../align_fastas2-2/out/{OGid}.mfa') # 1 Calculate shared variables gaps_array = np.full((len(msa1), len(msa1[0][1])), False) for i, (_, seq) in enumerate(msa1): for j, sym in enumerate(seq): if sym == '-': gaps_array[i, j] = True scores = gaps_array.sum(axis=0) msa1 = skbio.TabularMSA([skbio.Protein(seq, metadata={'description': header}) for header, seq in msa1]) # 2 Get trims (segments and columns) syms_list1 = trim_conserved(msa1, scores, gaps_array, tp['con_frac'], tp['con_window'], tp['con_minlen'], tp['con_rate'], tp['con_minsig']) syms_list2, trims = trim_insertions(msa1, scores, gaps_array, tp['gap_num'], tp['gap_rate'], tp['gap_minsig'], tp['nongap_frac'], tp['nongap_minlen'], tp['gp_sigma'], tp['gd_window'], tp['indel1_rate'], tp['indel2_rate'], tp['weights'], tp['threshold'], matrix) # 3 Combine trims (segments and columns) to yield final alignment msa2 = [] for seq, syms1, syms2 in zip(msa1, syms_list1, syms_list2): syms = ['-' if sym1 != sym2 else sym1 for sym1, sym2 in zip(syms1, syms2)] # Will only differ if one is converted to gap msa2.append((seq.metadata['description'], syms)) # 4 Restore gap only columns gaps_array = np.full((len(msa2), len(msa2[0][1])), False) for i, (_, seq) in enumerate(msa2): for j, sym in enumerate(seq): if sym == '-': gaps_array[i, j] = True scores = gaps_array.sum(axis=0) rf = ['x' for _ in range(len(msa2[0][1]))] # Metadata for marking consensus columns in profile HMM for region, in ndimage.find_objects(ndimage.label(scores == len(msa2))[0]): rf[region] = (region.stop - region.start) * ['.'] for i in range(len(msa2)): syms = msa2[i][1] syms[region] = list(str(msa1[i, region])) # 5 Write to file msa2 = skbio.TabularMSA([skbio.Protein(''.join(syms), metadata={'description': header}) for header, syms in msa2], positional_metadata={'RF': rf}) msa2.write(f'out/{OGid}.sto', 'stockholm')
def hmm_align(OGid): sqidnum, gnidnum = OGid2meta[OGid] if sqidnum == gnidnum: path = f'../make_fastas1/out/{OGid}.tfa' else: path = f'../make_fastas2-2/out/{OGid}.tfa' run(f'../../../bin/hmmbuild --hand --eset {1.5*gnidnum} --wnone out/{OGid}.hmm ../realign_trim/out/{OGid}.sto > out/{OGid}.txt', shell=True, check=True) run(f'../../../bin/hmmalign --outformat afa out/{OGid}.hmm {path} > out/{OGid}_temp.mfa', shell=True, check=True) # Remove excess gaps msa = read_fasta(f'out/{OGid}_temp.mfa') slices, idx = [], None for j in range(len(msa[0][1])): for i in range(len(msa)): sym = msa[i][1][j] if sym not in ['-', '.']: if idx is None: # Store position only if new slice is not started idx = j break else: if idx is not None: slices.append(slice(idx, j)) idx = None if idx is not None: # Add final slice to end slices.append(slice(idx, len(msa[0][1]))) # Write to file and remove temp alignment with open(f'out/{OGid}.mfa', 'w') as file: for header, seq1 in msa: seq2 = ''.join([seq1[s] for s in slices]) seqstring = '\n'.join([seq2[i:i+80] for i in range(0, len(seq2), 80)]) + '\n' file.write(header + '\n' + seqstring) os.remove(f'out/{OGid}_temp.mfa')
def decode(OGid, params): # Load msa and trim terminal insertions msa = read_fasta(f'../../ortho_MSA/realign_hmmer2/out/{OGid}.mfa') idx = 0 for j in range(len(msa[0][1])): for i in range(len(msa)): sym = msa[i][1][j] if sym == '.' or sym.islower(): break else: idx = j break # if no break exit msa = [(header, seq[idx:]) for header, seq in msa] idx = len(msa[0][1]) for j in range(len(msa[0][1]), 0, -1): for i in range(len(msa)): sym = msa[i][1][j - 1] if sym == '.' or sym.islower(): break else: idx = j break # if no break exit msa = [(header, seq[:idx]) for header, seq in msa] # Create emission sequence col0 = [] emits = [] for j in range(len(msa[0][1])): col = [1 if msa[i][1][j] in ['-', '.'] else 0 for i in range(len(msa))] emit0 = all([c0 == c for c0, c in zip(col0, col)]) emit1 = sum(col) emits.append((emit0, emit1)) col0 = col # Instantiate model e_dists_rv = { state: bernoulli_betabinom_frozen(p, len(msa) - 1, a, b) for state, (p, a, b) in params['e_dists'].items() } model = hmm.HMM(params['t_dists'], e_dists_rv, params['start_dist']) # Decode states and write fbs = model.forward_backward(emits) with open(f'out/{OGid}.tsv', 'w') as file: file.write('\t'.join(states) + '\n') for fb in zip(*[fbs[state] for state in states]): file.write('\t'.join([str(v) for v in fb]) + '\n')
def decode(OGid, params): # Load msa msa = read_fasta(f'../insertion_trim/out/{OGid}.mfa') # Create Bernoulli sequence ps = [] for j in range(len(msa[0][1])): col = [1 if msa[i][1][j] in ['-', '.'] else 0 for i in range(len(msa))] p = sum(col) / len(col) ps.append(p) # Instantiate model e_dists_rv = { '0': msaBernoulli(ps), '1': msaBernoulli([params['e_param'] for _ in range(len(ps))]) } model = hmm.HMM(params['t_dists'], e_dists_rv, params['start_dist']) # Decode states records = [] for header, seq in msa: # Create emission sequence emits = [] for j, sym in enumerate(seq): if sym in ['-', '.']: emits.append((j, 1)) else: emits.append((j, 0)) ppid = re.search(ppid_regex, header).group(1) fbs = model.forward_backward(emits) records.append((ppid, fbs)) # Write decoded states with open(f'out/{OGid}.tsv', 'w') as file: file.write('\t'.join(['ppid'] + states) + '\n') for ppid, fbs in records: for fb in zip(*[fbs[state] for state in states]): file.write(ppid + '\t' + '\t'.join([str(v) for v in fb]) + '\n')
def run_aucpred(OGid): msa = read_fasta(f'../insertion_trim/out/{OGid}.mfa') prefix = f'out/raw/{OGid}/' if not os.path.exists(prefix): os.mkdir(prefix) for header, seq in msa: ppid = re.search(r'ppid=([A-Za-z0-9_]+)', header).group(1) seq = seq.translate({ord('-'): None, ord('.'): None}) if len( seq ) < 10000: # AUCpreD uses PSIPRED which has a length limit of 10000 with open(f'{prefix}{ppid}.fasta', 'w') as file: seqstring = '\n'.join( [seq[i:i + 80] for i in range(0, len(seq), 80)]) + '\n' file.write(header + '\n' + seqstring) subprocess.run( f'../../../bin/Predict_Property/AUCpreD.sh -i {prefix}{ppid}.fasta -o {prefix}', check=True, shell=True) os.remove(f'{prefix}{ppid}.fasta')
spid, _, source, _, cds_path = line.split() genomes.append((spid, source, cds_path)) # Load translation table ttable = {} with open('ttable.txt') as file: lines = [line.rstrip().split(' = ')[1] for line in file] for i in range(len(lines[0])): aa = lines[0][i] codon = ''.join([lines[j][i] for j in range(2, 5)]) ttable[codon] = aa # Load CDSs ppid2cds = {} for spid, source, cds_path in genomes: fasta = read_fasta(cds_path) for header, seq in fasta: ppid = re.search(ppid_regex[header], line).group(1) ppid2cds[ppid] = seq if not os.path.exists('out/'): os.mkdir('out/') sys.stdout = open('out/out.txt', 'w') # Redirect stdout to file for file_id in filter(lambda x: x.endswith('.mfa'), os.listdir('../align_fastas/out/')): # Translate and write CDS nt_aligns = [] for header, aa_align in read_fasta('../align_fastas/out/' + file_id): ppid = re.search(r'ppid=([NXYPFBp0-9_.]+)\|', header)[1] aa_seq = aa_align.replace('-', '') nt_seq = ppid2cds[ppid]
genes = pd.read_table('genes.tsv') # Load tree tree = skbio.read('../../ortho_tree/ctree_WAG/out/100red_ni.txt', 'newick', skbio.TreeNode) tip_order = {tip.name: i for i, tip in enumerate(tree.tips())} # Draw alignments if not os.path.exists('out/'): os.mkdir('out/') df = OGs[['gnid', 'OGid']].drop_duplicates().merge(OG_meta, on='OGid', how='right').merge(genes, on='gnid', how='right') df.to_csv('out/OGs.tsv', sep='\t', index=False) for row in df.dropna().itertuples(): if row.sqidnum == row.gnidnum: msa = read_fasta(f'../align_fastas1/out/{row.OGid}.mfa') else: msa = read_fasta(f'../align_fastas2-2/out/{row.OGid}.mfa') msa = [(re.search(r'spid=([a-z]+)', header).group(1), seq) for header, seq in msa] msa = [seq for _, seq in sorted(msa, key=lambda x: tip_order[x[0]])] # Re-order sequences and extract seq only im = draw_msa(msa) plt.imsave(f'out/{row.OGid}.png', im) """ DEPENDENCIES ../../ortho_cluster3/clique4+_pcommunity/clique4+_pcommunity2.py ../../ortho_cluster3/clique4+_pcommunity/out/pgraph2/4clique/pclusters.txt ../../ortho_search/seq_meta/seq_meta.py ../../ortho_search/seq_meta/out/seq_meta.tsv ../../ortho_tree/ctree_WAG/ctree_WAG.py
tree = skbio.read('../../ortho_tree/ctree_WAG/out/100red_ni.txt', 'newick', skbio.TreeNode) tip_order = {tip.name: i for i, tip in enumerate(tree.tips())} spids = {tip.name for tip in tree.tips() if tip.name != 'sleb'} OG_filter = pd.read_table('../OG_filter/out/OG_filter.tsv') df = pd.read_table('../gap_contrasts/out/total_sums.tsv').merge(OG_filter[['OGid', 'sqidnum']], on='OGid', how='left') # total_sums.tsv has gnidnum already df['norm1'] = df['total'] / df['gnidnum'] df['norm2'] = df['total'] / (df['gnidnum'] * df['len2']) for label in ['norm1', 'norm2']: if not os.path.exists(f'out/{label}/'): os.makedirs(f'out/{label}/') head = df.sort_values(by=label, ascending=False).head(150) for i, row in enumerate(head.itertuples()): msa = read_fasta(f'../realign_hmmer2/out/{row.OGid}.mfa') msa = [(re.search(r'spid=([a-z]+)', header).group(1), seq) for header, seq in msa] msa = [seq.upper() for _, seq in sorted(msa, key=lambda x: tip_order[x[0]])] # Re-order sequences and extract seq only im = draw_msa(msa) plt.imsave(f'out/{label}/{i}_{row.OGid}.png', im) """ DEPENDENCIES ../../ortho_tree/ctree_WAG/ctree_WAG.py ../../ortho_tree/ctree_WAG/out/100red_ni.txt ../gap_contrasts/gap_contrasts_calc.py ../gap_contrasts/out/total_sums.tsv ../OG_filter/OG_filter.py ../OG_filter/out/OG_filter.tsv ../realign_hmmer2/realign_hmmer2.py
seed(930715) # Set seed to make results consistent # Extract column pools colpools = [('100red', lambda col: is_redundant(col, 1), []), ('100red_ni', lambda col: is_redundant(col, 1) and not is_invariant(col), []), ('50red', lambda col: is_redundant(col, 0.5), []), ('50red_ni', lambda col: is_redundant(col, 0.5) and not is_invariant(col), []), ('0red', lambda col: is_redundant(col, 0), []), ('0red_ni', lambda col: is_redundant(col, 0) and not is_invariant(col), [])] for file_id in filter( lambda x: x.endswith('.mfa'), os.listdir('../align_fastas/out/') ): # Because inputs are not sorted, results are not guaranteed to be consistent msa = read_fasta(f'../align_fastas/out/{file_id}') for i in range(len(msa[0][1])): col = [Column(header[-4:], seq[i]) for header, seq in msa] for _, condition, colpool in colpools: if condition(col): colpool.append(col) # Make meta alignments for label, _, colpool in colpools: if not os.path.exists(f'out/{label}/'): os.makedirs(f'out/{label}/') print(f'{label}:', len(colpool)) for samplenum in range(100): sample = [colpool[randrange(len(colpool))] for _ in range(10000)] seqs = {}
file.readline() for line in file: fields = line.rstrip('\n').split('\t') OGid, start, stop, state = fields if OGid in OGid2labels: OGid2labels[OGid][state].append((int(start), int(stop))) else: labels = {'0': [], '1A': [], '1B': [], '2': [], '3': []} labels[state].append((int(start), int(stop))) OGid2labels[OGid] = labels if not os.path.exists('out/'): os.mkdir('out/') for OGid, labels in OGid2labels.items(): msa = trim_terminals(read_fasta(f'../../ortho_MSA/realign_hmmer1/out/{OGid}.mfa')) if labels['0'] and labels['0'][0][0] == 0: offset = labels['0'][0][1] else: offset = 0 lines = {} for state in ['1A', '1B', '2', '3']: line = np.zeros(len(msa[0][1])) for start, stop in labels[state]: line[slice(start-offset, stop-offset)] = 1 lines[state] = line plot_msa_lines([seq[1].upper() for seq in msa], [lines['1A'], lines['2'], lines['3'], lines['1B']], figsize=(15, 6)) plt.savefig(f'out/{OGid}.png', bbox_inches='tight')
if state != '0': # Skip terminal insertions as actual state states.add(state) try: OGid2regions[OGid].append((int(start), int(stop), state)) except KeyError: OGid2regions[OGid] = [(int(start), int(stop), state)] # Initialize counts with pseudocounts t_counts = {state: {s: 1 for s in states} for state in states} e_counts = {state: {} for state in states} start_count = {state: 1 for state in states} # Get observed counts for OGid, regions in OGid2regions.items(): # Load MSA and trim terminal insertions msa = read_fasta(f'../../ortho_MSA/realign_hmmer1/out/{OGid}.mfa') if regions[-1][2] == '0': start, _, _ = regions[-1] regions = regions[:-1] trim = [] for header, seq in msa: trim.append((header, seq[:start])) msa = trim if regions[0][2] == '0': _, stop, _ = regions[0] trim = [] for header, seq in msa: trim.append((header, seq[stop:])) msa = trim offset = regions[0][1]
regions = [] with open('../aucpred_filter/out/regions_30.tsv') as file: file.readline() # Skip header for line in file: OGid, start, stop, disorder, ppids = line.split() regions.append((OGid, int(start), int(stop), set(ppids.split(',')))) # Calculate contrasts if not os.path.exists('out/'): os.mkdir('out/') totals = [] rows = [] for OGid, start, stop, ppids in regions: msa = {} for header, seq in read_fasta(f'../insertion_trim/out/{OGid}.mfa'): ppid = re.search(ppid_regex, header).group(1) spid = re.search(spid_regex, header).group(1) if ppid in ppids: msa[spid] = seq[start:stop] tree = tree_template.deepcopy().shear(msa.keys()) for tip in tree.tips(): gap_vector = np.asarray([1 if sym == '-' else 0 for sym in msa[tip.name]]) tip.value = gap_vector tree.length = 0 # Set root length to 0 for convenience contrasts, _, _ = get_contrasts(tree) gap_matrix = np.asarray([[0 if sym == '-' else 1 for sym in seq] for seq in msa.values()]) len1 = len(msa['dmel']) # Total length of alignment len2 = (gap_matrix / len(msa)).sum() # Adjusted length of alignment
'X': 0, '-': 0 } a = 1E-3 # Coefficient of outlier curve spid_regex = r'spid=([a-z]+)' tree = skbio.read('../../ortho_tree/ctree_WAG/out/100red_ni.txt', 'newick', skbio.TreeNode) tip_order = {tip.name: i for i, tip in enumerate(tree.tips())} records = [] for OGid in [ path.split('.mfa')[0] for path in os.listdir('../realign_hmmer1/out/') if path.endswith('.mfa') ]: msa = [(re.search(spid_regex, header).group(1), seq.upper()) for header, seq in read_fasta(f'../realign_hmmer1/out/{OGid}.mfa')] idx = 0 for j in range(len(msa[0][1])): for i in range(len(msa)): sym = msa[i][1][j] if sym == '.' or sym.islower(): break else: idx = j break # if no break exit msa = [(header, seq[idx:]) for header, seq in msa] idx = len(msa[0][1]) for j in range(len(msa[0][1]), 0, -1): for i in range(len(msa)):
with open('segments.tsv') as file: file.readline() # Skip header for line in file: OGid, ppid, start, stop, state = line.split() state_set.add(state) try: OGid2regions[(OGid, ppid)].append( (int(start), int(stop), state)) except KeyError: OGid2regions[(OGid, ppid)] = [(int(start), int(stop), state)] # Convert MSAs to records containing state-emissions sequences and other data records = [] for (OGid, ppid), regions in OGid2regions.items(): # Load MSA and extract seq msa = read_fasta(f'../insertion_trim/out/{OGid}.mfa') seq = [ seq for header, seq in msa if re.search(ppid_regex, header).group(1) == ppid ][0] # Create Bernoulli sequence p_seq = [] for j in range(len(msa[0][1])): col = [ 1 if msa[i][1][j] in ['-', '.'] else 0 for i in range(len(msa)) ] p = sum(col) / len(col) p_seq.append(p) # Create emission sequence
if 'charset' in line: groupdict = re.search(r'charset (?P<name>[a-zA-Z0-9]+) = (?P<regions>[0-9 -]+);', line) regions = [] for region in groupdict['regions'].split(): start, stop = region.split('-') regions.append((int(start)-1, int(stop))) transform, start0 = {}, 0 for start, stop in regions: transform[(start, stop)] = (start0, stop - start + start0) start0 += stop - start partition = partitions[partition_id] partition.update({'regions': regions, 'transform': transform}) partition_id += 1 # Calculate likelihoods msa = read_fasta(f'../asr_aa/out/{OGid}.mfa') for partition in partitions.values(): # Unpack partition parameters and partition MSA matrix, freqs = models[partition['model']] pinv, alpha, num_categories = partition['pinv'], partition['alpha'], partition['num_categories'] partition_msa = [] for header, seq in msa: partition_seq = ''.join([seq[start:stop] for start, stop in partition['regions']]) partition_msa.append((header, partition_seq)) # Convert to vectors at tips of tree tips = {tip.name: tip for tip in tree.tips()} for header, seq in partition_msa: tip = tips[header[1:5]] conditional = np.zeros((len(syms), len(seq))) for j, sym in enumerate(seq):
with open('../../brownian2/aucpred_regions/out/regions.tsv') as file: file.readline() # Skip header for line in file: OGid, start, stop, disorder = line.split() try: OGid2regions[OGid].append( (int(start), int(stop), True if disorder == 'True' else False)) except KeyError: OGid2regions[OGid] = [(int(start), int(stop), True if disorder == 'True' else False)] if not os.path.exists('out/'): os.mkdir('out/') for OGid in OGids: msa = read_fasta(f'../../brownian2/insertion_trim/out/{OGid}.mfa') msa = [(re.search(ppid_regex, header).group(1), re.search(spid_regex, header).group(1), seq) for header, seq in msa] # Check regions and merge if necessary regions = OGid2regions[OGid] disorder_length = sum( [stop - start for start, stop, disorder in regions if disorder]) order_length = sum( [stop - start for start, stop, disorder in regions if not disorder]) if disorder_length >= 30 and order_length >= 30: disorder_regions = [ f'{start+1}-{stop}' for start, stop, disorder in regions if disorder
tip_order = {tip.name: i for i, tip in enumerate(tree.tips())} spids = {tip.name for tip in tree.tips() if tip.name != 'sleb'} OG_filter = pd.read_table('../../ortho_MSA/OG_filter/out/OG_filter.tsv') df = pd.read_table('../../ortho_MSA/gap_contrasts/out/total_sums.tsv').merge(OG_filter[['OGid', 'sqidnum']], on='OGid', how='left') # total_sums.tsv has gnidnum already df['norm1'] = df['total'] / df['gnidnum'] df['norm2'] = df['total'] / (df['gnidnum'] * df['len2']) for label in ['norm1', 'norm2']: if not os.path.exists(f'out/{label}/'): os.makedirs(f'out/{label}/') head = df.sort_values(by=label, ascending=False).head(150) for i, row in enumerate(head.itertuples()): # Load msa and trim terminal insertions msa = [(re.search(r'spid=([a-z]+)', header).group(1), seq) for header, seq in read_fasta(f'../../ortho_MSA/realign_hmmer2/out/{row.OGid}.mfa')] msa = trim_terminals(msa) # Load decoded states posterior = [] with open(f'../insertion_trim/out/{row.OGid}.tsv') as file: header = file.readline().rstrip('\n').split('\t') for line in file: fields = {key: float(value) for key, value in zip(header, line.rstrip('\n').split('\t'))} posterior.append(fields['2'] + fields['3']) posterior = np.array(posterior) gradient = np.gradient(posterior) # Make trim plot slices = get_slices(msa, posterior, gradient) trims = np.zeros(len(posterior))
# Parse genomes genomes = [] with open('../config/genomes.tsv') as file: file.readline() # Skip header for line in file: spid, _, source, prot_path, tcds_path = line.split() genomes.append((spid, source, prot_path, tcds_path)) # Extract and count polypeptide IDs counts = {} # Counts for each PPID to find duplicates ppid2meta = {} # PPID to gene and species gnid2seqs = {} # GNID to PPIDs with unique sequences for spid, source, prot_path, tcds_path in genomes: # Find parent genes in tcds headers tcds_fasta = read_fasta(tcds_path) for header, _ in tcds_fasta: gn_match = re.search(gnid_regex[source], header) pp_match = re.search(ppid_regex[source], header) try: # First group is entire line, second is first match gnid = gn_match.group(1) ppid = pp_match.group(1) ppid2meta[ppid] = (gnid, spid) except AttributeError: print(header) # Find representative sequences in prot files prot_fasta = read_fasta(prot_path) for header, seq in prot_fasta: ppid = re.search(ppid_regex[source], header).group(1)
return False return True Column = namedtuple('Column', ['spid', 'sym']) seed(930715) # Set seed to make results consistent # Extract column pools colpools = [('100red', lambda col: is_redundant(col, 1), []), ('100red_ni', lambda col: is_redundant(col, 1) and not is_invariant(col), []), ('50red', lambda col: is_redundant(col, 0.5), []), ('50red_ni', lambda col: is_redundant(col, 0.5) and not is_invariant(col), []), ('0red', lambda col: is_redundant(col, 0), []), ('0red_ni', lambda col: is_redundant(col, 0) and not is_invariant(col), [])] for file_id in filter(lambda x: x.endswith('.mfa'), os.listdir('../align_aa2nt/out/')): # Because inputs are not sorted, results are not guaranteed to be consistent msa = read_fasta(f'../align_aa2nt/out/{file_id}') for i in range(len(msa[0][1])): col = [Column(header[-4:], seq[i]) for header, seq in msa] for _, condition, colpool in colpools: if condition(col): colpool.append(col) # Make meta alignments for label, _, colpool in colpools: if not os.path.exists(f'out/{label}/'): os.makedirs(f'out/{label}/') print(f'{label}:', len(colpool)) for samplenum in range(100): sample = [colpool[randrange(len(colpool))] for _ in range(10000)] seqs = {}
for line in file: ppid, gnid, _, sqid = line.split() ppid2meta[ppid] = (gnid, sqid) # Parse genomes genomes = {} with open('../config/genomes.tsv') as file: file.readline() # Skip header for line in file: spid, _, source, prot_path = line.split() genomes[spid] = (source, prot_path) # Parse polypeptides rows = [] for spid, (source, prot_path) in genomes.items(): fasta = read_fasta(prot_path) for header, seq in fasta: ppid = re.search(ppid_regex[source], header).group(1) gnid, sqid = ppid2meta[ppid] rows.append({ 'ppid': ppid, 'gnid': gnid, 'spid': spid, 'sqid': sqid, 'seqlen': len(seq), 'Xnum': seq.upper().count('X'), 'Xmax': get_Xmax(seq) }) # Make plots output directory if not os.path.exists('out/'):
line = file.readline() while not line.startswith('Model of rate heterogeneity:'): line = file.readline() num_categories = int(line.rstrip().split(' Gamma with ')[1][0]) alpha = float(file.readline().rstrip().split(': ')[1]) igfs = [] # Incomplete gamma function evaluations for i in range(num_categories + 1): x = gamma.ppf(i / num_categories, a=alpha, scale=1 / alpha) igfs.append(gammainc(alpha + 1, alpha * x)) rates = [] # Normalized rates for i in range(num_categories): rate = num_categories * (igfs[i + 1] - igfs[i]) rates.append((rate, 1 / num_categories)) # Load sequence and convert to vectors at tips of tree mca = read_fasta(f'../asr_indel/out/{OGid}.mfa') tips = {tip.name: tip for tip in tree.tips()} for header, seq in mca: tip = tips[header[1:5]] conditional = np.zeros((2, len(seq))) for j, sym in enumerate(seq): conditional[int(sym), j] = 1 tip.conditional = conditional # Get likelihoods for rate categories likelihoods = [] for rate, prior in rates: s, conditional = get_conditional(tree, rate * matrix) l = np.expand_dims(freqs, -1) * conditional likelihoods.append(np.exp(s) * l * prior)