def get_response_content(fs): # get the newick trees. trees = [] for tree_string in iterutils.stripped_lines(StringIO(fs.trees)): # parse each tree # and make sure that it conforms to various requirements tree = NewickIO.parse(tree_string, FelTree.NewickTree) tip_names = [tip.get_name() for tip in tree.gen_tips()] if len(tip_names) < 4: msg = 'expected at least 4 tips but found ' + str(len(tip_names)) raise HandlingError(msg) if any(name is None for name in tip_names): raise HandlingError('each terminal node must be labeled') if len(set(tip_names)) != len(tip_names): raise HandlingError('each terminal node label must be unique') trees.append(tree) # get the threshold for negligibility of an eigenvector loading epsilon = fs.epsilon if not (0 <= epsilon < 1): raise HandlingError('invalid threshold for negligibility') # get the set of selected options selected_options = fs.options # analyze each tree results = [] for tree in trees: results.append(AnalysisResult(tree, epsilon)) # create the response out = StringIO() for result in results: for line in result.get_response_lines(selected_options): print >> out, line print >> out # return the response return out.getvalue()
def process(hud_lines, matpheno_lines): """ @param hud_lines: lines of a .hud file @param matpheno_lines: lines of a MAT_pheno.txt file @return: contents of an .ind file """ # get the ordered names from the .hud file names, hud_data = hud.decode(hud_lines) # get case and control status from the matpheno file cases = set() controls = set() for line in iterutils.stripped_lines(matpheno_lines): name, classification = line.split(None, 1) if classification == '1': cases.add(name) elif classification == '2': controls.add(name) elif classification in ('12', 'null'): # skip individuals classified like this pass else: msg = 'invalid MAT_pheno classification: ' + classification raise Exception(msg) # write the .ind file contents out = StringIO() for name in names: gender = 'U' classification = 'Ignore' if name in cases: classification = 'Case' elif name in controls: classification = 'Control' row = [name, gender, classification] print >> out, '\t'.join(row) return out.getvalue().rstrip()
def get_response_content(fs): # get the newick trees. trees = [] for tree_string in iterutils.stripped_lines(StringIO(fs.trees)): # parse each tree and make sure that it conforms to various requirements tree = NewickIO.parse(tree_string, FelTree.NewickTree) tip_names = [tip.get_name() for tip in tree.gen_tips()] if len(tip_names) < 4: raise HandlingError('expected at least four tips but found ' + str(len(tip_names))) if any(name is None for name in tip_names): raise HandlingError('each terminal node must be labeled') if len(set(tip_names)) != len(tip_names): raise HandlingError('each terminal node label must be unique') trees.append(tree) # begin the response out = StringIO() # look at each tree nerrors = 0 ncounterexamples = 0 for tree in trees: # get the set of valid partitions implied by the tree valid_parts = TreeComparison.get_partitions(tree) ordered_tip_names = [tip.get_name() for tip in tree.gen_tips()] # assert that the partition implied by the correct formula is valid D = np.array(tree.get_distance_matrix(ordered_tip_names)) loadings = get_principal_coordinate(D) nonneg_leaf_set = frozenset( tip for tip, v in zip(ordered_tip_names, loadings) if v >= 0) neg_leaf_set = frozenset(tip for tip, v in zip(ordered_tip_names, loadings) if v < 0) part = frozenset([nonneg_leaf_set, neg_leaf_set]) if part not in valid_parts: nerrors += 1 print >> out, 'error: a partition that was supposed to be valid was found to be invalid' print >> out, 'tree:', NewickIO.get_newick_string(tree) print >> out, 'invalid partition:', partition_to_string(part) print >> out # check the validity of the partition implied by the incorrect formula Q = D * D loadings = get_principal_coordinate(Q) nonneg_leaf_set = frozenset( tip for tip, v in zip(ordered_tip_names, loadings) if v >= 0) neg_leaf_set = frozenset(tip for tip, v in zip(ordered_tip_names, loadings) if v < 0) part = frozenset([nonneg_leaf_set, neg_leaf_set]) if part not in valid_parts: ncounterexamples += 1 print >> out, 'found a counterexample!' print >> out, 'tree:', NewickIO.get_newick_string(tree) print >> out, 'invalid partition:', partition_to_string(part) print >> out print >> out, 'errors found:', nerrors print >> out, 'counterexamples found:', ncounterexamples # return the response return out.getvalue()
def get_hyphy_namespace(lines): """ @param lines: lines of HyPhy output @return: a HyphyNamespace object """ # process each line of the hyphy output ns = HyphyNamespace() for line in iterutils.stripped_lines(lines): ns.process_line(line) return ns
def get_response_content(fs): # get a properly formatted newick tree with branch lengths tree = Newick.parse(fs.tree, SpatialTree.SpatialTree) tree.assert_valid() if tree.has_negative_branch_lengths(): msg = 'drawing a tree with negative branch lengths is not implemented' raise HandlingError(msg) tree.add_branch_lengths() # get the dictionary mapping the branch name to the nucleotide name_to_nucleotide = {} # parse the column string for line in iterutils.stripped_lines(fs.column.splitlines()): name_string, nucleotide_string = SnippetUtil.get_state_value_pair(line) if nucleotide_string not in list('acgtACGT'): msg = '"%s" is not a valid nucleotide' % nucleotide_string raise HandlingError(msg) nucleotide_string = nucleotide_string.upper() if name_string in name_to_nucleotide: raise HandlingError('the name "%s" was duplicated' % name_string) name_to_nucleotide[name_string] = nucleotide_string # augment the tips with the nucleotide letters for name, nucleotide in name_to_nucleotide.items(): try: node = tree.get_unique_node(name) except Newick.NewickSearchError as e: raise HandlingError(e) if node.children: msg = 'constraints on internal nodes are not implemented' raise HandlingError(msg) node.state = nucleotide # get the Jukes-Cantor rate matrix object dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix() ordered_states = list('ACGT') row_major_rate_matrix = MatrixUtil.dict_to_row_major( dictionary_rate_matrix, ordered_states, ordered_states) rate_matrix_object = RateMatrix.RateMatrix( row_major_rate_matrix, ordered_states) # simulate the ancestral nucleotides rate_matrix_object.simulate_ancestral_states(tree) # simulate a path on each branch # this breaks up the branch into a linear sequence of nodes and adds color for node in tree.gen_non_root_nodes(): simulate_branch_path(tree, node) # do the layout EqualArcLayout.do_layout(tree) # draw the image try: ext = Form.g_imageformat_to_ext[fs.imageformat] return DrawTreeImage.get_tree_image(tree, (640, 480), ext) except CairoUtil.CairoUtilError as e: raise HandlingError(e)
def get_response_content(fs): # get a properly formatted newick tree with branch lengths tree = Newick.parse(fs.tree, SpatialTree.SpatialTree) tree.assert_valid() if tree.has_negative_branch_lengths(): msg = 'drawing a tree with negative branch lengths is not implemented' raise HandlingError(msg) tree.add_branch_lengths() # get the dictionary mapping the branch name to the nucleotide name_to_nucleotide = {} # parse the column string for line in iterutils.stripped_lines(fs.column.splitlines()): name_string, nucleotide_string = SnippetUtil.get_state_value_pair(line) if nucleotide_string not in list('acgtACGT'): msg = '"%s" is not a valid nucleotide' % nucleotide_string raise HandlingError(msg) nucleotide_string = nucleotide_string.upper() if name_string in name_to_nucleotide: raise HandlingError('the name "%s" was duplicated' % name_string) name_to_nucleotide[name_string] = nucleotide_string # augment the tips with the nucleotide letters for name, nucleotide in name_to_nucleotide.items(): try: node = tree.get_unique_node(name) except Newick.NewickSearchError as e: raise HandlingError(e) if node.children: msg = 'constraints on internal nodes are not implemented' raise HandlingError(msg) node.state = nucleotide # get the Jukes-Cantor rate matrix object dictionary_rate_matrix = RateMatrix.get_jukes_cantor_rate_matrix() ordered_states = list('ACGT') row_major_rate_matrix = MatrixUtil.dict_to_row_major( dictionary_rate_matrix, ordered_states, ordered_states) rate_matrix_object = RateMatrix.RateMatrix(row_major_rate_matrix, ordered_states) # simulate the ancestral nucleotides rate_matrix_object.simulate_ancestral_states(tree) # simulate a path on each branch # this breaks up the branch into a linear sequence of nodes and adds color for node in tree.gen_non_root_nodes(): simulate_branch_path(tree, node) # do the layout EqualArcLayout.do_layout(tree) # draw the image try: ext = Form.g_imageformat_to_ext[fs.imageformat] return DrawTreeImage.get_tree_image(tree, (640, 480), ext) except CairoUtil.CairoUtilError as e: raise HandlingError(e)
def get_response_content(fs): # get the newick trees. trees = [] for tree_string in iterutils.stripped_lines(StringIO(fs.trees)): # parse each tree and make sure that it conforms to various requirements tree = NewickIO.parse(tree_string, FelTree.NewickTree) tip_names = [tip.get_name() for tip in tree.gen_tips()] if len(tip_names) < 4: raise HandlingError('expected at least four tips but found ' + str(len(tip_names))) if any(name is None for name in tip_names): raise HandlingError('each terminal node must be labeled') if len(set(tip_names)) != len(tip_names): raise HandlingError('each terminal node label must be unique') trees.append(tree) # begin the response out = StringIO() # look at each tree nerrors = 0 ncounterexamples = 0 for tree in trees: # get the set of valid partitions implied by the tree valid_parts = TreeComparison.get_partitions(tree) ordered_tip_names = [tip.get_name() for tip in tree.gen_tips()] # assert that the partition implied by the correct formula is valid D = np.array(tree.get_distance_matrix(ordered_tip_names)) loadings = get_principal_coordinate(D) nonneg_leaf_set = frozenset(tip for tip, v in zip(ordered_tip_names, loadings) if v >= 0) neg_leaf_set = frozenset(tip for tip, v in zip(ordered_tip_names, loadings) if v < 0) part = frozenset([nonneg_leaf_set, neg_leaf_set]) if part not in valid_parts: nerrors += 1 print >> out, 'error: a partition that was supposed to be valid was found to be invalid' print >> out, 'tree:', NewickIO.get_newick_string(tree) print >> out, 'invalid partition:', partition_to_string(part) print >> out # check the validity of the partition implied by the incorrect formula Q = D * D loadings = get_principal_coordinate(Q) nonneg_leaf_set = frozenset(tip for tip, v in zip(ordered_tip_names, loadings) if v >= 0) neg_leaf_set = frozenset(tip for tip, v in zip(ordered_tip_names, loadings) if v < 0) part = frozenset([nonneg_leaf_set, neg_leaf_set]) if part not in valid_parts: ncounterexamples += 1 print >> out, 'found a counterexample!' print >> out, 'tree:', NewickIO.get_newick_string(tree) print >> out, 'invalid partition:', partition_to_string(part) print >> out print >> out, 'errors found:', nerrors print >> out, 'counterexamples found:', ncounterexamples # return the response return out.getvalue()
def get_response_content(fs): # get a properly formatted newick tree with branch lengths tree = Newick.parse(fs.tree, SpatialTree.SpatialTree) tree.assert_valid() if tree.has_negative_branch_lengths(): msg = 'drawing a tree with negative branch lengths is not implemented' raise HandlingError(msg) tree.add_branch_lengths() # get the dictionary mapping the branch name to the rgb color name_to_rgb = {} # parse the coloration string for line in iterutils.stripped_lines(fs.coloration.splitlines()): # get the branch and its color name_string, rgb_string = SnippetUtil.get_state_value_pair(line) rgb_string = rgb_string.upper() # validate the rgb string if len(rgb_string) != 6: msg = 'expected each rgb string to be six characters long' raise HandlingError(msg) bad_letters = set(rgb_string) - set('0123456789ABCDEFabcdef') if bad_letters: msg = 'found invalid rgb characters: %s' % str(tuple(bad_letters)) raise HandlingError(msg) # associate the branch with its color name_to_rgb[name_string] = rgb_string # color the branches for name, rgb in name_to_rgb.items(): try: node = tree.get_unique_node(name) except Newick.NewickSearchError as e: raise HandlingError(e) node.branch_color = rgb # do the layout try: layout = FastDaylightLayout.StraightBranchLayout() layout.do_layout(tree) except RuntimeError as e: pass # draw the image try: ext = Form.g_imageformat_to_ext[fs.imageformat] return DrawTreeImage.get_tree_image(tree, (640, 480), ext) except CairoUtil.CairoUtilError as e: raise HandlingError(e)
def process(raw_lines): """ @param lines: lines of an .ind file @return: the single string of a .pheno file """ values = [] for line in iterutils.stripped_lines(raw_lines): name, gender, status = line.split() if status == 'Control': v = '0' elif status == 'Case': v = '1' elif status == 'Ignore': v = '9' else: msg = 'Invalid status: ' + status raise Exception(msg) values.append(v) return ''.join(values)
def get_alignment(data_string, tree_string): # convert the comma separated data into a table table = [] for line in iterutils.stripped_lines(StringIO(data_string)): row = list(csv.reader(StringIO(line), delimiter=',', quotechar='"'))[0] table.append(row) # create the amino acid fasta alignment alignment = get_amino_acid_alignment(table) # create the tree tree = NewickIO.parse(tree_string, FelTree.NewickTree) # Make sure that the newick tree has all of the taxa # required by the alignment. tree_taxa_set = set(node.get_name() for node in tree.gen_tips()) alignment_taxa_set = set(alignment.headers) weird_alignment_taxa = alignment_taxa_set - tree_taxa_set if weird_alignment_taxa: raise HandlingError('the following taxa were not found ' 'in the tree: %s' % str(weird_taxa)) # return the alignment return alignment
def get_alignment(data_string, tree_string): # convert the comma separated data into a table table = [] for line in iterutils.stripped_lines(StringIO(data_string)): row = list(csv.reader( StringIO(line), delimiter=',', quotechar='"'))[0] table.append(row) # create the amino acid fasta alignment alignment = get_amino_acid_alignment(table) # create the tree tree = NewickIO.parse(tree_string, FelTree.NewickTree) # Make sure that the newick tree has all of the taxa # required by the alignment. tree_taxa_set = set(node.get_name() for node in tree.gen_tips()) alignment_taxa_set = set(alignment.headers) weird_alignment_taxa = alignment_taxa_set - tree_taxa_set if weird_alignment_taxa: raise HandlingError( 'the following taxa were not found ' 'in the tree: %s' % str(weird_taxa)) # return the alignment return alignment
def get_response_content(fs): # get the sequences sequences = [] for raw_string in iterutils.stripped_lines(fs.sequences.splitlines()): sequences.append(raw_string.strip()) # get the alphabet alphabet = list(sorted(set(''.join(sequences)))) # get the vectors that should represent the symbols. raw_vectors = get_vectors(len(alphabet)) # set values smaller than user-defined epsilon to zero vectors = [[eps_filter(x, fs.epsilon) for x in v] for v in raw_vectors] # map letters to vectors letter_to_vector = dict(zip(alphabet, vectors)) # get the number lists corresponding to the sequences number_lists = [] for sequence in sequences: number_list = [] for letter in sequence: number_list.extend(letter_to_vector[letter]) number_lists.append(number_list) # return the response return MatrixUtil.m_to_string(number_lists) + '\n'
def get_response_content(fs): # get the newick trees. trees = [] for tree_string in iterutils.stripped_lines(fs.trees.splitlines()): # parse each tree and make sure that it conforms to various requirements tree = NewickIO.parse(tree_string, FelTree.NewickTree) tip_names = [tip.get_name() for tip in tree.gen_tips()] if len(tip_names) < 4: raise HandlingError( 'expected at least four tips ' 'but found ' + str(len(tip_names))) if any(name is None for name in tip_names): raise HandlingError('each terminal node must be labeled') if len(set(tip_names)) != len(tip_names): raise HandlingError('each terminal node label must be unique') trees.append(tree) # create the response out = StringIO() same_count = 0 diff_count = 0 for tree in trees: # make the local paragraph that will be shown if there is an event local_out = StringIO() has_event = False # print the tree print >> local_out, NewickIO.get_newick_string(tree) # get the tip nodes and the internal nodes tip_nodes = [] internal_nodes = [] for node in tree.preorder(): if node.is_tip(): tip_nodes.append(node) else: internal_nodes.append(node) all_nodes = tip_nodes + internal_nodes # get all tip name partitions implied by the tree topology valid_partitions = TreeComparison.get_partitions(tree) # get results from the augmented distance matrix D_full = tree.get_partial_distance_matrix( [id(node) for node in all_nodes]) y_full = get_vector(D_full).tolist() y = y_full[:len(tip_nodes)] name_selection = frozenset(node.get_name() for node, elem in zip(tip_nodes, y) if elem > 0) name_complement = frozenset(node.get_name() for node, elem in zip(tip_nodes, y) if elem <= 0) name_partition_a = frozenset((name_selection, name_complement)) if name_partition_a not in valid_partitions: print >> local_out, 'augmented distance matrix split fail:', print >> local_out, name_partition_a has_event = True # get results from the not-augmented distance matrix D = tree.get_partial_distance_matrix([id(node) for node in tip_nodes]) y = get_vector(D).tolist() name_selection = frozenset(node.get_name() for node, elem in zip(tip_nodes, y) if elem > 0) name_complement = frozenset(node.get_name() for node, elem in zip(tip_nodes, y) if elem <= 0) name_partition_b = frozenset((name_selection, name_complement)) if name_partition_b not in valid_partitions: print >> local_out, 'not-augmented distance matrix split fail:', print >> local_out, name_partition_b has_event = True # compare the name partitions if name_partition_a == name_partition_b: same_count += 1 else: diff_count += 1 print >> local_out, 'this tree was split differently ' print >> local_out, 'by the different methods:' print >> local_out, 'augmented distance matrix split:', print >> local_out, name_partition_a print >> local_out, 'not-augmented distance matrix split:', print >> local_out, name_partition_b has_event = True # print a newline between trees if has_event: print >> out, local_out.getvalue() # write the summary print >> out, 'for this many trees the same split was found:', print >> out, same_count print >> out, 'for this many trees different splits were found:', print >> out, diff_count # write the response return out.getvalue()
def get_response_content(fs): # get the newick trees. trees = [] for tree_string in iterutils.stripped_lines(fs.trees.splitlines()): # parse each tree and make sure that it conforms to various requirements tree = NewickIO.parse(tree_string, FelTree.NewickTree) tip_names = [tip.get_name() for tip in tree.gen_tips()] if len(tip_names) < 4: raise HandlingError('expected at least four tips ' 'but found ' + str(len(tip_names))) if any(name is None for name in tip_names): raise HandlingError('each terminal node must be labeled') if len(set(tip_names)) != len(tip_names): raise HandlingError('each terminal node label must be unique') trees.append(tree) # create the response out = StringIO() same_count = 0 diff_count = 0 for tree in trees: # make the local paragraph that will be shown if there is an event local_out = StringIO() has_event = False # print the tree print >> local_out, NewickIO.get_newick_string(tree) # get the tip nodes and the internal nodes tip_nodes = [] internal_nodes = [] for node in tree.preorder(): if node.is_tip(): tip_nodes.append(node) else: internal_nodes.append(node) all_nodes = tip_nodes + internal_nodes # get all tip name partitions implied by the tree topology valid_partitions = TreeComparison.get_partitions(tree) # get results from the augmented distance matrix D_full = tree.get_partial_distance_matrix( [id(node) for node in all_nodes]) y_full = get_vector(D_full).tolist() y = y_full[:len(tip_nodes)] name_selection = frozenset(node.get_name() for node, elem in zip(tip_nodes, y) if elem > 0) name_complement = frozenset(node.get_name() for node, elem in zip(tip_nodes, y) if elem <= 0) name_partition_a = frozenset((name_selection, name_complement)) if name_partition_a not in valid_partitions: print >> local_out, 'augmented distance matrix split fail:', print >> local_out, name_partition_a has_event = True # get results from the not-augmented distance matrix D = tree.get_partial_distance_matrix([id(node) for node in tip_nodes]) y = get_vector(D).tolist() name_selection = frozenset(node.get_name() for node, elem in zip(tip_nodes, y) if elem > 0) name_complement = frozenset(node.get_name() for node, elem in zip(tip_nodes, y) if elem <= 0) name_partition_b = frozenset((name_selection, name_complement)) if name_partition_b not in valid_partitions: print >> local_out, 'not-augmented distance matrix split fail:', print >> local_out, name_partition_b has_event = True # compare the name partitions if name_partition_a == name_partition_b: same_count += 1 else: diff_count += 1 print >> local_out, 'this tree was split differently ' print >> local_out, 'by the different methods:' print >> local_out, 'augmented distance matrix split:', print >> local_out, name_partition_a print >> local_out, 'not-augmented distance matrix split:', print >> local_out, name_partition_b has_event = True # print a newline between trees if has_event: print >> out, local_out.getvalue() # write the summary print >> out, 'for this many trees the same split was found:', print >> out, same_count print >> out, 'for this many trees different splits were found:', print >> out, diff_count # write the response return out.getvalue()
def get_response_content(fs): # get the newick trees. trees = [] for tree_string in iterutils.stripped_lines(StringIO(fs.trees)): # Parse each tree and make sure # that it conforms to various requirements. tree = NewickIO.parse(tree_string, FelTree.NewickTree) tip_names = [tip.get_name() for tip in tree.gen_tips()] if len(tip_names) < 4: msg_a = 'expected at least four tips but found ' msg_b = str(len(tip_names)) raise HandlingError(msg_a + msg_b) if any(name is None for name in tip_names): raise HandlingError('each terminal node must be labeled') if len(set(tip_names)) != len(tip_names): raise HandlingError('each terminal node label must be unique') trees.append(tree) # read the criterion string, creating the splitter object if fs.exact: splitter = Clustering.StoneExactDMS() elif fs.sign: splitter = Clustering.StoneSpectralSignDMS() elif fs.threshold: splitter = Clustering.StoneSpectralThresholdDMS() elif fs.nj: splitter = Clustering.NeighborJoiningDMS() elif fs.random: splitter = Clustering.RandomDMS() # assert that the computation is fast complexity = 0 for tree in trees: n = len(list(tree.gen_tips())) complexity += n * splitter.get_complexity(n) if complexity > 1000000: raise HandlingError('this computation would take too long') # evaluate the bipartition of each tree based on its distance matrix informative_split_count = 0 degenerate_split_count = 0 invalid_split_count = 0 for tree in trees: tips = list(tree.gen_tips()) n = len(tips) D = tree.get_distance_matrix() if fs.strength: P = [row[:] for row in D] for i in range(n): for j in range(i): x = random.normalvariate(0, fs.strength) new_distance = D[i][j] * math.exp(x) P[i][j] = new_distance P[j][i] = new_distance else: P = D index_selection = splitter.get_selection(P) tip_selection = [tips[i] for i in index_selection] n_selection = len(tip_selection) n_complement = n - n_selection if min(n_selection, n_complement) < 2: degenerate_split_count += 1 else: if tree.get_split_branch(tip_selection): informative_split_count += 1 else: invalid_split_count += 1 # define the response out = StringIO() print >> out, informative_split_count, 'informative splits' print >> out, degenerate_split_count, 'degenerate splits' print >> out, invalid_split_count, 'invalid splits' # return the response return out.getvalue()
def gen_typed_rows(fin): for line in iterutils.stripped_lines(fin): yield line_to_row(line)
def load(self, lines): """ @param lines: lines of nexus data """ # get the taxa, tree, and character lines taxa_lines = [] tree_lines = [] character_lines = [] current_array = None for line in iterutils.stripped_lines(lines): # Ignore an entire line that is a comment. # Nested comments and multi-line comments # are not correctly processed here. if line.startswith('[') and line.endswith(']'): self.add_comment(line[1:-1]) continue tokens = line.upper().split() if tokens == ['BEGIN', 'TAXA;']: current_array = taxa_lines elif tokens == ['BEGIN', 'TREES;']: current_array = tree_lines elif tokens == ['BEGIN', 'CHARACTERS;']: current_array = character_lines elif tokens == ['END;']: current_array = None elif current_array is not None: current_array.append(line) # assert that tree lines and character lines are present if not tree_lines: raise NexusError('TREES was not found') if not character_lines: raise NexusError('CHARACTERS was not found') # read the newick tree string nexus_tree_string = ''.join(tree_lines) if nexus_tree_string.count(';') != 1: raise NexusError('expected exactly one semicolon in the nexus TREES block') if nexus_tree_string.count('=') != 1: raise NexusError('expected exactly one equals sign in the nexus TREES block') offset = nexus_tree_string.find('=') newick_string = nexus_tree_string[offset+1:] self.tree = Newick.parse(newick_string, Newick.NewickTree) # read the alignment matrix arr = [] found_matrix = False for line in character_lines: if line.upper().startswith('DIMENSIONS'): continue if line.upper().startswith('FORMAT'): continue if line.upper().startswith('MATRIX'): found_matrix = True continue if found_matrix: arr.append(line.replace(';', ' ')) if not arr: raise NexusError('no alignment was found') tokens = ' '.join(arr).split() if len(tokens) % 2 != 0: raise NexusError('expected the alignment to be a list of (taxon, sequence) pairs') alignment_out = StringIO() for header, sequence in iterutils.chopped(tokens, 2): sequence = sequence.upper() unexpected_letters = set(sequence) - set('ACGT') if unexpected_letters: raise NexusError('unexpected sequence character(s): %s' % list(unexpected_letters)) print >> alignment_out, '>%s' % header print >> alignment_out, sequence alignment_string = alignment_out.getvalue() self.alignment = Fasta.Alignment(StringIO(alignment_string))
def load(self, lines): """ @param lines: lines of nexus data """ # get the taxa, tree, and character lines taxa_lines = [] tree_lines = [] character_lines = [] current_array = None for line in iterutils.stripped_lines(lines): # Ignore an entire line that is a comment. # Nested comments and multi-line comments # are not correctly processed here. if line.startswith('[') and line.endswith(']'): self.add_comment(line[1:-1]) continue tokens = line.upper().split() if tokens == ['BEGIN', 'TAXA;']: current_array = taxa_lines elif tokens == ['BEGIN', 'TREES;']: current_array = tree_lines elif tokens == ['BEGIN', 'CHARACTERS;']: current_array = character_lines elif tokens == ['END;']: current_array = None elif current_array is not None: current_array.append(line) # assert that tree lines and character lines are present if not tree_lines: raise NexusError('TREES was not found') if not character_lines: raise NexusError('CHARACTERS was not found') # read the newick tree string nexus_tree_string = ''.join(tree_lines) if nexus_tree_string.count(';') != 1: raise NexusError( 'expected exactly one semicolon in the nexus TREES block') if nexus_tree_string.count('=') != 1: raise NexusError( 'expected exactly one equals sign in the nexus TREES block') offset = nexus_tree_string.find('=') newick_string = nexus_tree_string[offset + 1:] self.tree = Newick.parse(newick_string, Newick.NewickTree) # read the alignment matrix arr = [] found_matrix = False for line in character_lines: if line.upper().startswith('DIMENSIONS'): continue if line.upper().startswith('FORMAT'): continue if line.upper().startswith('MATRIX'): found_matrix = True continue if found_matrix: arr.append(line.replace(';', ' ')) if not arr: raise NexusError('no alignment was found') tokens = ' '.join(arr).split() if len(tokens) % 2 != 0: raise NexusError( 'expected the alignment to be a list of (taxon, sequence) pairs' ) alignment_out = StringIO() for header, sequence in iterutils.chopped(tokens, 2): sequence = sequence.upper() unexpected_letters = set(sequence) - set('ACGT') if unexpected_letters: raise NexusError('unexpected sequence character(s): %s' % list(unexpected_letters)) print >> alignment_out, '>%s' % header print >> alignment_out, sequence alignment_string = alignment_out.getvalue() self.alignment = Fasta.Alignment(StringIO(alignment_string))
def gen_untyped_rows(fin): for line in iterutils.stripped_lines(fin): yield line.split()
def get_response_content(fs): # read the energies from the form data energies = [] for line in iterutils.stripped_lines(fs.energies.splitlines()): try: energy = float(line) except ValueError as e: raise ValueError('invalid energy: %s' % line) energies.append(energy) n = len(energies) if n > 100: raise ValueError('too many energies') # compute the rate matrix R = np.zeros((n, n)) for row in range(n): for col in range(n): rate = math.exp(-(energies[col] - energies[row])) R[row, col] = rate for i, r in enumerate(R): R[i, i] = -np.sum(r) + 1 # get the transition matrix at large finite time large_t = 1000.0 T = scipy.linalg.expm(R * large_t) # eigendecompose Wr, Vr = scipy.linalg.eig(R, left=False, right=True) Wl, Vl = scipy.linalg.eig(R, left=True, right=False) # get left eigenvector associated with stationary distribution val_vec_pairs = [(abs(Wl[i]), Vl[:, i]) for i in range(n)] dummy, pi_eigenvector = min(val_vec_pairs) # get the stationary distribution itself total = np.sum(pi_eigenvector) pi_arr = np.array([v / total for v in pi_eigenvector]) # get the square root stationary vector and diagonal matrix sqrt_pi_arr = np.sqrt(pi_arr) lam = np.diag(sqrt_pi_arr) # get reciprocal arrays recip_sqrt_pi_arr = np.reciprocal(sqrt_pi_arr) recip_lam = np.reciprocal(lam) # print things np.set_printoptions(linewidth=300) out = StringIO() print >> out, 'rate matrix:' print >> out, R print >> out print >> out, 'rate matrix row sums:' print >> out, np.sum(R, axis=1) print >> out print >> out, 'eigenvalues:' print >> out, Wr print >> out print >> out, 'corresponding orthonormal right eigenvectors (columns):' print >> out, Vr print >> out print >> out, 'eigenvalues:' print >> out, Wl print >> out print >> out, 'corresponding orthonormal left eigenvectors (columns):' print >> out, Vl print >> out print >> out, 'L2 normalized eigenvector associated with stationary distn:' print >> out, pi_eigenvector print >> out print >> out, 'L1 renormalized vector (the stationary distribution):' print >> out, pi_arr print >> out print >> out # eigendecompose the transition matrix Wr, Vr = scipy.linalg.eig(T, left=False, right=True) Wl, Vl = scipy.linalg.eig(T, left=True, right=False) print >> out, 'transition matrix for t=%f:' % large_t print >> out, T print >> out print >> out, 'transition matrix row sums:' print >> out, np.sum(T, axis=1) print >> out print >> out, 'eigenvalues:' print >> out, Wr print >> out print >> out, 'corresponding orthonormal right eigenvectors (columns):' print >> out, Vr print >> out print >> out, 'eigenvalues:' print >> out, Wl print >> out print >> out, 'corresponding orthonormal left eigenvectors (columns):' print >> out, Vl print >> out print >> out, 'incorrect reconstitution of the transition matrix:' print >> out, ndot(Vr, np.diag(Wr), Vl.T) print >> out print >> out # Use the known properties of reversibility to symmetrize the matrix. t = 3 coeffs, rates, c = get_identicality_params(R) print >> out, 'brute identicality computation for t=%f:' % t print >> out, get_numerical_identicality(R, t) print >> out print >> out, 'sophisticated identicality computation for t=%f:' % t print >> out, get_symbolic_identicality(coeffs, rates, c, t) print >> out print >> out # Try another couple rate matrices. e2 = math.exp(2) en2 = math.exp(-2) rate_matrices = [ np.array([[-2.0, 2.0], [2.0, -2.0]]), np.array([[-1.0, 1.0], [3.0, -3.0]]), np.array([[-1, 1, 0], [1, -2, 1], [0, 1, -1]]), #np.array([[-4.0, 4.0, 0], [1, -2, 1], [0, 4, -4]])] #np.array([[-1, 1, 0], [7, -14, 7], [0, 1, -1]])] np.array([[-en2, en2, 0], [e2, -2 * e2, e2], [0, en2, -en2]]) ] t = 3.0 for R in rate_matrices: coeffs, rates, c = get_identicality_params(R) print >> out, 'test rate matrix:' print >> out, R print >> out print >> out, 'eigenvalues:' print >> out, scipy.linalg.eigvals(R) print >> out print >> out, 'stationary distribution:' print >> out, R_to_distn(R) print >> out print >> out, 'brute identicality computation for t=%f:' % t print >> out, get_numerical_identicality(R, t) print >> out print >> out, 'sophisticated identicality computation for t=%f:' % t print >> out, get_symbolic_identicality(coeffs, rates, c, t) print >> out print >> out, 'identicality derivative for t=%f:' % t print >> out, get_identicality_derivative(coeffs, rates, t) print >> out print >> out # return the message return out.getvalue().rstrip()
def get_response_content(fs): # read the edge triples (vertex name, vertex name, edge weight) edge_triples = [] for line in iterutils.stripped_lines(fs.graph.splitlines()): string_triple = line.split() if len(string_triple) != 3: raise HandlingError( 'each graph row should have three elements ' 'but found this line: ' + line) triple = string_triple[:2] try: weight = float(string_triple[2]) except ValueError as e: raise HandlingError( 'edge weights should be floating point numbers') if weight <= 0: raise HandlingError('edge weights should be positive') triple.append(weight) edge_triples.append(triple) # get the set of directed edges to check for redundant or invalid input unordered_directed_edges = set() for a, b, weight in edge_triples: if a == b: raise HandlingError( 'vertices should not have edges connecting to themselves') if (a, b) in unordered_directed_edges: raise HandlingError('each edge should be given only once') if (b, a) in unordered_directed_edges: raise HandlingError( 'each edge should be given in only one direction') unordered_directed_edges.add((a, b)) # get the lexicographically ordered list of vertex names unordered_vertex_names = set() for edge in unordered_directed_edges: unordered_vertex_names.update(set(edge)) ordered_vertex_names = list(sorted(unordered_vertex_names)) name_to_index = dict( (name, i) for i, name in enumerate(ordered_vertex_names)) n = len(ordered_vertex_names) # read the set of vertices that the user wants to remove vertex_names_to_remove = set() for name in iterutils.stripped_lines(fs.vertices.splitlines()): if name in vertex_names_to_remove: raise HandlingError( 'vertices should be named for removal at most once') vertex_names_to_remove.add(name) # Assert that the set of vertex names for removal # is a subset of the vertex names in the graph. weird_names = vertex_names_to_remove - unordered_vertex_names if weird_names: raise HandlingError( 'some vertices named for removal ' 'were not found in the graph: ' + str(weird_names)) # get the ordered list of vertex names that will remain reduced_ordered_vertex_names = list( sorted(unordered_vertex_names - vertex_names_to_remove)) # get the laplacian depending on the method if fs.funky: reduced_edge_triples = get_funky_transformation( edge_triples, name_to_index, reduced_ordered_vertex_names) elif fs.funky_corrected: reduced_edge_triples = get_corrected_funky_transformation( edge_triples, name_to_index, reduced_ordered_vertex_names) elif fs.ohm: reduced_edge_triples = get_ohm_transformation( edge_triples, name_to_index, reduced_ordered_vertex_names) elif fs.conductance: reduced_edge_triples = get_conductance_transformation( edge_triples, name_to_index, reduced_ordered_vertex_names) # write the reduced edge triples out = StringIO() for name_a, name_b, weight in reduced_edge_triples: print >> out, name_a, name_b, weight # write the response return out.getvalue()
def get_response_content(fs): # read the energies from the form data energies = [] for line in iterutils.stripped_lines(fs.energies.splitlines()): try: energy = float(line) except ValueError as e: raise ValueError('invalid energy: %s' % line) energies.append(energy) n = len(energies) if n > 100: raise ValueError('too many energies') # compute the rate matrix R = np.zeros((n, n)) for row in range(n): for col in range(n): rate = math.exp(-(energies[col] - energies[row])) R[row, col] = rate for i, r in enumerate(R): R[i, i] = -np.sum(r) + 1 # get the transition matrix at large finite time large_t = 1000.0 T = scipy.linalg.expm(R*large_t) # eigendecompose Wr, Vr = scipy.linalg.eig(R, left=False, right=True) Wl, Vl = scipy.linalg.eig(R, left=True, right=False) # get left eigenvector associated with stationary distribution val_vec_pairs = [(abs(Wl[i]), Vl[:,i]) for i in range(n)] dummy, pi_eigenvector = min(val_vec_pairs) # get the stationary distribution itself total = np.sum(pi_eigenvector) pi_arr = np.array([v/total for v in pi_eigenvector]) # get the square root stationary vector and diagonal matrix sqrt_pi_arr = np.sqrt(pi_arr) lam = np.diag(sqrt_pi_arr) # get reciprocal arrays recip_sqrt_pi_arr = np.reciprocal(sqrt_pi_arr) recip_lam = np.reciprocal(lam) # print things np.set_printoptions(linewidth=300) out = StringIO() print >> out, 'rate matrix:' print >> out, R print >> out print >> out, 'rate matrix row sums:' print >> out, np.sum(R, axis=1) print >> out print >> out, 'eigenvalues:' print >> out, Wr print >> out print >> out, 'corresponding orthonormal right eigenvectors (columns):' print >> out, Vr print >> out print >> out, 'eigenvalues:' print >> out, Wl print >> out print >> out, 'corresponding orthonormal left eigenvectors (columns):' print >> out, Vl print >> out print >> out, 'L2 normalized eigenvector associated with stationary distn:' print >> out, pi_eigenvector print >> out print >> out, 'L1 renormalized vector (the stationary distribution):' print >> out, pi_arr print >> out print >> out # eigendecompose the transition matrix Wr, Vr = scipy.linalg.eig(T, left=False, right=True) Wl, Vl = scipy.linalg.eig(T, left=True, right=False) print >> out, 'transition matrix for t=%f:' % large_t print >> out, T print >> out print >> out, 'transition matrix row sums:' print >> out, np.sum(T, axis=1) print >> out print >> out, 'eigenvalues:' print >> out, Wr print >> out print >> out, 'corresponding orthonormal right eigenvectors (columns):' print >> out, Vr print >> out print >> out, 'eigenvalues:' print >> out, Wl print >> out print >> out, 'corresponding orthonormal left eigenvectors (columns):' print >> out, Vl print >> out print >> out, 'incorrect reconstitution of the transition matrix:' print >> out, ndot(Vr, np.diag(Wr), Vl.T) print >> out print >> out # Use the known properties of reversibility to symmetrize the matrix. t = 3 coeffs, rates, c = get_identicality_params(R) print >> out, 'brute identicality computation for t=%f:' % t print >> out, get_numerical_identicality(R, t) print >> out print >> out, 'sophisticated identicality computation for t=%f:' % t print >> out, get_symbolic_identicality(coeffs, rates, c, t) print >> out print >> out # Try another couple rate matrices. e2 = math.exp(2) en2 = math.exp(-2) rate_matrices = [ np.array([[-2.0, 2.0], [2.0, -2.0]]), np.array([[-1.0, 1.0], [3.0, -3.0]]), np.array([[-1, 1, 0], [1, -2, 1], [0, 1, -1]]), #np.array([[-4.0, 4.0, 0], [1, -2, 1], [0, 4, -4]])] #np.array([[-1, 1, 0], [7, -14, 7], [0, 1, -1]])] np.array([[-en2, en2, 0], [e2, -2*e2, e2], [0, en2, -en2]])] t = 3.0 for R in rate_matrices: coeffs, rates, c = get_identicality_params(R) print >> out, 'test rate matrix:' print >> out, R print >> out print >> out, 'eigenvalues:' print >> out, scipy.linalg.eigvals(R) print >> out print >> out, 'stationary distribution:' print >> out, R_to_distn(R) print >> out print >> out, 'brute identicality computation for t=%f:' % t print >> out, get_numerical_identicality(R, t) print >> out print >> out, 'sophisticated identicality computation for t=%f:' % t print >> out, get_symbolic_identicality(coeffs, rates, c, t) print >> out print >> out, 'identicality derivative for t=%f:' % t print >> out, get_identicality_derivative(coeffs, rates, t) print >> out print >> out # return the message return out.getvalue().rstrip()