def merge_fabricated_subsets(subset_list): '''Allows the merging of fabricated subsets and the preservation of their centroids and lnls''' columns = set() lnl = 0 centroid = [] # Figure out how many dimensions the centroid is centroid_dim = len(subset_list[0].centroid) for i in range(centroid_dim): centroid.append(0) for sub in subset_list: columns |= sub.column_set lnl += sub.best_lnl number = 0 for observation in centroid: observation += sub.centroid[number] number += 1 # Now just take the average of each centroid to be the centroid of the new # subset centroid = [x / len(subset_list) for x in centroid] new_sub = subset.Subset(sub.cfg, columns) # Add the centroid and sum of the lnls to the subset. TODO: create # functions to update these variables in the subset rather than messing # with them directly new_sub.centroid = centroid new_sub.lnl = lnl return new_sub
def create_scheme(cfg, scheme_name, scheme_description): """ Generate a single scheme given a list of numbers that represent the indexes of the partitions e.g. [0,1,2,3,4,5,6,7] """ partition_count = len( cfg.partitions) # total number of partitions defined by user # Check that the correct number of items are in the list if len(scheme_description) != partition_count: log.error("There's a problem with the description of scheme %s" % scheme_name) raise SchemeError # Now generate the pattern subs = {} # We use the numbers returned to group the different subsets for sub_index, grouping in enumerate(scheme_description): insub = subs.setdefault(grouping, []) insub.append(sub_index) # We now have what we need to create a subset. Each entry will have a # set of values which are the index for the partition created_subsets = [] for sub_indexes in subs.values(): sub = subset.Subset(*tuple([cfg.partitions[i] for i in sub_indexes])) created_subsets.append(sub) return Scheme(cfg, str(scheme_name), created_subsets, description=scheme_description)
def define_user_subset(self, text, loc, part_def): """Use a list of tuples with start,stop,step to produces columns""" # We need to convert to column definitions. Note that these are # zero based, which is not how they are specified in the config. So we # must do some fiddling to make sure they are right. In addition, we # use range(...) which excludes the final column, whereas the # definitions assume inclusive. Subtracting 1 from start deals with # both issues... columns = [] description = [] for start, stop, step in part_def.parts: columns.extend(range(start - 1, stop, step)) # Keep a description of this around description.append((start, stop, step)) # Normalise it all column_set = set(columns) # If there was any overlap then these will differ... if len(columns) != len(column_set): raise ParserError( text, loc, "Block '%s' has internal overlap" % part_def.name) user_subset = subset.Subset(self.cfg, column_set) user_subset.add_description([part_def.name], description) self.cfg.user_subsets.append(user_subset) self.cfg.user_subsets_by_name[part_def.name] = user_subset
def make_clustered_scheme(start_scheme, scheme_name, subsets_to_cluster, cfg): #1. Create a new subset that merges the subsets_to_cluster newsub_parts = [] #log.info("Clustering %d subsets" % len(subsets_to_cluster)) for s in subsets_to_cluster: newsub_parts = newsub_parts + list(s.partitions) newsub = subset.Subset(*tuple(newsub_parts)) #2. Then we define a new scheme with those merged subsets all_subs = [s for s in start_scheme.subsets] #pop out the subsets we're going to join together for s in subsets_to_cluster: all_subs.remove(s) #and now we add back in our new subset... all_subs.append(newsub) #and finally create the clustered scheme final_scheme = (scheme.Scheme(cfg, str(scheme_name), all_subs)) return final_scheme
def define_subset(self, text, loc, subset_def): try: # Get the partitions from the names parts = [self.cfg.partitions[nm] for nm in subset_def[0]] # Keep a running list of these till we define the schema below self.subsets.append(subset.Subset(*tuple(parts))) except subset.SubsetError: raise ParserError(text, loc, "Error creating subset...")
def make_tree(self, user_path): # Begin by making a filtered alignment, containing ONLY those columns # that are defined in the subsets subset_with_everything = subset.Subset(*list(self.cfg.partitions)) self.filtered_alignment = SubsetAlignment(self.alignment, subset_with_everything) self.filtered_alignment_path = os.path.join(self.cfg.start_tree_path, 'filtered_source.phy') self.filtered_alignment.write(self.filtered_alignment_path) # Now we've written this alignment, we need to lock everything in # place, no more adding partitions, or changing them from now on. self.cfg.partitions.check_against_alignment(self.alignment) self.cfg.partitions.finalise() # We start by copying the alignment self.alignment_path = os.path.join(self.cfg.start_tree_path, 'source.phy') # Now check for the tree tree_path = self.cfg.processor.make_tree_path( self.filtered_alignment_path) if self.need_new_tree(tree_path) == True: log.debug("Estimating new starting tree, no old tree found") # If we have a user tree, then use that, otherwise, create a topology util.clean_out_folder(self.cfg.start_tree_path, keep=["filtered_source.phy", "source.phy"]) if user_path is not None and user_path != "": # Copy it into the start tree folder log.info("Using user supplied topology at %s", user_path) topology_path = os.path.join(self.cfg.start_tree_path, 'user_topology.phy') self.cfg.processor.dupfile(user_path, topology_path) else: log.debug("didn't find tree at %s, making a new one" % tree_path) topology_path = self.cfg.processor.make_topology( self.filtered_alignment_path, self.cfg.datatype, self.cfg.cmdline_extras) # Now estimate branch lengths tree_path = self.cfg.processor.make_branch_lengths( self.filtered_alignment_path, topology_path, self.cfg.datatype, self.cfg.cmdline_extras) self.tree_path = tree_path log.info("Starting tree with branch lengths is here: %s", self.tree_path)
def model_to_scheme(model, scheme_name, cfg): """Turn a model definition e.g. [0, 1, 2, 3, 4] into a scheme""" subs = {} # We use the numbers returned to group the different subsets for sub_index, grouping in enumerate(model): insub = subs.setdefault(grouping, []) insub.append(sub_index) # We now have what we need to create a subset. Each entry will have a # set of values which are the index for the partition created_subsets = [] for sub_indexes in subs.values(): sub = subset.Subset(*tuple([cfg.partitions[i] for i in sub_indexes])) created_subsets.append(sub) return Scheme(cfg, str(scheme_name), created_subsets)
def _add_codelist_subset(self, concept, node): """ for cmap nodes with CT subsets (e.g. CDs) create a subset and add it to the node :param concept: CMAP concept element :param node: node, or vertex, in the internal graph """ subset_terms = concept.get("long-comment", "") if not subset_terms: return # no subset to add to the node # example long-comment on CD node: SITTING (C62122) default; SUPINE (C62167); STANDING (C62166) cl_subset = SUB.Subset(c_code=node.label.split("(")[1].strip()[:-1], name=node.label.split("(")[0].strip()) for term_value in subset_terms.split(";"): parts = self._parse_cd_subset(term_value) cl_subset.add_term(c_code=parts[1], sub_val=parts[0], default=parts[2]) node.add_ct_subset(cl_subset)
def merge_subsets(subset_list): """Take a set of subsets and merge them together""" columns = set() # We just need the columns names = [] descriptions = [] for sub in subset_list: columns |= sub.column_set descriptions.extend(sub.description) names.extend(sub.names) newsub = subset.Subset(sub.cfg, columns) # Only add the description if it isn't there (we might get back a cache # hit) if not newsub.names: newsub.add_description(names, descriptions) return newsub
def split_subset(a_subset, cluster_list): """Takes a subset and splits it according to a cluster list, then returns the subsets resulting from the split""" # Take each site from the first list and add it to a new subset_list = a_subset.columns subset_columns = [] list_of_subsets = [] for cluster in cluster_list: list_of_sites = [] for site in cluster: list_of_sites.append(subset_list[site - 1]) subset_columns.append(set(list_of_sites)) tracker = 0 for column_set in subset_columns: new_subset = subset.Subset(a_subset.cfg, column_set) list_of_subsets.append(new_subset) tracker += 1 return list_of_subsets
def generate_all_schemes(cfg): """ Convert the abstract schema given by the algorithm into subsets """ log.info("Generating all possible schemes for the partitions...") partition_count = len( cfg.partitions) # total number of partitions defined by user # Now generate the pattern for this many partitions all_schemes = submodels.get_submodels(partition_count) scheme_name = 1 scheme_list = [] for scheme in all_schemes: subs = {} # We use the numbers returned to group the different subsets for sub_index, grouping in enumerate(scheme): insub = subs.setdefault(grouping, []) insub.append(sub_index) # We now have what we need to create a subset. Each entry will have a # set of values which are the index for the partition created_subsets = [] for sub_indexes in subs.values(): sub = subset.Subset( *tuple([cfg.partitions[i] for i in sub_indexes])) created_subsets.append(sub) scheme_list.append( Scheme(cfg, str(scheme_name), created_subsets)) log.debug("Created scheme %d of %d" % (scheme_name, len(all_schemes))) scheme_name += 1 return scheme_list