Ejemplo n.º 1
0
def merge_fabricated_subsets(subset_list):
    '''Allows the merging of fabricated subsets and the preservation of their
    centroids and lnls'''
    columns = set()
    lnl = 0
    centroid = []

    # Figure out how many dimensions the centroid is
    centroid_dim = len(subset_list[0].centroid)
    for i in range(centroid_dim):
        centroid.append(0)

    for sub in subset_list:
        columns |= sub.column_set
        lnl += sub.best_lnl
        number = 0
        for observation in centroid:
            observation += sub.centroid[number]
            number += 1

    # Now just take the average of each centroid to be the centroid of the new
    # subset
    centroid = [x / len(subset_list) for x in centroid]

    new_sub = subset.Subset(sub.cfg, columns)

    # Add the centroid and sum of the lnls to the subset. TODO: create
    # functions to update these variables in the subset rather than messing
    # with them directly
    new_sub.centroid = centroid
    new_sub.lnl = lnl
    return new_sub
Ejemplo n.º 2
0
def create_scheme(cfg, scheme_name, scheme_description):
    """
    Generate a single scheme given a list of numbers that represent the
    indexes of the partitions e.g. [0,1,2,3,4,5,6,7]
    """

    partition_count = len(
        cfg.partitions)  # total number of partitions defined by user

    # Check that the correct number of items are in the list
    if len(scheme_description) != partition_count:
        log.error("There's a problem with the description of scheme %s" %
                  scheme_name)
        raise SchemeError

    # Now generate the pattern
    subs = {}
    # We use the numbers returned to group the different subsets
    for sub_index, grouping in enumerate(scheme_description):
        insub = subs.setdefault(grouping, [])
        insub.append(sub_index)

    # We now have what we need to create a subset. Each entry will have a
    # set of values which are the index for the partition
    created_subsets = []
    for sub_indexes in subs.values():
        sub = subset.Subset(*tuple([cfg.partitions[i] for i in sub_indexes]))
        created_subsets.append(sub)

    return Scheme(cfg, str(scheme_name), created_subsets, description=scheme_description)
Ejemplo n.º 3
0
    def define_user_subset(self, text, loc, part_def):
        """Use a list of tuples with start,stop,step to produces columns"""

        # We need to convert to column definitions. Note that these are
        # zero based, which is not how they are specified in the config. So we
        # must do some fiddling to make sure they are right. In addition, we
        # use range(...) which excludes the final column, whereas the
        # definitions assume inclusive. Subtracting 1 from start deals with
        # both issues...
        columns = []
        description = []
        for start, stop, step in part_def.parts:
            columns.extend(range(start - 1, stop, step))

            # Keep a description of this around
            description.append((start, stop, step))

        # Normalise it all
        column_set = set(columns)

        # If there was any overlap then these will differ...
        if len(columns) != len(column_set):
            raise ParserError(
                text, loc, "Block '%s' has internal overlap" % part_def.name)

        user_subset = subset.Subset(self.cfg, column_set)
        user_subset.add_description([part_def.name], description)

        self.cfg.user_subsets.append(user_subset)
        self.cfg.user_subsets_by_name[part_def.name] = user_subset
Ejemplo n.º 4
0
def make_clustered_scheme(start_scheme, scheme_name, subsets_to_cluster, cfg):

    #1. Create a new subset that merges the subsets_to_cluster
    newsub_parts = []

    #log.info("Clustering %d subsets" % len(subsets_to_cluster))

    for s in subsets_to_cluster:
        newsub_parts = newsub_parts + list(s.partitions)
    newsub = subset.Subset(*tuple(newsub_parts))

    #2. Then we define a new scheme with those merged subsets
    all_subs = [s for s in start_scheme.subsets]

    #pop out the subsets we're going to join together
    for s in subsets_to_cluster:
        all_subs.remove(s)

    #and now we add back in our new subset...
    all_subs.append(newsub)

    #and finally create the clustered scheme
    final_scheme = (scheme.Scheme(cfg, str(scheme_name), all_subs))

    return final_scheme
Ejemplo n.º 5
0
    def define_subset(self, text, loc, subset_def):
        try:
            # Get the partitions from the names
            parts = [self.cfg.partitions[nm] for nm in subset_def[0]]

            # Keep a running list of these till we define the schema below
            self.subsets.append(subset.Subset(*tuple(parts)))
        except subset.SubsetError:
            raise ParserError(text, loc, "Error creating subset...")
Ejemplo n.º 6
0
    def make_tree(self, user_path):
        # Begin by making a filtered alignment, containing ONLY those columns
        # that are defined in the subsets
        subset_with_everything = subset.Subset(*list(self.cfg.partitions))
        self.filtered_alignment = SubsetAlignment(self.alignment,
                                                  subset_with_everything)
        self.filtered_alignment_path = os.path.join(self.cfg.start_tree_path,
                                                    'filtered_source.phy')
        self.filtered_alignment.write(self.filtered_alignment_path)

        # Now we've written this alignment, we need to lock everything in
        # place, no more adding partitions, or changing them from now on.
        self.cfg.partitions.check_against_alignment(self.alignment)
        self.cfg.partitions.finalise()

        # We start by copying the alignment
        self.alignment_path = os.path.join(self.cfg.start_tree_path,
                                           'source.phy')

        # Now check for the tree
        tree_path = self.cfg.processor.make_tree_path(
            self.filtered_alignment_path)

        if self.need_new_tree(tree_path) == True:
            log.debug("Estimating new starting tree, no old tree found")

            # If we have a user tree, then use that, otherwise, create a topology
            util.clean_out_folder(self.cfg.start_tree_path,
                                  keep=["filtered_source.phy", "source.phy"])

            if user_path is not None and user_path != "":
                # Copy it into the start tree folder
                log.info("Using user supplied topology at %s", user_path)
                topology_path = os.path.join(self.cfg.start_tree_path,
                                             'user_topology.phy')
                self.cfg.processor.dupfile(user_path, topology_path)
            else:
                log.debug("didn't find tree at %s, making a new one" %
                          tree_path)
                topology_path = self.cfg.processor.make_topology(
                    self.filtered_alignment_path, self.cfg.datatype,
                    self.cfg.cmdline_extras)

            # Now estimate branch lengths
            tree_path = self.cfg.processor.make_branch_lengths(
                self.filtered_alignment_path, topology_path, self.cfg.datatype,
                self.cfg.cmdline_extras)

        self.tree_path = tree_path
        log.info("Starting tree with branch lengths is here: %s",
                 self.tree_path)
Ejemplo n.º 7
0
def model_to_scheme(model, scheme_name, cfg):
    """Turn a model definition e.g. [0, 1, 2, 3, 4] into a scheme"""
    subs = {}
    # We use the numbers returned to group the different subsets
    for sub_index, grouping in enumerate(model):
        insub = subs.setdefault(grouping, [])
        insub.append(sub_index)

    # We now have what we need to create a subset. Each entry will have a
    # set of values which are the index for the partition
    created_subsets = []
    for sub_indexes in subs.values():
        sub = subset.Subset(*tuple([cfg.partitions[i] for i in sub_indexes]))
        created_subsets.append(sub)

    return Scheme(cfg, str(scheme_name), created_subsets)
Ejemplo n.º 8
0
 def _add_codelist_subset(self, concept, node):
     """
     for cmap nodes with CT subsets (e.g. CDs) create a subset and add it to the node
     :param concept: CMAP concept element
     :param node: node, or vertex, in the internal graph
     """
     subset_terms = concept.get("long-comment", "")
     if not subset_terms:
         return  # no subset to add to the node
     # example long-comment on CD node: SITTING (C62122) default; SUPINE (C62167); STANDING (C62166)
     cl_subset = SUB.Subset(c_code=node.label.split("(")[1].strip()[:-1],
                            name=node.label.split("(")[0].strip())
     for term_value in subset_terms.split(";"):
         parts = self._parse_cd_subset(term_value)
         cl_subset.add_term(c_code=parts[1],
                            sub_val=parts[0],
                            default=parts[2])
     node.add_ct_subset(cl_subset)
Ejemplo n.º 9
0
def merge_subsets(subset_list):
    """Take a set of subsets and merge them together"""
    columns = set()

    # We just need the columns
    names = []
    descriptions = []
    for sub in subset_list:
        columns |= sub.column_set
        descriptions.extend(sub.description)
        names.extend(sub.names)

    newsub = subset.Subset(sub.cfg, columns)
    # Only add the description if it isn't there (we might get back a cache
    # hit)
    if not newsub.names:
        newsub.add_description(names, descriptions)

    return newsub
Ejemplo n.º 10
0
def split_subset(a_subset, cluster_list):
    """Takes a subset and splits it according to a cluster list,
     then returns the subsets resulting from the split"""
    # Take each site from the first list and add it to a new
    subset_list = a_subset.columns
    subset_columns = []
    list_of_subsets = []
    for cluster in cluster_list:
        list_of_sites = []
        for site in cluster:
            list_of_sites.append(subset_list[site - 1])
        subset_columns.append(set(list_of_sites))

    tracker = 0
    for column_set in subset_columns:
        new_subset = subset.Subset(a_subset.cfg, column_set)
        list_of_subsets.append(new_subset)
        tracker += 1

    return list_of_subsets
Ejemplo n.º 11
0
def generate_all_schemes(cfg):
    """
    Convert the abstract schema given by the algorithm into subsets
    """

    log.info("Generating all possible schemes for the partitions...")

    partition_count = len(
        cfg.partitions)  # total number of partitions defined by user

    # Now generate the pattern for this many partitions
    all_schemes = submodels.get_submodels(partition_count)
    scheme_name = 1
    scheme_list = []
    for scheme in all_schemes:
        subs = {}
        # We use the numbers returned to group the different subsets
        for sub_index, grouping in enumerate(scheme):
            insub = subs.setdefault(grouping, [])
            insub.append(sub_index)
        # We now have what we need to create a subset. Each entry will have a
        # set of values which are the index for the partition
        created_subsets = []
        for sub_indexes in subs.values():
            sub = subset.Subset(
                *tuple([cfg.partitions[i] for i in sub_indexes]))
            created_subsets.append(sub)

        scheme_list.append(
            Scheme(cfg, str(scheme_name), created_subsets))

        log.debug("Created scheme %d of %d" % (scheme_name, len(all_schemes)))

        scheme_name += 1

    return scheme_list