Esempio n. 1
0
def _make_term_column_builders(terms, num_column_counts, cat_levels_contrasts):
    # Sort each term into a bucket based on the set of numeric factors it
    # contains:
    term_buckets = OrderedDict()
    bucket_ordering = []
    for term in terms:
        num_factors = []
        for factor in term.factors:
            if factor in num_column_counts:
                num_factors.append(factor)
        bucket = frozenset(num_factors)
        if bucket not in term_buckets:
            bucket_ordering.append(bucket)
        term_buckets.setdefault(bucket, []).append(term)
    # Special rule: if there is a no-numerics bucket, then it always comes
    # first:
    if frozenset() in term_buckets:
        bucket_ordering.remove(frozenset())
        bucket_ordering.insert(0, frozenset())
    term_to_column_builders = {}
    new_term_order = []
    # Then within each bucket, work out which sort of contrasts we want to use
    # for each term to avoid redundancy
    for bucket in bucket_ordering:
        bucket_terms = term_buckets[bucket]
        # Sort by degree of interaction
        bucket_terms.sort(key=lambda t: len(t.factors))
        new_term_order += bucket_terms
        used_subterms = set()
        for term in bucket_terms:
            column_builders = []
            factor_codings = pick_contrasts_for_term(term, num_column_counts, used_subterms)
            # Construct one _ColumnBuilder for each subterm
            for factor_coding in factor_codings:
                builder_factors = []
                num_columns = {}
                cat_contrasts = {}
                # In order to preserve factor ordering information, the
                # coding_for_term just returns dicts, and we refer to
                # the original factors to figure out which are included in
                # each subterm, and in what order
                for factor in term.factors:
                    # Numeric factors are included in every subterm
                    if factor in num_column_counts:
                        builder_factors.append(factor)
                        num_columns[factor] = num_column_counts[factor]
                    elif factor in factor_coding:
                        builder_factors.append(factor)
                        levels, contrast = cat_levels_contrasts[factor]
                        # This is where the default coding is set to
                        # Treatment:
                        coded = code_contrast_matrix(factor_coding[factor], levels, contrast, default=Treatment)
                        cat_contrasts[factor] = coded
                column_builder = _ColumnBuilder(builder_factors, num_columns, cat_contrasts)
                column_builders.append(column_builder)
            term_to_column_builders[term] = column_builders
    return new_term_order, term_to_column_builders
Esempio n. 2
0
def _make_subterm_infos(terms,
                        num_column_counts,
                        cat_levels_contrasts):
    # Sort each term into a bucket based on the set of numeric factors it
    # contains:
    term_buckets = OrderedDict()
    bucket_ordering = []
    for term in terms:
        num_factors = []
        for factor in term.factors:
            if factor in num_column_counts:
                num_factors.append(factor)
        bucket = frozenset(num_factors)
        if bucket not in term_buckets:
            bucket_ordering.append(bucket)
        term_buckets.setdefault(bucket, []).append(term)
    # Special rule: if there is a no-numerics bucket, then it always comes
    # first:
    if frozenset() in term_buckets:
        bucket_ordering.remove(frozenset())
        bucket_ordering.insert(0, frozenset())
    term_to_subterm_infos = OrderedDict()
    new_term_order = []
    # Then within each bucket, work out which sort of contrasts we want to use
    # for each term to avoid redundancy
    for bucket in bucket_ordering:
        bucket_terms = term_buckets[bucket]
        # Sort by degree of interaction
        bucket_terms.sort(key=lambda t: len(t.factors))
        new_term_order += bucket_terms
        used_subterms = set()
        for term in bucket_terms:
            subterm_infos = []
            factor_codings = pick_contrasts_for_term(term,
                                                     num_column_counts,
                                                     used_subterms)
            # Construct one SubtermInfo for each subterm
            for factor_coding in factor_codings:
                subterm_factors = []
                contrast_matrices = {}
                subterm_columns = 1
                # In order to preserve factor ordering information, the
                # coding_for_term just returns dicts, and we refer to
                # the original factors to figure out which are included in
                # each subterm, and in what order
                for factor in term.factors:
                    # Numeric factors are included in every subterm
                    if factor in num_column_counts:
                        subterm_factors.append(factor)
                        subterm_columns *= num_column_counts[factor]
                    elif factor in factor_coding:
                        subterm_factors.append(factor)
                        levels, contrast = cat_levels_contrasts[factor]
                        # This is where the default coding is set to
                        # Treatment:
                        coded = code_contrast_matrix(factor_coding[factor],
                                                     levels, contrast,
                                                     default=Treatment)
                        contrast_matrices[factor] = coded
                        subterm_columns *= coded.matrix.shape[1]
                subterm_infos.append(SubtermInfo(subterm_factors,
                                                       contrast_matrices,
                                                       subterm_columns))
            term_to_subterm_infos[term] = subterm_infos
    assert new_term_order == list(term_to_subterm_infos)
    return term_to_subterm_infos