Ejemplo n.º 1
0
 def test_make_factor_cxts(self):
     fct1 = fca.Concept({'g1', 'g3'}, {'m2', 'm1'})
     fct2 = fca.Concept({'g2'}, {'m2', 'm3'})
     cxt_objs_fcts, cxt_fcts_atts = fca.factors.make_factor_cxts(
         (fct1, fct2))
     print(cxt_objs_fcts)
     assert cxt_objs_fcts == fca.Context([[1, 0], [0, 1], [1, 0]],
                                         ['g1', 'g2', 'g3'], ['f0', 'f1'])
     assert cxt_fcts_atts == fca.Context([[1, 1, 0], [0, 1, 1]],
                                         ['f0', 'f1'], ['m1', 'm2', 'm3'])
Ejemplo n.º 2
0
def _oplus(D_objs: Set[str], y: str,
           cxt: 'fca.Context',
           U: Set[Tuple[str, str]]):
    yprime = cxt.get_attribute_extent(y)
    Dy_prime = D_objs & yprime
    Dy_primeprime = cxt.oprime(Dy_prime)
    cpt = fca.Concept(extent=Dy_prime, intent=Dy_primeprime)
    # result = {x for x in cpt.pairs() if x not in U}
    result = set(cpt.pairs()) & U
    return result
 def get_sense(factor):
     docs = list(factor.extent)
     term_rank = dict()
     common_terms = factor.intent
     for term in common_terms:
         term_rank[term] = sum(doc2preds[doc].index(term) for doc in docs)
     sense_ids = sorted(term_rank,
                        key=lambda x: term_rank[x])[:n_sense_indicators]
     sense = fca.Concept(docs, set(sense_ids))
     return sense
Ejemplo n.º 4
0
def algorithm2_weighted(cxt, fidelity=1):
    """
    Algorithm2 from article{
    title = "Discovery of optimal factors in binary data via a novel method of matrix decomposition ",
    journal = "Journal of Computer and System Sciences ",
    volume = "76",
    number = "1",
    pages = "3 - 20",
    year = "2010",
    doi = "http://dx.doi.org/10.1016/j.jcss.2009.05.002",
    url = "http://www.sciencedirect.com/science/article/pii/S0022000009000415",
    author = "Radim Belohlavek and Vilem Vychodil"}

    Extensions:
    Fidelity of coverage - stop when fidelity level is covered by factors
    """
    len_objs_initial = len(cxt.objects)
    len_atts_initial = len(cxt.attributes)

    def score(obj_att_pairs):
        objs = {x[0] for x in obj_att_pairs}
        atts = {x[1] for x in obj_att_pairs}
        score = len(objs) * len(atts) / (len_objs_initial * len_atts_initial)
        return score

    U = set(cxt.object_attribute_pairs)
    len_initial = len(U)
    while (len_initial - len(U)) / len_initial < fidelity:
        D = set()
        V = 0
        to_remove = set()
        while True:
            D_objs = cxt.oprime(D)
            ls_measures = [(score(_oplus(D_objs, j, cxt, U)), j)
                           for j in set(cxt.attributes) - D]
            if ls_measures:
                maxDj = max(ls_measures, key=lambda x: x[0])
            else:
                maxDj = [0,]
            if maxDj[0] > V:
                j_score, j = maxDj
                Dj = D | {j}
                C = cxt.aprime(Dj)
                D = cxt.oprime(C)
                to_remove = set(itertools.product(C, D)) & U
                V = len(to_remove)
            else:
                break
        if len(to_remove) == 0:
            print('Algorithm stuck, something went wrong, pairs left ', len(U))
            assert False
        U -= to_remove
        yield fca.Concept(C, D), j_score
Ejemplo n.º 5
0
    def end_element(name):
        global cs, new_intent, new_extent, new_meta
        global new_obj, new_attr, buffer
        if name == "object":
            if new_obj:
                d_objects[new_obj] = buffer
                objects.append(buffer)
                new_obj = None
                buffer = ""
        elif name == "attribute":
            if new_attr:
                d_attributes[new_attr] = buffer
                attributes.append(buffer)
                new_attr = None
                buffer = ""
        elif name == "concept":
            new_concept = fca.Concept(new_extent, new_intent)
            new_concept.meta = new_meta
            cs.append(new_concept)

            new_extent = []
            new_intent = []
            new_meta = {}
Ejemplo n.º 6
0
def algorithm2_w_condition(cxt, fidelity: float = 1,
                           allow_repeatitions=True,
                           min_atts_and_objs=3, objs_ge_atts=False):
    """
    Algorithm2 from article{
    title = "Discovery of optimal factors in binary data via a novel method of matrix decomposition ",
    journal = "Journal of Computer and System Sciences ",
    volume = "76",
    number = "1",
    pages = "3 - 20",
    year = "2010",
    doi = "http://dx.doi.org/10.1016/j.jcss.2009.05.002",
    url = "http://www.sciencedirect.com/science/article/pii/S0022000009000415",
    author = "Radim Belohlavek and Vilem Vychodil"}

    :param objs_ge_atts: should the number of objects be greater or equal to
            the number of attributes in the output factors
    :param min_atts_and_objs: minimum number of attributes and objects in the
            output factors
    :param fidelity: stops when this fraction of crosses in the table are covered
    :param allow_repeatitions: exclude attributes in already obtained factors
            from further consideration - they still may appear in the closure
    """
    def good_factor(cpt: 'fca.Concept'):
        if objs_ge_atts:
            return len(cpt.extent) >= len(cpt.intent) >= min_atts_and_objs
        else:
            return len(cpt.extent) >= min_atts_and_objs and \
                   len(cpt.intent) >= min_atts_and_objs

    U = set(cxt.object_attribute_pairs)
    len_initial = len(U)
    removed_atts = set()
    removed_objs = set()
    if not len_initial:
        return
    while (len_initial - len(U)) / len_initial < fidelity:
        D = set()
        C = set(cxt.objects)
        V = 0
        to_remove = set()
        available_atts = {x[1] for x in U} - removed_atts
        while True:
            Dprime = cxt.aprime(D)
            ls_measures = [(len(_oplus(Dprime, j, cxt, U)), j)
                           for j in available_atts - D]
            if not ls_measures:
                # print(f'Empty ls_measures. len(U) = {len(U)}, {set(u[1] for u in U)}, len(D) = {len(D)}, len(avail_atts) = {len(available_atts)}')
                return
            maxDj = max(ls_measures, key=lambda x: x[0])
            # print(D, Dprime, maxDj, V)
            # print(cxt)
            if maxDj[0] > V or not good_factor(cpt=fca.Concept(C, D)):  # update the values
                # D_old = D.copy()
                j_score, j = maxDj
                Dj = D | {j}
                C = cxt.aprime(Dj)
                if len(C) < min_atts_and_objs or not (available_atts - D):  # early restart
                    U = {u for u in U if u[1] not in Dj}
                    removed_atts |= Dj
                    break
                D = cxt.oprime(C)
                to_remove_U = set(itertools.product(C, D)) & U
                V = len(to_remove_U)
                if not allow_repeatitions:
                    to_remove = (set(itertools.product(C, cxt.attributes)) |
                                 set(itertools.product(cxt.objects, D))) & U
                else:
                    to_remove = to_remove_U
            elif good_factor(cpt=fca.Concept(C, D)):
                if len(to_remove) == 0:
                    raise Exception(
                        f'Algorithm stuck, something went wrong, pairs left '
                        f'{len(U)}')
                U -= to_remove
                # print(f'Factor out: {len(C)}, {len(D)}')
                yield fca.Concept(C, D), len(to_remove) / len_initial, (len_initial - len(U)) / len_initial
                break
            else:
                assert False
Ejemplo n.º 7
0
 def test_norris(self):
     cl = fca.ConceptLattice(self.small_cxt)
     assert fca.Concept(self.small_cxt.objects, []) in cl
     assert fca.Concept([], self.small_cxt.attributes) in cl
     assert len(cl) > 2
def induce(contexts: List[str],
           target_start_end_tuples: List[Tuple[int, int]],
           titles: List[str] = None,
           target_pos: str = None,
           n_sense_descriptors=5,
           lang='eng',
           top_n_pred=100,
           min_number_contexts_for_fca_clustering=3,
           min_sub_len=3,
           verbose=False,
           logger=None) -> List[fca.Concept]:
    """
    The function induces sense(s) of the target from a collection of contexts.
    This function always returns a result. If the proper clustering does not
    produce any factors then the most common predictions are output.

    :param min_sub_len: min length of produced substitute
    :param contexts: the contexts themselves
    :param target_start_end_tuples: the (start index, end index) pairs
        indicating the target in the respective context.
        len(contexts) == len(target_start_end_tuples)
    :param top_n_pred: how many predictions are produced for each context
    :param titles: Titles of contexts
    :param n_sense_descriptors: how many sense indicators - subset of all
        predictions - are output for each sense
    :param target_pos: the desired part of speach of predictions
    :param lang: language. Used for POS tagging and lemmatization of predictions
    :param min_number_contexts_for_fca_clustering: minimum number of contexts
        to try the fca clustering. If there are only 1 or 2 then it often does
        not make sense to cluster.
    """
    if logger is None:
        logger = local_logger
    if not len(contexts) == len(target_start_end_tuples):
        raise ValueError(f'Length of contexts {len(contexts)} is not equal to '
                         f'the length of start and end indices list '
                         f'{len(target_start_end_tuples)}.')

    subs = iter_substitutes(
        contexts,
        target_start_end_tuples,
        titles=titles,
        th_substitute_len=min_sub_len,
        top_n=top_n_pred,
        target_pos=target_pos,
        lang=lang,
    )
    if verbose:
        subs = tqdm(subs, total=len(contexts))
    predicted = {
        title: top_pred_m + top_pred_unm
        for title, top_pred_m, top_pred_unm in subs
    }

    senses = []
    target_phrase_in_fiurst_cintext = contexts[0][
        target_start_end_tuples[0][0]:target_start_end_tuples[0][1]]
    if len(contexts) >= min_number_contexts_for_fca_clustering:
        senses = fca_cluster(predicted, n_sense_indicators=n_sense_descriptors)
        logger.debug(
            f'For {target_phrase_in_fiurst_cintext} with {len(contexts)} contexts '
            f'fca_cluster produced {len(senses)} senses.')
    if not senses:  # fca_cluster did not produce results
        all_predicted = sum(predicted.values(), [])
        top_predicted = [
            x[0] for x in Counter(all_predicted).most_common(
                min([top_n_pred, n_sense_descriptors]))
        ]
        senses = [
            fca.Concept(intent=top_predicted, extent=list(predicted.keys()))
        ]
        logger.debug(
            f'For {target_phrase_in_fiurst_cintext} with {len(contexts)} contexts '
            f'most common {len(top_predicted)} predictions are '
            f'taken as sense indicators.')
    return senses