Beispiel #1
0
def main():
    # Getting required data
    with open(TARGETS_PATH, "rb") as f:
        target_array, scores = pickle.load(f)
        target_list = target_array.tolist()
    with open(FMAT_PATH, "rb") as f:
        fmat = pickle.load(f)
    with open(PASSAGES_PATH, "rb") as f:
        passages = pickle.load(f)
    with open(TOKENS_FMAT, "rb") as f:
        tokens_fmat = pickle.load(f)
    passages = passages[:NUM_PASSAGES]
    terminals, token_labels = tokeneval.get_terminals_labels(passages)
    tokens = [x.text for x in terminals]

    # Running through random parameters settings
    # for i, params in enumerate(params_generator(NUM_SAMPLING)):
    for i, params in enumerate(PARAMS):
        sys.stderr.write("{} {}\n".format(METHOD, i))
        clas, _, _ = classify.self_train_classifier(
            fmat,
            scores,
            target_array,
            params,
            method=METHOD,
            c_param=CLS_PRM,
            nu_param=CLS_PRM,
            learn_rate=CLS_PRM,
            n_estimators=500,
        )
        target_labels = [int(x >= classify.PRE_LABELS_THRESH) for x in scores]
        target_labels += list(classify.predict_labels(clas, fmat[len(scores) :]))
        stats = tokeneval.evaluate_with_classifier(tokens, token_labels, target_list, tokens_fmat, clas)
        print("\t".join([str(x) for x in params] + [str(len(x)) for x in stats]))
def main():
    with open(PASSAGES_PATH, "rb") as f:
        passages = pickle.load(f)
    passages = passages[:NUM_PASSAGES]
    terminals, token_labels = tokeneval.get_terminals_labels(passages)
    tokens = [x.text for x in terminals]

    clas = classify.train_classifier(
        FMAT[: len(LABELS)], LABELS, METHOD, c_param=PARAM, nu_param=PARAM, learn_rate=PARAM, n_estimators=500
    )
    if TOKENS_FMAT is not None:  # use token evaluation, not type
        stats = tokeneval.evaluate_with_classifier(tokens, token_labels, TARGETS, TOKENS_FMAT, clas)
    else:
        target_labels = LABELS.tolist()
        target_labels += classify.predict_labels(clas, FMAT[len(LABELS) :]).tolist()
        stats = tokeneval.evaluate_with_type(tokens, token_labels, TARGETS, target_labels)

    print("\t".join(str(len(x)) for x in stats))
Beispiel #3
0
def main():
    # Getting required data
    with open(TARGETS_PATH, "rb") as f:
        target_array, scores = pickle.load(f)
        target_list = target_array.tolist()
    with open(FMAT_PATH, "rb") as f:
        fmat = pickle.load(f)
    with open(PASSAGES_PATH, "rb") as f:
        passages = pickle.load(f)
    passages = passages[:34]
    terminals, token_labels = tokeneval.get_terminals_labels(passages)
    tokens = [x.text for x in terminals]

    # Running through random parameters settings
    for params in params_generator(50000):
        clas, _, _ = classify.self_train_classifier(fmat, scores, target_array,
                                                    params)
        target_labels = [int(x >= classify.PRE_LABELS_THRESH) for x in scores]
        target_labels += list(classify.predict_labels(clas,
                                                      fmat[len(scores):]))
        stats = tokeneval.evaluate_with_type(tokens, token_labels, target_list,
                                             target_labels)
        print("\t".join([str(x)
                         for x in params] + [str(len(x)) for x in stats]))
Beispiel #4
0
def main():
    with open(PASSAGES_PATH, "rb") as f:
        passages = pickle.load(f)
    passages = passages[:NUM_PASSAGES]
    terminals, token_labels = tokeneval.get_terminals_labels(passages)
    tokens_context = tokeneval.get_context(terminals, context=2)
    tokens = [x[0] for x in tokens_context]
    lemmas = [tokeneval.lemmatize(token, TARGETS) for token in tokens]
    lemmas_tuples = [(lemma,) for lemma in lemmas]
    form_ident = lex.FormIdentifier(COLLINS_PATH, WIKT_PATH)

    # First calculate all features which are computed together
    if USE_MORPH_DICT:
        res = features.extract_dict_features(lemmas_tuples, COLLINS_PATH)
        res = [x.split(' ') for x in res]
        res = [[int(x) for x in y] for y in res]
        dict_features = list(zip(*res))
    if USE_HFW:
        res = features.extract_hfw_dict_features(lemmas_tuples, COLLINS_PATH,
                                                 HFW)
        res = [x.split(' ') for x in res]
        res = [[int(x) for x in y] for y in res]
        hfw_features = list(zip(*res))

    # Creating a list of features for each token
    all_res = []
    print("finished init")
    for i, (token, pre_context, post_context) in enumerate(tokens_context):
        if i % 100 == 0: print(i)
        lemma = lemmas[i]
        res = []
        if USE_MORPH_DICT:
            res += [int(lemma.endswith(suffix)) for suffix in SUFFIXES]
            res += [int(lemma.startswith(prefix)) for prefix in PREFIXES]
            res.append(int(form_ident.is_dual_vn(lemma)))
            res.extend(dict_features[i])
        if USE_HFW:
            res.extend(hfw_features[i])
        if USE_FUNCWORDS:
            for funcwords in FUNCWORDS:
                if pre_context and pre_context[0].lower() in funcwords:
                    res.append(1)
                else:
                    res.append(0)
                if post_context and post_context[0].lower() in funcwords:
                    res.append(1)
                else:
                    res.append(0)
        if USE_LIGHTVERBS:
            for lightverbs in LIGHTVERBS:
                if ((pre_context and pre_context[0].lower() in lightverbs) or
                        (len(pre_context) > 1 and
                         pre_context[1].lower() in lightverbs)):
                    res.append(1)
                else:
                    res.append(0)
                if ((post_context and post_context[0].lower() in lightverbs) or
                        (len(post_context) > 1 and
                         post_context[1].lower() in lightverbs)):
                    res.append(1)
                else:
                    res.append(0)

        all_res.append(res)

    # Converting to numpy matrix
    fmat = np.array(all_res)
    with open(FMAT_PATH, 'wb') as f:
        pickle.dump(fmat, f)