def main(): # Getting required data with open(TARGETS_PATH, "rb") as f: target_array, scores = pickle.load(f) target_list = target_array.tolist() with open(FMAT_PATH, "rb") as f: fmat = pickle.load(f) with open(PASSAGES_PATH, "rb") as f: passages = pickle.load(f) with open(TOKENS_FMAT, "rb") as f: tokens_fmat = pickle.load(f) passages = passages[:NUM_PASSAGES] terminals, token_labels = tokeneval.get_terminals_labels(passages) tokens = [x.text for x in terminals] # Running through random parameters settings #for i, params in enumerate(params_generator(NUM_SAMPLING)): for i, params in enumerate(PARAMS): sys.stderr.write('{} {}\n'.format(METHOD, i)) clas, _, _ = classify.self_train_classifier( fmat, scores, target_array, params, method=METHOD, c_param=CLS_PRM, nu_param=CLS_PRM, learn_rate=CLS_PRM, n_estimators=500) target_labels = [int(x >= classify.PRE_LABELS_THRESH) for x in scores] target_labels += list(classify.predict_labels(clas, fmat[len(scores):])) stats = tokeneval.evaluate_with_classifier( tokens, token_labels, target_list, tokens_fmat, clas) print("\t".join([str(x) for x in params] + [str(len(x)) for x in stats]))
def main(): with open(PASSAGES_PATH, "rb") as f: passages = pickle.load(f) passages = passages[:NUM_PASSAGES] terminals, token_labels = tokeneval.get_terminals_labels(passages) tokens = [x.text for x in terminals] clas = classify.train_classifier(FMAT[:len(LABELS)], LABELS, METHOD, c_param=PARAM, nu_param=PARAM, learn_rate=PARAM, n_estimators=500) if TOKENS_FMAT is not None: # use token evaluation, not type stats = tokeneval.evaluate_with_classifier(tokens, token_labels, TARGETS, TOKENS_FMAT, clas) else: target_labels = LABELS.tolist() target_labels += classify.predict_labels(clas, FMAT[len(LABELS):]).tolist() stats = tokeneval.evaluate_with_type(tokens, token_labels, TARGETS, target_labels) print("\t".join(str(len(x)) for x in stats))
def main(): with open(PASSAGES_PATH, "rb") as f: passages = pickle.load(f) passages = passages[:NUM_PASSAGES] terminals, token_labels = tokeneval.get_terminals_labels(passages) tokens_context = tokeneval.get_context(terminals, context=2) tokens = [x[0] for x in tokens_context] lemmas = [tokeneval.lemmatize(token, TARGETS) for token in tokens] lemmas_tuples = [(lemma,) for lemma in lemmas] form_ident = lex.FormIdentifier(COLLINS_PATH, WIKT_PATH) # First calculate all features which are computed together if USE_MORPH_DICT: res = scene_features.extract_dict_features(lemmas_tuples, COLLINS_PATH) res = [x.split(' ') for x in res] res = [[int(x) for x in y] for y in res] dict_features = list(zip(*res)) if USE_HFW: res = scene_features.extract_hfw_dict_features(lemmas_tuples, COLLINS_PATH, HFW) res = [x.split(' ') for x in res] res = [[int(x) for x in y] for y in res] hfw_features = list(zip(*res)) # Creating a list of features for each token all_res = [] print("finished init") for i, (token, pre_context, post_context) in enumerate(tokens_context): if i % 100 == 0: print(i) lemma = lemmas[i] res = [] if USE_MORPH_DICT: res += [int(lemma.endswith(suffix)) for suffix in SUFFIXES] res += [int(lemma.startswith(prefix)) for prefix in PREFIXES] res.append(int(form_ident.is_dual_vn(lemma))) res.extend(dict_features[i]) if USE_HFW: res.extend(hfw_features[i]) if USE_FUNCWORDS: for funcwords in FUNCWORDS: if pre_context and pre_context[0].lower() in funcwords: res.append(1) else: res.append(0) if post_context and post_context[0].lower() in funcwords: res.append(1) else: res.append(0) if USE_LIGHTVERBS: for lightverbs in LIGHTVERBS: if ((pre_context and pre_context[0].lower() in lightverbs) or (len(pre_context) > 1 and pre_context[1].lower() in lightverbs)): res.append(1) else: res.append(0) if ((post_context and post_context[0].lower() in lightverbs) or (len(post_context) > 1 and post_context[1].lower() in lightverbs)): res.append(1) else: res.append(0) all_res.append(res) # Converting to numpy matrix fmat = np.array(all_res) with open(FMAT_PATH, 'wb') as f: pickle.dump(fmat, f)
def main(): with open(PASSAGES_PATH, "rb") as f: passages = pickle.load(f) passages = passages[:NUM_PASSAGES] terminals, token_labels = tokeneval.get_terminals_labels(passages) tokens_context = tokeneval.get_context(terminals, context=2) tokens = [x[0] for x in tokens_context] lemmas = [tokeneval.lemmatize(token, TARGETS) for token in tokens] lemmas_tuples = [(lemma,) for lemma in lemmas] form_ident = lex.FormIdentifier(COLLINS_PATH, WIKT_PATH) # First calculate all features which are computed together if USE_MORPH_DICT: res = features.extract_dict_features(lemmas_tuples, COLLINS_PATH) res = [x.split(' ') for x in res] res = [[int(x) for x in y] for y in res] dict_features = list(zip(*res)) if USE_HFW: res = features.extract_hfw_dict_features(lemmas_tuples, COLLINS_PATH, HFW) res = [x.split(' ') for x in res] res = [[int(x) for x in y] for y in res] hfw_features = list(zip(*res)) # Creating a list of features for each token all_res = [] print("finished init") for i, (token, pre_context, post_context) in enumerate(tokens_context): if i % 100 == 0: print(i) lemma = lemmas[i] res = [] if USE_MORPH_DICT: res += [int(lemma.endswith(suffix)) for suffix in SUFFIXES] res += [int(lemma.startswith(prefix)) for prefix in PREFIXES] res.append(int(form_ident.is_dual_vn(lemma))) res.extend(dict_features[i]) if USE_HFW: res.extend(hfw_features[i]) if USE_FUNCWORDS: for funcwords in FUNCWORDS: if pre_context and pre_context[0].lower() in funcwords: res.append(1) else: res.append(0) if post_context and post_context[0].lower() in funcwords: res.append(1) else: res.append(0) if USE_LIGHTVERBS: for lightverbs in LIGHTVERBS: if ((pre_context and pre_context[0].lower() in lightverbs) or (len(pre_context) > 1 and pre_context[1].lower() in lightverbs)): res.append(1) else: res.append(0) if ((post_context and post_context[0].lower() in lightverbs) or (len(post_context) > 1 and post_context[1].lower() in lightverbs)): res.append(1) else: res.append(0) all_res.append(res) # Converting to numpy matrix fmat = np.array(all_res) with open(FMAT_PATH, 'wb') as f: pickle.dump(fmat, f)