import nltk from prepare_data import get_sentences, normalize_special_characters, parse_sents, remove_articles import os from code.lib import utils import argparse from code.lib.eval_predicted_text import eval_predicted_text import codecs from code.lib.Tree import Tree #from extract_features import np_selector from collections import defaultdict SETTINGS = utils.read_settings() MANUAL_DATA_PATH = os.path.join(SETTINGS.get('paths', 'dataCorrector'), 'manual') LOG_RESULTS_PATH = os.path.join(SETTINGS.get('paths', 'logModelResults'), 'bnc_manual_comparison') TOTAL_ART_POSITIONS = 163 def compare_multiple_predictions(orig_sent, pred_sents): """ Vykresli smrsklou reprezentaci vsech predikci pro jednu originalni vetu: "sell-through market, as [the, .,.,a/an,X] retail sector is ..." Do znacne miry duplikace kodu z code.lib.eval_predicted_text :( """ ARTICLE_TOKENS = ('a', 'an', 'a/an', 'the') error_repr = [] orig_sent = nltk.word_tokenize(orig_sent) predict_sents = [ nltk.word_tokenize(predict_sent) for predict_sent in pred_sents ]
import numpy as np import pandas import os from code.lib.utils import read_settings, save_csr_matrix from .convert_bools import convert_bools from .embeddings import postprocess_embeddings from sklearn.feature_extraction import DictVectorizer import scipy.sparse import pickle SETTINGS = read_settings() CATEGORICAL_LIST_FEATURES = ( 'a_hypernyms', 'b_pos_after_head_as_list', 'b_pos_before_head_as_list', 'b_words_after_head_as_list', 'b_words_after_np_as_list', 'b_words_before_head_as_list', 'b_words_before_np_as_list' ) TRAIN_DATASET_NAME, HELDOUT_DATASET_NAME, TEST_DATASET_NAME = 'train', 'heldout', 'test' RARE_FEATURE_VALUE_CUTOFF = 5 OUT_OF_VOCABULARY_TOKEN = '-OOV-' def postproces_dataframes(datasets): assert_columns_match(datasets) print("Cleaning fake (informative) features...") for dataset_type, dataset in datasets.items(): print("- for dataset {}".format(dataset_type)) datasets[dataset_type] = dataset.drop([f_name for f_name in dataset.columns if f_name.startswith("_")], 1) assert_columns_match(datasets) print("Done")