Beispiel #1
0
import nltk
from prepare_data import get_sentences, normalize_special_characters, parse_sents, remove_articles
import os
from code.lib import utils
import argparse
from code.lib.eval_predicted_text import eval_predicted_text
import codecs
from code.lib.Tree import Tree
#from extract_features import np_selector
from collections import defaultdict

SETTINGS = utils.read_settings()

MANUAL_DATA_PATH = os.path.join(SETTINGS.get('paths', 'dataCorrector'),
                                'manual')
LOG_RESULTS_PATH = os.path.join(SETTINGS.get('paths', 'logModelResults'),
                                'bnc_manual_comparison')
TOTAL_ART_POSITIONS = 163


def compare_multiple_predictions(orig_sent, pred_sents):
    """
    Vykresli smrsklou reprezentaci vsech predikci pro jednu originalni vetu: "sell-through market, as [the, .,.,a/an,X] retail sector is ..."
    Do znacne miry duplikace kodu z code.lib.eval_predicted_text :(
    """
    ARTICLE_TOKENS = ('a', 'an', 'a/an', 'the')
    error_repr = []
    orig_sent = nltk.word_tokenize(orig_sent)
    predict_sents = [
        nltk.word_tokenize(predict_sent) for predict_sent in pred_sents
    ]
import numpy as np
import pandas
import os
from code.lib.utils import read_settings, save_csr_matrix
from .convert_bools import convert_bools
from .embeddings import postprocess_embeddings
from sklearn.feature_extraction import DictVectorizer
import scipy.sparse
import pickle


SETTINGS = read_settings()
CATEGORICAL_LIST_FEATURES = (
    'a_hypernyms',
    'b_pos_after_head_as_list', 'b_pos_before_head_as_list', 'b_words_after_head_as_list',
    'b_words_after_np_as_list', 'b_words_before_head_as_list', 'b_words_before_np_as_list'
)
TRAIN_DATASET_NAME, HELDOUT_DATASET_NAME, TEST_DATASET_NAME = 'train', 'heldout', 'test'
RARE_FEATURE_VALUE_CUTOFF = 5
OUT_OF_VOCABULARY_TOKEN = '-OOV-'


def postproces_dataframes(datasets):
    assert_columns_match(datasets)

    print("Cleaning fake (informative) features...")
    for dataset_type, dataset in datasets.items():
        print("- for dataset {}".format(dataset_type))
        datasets[dataset_type] = dataset.drop([f_name for f_name in dataset.columns if f_name.startswith("_")], 1)
    assert_columns_match(datasets)
    print("Done")