Ejemplo n.º 1
0
def main():
    data_sets = debates.get_for_crossvalidation()

    texts = [sentence.text for sentence in data_sets[0][1]]
    texts.extend([sentence.text for sentence in data_sets[0][2]])

    tokenizer, word_index = create_tokenizer(texts)
    create_embedding('/usr/users/oliverren/meng/check-worthy/data/glove/glove.6B.50d.txt', word_index)
Ejemplo n.º 2
0
import sys
sys.path.append('/usr/users/oliverren/meng/check-worthy')

import numpy as np
from sklearn.metrics import (average_precision_score, precision_score,
                             recall_score, roc_auc_score)
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint

from src.data import debates
from src.models import models
from src.stats import rank_metrics as rm
from src.features.feature_sets import get_serialized_pipeline

data_sets = debates.get_for_crossvalidation()

texts = [
    sentence.text for sentence in debates.read_all_debates(sep_by_deb=False)
]  #debates
texts.extend([
    sentence.text for sentence in debates.read_all_speeches(sep_by_deb=False)
])  #speeches

tokenizer, word_index = models.create_tokenizer(texts)

MAX_SENTENCE_LENGTH = max([len(sentence.split()) for sentence in texts])

folder = 'speeches/'
# tests: one hidden layer, dropout 0.3, softmax activation, fixed vs dynamic word embedding (300d glove)
results = []
Ejemplo n.º 3
0
        serialize = False
        if serialize:
            all_debates = []
            trainable_feats = counting_feat.BagOfTfIDF.FEATS + knn_similarity.TrainSearch.FEATS

            for debate in DEBATES:
                all_debates += read_debates(debate)
            all_feats = get_experimential_pipeline(all_debates, to_matrix=False).fit_transform(all_debates)
            for feat_name in all_feats[0].features.keys():
                if feat_name in trainable_feats:
                    continue
                feat_dict = {}
                for _x in all_feats:
                    feat_dict[str(_x.id) + _x.debate.name] = _x.features[feat_name]
                if os.path.isfile(CONFIG['features_dump_dir'] + feat_name):
                    old_dict = json.loads(open(CONFIG['features_dump_dir'] + feat_name).read())
                else:
                    old_dict = {}
                old_dict.update(feat_dict)
                with open(CONFIG['features_dump_dir'] + feat_name, "w") as out:
                    out.write(json.dumps(old_dict))
        else:
            results = []
            for test_deb, test, train in get_for_crossvalidation():
                split_results = run(test, train)
                results.append(split_results)
                get_all_metrics(copy.deepcopy([split_results]), agreement=1)
            get_all_metrics(results, agreement=1)