def main(): data_sets = debates.get_for_crossvalidation() texts = [sentence.text for sentence in data_sets[0][1]] texts.extend([sentence.text for sentence in data_sets[0][2]]) tokenizer, word_index = create_tokenizer(texts) create_embedding('/usr/users/oliverren/meng/check-worthy/data/glove/glove.6B.50d.txt', word_index)
import sys sys.path.append('/usr/users/oliverren/meng/check-worthy') import numpy as np from sklearn.metrics import (average_precision_score, precision_score, recall_score, roc_auc_score) from keras.preprocessing.sequence import pad_sequences from keras.callbacks import ModelCheckpoint from src.data import debates from src.models import models from src.stats import rank_metrics as rm from src.features.feature_sets import get_serialized_pipeline data_sets = debates.get_for_crossvalidation() texts = [ sentence.text for sentence in debates.read_all_debates(sep_by_deb=False) ] #debates texts.extend([ sentence.text for sentence in debates.read_all_speeches(sep_by_deb=False) ]) #speeches tokenizer, word_index = models.create_tokenizer(texts) MAX_SENTENCE_LENGTH = max([len(sentence.split()) for sentence in texts]) folder = 'speeches/' # tests: one hidden layer, dropout 0.3, softmax activation, fixed vs dynamic word embedding (300d glove) results = []
serialize = False if serialize: all_debates = [] trainable_feats = counting_feat.BagOfTfIDF.FEATS + knn_similarity.TrainSearch.FEATS for debate in DEBATES: all_debates += read_debates(debate) all_feats = get_experimential_pipeline(all_debates, to_matrix=False).fit_transform(all_debates) for feat_name in all_feats[0].features.keys(): if feat_name in trainable_feats: continue feat_dict = {} for _x in all_feats: feat_dict[str(_x.id) + _x.debate.name] = _x.features[feat_name] if os.path.isfile(CONFIG['features_dump_dir'] + feat_name): old_dict = json.loads(open(CONFIG['features_dump_dir'] + feat_name).read()) else: old_dict = {} old_dict.update(feat_dict) with open(CONFIG['features_dump_dir'] + feat_name, "w") as out: out.write(json.dumps(old_dict)) else: results = [] for test_deb, test, train in get_for_crossvalidation(): split_results = run(test, train) results.append(split_results) get_all_metrics(copy.deepcopy([split_results]), agreement=1) get_all_metrics(results, agreement=1)