def prepare_data(): logger.debug('Fetching unlabeled posts from database') con = sqlite3.connect(conf.CORPUSDB) sql = ''' SELECT Headline, Body FROM Posts WHERE ID_Post NOT IN ( SELECT DISTINCT ID_Post FROM Annotations ) ''' r = con.execute(sql) pool = multiprocessing.Pool() posts = pool.map(preprocess, r) return posts
def get_post_documents(): logger.debug('Fetching unlabeled posts from database') con = sqlite3.connect(conf.CORPUSDB) sql = ''' SELECT ID_Post, COALESCE(Headline, '') || ' ' || COALESCE(Body, '') FROM Posts WHERE ID_Post NOT IN ( SELECT DISTINCT ID_Post FROM Annotations ) ''' r = con.execute(sql) pool = multiprocessing.Pool() while True: rows = r.fetchmany(100000) if len(rows) == 0: break logger.debug('Normalizing and tokenizing') wordlists = pool.map(micro_tokenize, pool.map(normalize, [r[1] for r in rows])) for i, words in enumerate(wordlists): yield TaggedDocument(words, [rows[i][0]]) pool.close() pool.join() logger.debug('End of generator')
from gensim.models.word2vec import Word2Vec import numpy from sklearn.cluster import KMeans from customlogging import logger import conf if __name__ == '__main__': w2vmodelfile = os.path.join(conf.W2V_DIR, 'model') if not os.path.exists(w2vmodelfile): print('Word2vec model file "%s" not found.' % w2vmodelfile) print('Did you run train_word2vec.py?') sys.exit(1) logger.debug('Loading word embedding') emb = Word2Vec.load(w2vmodelfile) vocab = emb.index2word wordvecs = emb.wv.syn0 # add UNK word at origin of embedding space vocab.append('UNK') wordvecs = numpy.vstack((wordvecs, numpy.zeros(wordvecs.shape[1]))) clusterer = KMeans(n_clusters=conf.BOCID_NCLUSTERS, random_state=conf.SEED, max_iter=conf.BOCID_CLUSTITER, n_jobs=-1) logger.debug('Starting clustering') VC = clusterer.fit_predict(wordvecs) logger.debug('Matching words to cluster IDs')
int(y_pred[i])]) return resultrows if __name__ == '__main__': if not os.path.exists(conf.RESULTDB): con_results = sqlite3.connect(conf.RESULTDB) con_results.execute(conf.RESULTDB_SETUP) else: con_results = sqlite3.connect(conf.RESULTDB) cats = get_categories() folds = get_folds() sql = 'INSERT INTO Results VALUES(?, ?, ?, ?, ?, ?)' for method in methodmodules.keys(): logger.debug('-' * 40) logger.debug('Method %s', method) logger.debug('Computing results for %d categories and %d folds...' % (len(cats), len(folds))) jobs = [] for c in cats: for fold in folds: jobs.append([method, c, fold]) # LSTM runs on GPU, where all memory is needed for a single job. Hence, # we need to run each job sequentially. if method == 'LSTM': results = list(itertools.starmap(evaluate, jobs)) # For all other methods, we can spawn parallel processes. else:
SELECT DISTINCT ID_Post FROM Annotations ) ''' r = con.execute(sql) pool = multiprocessing.Pool() posts = pool.map(preprocess, r) return posts if __name__ == '__main__': if not os.path.exists(conf.W2V_DIR): os.mkdir(conf.W2V_DIR) sentences = prepare_data() logger.debug('word2vec training...') logging.basicConfig(format='%(asctime)s [word2vec]: %(message)s', level=logging.INFO) model = word2vec.Word2Vec(sentences, size=conf.W2V_DIMS, window=5, min_count=5, seed=conf.SEED, workers=1, iter=conf.W2V_EPOCHS) model.delete_temporary_training_data( replace_word_vectors_with_normalized=True) outfile = os.path.join(conf.W2V_DIR, 'model') logger.debug('Storing word2vec object to "%s"' % outfile) model.save(fname_or_handle=outfile, separately=None, pickle_protocol=3) logger.debug('Finished.')
def evaluate(cat, fold, txt_train, txt_test, y_train, y_test): pool = multiprocessing.Pool() wordlists_train = pool.map(preprocess, txt_train) wordlists_test = pool.map(preprocess, txt_test) pool.close() pool.join() emb = Word2Vec.load(os.path.join(conf.W2V_DIR, 'model')) # add point at orign for unknown words emb.wv.syn0 = numpy.vstack((emb.wv.syn0, numpy.zeros(emb.wv.syn0.shape[1], dtype=numpy.float32))) # train data: replace words with embedding IDs, zero-padding and truncation X = numpy.zeros((len(y_train), conf.LSTM_MAXPOSTLEN), dtype=numpy.int32) X_lengths = numpy.zeros((len(y_train))) for i, words in enumerate(wordlists_train): X_lengths[i] = len(words) for j, w in enumerate(words): if j >= conf.LSTM_MAXPOSTLEN: break if w in emb: X[i,j] = emb.vocab[w].index else: X[i,j] = len(emb.vocab) # test data: replace words with embedding IDs, zero-padding and truncation test_X = numpy.zeros((len(y_test), conf.LSTM_MAXPOSTLEN), dtype=numpy.int32) test_lengths = numpy.zeros((len(y_test))) for i, words in enumerate(wordlists_test): test_lengths[i] = len(words) for j, w in enumerate(words): if j >= conf.LSTM_MAXPOSTLEN: break if w in emb: test_X[i,j] = emb.vocab[w].index else: test_X[i,j] = len(emb.vocab) # one-hot encode y enc = OneHotEncoder() y = enc.fit_transform(y_train.reshape(-1,1)).todense() test_y = enc.transform(y_test.reshape(-1,1)).todense() # split training data 80/20 into training and validation data for early # stopping splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=conf.SEED) train_i, vali_i = next(splitter.split(X, y_train)) X_vali = X[vali_i,:] y_vali = y[vali_i,:] vali_lengths = X_lengths[vali_i] X = X[train_i,:] y = y[train_i,:] X_lengths = X_lengths[train_i] numpy.random.seed(conf.SEED) tf.set_random_seed(conf.SEED) model = LSTMModel(emb, y.shape[1]) # The following, in combination with # export CUDA_VISIBLE_DEVICES="" # in the shell disables all parallelism, which leads to reproducible results # but takes a very long time to complete # sess = tf.Session(config=tf.ConfigProto( # inter_op_parallelism_threads=1 # intra_op_parallelism_threads=1)) sess = tf.Session() sess.run(model.init_op) no_of_batches = math.ceil(len(X) / conf.LSTM_BATCHSIZE) losses = [] f1s_train = [] precisions_vali = [] recalls_vali = [] f1s_vali = [] precisions_test = [] recalls_test = [] f1s_test = [] best_vali_f1 = -1.0 best_y_pred = [] for i in range(conf.LSTM_EPOCHS): ptr = 0 totalloss = 0.0 predictions = [] true = [] batch_gen = stratified_batch_generator(X, y, X_lengths, conf.LSTM_BATCHSIZE) for inp, out, leng in batch_gen: extra = conf.LSTM_BATCHSIZE - len(inp) if extra > 0: inp = numpy.vstack((inp, numpy.zeros((extra, inp.shape[1])))) out = numpy.vstack((out, numpy.zeros((extra, out.shape[1])))) leng = numpy.concatenate((leng, numpy.zeros(extra))) _, loss, pred = sess.run( [ model.minimize, model.cross_entropy, model.prediction ], { model.data: inp, model.target: out, model.lengths: leng, model.dropout_lstm: conf.LSTM_DROPOUT_LSTM, model.dropout_fully: conf.LSTM_DROPOUT_FULLY, } ) pred = list(numpy.argmax(pred, axis=1)) true.extend(out) if extra > 0: pred = pred[:-extra] true = true[:-extra] predictions.extend(pred) totalloss += loss losses.append(totalloss) true = numpy.argmax(true, axis=1) with warnings.catch_warnings(): warnings.simplefilter('ignore', category=UndefinedMetricWarning) f1s_train.append(f1_score(predictions, true)) # validation set F1 predictions = [] ptr2 = 0 for j in range(math.ceil(len(X_vali) / conf.LSTM_BATCHSIZE)): inp2 = X_vali[ptr2:ptr2+conf.LSTM_BATCHSIZE] leng = vali_lengths[ptr2:ptr2+conf.LSTM_BATCHSIZE] extra = conf.LSTM_BATCHSIZE - len(inp2) if extra > 0: inp2 = numpy.vstack((inp2, numpy.zeros((extra, inp2.shape[1])))) leng = numpy.concatenate((leng, numpy.zeros(extra))) ptr2 += conf.LSTM_BATCHSIZE pred = sess.run(model.prediction, { model.data: inp2, model.lengths: leng, model.dropout_lstm: 1.0, model.dropout_fully: 1.0, } ) pred = list(numpy.argmax(pred, axis=1)) if extra > 0: pred = pred[:-extra] predictions.extend(pred) true = numpy.argmax(y_vali, axis=1) with warnings.catch_warnings(): warnings.simplefilter('ignore', category=UndefinedMetricWarning) precisions_vali.append(precision_score(predictions, true)) recalls_vali.append(recall_score(predictions, true)) f1s_vali.append(f1_score(predictions, true)) # test set F1 predictions = [] ptr2 = 0 for j in range(math.ceil(len(test_X) / conf.LSTM_BATCHSIZE)): inp2 = test_X[ptr2:ptr2+conf.LSTM_BATCHSIZE] leng = test_lengths[ptr2:ptr2+conf.LSTM_BATCHSIZE] extra = conf.LSTM_BATCHSIZE - len(inp2) if extra > 0: inp2 = numpy.vstack((inp2, numpy.zeros((extra, inp2.shape[1])))) leng = numpy.concatenate((leng, numpy.zeros(extra))) ptr2 += conf.LSTM_BATCHSIZE pred = sess.run(model.prediction, { model.data: inp2, model.lengths: leng, model.dropout_lstm: 1.0, model.dropout_fully: 1.0, } ) pred = list(numpy.argmax(pred, axis=1)) if extra > 0: pred = pred[:-extra] predictions.extend(pred) true = numpy.argmax(test_y, axis=1) with warnings.catch_warnings(): warnings.simplefilter('ignore', category=UndefinedMetricWarning) precisions_test.append(precision_score(predictions, true)) recalls_test.append(recall_score(predictions, true)) f1s_test.append(f1_score(predictions, true)) # "early stopping" (not really stopping) if f1s_vali[-1] > best_vali_f1: best_y_pred = predictions best_vali_f1 = f1s_vali[-1] logger.debug('New best Validation F1: %f', best_vali_f1) logger.debug('Epoch %3d of %3d, total loss = %.4f, ' + 'F1_train = %.4f, F1_test = %.4f', i + 1, conf.LSTM_EPOCHS, totalloss, f1s_train[-1], f1s_test[-1]) if not os.path.exists(conf.LSTM_PLOTDIR): os.mkdir(conf.LSTM_PLOTDIR) plotfile = os.path.join(conf.LSTM_PLOTDIR, 'plot_%s_%d.png' % (cat, fold)) plot_losses_f1s( losses, f1s_train, precisions_vali, recalls_vali, f1s_vali, precisions_test, recalls_test, f1s_test, plotfile ) sess.close() del model tf.reset_default_graph() return best_y_pred
logger.debug('End of generator') if __name__ == '__main__': logging.basicConfig(format='%(asctime)s [doc2vec] : %(message)s', level=logging.INFO) d2v = Doc2Vec(dm=1, size=conf.D2V_DIMS, negative=5, iter=1, alpha=conf.D2V_ALPHA, seed=conf.SEED, workers=1) logger.debug('Building doc2vec vocabulary...') d2v.build_vocab(get_post_documents()) logger.debug('doc2vec training...') alpha = conf.D2V_ALPHA alpha_delta = (conf.D2V_ALPHA - conf.D2V_MINALPHA) / conf.D2V_EPOCHS for i in range(conf.D2V_EPOCHS): logger.debug('Epoch %d of %d (alpha = %f)', i + 1, conf.D2V_EPOCHS, alpha) d2v.alpha = alpha d2v.train(get_post_documents(), report_delay=10.0) alpha -= alpha_delta if not os.path.exists(conf.D2V_DIR): os.mkdir(conf.D2V_DIR) outfile = os.path.join(conf.D2V_DIR, 'model')