def main(): model_file = '../../paper/data/srwe_model/wiki_small.w2v.model' nytimes_file = '../gen_data/nytimes/news_corpus' model = load_w2v_model(model_file, logging, nparray=True) corpus_vec, corpus_label = load_nytimes(nytimes_file, model) labels = list(set(corpus_label)) X_train, X_test, y_train, y_test = train_test_split(corpus_vec, corpus_label, test_size=0.2, random_state=42) logging.info('train size: %d, test size:%d' % (len(y_train), len(y_test))) clfs = {} for label in labels: clfs[label] = train(label, X_train, X_test, y_train, y_test) y_pred = [] for each in X_test: pred_res = [] for label in clfs: pred_res.append((clfs[label].predict_proba(each.reshape(1, -1))[0][1], label)) sorted_pred = sorted(pred_res, key=lambda x: x[0], reverse=True) y_pred.append(sorted_pred[0][1]) precision, recall, f_score, support, present_labels = precision_recall_fscore_support(y_test, y_pred) for l, p, r, f in zip(present_labels, precision, recall, f_score): print '%s\t%.4lf\t%.4lf\t%.4lf' % (l, p, r, f) precision, recall, f_score, support, present_labels = precision_recall_fscore_support(y_test, y_pred, average='macro') print 'Macro\t%.4lf\t%.4lf\t%.4lf' % (precision, recall, f_score) precision, recall, f_score, support, present_labels = precision_recall_fscore_support(y_test, y_pred, average='micro') print 'Micro\t%.4lf\t%.4lf\t%.4lf' % (precision, recall, f_score)
def load_models(labels): global clfs, w2v_model, dimension path = '/home/zhangbaihan/srwe/evaluation/clf_model/%s.model' model_file = '/home/zhangbaihan/paper/data/srwe_model/wiki_small.w2v.100.r.0.00001.s.0.00009.model' if not clfs: keys = labels.keys() fetch_logger.info('loadding LR models...') for label in keys: fetch_logger.info('loadding %s.model ...' % label) clfs[label] = joblib.load(path % label) if not w2v_model: fetch_logger.info('loadding w2v model...') w2v_model = load_w2v_model(model_file, None, nparray=True) dimension = get_vec_len(w2v_model)
def main(): word_pair, simi = load_standard('./wordsim353_annotator1.txt') #model = load_w2v_model('../../paper/word2vec/vec.txt', logging) model_path = '../../paper/data/srwe_model/wiki_small.w2v.r.0.001.model' model = load_w2v_model(model_path, logging) new_simi = [] for pair in word_pair: if pair[0] not in model or pair[1] not in model: logging.error('%s not in vocab.' % pair[0] if pair[0] not in model else pair[1]) new_simi.append(0.0) continue new_simi.append(similarity(model[pair[0]], model[pair[1]])) print model_path res = scipy.stats.spearmanr(simi, new_simi) print res