Example #1
0
def main():
    model_file = '../../paper/data/srwe_model/wiki_small.w2v.model'
    nytimes_file = '../gen_data/nytimes/news_corpus'
    model = load_w2v_model(model_file, logging, nparray=True)
    corpus_vec, corpus_label = load_nytimes(nytimes_file, model)
    labels = list(set(corpus_label))
    X_train, X_test, y_train, y_test = train_test_split(corpus_vec, corpus_label, test_size=0.2, random_state=42)
    logging.info('train size: %d, test size:%d' % (len(y_train), len(y_test)))
    clfs = {}
    for label in labels:
        clfs[label] = train(label, X_train, X_test, y_train, y_test)

    y_pred = []
    for each in X_test:
        pred_res = []
        for label in clfs:
            pred_res.append((clfs[label].predict_proba(each.reshape(1, -1))[0][1], label))
        sorted_pred = sorted(pred_res, key=lambda x: x[0], reverse=True)
        y_pred.append(sorted_pred[0][1])
    precision, recall, f_score, support, present_labels = precision_recall_fscore_support(y_test, y_pred)
    for l, p, r, f in zip(present_labels, precision, recall, f_score):
        print '%s\t%.4lf\t%.4lf\t%.4lf' % (l, p, r, f)

    precision, recall, f_score, support, present_labels = precision_recall_fscore_support(y_test, y_pred, average='macro')
    print 'Macro\t%.4lf\t%.4lf\t%.4lf' % (precision, recall, f_score)
    precision, recall, f_score, support, present_labels = precision_recall_fscore_support(y_test, y_pred, average='micro')
    print 'Micro\t%.4lf\t%.4lf\t%.4lf' % (precision, recall, f_score)
Example #2
0
def load_models(labels):
    global clfs, w2v_model, dimension
    path = '/home/zhangbaihan/srwe/evaluation/clf_model/%s.model'
    model_file = '/home/zhangbaihan/paper/data/srwe_model/wiki_small.w2v.100.r.0.00001.s.0.00009.model'
    if not clfs:
        keys = labels.keys()
        fetch_logger.info('loadding LR models...')
        for label in keys:
            fetch_logger.info('loadding %s.model ...' % label)
            clfs[label] = joblib.load(path % label)
    if not w2v_model:
        fetch_logger.info('loadding w2v model...')
        w2v_model = load_w2v_model(model_file, None, nparray=True)
    dimension = get_vec_len(w2v_model)
Example #3
0
def main():
    word_pair, simi = load_standard('./wordsim353_annotator1.txt')
    #model = load_w2v_model('../../paper/word2vec/vec.txt', logging)
    model_path = '../../paper/data/srwe_model/wiki_small.w2v.r.0.001.model'
    model = load_w2v_model(model_path, logging)
    new_simi = []
    for pair in word_pair:
        if pair[0] not in model or pair[1] not in model:
            logging.error('%s not in vocab.' % pair[0] if pair[0] not in model else pair[1])
            new_simi.append(0.0)
            continue
        new_simi.append(similarity(model[pair[0]], model[pair[1]]))
    print model_path
    res = scipy.stats.spearmanr(simi, new_simi)
    print res