def train_eval(mode, model_file, descriptions_file, neg_words_mult=2., lbda=50, min_words=50, eval_lines=5000, eval_words=10): model = load_word2vec_model(model_file, mmap='r') if mode == 'centroid': entity_model = EntityModelCentroid() elif mode == 'lr': bins = np.cumsum( [model.vocab[word].count for word in model.index2word]) entity_model = EntityModelLR(bins, neg_words_mult, lbda) else: raise Exception('unsupported mode %s' % mode) rng = random.Random(1729) eval_items = [] def sampled_word_seqs(): for i, (entity, t, word_idxs) in \ enumerate(read_entity_word_seqs(descriptions_file, model, min_words)): rng.shuffle(word_idxs) if i < eval_lines: eval_items.append( (entity, word_idxs[:eval_words], len(word_idxs))) yield entity, t, word_idxs[eval_words:] entity_model.train(model, sampled_word_seqs()) evaluate_retrieval(model, entity_model, eval_items)
def eval(model_file, lr_entity_file, centroid_entity_file): import readline readline.parse_and_bind('set editing-mode emacs') model = load_word2vec_model(model_file, mmap='r') lr_entity_model = EntityModel.load(lr_entity_file, mmap='r') centroid_entity_model = EntityModel.load(centroid_entity_file, mmap='r') norm_entities = [(entity.lower(), entity) for entity in lr_entity_model.entities] while True: try: line = raw_input('> ').strip() except EOFError: break words, entities = parse_query(norm_entities, line) lr_top = top_entities(model, lr_entity_model, entities, words) centroid_top = top_entities(model, centroid_entity_model, entities, words) for (lr_score, lr_ent), (centroid_score, centroid_ent) in zip(lr_top, centroid_top): print '%-50s%10.3f | %-50s%10.3f' % (lr_ent, lr_score, centroid_ent, centroid_score)
def train_eval(mode, model_file, descriptions_file, neg_words_mult=2., lbda=50, min_words=50, eval_lines=5000, eval_words=10): model = load_word2vec_model(model_file, mmap='r') if mode == 'centroid': entity_model = EntityModelCentroid() elif mode == 'lr': bins = np.cumsum([model.vocab[word].count for word in model.index2word]) entity_model = EntityModelLR(bins, neg_words_mult, lbda) else: raise Exception('unsupported mode %s' % mode) rng = random.Random(1729) eval_items = [] def sampled_word_seqs(): for i, (entity, t, word_idxs) in \ enumerate(read_entity_word_seqs(descriptions_file, model, min_words)): rng.shuffle(word_idxs) if i < eval_lines: eval_items.append((entity, word_idxs[:eval_words], len(word_idxs))) yield entity, t, word_idxs[eval_words:] entity_model.train(model, sampled_word_seqs()) evaluate_retrieval(model, entity_model, eval_items)
def quant(input_file, output_template=None, target_err=0.1, transform=True, test_accuracy=None): model = load_word2vec_model(input_file, mmap='r') q, pred_bits, zeros, avg_err, quant_syn0, dequant_model = quantize(model, target_err, transform) pred_bps = float(pred_bits) / quant_syn0.size avg_zeros = float(zeros) / quant_syn0.size if output_template is not None: output_filename = '%s.e%.3f.%s' % (output_template, target_err, 'tr' if transform else 'nt') with open(output_filename + '.txt', 'w') as fout: save_vectors(fout, model.index2word, quant_syn0, q) dequant_model.save(output_filename + '.model') acc = None if test_accuracy is not None: acc = fast_accuracy(dequant_model.vocab, dequant_model.syn0, test_accuracy, restrict=100000) print json.dumps(OrderedDict([ ('q', q), ('transform', transform), ('pred_bps', float(pred_bps)), ('avg_zeros', float(avg_zeros)), ('avg_err', float(avg_err)), ('accuracy', acc), ]))
def load_resources_for_infer_one(domain): """加载第三方情感词典""" general_opinion_doc = mongodb_client.db['opinion_resources'].find_one( {'doc_type': 'general_opinion'}) res_for_one.general_opinion = general_opinion_doc['lexicon'] """加载用户定义词典""" res_for_one.user_defined_aspect = TAG_LIST """加载word2vec模型""" res_for_one.word2vec_model = utils.load_word2vec_model() """加载build完成的pair""" build_pairs_query_res = mongodb_client.db['opinion_build_pairs'].find_one( {'domain': domain}) pair_polarity = build_pairs_query_res['pair_polarity'] res_for_one.pair_polarity = pair_polarity
def train(mode, model_file, descriptions_file, output_file=None, neg_words_mult=2., lbda=50, min_words=1): model = load_word2vec_model(model_file, mmap='r') if mode == 'centroid': entity_model = EntityModelCentroid() elif mode == 'lr': bins = np.cumsum([model.vocab[word].count for word in model.index2word]) entity_model = EntityModelLR(bins, neg_words_mult, lbda) else: raise Exception('unsupported mode %s' % mode) entity_model.train(model, read_entity_word_seqs(descriptions_file, model, min_words)) if output_file is not None: entity_model.save(output_file)
def train(mode, model_file, descriptions_file, output_file=None, neg_words_mult=2., lbda=50, min_words=1): model = load_word2vec_model(model_file, mmap='r') if mode == 'centroid': entity_model = EntityModelCentroid() elif mode == 'lr': bins = np.cumsum( [model.vocab[word].count for word in model.index2word]) entity_model = EntityModelLR(bins, neg_words_mult, lbda) else: raise Exception('unsupported mode %s' % mode) entity_model.train( model, read_entity_word_seqs(descriptions_file, model, min_words)) if output_file is not None: entity_model.save(output_file)
def quant(input_file, output_template=None, target_err=0.1, transform=True, test_accuracy=None): model = load_word2vec_model(input_file, mmap='r') q, pred_bits, zeros, avg_err, quant_syn0, dequant_model = quantize( model, target_err, transform) pred_bps = float(pred_bits) / quant_syn0.size avg_zeros = float(zeros) / quant_syn0.size if output_template is not None: output_filename = '%s.e%.3f.%s' % (output_template, target_err, 'tr' if transform else 'nt') with open(output_filename + '.txt', 'w') as fout: save_vectors(fout, model.index2word, quant_syn0, q) dequant_model.save(output_filename + '.model') acc = None if test_accuracy is not None: acc = fast_accuracy(dequant_model.vocab, dequant_model.syn0, test_accuracy, restrict=100000) print json.dumps( OrderedDict([ ('q', q), ('transform', transform), ('pred_bps', float(pred_bps)), ('avg_zeros', float(avg_zeros)), ('avg_err', float(avg_err)), ('accuracy', acc), ]))
vector = get_vector(analyzer, sentence) if not vector is None: X.append(vector) y.append(data[ix][1]) return (X, y) if __name__=="__main__": REPORT = "report.txt" # data[0] -> text # data[1] -> label (1 or 0) # data[2] -> id data, sentences = get_data("train.json") model = load_word2vec_model("GoogleNews-vectors-negative300.bin") ta = TextAnalyzer(model) X, y = get_X_y(ta, data, sentences) clf = svm.SVC() clf.fit(X, y) test_data, test_sentences = get_data("test.json") X, y = get_X_y(ta, test_data, test_sentences) test_ids = [item[2] for item in test_data] true_pos = false_pos = true_neg = false_neg = 0 y_pred = list()
def accuracy(input_file, questions_file, restrict=100000): model = load_word2vec_model(input_file, mmap='r') acc = fast_accuracy(model.vocab, model.syn0, questions_file, restrict) print json.dumps(acc)