def vectorize(path_data, path_sent, path_label, mode): sents = flat_read(path_data, 'text') labels = flat_read(path_data, 'label') if mode == 'train': embed(sents, path_word2ind, path_word_vec, path_embed) label2ind(labels, path_label_ind) align(sents, labels, path_sent, path_label)
def statistic(path_train): texts = flat_read(path_train, 'text') labels = flat_read(path_train, 'label') text_str = ''.join(texts) text_lens = [len(text) for text in texts] count(path_vocab_freq, text_str, 'vocab') count(path_len_freq, text_lens, 'text_len') count(path_label_freq, labels, 'label')
def statistic(path_train): docs = flat_read(path_train, 'cut_doc') labels = flat_read(path_train, 'label') all_words = ' '.join(docs).split() doc_lens = [len(doc.split()) for doc in docs] count(path_vocab_freq, all_words, 'vocab') count(path_len_freq, doc_lens, 'doc_len') count(path_label_freq, labels, 'label')
def vectorize(path_data, path_sent, path_label, mode): texts = flat_read(path_data, 'text') sents = add_flag(texts) sent_words = [list(sent) for sent in sents] labels = flat_read(path_data, 'label') if mode == 'train': embed(sent_words, path_word_ind, path_word_vec, path_embed) label2ind(labels, path_label_ind) align(sent_words, labels, path_sent, path_label)
def vectorize_triple(path_data, path_triple): anc_sents = flat_read(path_data, 'anc') pos_sents = flat_read(path_data, 'pos') neg_sents = flat_read(path_data, 'neg') anc_seqs, pos_seqs, neg_seqs = align(anc_sents), align(pos_sents), align( neg_sents) triples = (anc_seqs, pos_seqs, neg_seqs) with open(path_triple, 'wb') as f: pk.dump(triples, f)
def vectorize(path_data, path_sent, path_label, mode): sents = flat_read(path_data, 'text') labels = flat_read(path_data, 'label') if mode == 'train': embed(sents, path_word2ind, path_word_vec, path_embed) pad_seqs = align(sents) with open(path_sent, 'wb') as f: pk.dump(pad_seqs, f) with open(path_label, 'wb') as f: pk.dump(labels, f)
def statistic(path_train): texts = flat_read(path_train, 'text') labels = flat_read(path_train, 'label') text_str = ''.join(texts) text_lens = [len(text) for text in texts] count(path_vocab_freq, text_str, 'vocab') count(path_len_freq, text_lens, 'text_len') count(path_label_freq, labels, 'label') metric = int(len(texts) / np.median(text_lens)) print('sent / word_per_sent: %d' % metric)
def statistic(path_train): poets = flat_read(path_train, 'poet') titles = flat_read(path_train, 'title') texts = flat_read(path_train, 'text') text_str = ''.join(texts) text_lens = [len(text) for text in texts] count(path_poet_freq, poets, 'poet') count(path_title_freq, titles, 'title') count(path_vocab_freq, text_str, 'vocab') count(path_len_freq, text_lens, 'text_len')
def featurize(path_data, path_sent, path_label, mode): sent1s = flat_read(path_data, 'text1') sent2s = flat_read(path_data, 'text2') labels = flat_read(path_data, 'label') sents = sent1s + sent2s sent_feats = sent2feat(sents, path_bow, path_svd, mode) sent_feats = merge(sent_feats) labels = np.array(labels) with open(path_sent, 'wb') as f: pk.dump(sent_feats, f) with open(path_label, 'wb') as f: pk.dump(labels, f)
def vectorize(path_data, path_pair, path_label, mode): sent1s = flat_read(path_data, 'text1') sent2s = flat_read(path_data, 'text2') labels = flat_read(path_data, 'label') sents = sent1s + sent2s if mode == 'train': embed(sents, path_word2ind, path_word_vec, path_embed) pad_seq1s, pad_seq2s = align(sent1s), align(sent2s) pairs = (pad_seq1s, pad_seq2s) labels = np.array(labels) with open(path_pair, 'wb') as f: pk.dump(pairs, f) with open(path_label, 'wb') as f: pk.dump(labels, f)
def featurize(path_data, path_sent, path_label, mode): sents = flat_read(path_data, 'text') labels = flat_read(path_data, 'label') sent_feats = sent2feat(sents, path_bow, path_svd, mode) if mode == 'train': label2ind(labels, path_label_ind) with open(path_label_ind, 'rb') as f: label_inds = pk.load(f) inds = list() for label in labels: inds.append(label_inds[label]) inds = np.array(inds) with open(path_sent, 'wb') as f: pk.dump(sent_feats, f) with open(path_label, 'wb') as f: pk.dump(inds, f)
def vectorize(path_data, path_sent, path_label, mode): sents = flat_read(path_data, 'text') labels = flat_read(path_data, 'label') if mode == 'train': embed(sents, path_word2ind, path_word_vec, path_embed) label2ind(labels, path_label_ind) pad_seqs = align(sents) with open(path_label_ind, 'rb') as f: label_inds = pk.load(f) inds = list() for label in labels: inds.append(label_inds[label]) inds = np.array(inds) with open(path_sent, 'wb') as f: pk.dump(pad_seqs, f) with open(path_label, 'wb') as f: pk.dump(inds, f)
def vectorize(paths, mode, update): texts = flat_read(paths['data'], 'text') flag_texts = add_flag(texts) if mode == 'train': if update: word2vec(flag_texts, path_word_vec) embed(flag_texts, path_word2ind, path_word_vec, path_embed) sents, labels = shift(flag_texts) align(sents, paths['cnn_sent'], extra=True) align(sents, paths['rnn_sent'], extra=False) align(labels, paths['label'], extra=False)
def merge(names, path_slot_dir, path_extra, path_cut_word): entitys = names files = os.listdir(path_slot_dir) for file in files: words = load_word(os.path.join(path_slot_dir, file)) entitys.extend(words) entity_strs = flat_read(path_extra, 'entity') for entity_str in entity_strs: words = entity_str.split() entitys.extend(words) entity_set = set(entitys) with open(path_cut_word, 'w') as f: for entity in entity_set: f.write(entity + '\n')
def featurize(path_data, path_sent, mode): docs = flat_read(path_data, 'cut_doc') doc_words = [doc.split() for doc in docs] if mode == 'train': word2ind = Dictionary(doc_words) bow_docs = [word2ind.doc2bow(words) for words in doc_words] tfidf = Tfidf(bow_docs) with open(path_word2ind, 'wb') as f: pk.dump(word2ind, f) with open(path_tfidf, 'wb') as f: pk.dump(tfidf, f) else: with open(path_word2ind, 'rb') as f: word2ind = pk.load(f) with open(path_tfidf, 'rb') as f: tfidf = pk.load(f) bow_docs = [word2ind.doc2bow(words) for words in doc_words] tfidf_docs = tfidf[bow_docs] with open(path_sent, 'wb') as f: pk.dump(tfidf_docs, f)
model = Model([input1, input2, input3], output) return model def load_model(name, embed_mat, seq_len): model = define_model(name, embed_mat, seq_len) model.load_weights(map_item(name, paths), by_name=True) return model seq_len = 30 path_test = 'data/test.csv' path_label = 'feat/label_test.pkl' path_embed = 'feat/embed.pkl' texts = flat_read(path_test, 'text') with open(path_label, 'rb') as f: labels = pk.load(f) with open(path_embed, 'rb') as f: embed_mat = pk.load(f) path_test_pair = 'data/test_pair.csv' path_pair = 'feat/pair_train.pkl' path_flag = 'feat/flag_train.pkl' text1s = flat_read(path_test_pair, 'text1') text2s = flat_read(path_test_pair, 'text2') with open(path_pair, 'rb') as f: pairs = pk.load(f) with open(path_flag, 'rb') as f: flags = pk.load(f)
def load_model(name, embed_mat, seq_len): model = define_model(name, embed_mat, seq_len) model.load_weights(map_item(name, paths)) return model seq_len = 30 detail = False path_test = 'data/test.csv' path_label = 'feat/label_test.pkl' path_embed = 'feat/embed.pkl' texts = flat_read(path_test, 'text') with open(path_label, 'rb') as f: labels = pk.load(f) with open(path_embed, 'rb') as f: embed_mat = pk.load(f) class_num = len(ind_labels) path_test_triple = 'data/test_triple.csv' path_triple = 'feat/triple_test.pkl' anc_texts = flat_read(path_test_triple, 'anc') pos_texts = flat_read(path_test_triple, 'pos') neg_texts = flat_read(path_test_triple, 'neg') with open(path_triple, 'rb') as f: triples = pk.load(f)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score from match import predict from util import flat_read, map_item path_test = 'data/test.csv' texts = flat_read(path_test, 'text') labels = flat_read(path_test, 'label') label_set = sorted(list(set(labels))) class_num = len(label_set) paths = {'edit': 'metric/edit.csv', 'cos': 'metric/cos.csv'} def test(name, texts, labels, thre): preds = list() for text, label in zip(texts, labels): pred = predict(text, name, thre) preds.append(pred) precs = precision_score(labels, preds, average=None, labels=label_set) recs = recall_score(labels, preds, average=None, labels=label_set) with open(map_item(name, paths), 'w') as f: f.write('label,prec,rec' + '\n') for i in range(class_num): f.write('%s,%.2f,%.2f\n' % (label_set[i], precs[i], recs[i])) f1 = f1_score(labels, preds, average='weighted')
from build import tensorize from classify import ind_labels, models from util import flat_read, map_item device = torch.device('cpu') detail = False path_test = 'data/test.csv' path_sent = 'feat/sent_test.pkl' path_label = 'feat/label_test.pkl' texts = flat_read(path_test, 'text') with open(path_sent, 'rb') as f: sents = pk.load(f) with open(path_label, 'rb') as f: labels = pk.load(f) class_num = len(ind_labels) paths = {'dnn': 'metric/dnn.csv', 'cnn': 'metric/cnn.csv', 'rnn': 'metric/rnn.csv'} def test(name, sents, labels): sents, labels = tensorize([sents, labels], device) model = map_item(name, models)
def fit(path_train): cut_texts = flat_read(path_train, 'cut_text') labels = flat_read(path_train, 'label') link_fit(cut_texts, labels, path_word_sent) freq_fit(cut_texts, path_bow, path_svd, path_sent_vec)
def fit(path_train): cut_docs = flat_read(path_train, 'cut_doc') labels = flat_read(path_train, 'label') rank_fit(cut_docs, labels, path_rank) freq_fit(cut_docs, labels, path_freq, path_tfidf)
seq_len = 30 path_stop_word = 'dict/stop_word.txt' path_type_dir = 'dict/word_type' path_homo = 'dict/homonym.csv' path_syno = 'dict/synonym.csv' stop_word_re = load_word_re(path_stop_word) word_type_re = load_type_re(path_type_dir) homo_dict = load_word_pair(path_homo) syno_dict = load_word_pair(path_syno) path_train = 'data/train.csv' path_label = 'feat/label_train.pkl' path_embed = 'feat/embed.pkl' path_word2ind = 'model/word2ind.pkl' texts = flat_read(path_train, 'text') with open(path_label, 'rb') as f: labels = pk.load(f) with open(path_embed, 'rb') as f: embed_mat = pk.load(f) with open(path_word2ind, 'rb') as f: word2ind = pk.load(f) paths = {'dnn': 'cache/dnn.pkl', 'cnn': 'cache/cnn.pkl', 'rnn': 'cache/rnn.pkl'} caches = {'dnn': load_cache(map_item('dnn', paths)), 'cnn': load_cache(map_item('dnn', paths)), 'rnn': load_cache(map_item('dnn', paths))}