def add_sent_fred_feature(self, data_type): """ Idea from https://www.kaggle.com/jturkewitz/magic-features-0-03-gain (Quora Question Pairs Competition) Magic features based on question frequency. The idea.md behind is a question that is asked often has more chances to be duplicated. """ feat_file = self.format_feature_file(data_type, 'sent_fred') if os.path.exists(feat_file): features = pickle_load(feat_file) else: sents_dict, p_vc, h_vc = self.get_sent_freq() data = pd.DataFrame(self.get_data(data_type)) data['p_hash'] = data['premise'].map(sents_dict) data['h_hash'] = data['hypotheis'].map(sents_dict) data['p_freq'] = data['p_hash'].map( lambda x: p_vc.get(x, 0) + h_vc.get(x, 0)) data['h_freq'] = data['h_hash'].map( lambda x: p_vc.get(x, 0) + h_vc.get(x, 0)) data['freq_mean'] = (data['p_freq'] + data['h_freq']) / 2 data['freq_cross'] = data['p_freq'] * data['h_freq'] data['p_freq_sq'] = data['p_freq'] * data['p_freq'] data['h_freq_sq'] = data['h_freq'] * data['h_freq'] features = data[[ 'p_freq', 'h_freq', 'freq_mean', 'freq_cross', 'p_freq_sq', 'h_freq_sq' ]].values pickle_dump(feat_file, features) return features
def tfidf_model(self): print('Logging Info - Get Tf-idf model...') tfidf_model_path = os.path.join(FEATURE_DIR, '{}_tfidf.model').format(self.genre) dict_path = os.path.join(FEATURE_DIR, '{}_tfidf.dict').format(self.genre) if os.path.exists(tfidf_model_path): dictionary = pickle_load(dict_path) tfidf_model = TfidfModel.load(tfidf_model_path) else: corpus = [ text.split() for text in self.train_data['premise'] + self.train_data['hypothesis'] + self.dev_data['premise'] + self.dev_data['hypothesis'] + self.test_data['premise'] + self.test_data['hypothesis'] ] dictionary = corpora.Dictionary(corpus) corpus = [dictionary.doc2bow(text) for text in corpus] tfidf_model = TfidfModel(corpus) del corpus tfidf_model.save(tfidf_model_path) pickle_dump(dict_path, dictionary) return dictionary, tfidf_model
def main(args): set_seed(args.seed) elapsed, context_predictions = context_selection(args, args.context_config_json) logger.info( f"Finished context selection, get {len(context_predictions)} paragraphs" f"({elapsed:.2f}s elapsed)") elapsed, qa_predictions = question_answering(args, args.qa_config_json, context_predictions) logger.info( f"Finished question_answering, get {len(qa_predictions)} answer spans" f"({elapsed:.2f}s elapsed)") predictions = { d["id"]: d["answer"] for d in map(postprocess, qa_predictions) } json_dump(predictions, args.predict_json, ensure_ascii=False) if args.tmp_dir: args.tmp_dir.mkdir(parents=True, exist_ok=True) pickle_dump(context_predictions, args.tmp_dir / "context_predictions.pkl") pickle_dump(qa_predictions, args.tmp_dir / "qa_predictions.pkl")
def add_tfidf_feature(self, data_type): feat_file = self.format_feature_file(data_type, 'tfidf') if os.path.exists(feat_file): features = pickle_load(feat_file) else: dictionary, tfidf_model = self.tfidf_model() features = list() for premise, hypothesis in zip( self.get_data(data_type)['premise'], self.get_data(data_type)['hypothesis']): premise = premise.split() hypothesis = hypothesis.split() p_tfidf = dict(tfidf_model[dictionary.doc2bow(premise)]) h_tfidf = dict(tfidf_model[dictionary.doc2bow(hypothesis)]) features.append([ np.sum(list(p_tfidf.values())), np.sum(list(h_tfidf.values())), np.mean(list(p_tfidf.values())), np.mean(list(h_tfidf.values())) ]) features = np.array(features) pickle_dump(feat_file, features) print('Logging Info - {} : w_ngram_ol_tfidf feature shape : {}'.format( data_type, features.shape)) return features
def add_similarity_feature(self, data_type, feat_type, sim_func): feat_file = self.format_feature_file(data_type, feat_type) if os.path.exists(feat_file): features = pickle_load(feat_file) else: len_dist_feat = np.array([ sim_func(p, h) for p, h in zip( self.get_data(data_type)['premise'], self.get_data(data_type)['hypothesis']) ]) features = self.check_and_expand_shape(len_dist_feat) pickle_dump(feat_file, features) print('Logging Info - {} : {} feature shape : {}'.format( data_type, feat_type, features.shape)) return features
def gen_all_features(self, data_type, scaled=False): if scaled: feat_file = self.format_feature_file(data_type, 'all_scaled') else: feat_file = self.format_feature_file(data_type, 'all') if os.path.exists(feat_file): features = pickle_load(feat_file) else: features = list() feat_types = [('len_dis', length_distance), ('lcs_seq', lcs_seq_norm), ('lcs_str', lcs_str_1_norm), ('edit_dist', edit_distance), ('jaro', jaro_distance), ('jaro_winkler', jaro_winkler_dist), ('fuzz', fuzzy), ('simhash', simhash), ('w_share', word_share), ('w_ngram_dist', word_ngram_distance), ('c_ngram_ol', char_ngram_overlap), ('w_ngram_ol', word_ngram_overlap)] for feat_type, sim_func in feat_types: features.append( self.add_similarity_feature(data_type, feat_type, sim_func)) features.append( self.add_weighted_word_ngram_overlap_feature(data_type)) features.append(self.add_tfidf_feature(data_type)) features.append(self.add_word_power_feature(data_type)) features.append(self.add_graph_feature(data_type)) features = np.concatenate(features, axis=-1) if scaled: scaler = StandardScaler() features = scaler.fit_transform(features) joblib.dump( scaler, os.path.join(FEATURE_DIR, '{}_scaler.model'.format(self.genre))) pickle_dump(feat_file, features) print('Logging Info - {} : all feature shape : {}'.format( data_type, features.shape))
def add_word_power_feature(self, data_type): feat_file = self.format_feature_file(data_type, 'word_power') if os.path.exists(feat_file): features = pickle_load(feat_file) else: power_word = self.get_power_word() num_least = 100 features = list() for premise, hypothesis in zip( self.get_data(data_type)['premise'], self.get_data(data_type)['hypothesis']): premise = premise.split() hypothesis = hypothesis.split() rate = [1.0, 1.0] share_words = list(set(premise).intersection(set(hypothesis))) for word in share_words: if word not in power_word: continue if power_word[word][0] * power_word[word][ 5] < num_least: # 共享词出现在双侧语句对数量要大于num_least continue rate[0] *= (1.0 - power_word[word][6] ) # 共享词但是语句对不是正确的(label!=2) p_diff = list(set(premise).difference(set(hypothesis))) h_diff = list(set(premise).difference(set(hypothesis))) all_diff = set(p_diff + h_diff) for word in all_diff: if word not in power_word: continue if power_word[word][0] * power_word[word][ 3] < num_least: # 共享词只出现在单侧语句数量要大于num_least continue rate[1] *= (1.0 - power_word[word][4] ) # 非共享词但是语句对是正确的(label=2) rate = [1 - num for num in rate] features.append(rate) features = np.array(features) pickle_dump(feat_file, features) print('Logging Info - {} : word_power feature shape : {}'.format( data_type, features.shape)) return features
def generate_graph(self): print('Logging Info - Get graph...') sent2id_path = os.path.join(FEATURE_DIR, '{}_graph_sent2id.pkl'.format(self.genre)) graph_path = os.path.join(FEATURE_DIR, '{}_graph.pkl'.format(self.genre)) if os.path.exists(graph_path): sent2id = pickle_load(sent2id_path) graph = pickle_load(graph_path) else: sent2id = {} # sentence to id graph = nx.Graph() for data_type in ['train', 'dev', 'test']: for premise, hypothesis in zip( self.get_data(data_type)['premise'], self.get_data(data_type)['hypothesis']): if premise not in sent2id: sent2id[premise] = len(sent2id) if hypothesis not in sent2id: sent2id[hypothesis] = len(sent2id) p_id = sent2id[premise] h_id = sent2id[hypothesis] match = 0.0 premise = premise.split() hypothesis = hypothesis.split() for w1 in premise: if w1 in hypothesis: match += 1 if len(premise) + len(hypothesis) == 0: weight = 0.0 else: weight = 2.0 * (match / (len(premise) + len(hypothesis))) graph.add_edge(p_id, h_id, weight=weight) pickle_dump(sent2id_path, sent2id) pickle_dump(graph_path, graph) return sent2id, graph
def add_weighted_word_ngram_overlap_feature(self, data_type): feat_file = self.format_feature_file(data_type, 'w_ngram_ol_tfidf') if os.path.exists(feat_file): features = pickle_load(feat_file) else: dictionary, tfidf_model = self.tfidf_model() idf_model = tfidf_model.idfs features = list() for premise, hypothesis in zip( self.get_data(data_type)['premise'], self.get_data(data_type)['hypothesis']): premise = premise.split() p_tfidf = dict(tfidf_model[dictionary.doc2bow(premise)]) input_premise = [ (word, idf_model.get(dictionary.token2id.get(word, 0), 0.0), p_tfidf.get(dictionary.token2id.get(word, 0), 0.0)) for word in premise ] hypothesis = hypothesis.split() h_tfidf = dict(tfidf_model[dictionary.doc2bow(hypothesis)]) input_hypothesis = [ (word, idf_model.get(dictionary.token2id.get(word, 0), 0.0), h_tfidf.get(dictionary.token2id.get(word, 0), 0.0)) for word in hypothesis ] features.append( weighted_word_ngram_overlap(input_premise, input_hypothesis)) features = np.array(features) pickle_dump(feat_file, features) print('Logging Info - {} : w_ngram_ol_tfidf feature shape : {}'.format( data_type, features.shape)) return features
def get_sent_freq(self): print('Logging Info - Get sentence frequency...') sents_dict_path = os.path.join(FEATURE_DIR, '{}_sent_dict.pkl'.format(self.genre)) p_vc_path = os.path.join(FEATURE_DIR, '{}_premise_vc.pkl'.format(self.genre)) h_vc_path = os.path.join(FEATURE_DIR, '{}_hypothesis_vc.pkl'.format(self.genre)) if os.path.exists(p_vc_path): sents_dict = pickle_load(sents_dict_path) p_vc = pickle_load(p_vc_path) h_vc = pickle_load(h_vc_path) else: train_data = pd.DataFrame(self.train_data) dev_data = pd.DataFrame(self.dev_data) test_data = pd.DataFrame(self.test_data) all_data = pd.concat([train_data, dev_data, test_data]) df1 = all_data[['premise']] df2 = all_data[['hypothesis']] df2.rename(columns={'hypothesis': 'premise'}, inplace=True) train_sents = pd.concat([df1, df2]) train_sents.drop_duplicates(subset=['premise'], inplace=True) train_sents.reset_index(inplace=True, drop=True) sents_dict = pd.Series(train_sents.index.values, index=train_sents.premise.values).to_dict() all_data['p_hash'] = all_data['premise'].map(sents_dict) all_data['h_hash'] = all_data['hypothesis'].map(sents_dict) p_vc = all_data.p_hash.value_counts().to_dict() h_vc = all_data.h_hash.value_counts().to_dict() pickle_dump(sents_dict_path, sents_dict) pickle_dump(p_vc_path, p_vc) pickle_dump(h_vc_path, h_vc) del train_data, dev_data, test_data, all_data return sents_dict, p_vc, h_vc
word_ids_test = create_token_ids_matrix(word_tokenizer, raw_data[variation], config.word_max_len) # prepare n-gram input vectorizer = pickle_load( format_filename(PROCESSED_DATA_DIR, VECTORIZER_TEMPLATE, variation=variation, type='binary', level='char', ngram_range=(2, 3))) n_gram_test = vectorizer.transform(raw_data[variation]) pickle_dump( format_filename(PROCESSED_DATA_DIR, TEST_DATA_TEMPLATE, variation=variation), {'sentence': test_data}) pickle_dump( format_filename(PROCESSED_DATA_DIR, TEST_IDS_MATRIX_TEMPLATE, variation=variation, level='word'), {'sentence': word_ids_test}) pickle_dump( format_filename(PROCESSED_DATA_DIR, TEST_NGRAM_DATA_TEMPLATE, variation=variation, type='binary', level='char', ngram_range=(2, 3)), {'sentence': n_gram_test})
def get_power_word(self): """ 计算数据中词语的影响力,格式如下: 词语 --> [0. 出现语句对数量,1. 出现语句对比例,2. 正确语句对比例,3. 单侧语句对比例,4. 单侧语句对正确比例, 5. 双侧语句对比例,6. 双侧语句对正确比例] """ print('Logging Info - Get power word...') words_power_path = os.path.join(FEATURE_DIR, '{}_power_word.pkl'.format(self.genre)) if os.path.exists(words_power_path): words_power = pickle_load(words_power_path) else: words_power = {} x_a = [ text.split() for text in self.train_data['premise'] + self.dev_data['premise'] + self.test_data['premise'] ] x_b = [ text.split() for text in self.train_data['hypothesis'] + self.dev_data['hypothesis'] + self.test_data['hypothesis'] ] y = self.train_data['label'] + self.dev_data[ 'label'] + self.test_data['label'] for i in range(len(x_a)): label = y[i] q1_words = x_a[i] q2_words = x_b[i] all_words = set(q1_words + q2_words) q1_words = set(q1_words) q2_words = set(q2_words) for word in all_words: if word not in words_power: words_power[word] = [0. for _ in range(7)] words_power[word][0] += 1. # 计算出现语句对的数量 words_power[word][1] += 1. # 计算出现语句对比例 if ((word in q1_words) and (word not in q2_words)) or ((word not in q1_words) and (word in q2_words)): words_power[word][3] += 1. # 计算单侧语句对比例 if 0 == label: words_power[word][2] += 1. # 计算正确语句对比例 words_power[word][4] += 1. # 计算单侧语句正确比例 if (word in q1_words) and (word in q2_words): words_power[word][5] += 1. # 计算双侧语句数量 if 2 == label: words_power[word][2] += 1. # 计算正确语句对比例 words_power[word][6] += 1. # 计算双侧语句正确比例 for word in words_power: words_power[word][1] /= len(x_a) # 计算出现语句对比例=出现语句对数量/总的语句对数量 words_power[word][2] /= words_power[word][ 0] # 计算正确语句对比例=正确语句对数量/出现语句对数量 if words_power[word][3] > 1e-6: words_power[word][4] /= words_power[word][ 3] # 计算单侧语句正确比例=单侧语句正确数量/出现单侧语句数量 words_power[word][3] /= words_power[word][ 0] # 计算出现单侧语句对比例=出现单侧语句数量/出现语句对数量 if words_power[word][5] > 1e-6: words_power[word][6] /= words_power[word][ 5] # 计算双侧语句正确比例=双侧语句正确数量/出现双侧语句数量 words_power[word][5] /= words_power[word][ 0] # 计算出现双侧语句对比例=出现双侧语句数量/出现语句数量 del x_a, x_b, y pickle_dump(words_power_path, words_power) return words_power
if not os.path.exists(PROCESSED_DATA_DIR): os.makedirs(PROCESSED_DATA_DIR) if not os.path.exists(LOG_DIR): os.makedirs(LOG_DIR) if not os.path.exists(MODEL_SAVED_DIR): os.makedirs(MODEL_SAVED_DIR) if not os.path.exists(SUBMIT_DIR): os.makedirs(SUBMIT_DIR) if not os.path.exists(IMG_DIR): os.makedirs(IMG_DIR) # load knowledge base data mention_to_entity, entity_to_mention, entity_desc, entity_type = load_kb_data( KB_FILENAME) pickle_dump( format_filename(PROCESSED_DATA_DIR, MENTION_TO_ENTITY_FILENAME), mention_to_entity) pickle_dump(format_filename(PROCESSED_DATA_DIR, ENTITY_DESC_FILENAME), entity_desc) pickle_dump(format_filename(PROCESSED_DATA_DIR, ENTITY_TYPE_FILENAME), entity_type) pickle_dump( format_filename(PROCESSED_DATA_DIR, ENTITY_TO_MENTION_FILENAME), entity_to_mention) # load training data train_data = load_train_data(CCKS_TRAIN_FILENAME) # prepare character embedding char_vocab, idx2char, char_corpus = load_char_vocab_and_corpus( entity_desc, train_data)
def prepare_skip_ngram_feature(vectorizer_type, level, ngram, skip_k, train_data, dev_data, variation): if level not in ['word', 'char']: raise ValueError('Vectorizer Level Not Understood: {}'.format(level)) if vectorizer_type == 'binary': vectorizer = CountVectorizer(binary=True, tokenizer=make_skip_tokenize( ngram, skip_k, level)) elif vectorizer_type == 'tf': vectorizer = CountVectorizer(binary=False, tokenizer=make_skip_tokenize( ngram, skip_k, level)) elif vectorizer_type == 'tfidf': vectorizer = TfidfVectorizer(make_skip_tokenize(ngram, skip_k, level)) else: raise ValueError( 'Vectorizer Type Not Understood: {}'.format(vectorizer_type)) train_ngram_feature = vectorizer.fit_transform(train_data['sentence']) train_ngram_data = { 'sentence': train_ngram_feature, 'label': train_data['label'] } dev_ngram_feature = vectorizer.transform(dev_data['sentence']) dev_ngram_data = { 'sentence': dev_ngram_feature, 'label': dev_data['label'] } print( 'Logging info - {}_{}vectorizer_{}_{}_{} : train_skip_ngram_feature shape: {}, ' 'dev_skip_ngram_feature shape: {}'.format(variation, vectorizer_type, level, ngram, skip_k, train_ngram_feature.shape, dev_ngram_feature.shape)) # pickle can't pickle lambda function, here i use drill: https://github.com/uqfoundation/dill with open( format_filename(PROCESSED_DATA_DIR, VECTORIZER_TEMPLATE, variation=variation, type=vectorizer_type, level=level, ngram_range='%d_%d' % (ngram, skip_k)), 'wb') as writer: dill.dump(vectorizer, writer) pickle_dump( format_filename(PROCESSED_DATA_DIR, TRAIN_NGRAM_DATA_TEMPLATE, variation=variation, type=vectorizer_type, level=level, ngram_range='%d_%d' % (ngram, skip_k)), train_ngram_data) pickle_dump( format_filename(PROCESSED_DATA_DIR, DEV_NGRAM_DATA_TEMPLATE, variation=variation, type=vectorizer_type, level=level, ngram_range='%d_%d' % (ngram, skip_k)), dev_ngram_data) return vectorizer, train_ngram_data, dev_ngram_data
def prepare_ngram_feature(vectorizer_type, level, ngram_range, train_data, dev_data, variation): if level not in ['word', 'char', 'char_wb']: raise ValueError('Vectorizer Level Not Understood: {}'.format(level)) if not isinstance(ngram_range, tuple): raise ValueError('ngram_range should be a tuple, got {}'.format( type(ngram_range))) if vectorizer_type == 'binary': vectorizer = CountVectorizer(binary=True, analyzer=level, ngram_range=ngram_range) elif vectorizer_type == 'tf': vectorizer = CountVectorizer(binary=False, analyzer=level, ngram_range=ngram_range) elif vectorizer_type == 'tfidf': vectorizer = TfidfVectorizer(analyzer=level, ngram_range=ngram_range) else: raise ValueError( 'Vectorizer Type Not Understood: {}'.format(vectorizer_type)) train_ngram_feature = vectorizer.fit_transform(train_data['sentence']) train_ngram_data = { 'sentence': train_ngram_feature, 'label': train_data['label'] } dev_ngram_feature = vectorizer.transform(dev_data['sentence']) dev_ngram_data = { 'sentence': dev_ngram_feature, 'label': dev_data['label'] } print( 'Logging info - {}_{}vectorizer_{}_{} : train_ngram_feature shape: {}, ' 'dev_ngram_feature shape: {}'.format(variation, vectorizer_type, level, ngram_range, train_ngram_feature.shape, dev_ngram_feature.shape)) pickle_dump( format_filename(PROCESSED_DATA_DIR, VECTORIZER_TEMPLATE, variation=variation, type=vectorizer_type, level=level, ngram_range=ngram_range), vectorizer) pickle_dump( format_filename(PROCESSED_DATA_DIR, TRAIN_NGRAM_DATA_TEMPLATE, variation=variation, type=vectorizer_type, level=level, ngram_range=ngram_range), train_ngram_data) pickle_dump( format_filename(PROCESSED_DATA_DIR, DEV_NGRAM_DATA_TEMPLATE, variation=variation, type=vectorizer_type, level=level, ngram_range=ngram_range), dev_ngram_data) return vectorizer, train_ngram_data, dev_ngram_data
def add_graph_feature(self, data_type): feat_file = self.format_feature_file(data_type, 'word_power') if os.path.exists(feat_file): graph_features = pickle_load(feat_file) else: sent2id, graph = self.generate_graph() n2clique = {} cliques = [] for clique in nx.find_cliques(graph): for n in clique: if n not in n2clique: n2clique[n] = [] n2clique[n].append(len(cliques)) cliques.append(clique) n2cc = {} ccs = [] for cc in nx.connected_components(graph): for n in cc: n2cc[n] = len(ccs) ccs.append(cc) pagerank = nx.pagerank(graph, alpha=0.9, max_iter=100) hits_h, hits_a = nx.hits(graph, max_iter=100) indegree_features = list() clique_features = list() cc_features = list() pagerank_features = list() hits_features = list() shortestpath_features = list() # neighbor_features = list() for premise, hypothesis in zip( self.get_data(data_type)['premise'], self.get_data(data_type)['hypothesis']): p_id = sent2id[premise] h_id = sent2id[hypothesis] # graph in-degree fetures indegree_features.append( [graph.degree[p_id], graph.degree[h_id]]) # clique features edge_max_clique_size = 0 num_clique = 0 for clique_id in n2clique[p_id]: if h_id in cliques[clique_id]: edge_max_clique_size = max(edge_max_clique_size, len(cliques[clique_id])) num_clique += 1 clique_features.append([edge_max_clique_size, num_clique]) lnode_max_clique_size = 0 rnode_max_clique_size = 0 for clique_id in n2clique[p_id]: lnode_max_clique_size = max(lnode_max_clique_size, len(cliques[clique_id])) for clique_id in n2clique[h_id]: rnode_max_clique_size = max(rnode_max_clique_size, len(cliques[clique_id])) clique_features[-1] += [ lnode_max_clique_size, rnode_max_clique_size, max(lnode_max_clique_size, rnode_max_clique_size), min(lnode_max_clique_size, rnode_max_clique_size) ] # connected components features cc_features.append([len(ccs[n2cc[p_id]])]) # page rank features pr1 = pagerank[p_id] * 1e6 pr2 = pagerank[h_id] * 1e6 pagerank_features.append( [pr1, pr2, max(pr1, pr2), min(pr1, pr2), (pr1 + pr2) / 2.]) # graph hits features h1 = hits_h[p_id] * 1e6 h2 = hits_h[h_id] * 1e6 a1 = hits_a[p_id] * 1e6 a2 = hits_a[h_id] * 1e6 hits_features.append([ h1, h2, a1, a2, max(h1, h2), max(a1, a2), min(h1, h2), min(a1, a2), (h1 + h2) / 2., (a1 + a2) / 2. ]) # graph shortest path features shortest_path = -1 weight = graph[p_id][h_id]['weight'] graph.remove_edge(p_id, h_id) if nx.has_path(graph, p_id, h_id): shortest_path = nx.dijkstra_path_length(graph, p_id, h_id) graph.add_edge(p_id, h_id, weight=weight) shortestpath_features.append([shortest_path]) # graph neighbour features # l = [] # r = [] # l_nb = graph.neighbors(p_id) # r_nb = graph.neighbors(h_id) # for n in l_nb: # if (n != h_id) and (n != p_id): # l.append(graph[p_id][n]['weight']) # for n in r_nb: # if (n != h_id) and (n != p_id): # r.append(graph[h_id][n]['weight']) # if len(l) == 0 or len(r) == 0: # neighbor_features.append([0.0] * 11) # else: # neighbor_features.append(l + r + # [len(list((set(l_nb).union(set(r_nb))) ^ (set(l_nb) ^ set(r_nb))))]) graph_features = np.concatenate( (np.array(indegree_features), np.array(clique_features), np.array(cc_features), np.array(pagerank_features), np.array(hits_features), np.array(shortestpath_features)), axis=-1) pickle_dump(feat_file, graph_features) print('Logging Info - {} : graph feature shape : {}'.format( data_type, graph_features.shape)) return graph_features
def main(): process_conf = ProcessConfig() # create directory if not os.path.exists(PROCESSED_DATA_DIR): os.makedirs(PROCESSED_DATA_DIR) if not os.path.exists(LOG_DIR): os.makedirs(LOG_DIR) if not os.path.exists(MODEL_SAVED_DIR): os.makedirs(MODEL_SAVED_DIR) if not os.path.exists(IMG_DIR): os.makedirs(IMG_DIR) # load SNLI, MultiNLI and MLI datasets data_train, data_dev, data_test = load_data() print('Logging Info - Data: train - {}, dev - {}, test - {}'.format(data_train.shape, data_dev.shape, data_test.shape)) for genre in GENRES: if genre not in data_train.index: continue analyze_result = {} genre_train = data_train.loc[genre] genre_dev = data_dev.loc[genre] genre_test = data_test.loc[genre] # might be None print('Logging Info - Genre: {}, train - {}, dev - {}, test - {}'.format(genre, genre_train.shape, genre_dev.shape, genre_test.shape)) analyze_result.update({'train_set': len(genre_train), 'dev_set': len(genre_dev), 'test_set': 0 if genre_test is None else len(genre_test)}) genre_train_data = process_data(genre_train, process_conf.clean, process_conf.stem) genre_dev_data = process_data(genre_dev, process_conf.clean, process_conf.stem) # class distribution analysis train_label_distribution = analyze_class_distribution(genre_train_data['label']) analyze_result.update(dict(('train_cls_{}'.format(cls), percent) for cls, percent in train_label_distribution.items())) dev_label_distribution = analyze_class_distribution(genre_dev_data['label']) analyze_result.update(dict(('dev_cls_{}'.format(cls), percent) for cls, percent in dev_label_distribution.items())) # create tokenizer and vocabulary sentences_train = genre_train_data['premise'] + genre_train_data['hypothesis'] sentences_dev = genre_dev_data['premise'] + genre_dev_data['hypothesis'] word_tokenizer = Tokenizer(lower=process_conf.lowercase, filters='', char_level=False) char_tokenizer = Tokenizer(lower=process_conf.lowercase, filters='', char_level=True) word_tokenizer.fit_on_texts(sentences_train) # just fit on train data char_tokenizer.fit_on_texts(sentences_train) print('Logging Info - Genre: {}, word_vocab: {}, char_vocab: {}'.format(genre, len(word_tokenizer.word_index), len(char_tokenizer.word_index))) analyze_result.update({'word_vocab': len(word_tokenizer.word_index), 'char_vocab': len(char_tokenizer.word_index)}) # length analysis word_len_distribution, word_max_len = analyze_len_distribution(sentences_train, level='word') analyze_result.update(dict(('word_{}'.format(k), v) for k, v in word_len_distribution.items())) char_len_distribution, char_max_len = analyze_len_distribution(sentences_train, level='char') analyze_result.update(dict(('char_{}'.format(k), v) for k, v in char_len_distribution.items())) train_word_ids = create_data_matrices(word_tokenizer, genre_train_data, process_conf.padding, process_conf.truncating, process_conf.n_class, word_max_len) train_char_ids = create_data_matrices(char_tokenizer, genre_train_data, process_conf.padding, process_conf.truncating, process_conf.n_class, char_max_len) dev_word_ids = create_data_matrices(word_tokenizer, genre_dev_data, process_conf.padding, process_conf.truncating, process_conf.n_class, word_max_len) dev_char_ids = create_data_matrices(char_tokenizer, genre_dev_data, process_conf.padding, process_conf.truncating, process_conf.n_class, char_max_len) # create embedding matrix from pretrained word vectors glove_cc = load_trained(EXTERNAL_WORD_VECTORS_FILENAME['glove_cc'], word_tokenizer.word_index) fasttext_cc = load_trained(EXTERNAL_WORD_VECTORS_FILENAME['fasttext_cc'], word_tokenizer.word_index) fasttext_wiki = load_trained(EXTERNAL_WORD_VECTORS_FILENAME['fasttext_wiki'], word_tokenizer.word_index) # create embedding matrix by training on nil dataset w2v_nil = train_w2v(sentences_train+sentences_dev, lambda x: x.split(), word_tokenizer.word_index) c2v_nil = train_w2v(sentences_train+sentences_dev, lambda x: list(x), char_tokenizer.word_index) w_fasttext_nil = train_fasttext(sentences_train + sentences_dev, lambda x: x.split(), word_tokenizer.word_index) c_fasttext_nil = train_fasttext(sentences_train + sentences_dev, lambda x: list(x), char_tokenizer.word_index) w_glove_nil = train_glove(sentences_train + sentences_dev, lambda x: x.split(), word_tokenizer.word_index) c_glove_nil = train_glove(sentences_train + sentences_dev, lambda x: list(x), char_tokenizer.word_index) # save pre-process data pickle_dump(format_filename(PROCESSED_DATA_DIR, TRAIN_DATA_TEMPLATE, genre), genre_train_data) pickle_dump(format_filename(PROCESSED_DATA_DIR, DEV_DATA_TEMPLATE, genre), genre_dev_data) pickle_dump(format_filename(PROCESSED_DATA_DIR, TRAIN_IDS_MATRIX_TEMPLATE, genre, 'word'), train_word_ids) pickle_dump(format_filename(PROCESSED_DATA_DIR, TRAIN_IDS_MATRIX_TEMPLATE, genre, 'char'), train_char_ids) pickle_dump(format_filename(PROCESSED_DATA_DIR, DEV_IDS_MATRIX_TEMPLATE, genre, 'word'), dev_word_ids) pickle_dump(format_filename(PROCESSED_DATA_DIR, DEV_IDS_MATRIX_TEMPLATE, genre, 'char'), dev_char_ids) np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, 'glove_cc'), glove_cc) np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, 'fasttext_cc'), fasttext_cc) np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, 'fasttext_wiki'), fasttext_wiki) np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, 'w2v_nil'), w2v_nil) np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, 'c2v_nil'), c2v_nil) np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, 'w_fasttext_nil'), w_fasttext_nil) np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, 'c_fasttext_nil'), c_fasttext_nil) np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, 'w_glove_nil'), w_glove_nil) np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, genre, 'c_glove_nil'), c_glove_nil) pickle_dump(format_filename(PROCESSED_DATA_DIR, TOKENIZER_TEMPLATE, genre, 'word'), word_tokenizer) pickle_dump(format_filename(PROCESSED_DATA_DIR, TOKENIZER_TEMPLATE, genre, 'char'), char_tokenizer) pickle_dump(format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, genre, 'word'), word_tokenizer.word_index) pickle_dump(format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, genre, 'char'), char_tokenizer.word_index) if genre_test is not None: genre_test_data = process_data(genre_test, process_conf.clean, process_conf.stem) test_label_distribution = analyze_class_distribution(genre_test_data['label']) analyze_result.update( dict(('test_cls_%d' % cls, percent) for cls, percent in test_label_distribution.items())) test_word_ids = create_data_matrices(word_tokenizer, genre_test_data, process_conf.padding, process_conf.truncating, process_conf.n_class, word_max_len) test_char_ids = create_data_matrices(char_tokenizer, genre_test_data, process_conf.padding, process_conf.truncating, process_conf.n_class, char_max_len) pickle_dump(format_filename(PROCESSED_DATA_DIR, TEST_DATA_TEMPLATE, genre), genre_test_data) pickle_dump(format_filename(PROCESSED_DATA_DIR, TEST_IDS_MATRIX_TEMPLATE, genre, 'word'), test_word_ids) pickle_dump(format_filename(PROCESSED_DATA_DIR, TEST_IDS_MATRIX_TEMPLATE, genre, 'char'), test_char_ids) # save analyze result analyze_result['timestamp'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) write_log(format_filename(LOG_DIR, ANALYSIS_LOG_TEMPLATE, genre), analyze_result)
def process_data(): config = ModelConfig() # create dir if not path.exists(PROCESSED_DATA_DIR): os.makedirs(PROCESSED_DATA_DIR) if not path.exists(LOG_DIR): os.makedirs(LOG_DIR) if not path.exists(MODEL_SAVED_DIR): os.makedirs(MODEL_SAVED_DIR) if not path.exists(IMG_DIR): os.makedirs(IMG_DIR) # load datasets data_train, data_dev = load_data() print('Logging Info - Data: train - {}, dev - {}'.format( data_train.shape, data_dev.shape)) for variation in VARIATIONS: if variation not in data_train.index: continue analyze_result = {} variation_train = data_train.loc[variation] variation_dev = data_dev.loc[variation] print('Logging Info - Variation: {}, train - {}, dev - {}'.format( variation, variation_train.shape, variation_dev.shape)) analyze_result.update({ 'train_set': len(variation_train), 'dev_set': len(variation_train) }) variation_train_data = get_sentence_label(variation_train) variation_dev_data = get_sentence_label(variation_dev) if config.data_augment: variation_train_data = augment_data(variation_train_data) variation += '_aug' # class distribution analysis train_label_distribution = analyze_class_distribution( variation_train_data['label']) analyze_result.update( dict(('train_cls_{}'.format(cls), percent) for cls, percent in train_label_distribution.items())) dev_label_distribution = analyze_class_distribution( variation_dev_data['label']) analyze_result.update( dict(('dev_cls_{}'.format(cls), percent) for cls, percent in dev_label_distribution.items())) # create tokenizer and vocabulary sentences_train = variation_train_data['sentence'] sentences_dev = variation_dev_data['sentence'] word_tokenizer = Tokenizer(char_level=False) char_tokenizer = Tokenizer(char_level=True) word_tokenizer.fit_on_texts(sentences_train) char_tokenizer.fit_on_texts(sentences_train) print('Logging Info - Variation: {}, word_vocab: {}, char_vocab: {}'. format(variation, len(word_tokenizer.word_index), len(char_tokenizer.word_index))) analyze_result.update({ 'word_vocab': len(word_tokenizer.word_index), 'char_vocab': len(char_tokenizer.word_index) }) # length analysis word_len_distribution, word_max_len = analyze_len_distribution( sentences_train, level='word') analyze_result.update( dict(('word_{}'.format(k), v) for k, v in word_len_distribution.items())) char_len_distribution, char_max_len = analyze_len_distribution( sentences_train, level='char') analyze_result.update( dict(('char_{}'.format(k), v) for k, v in char_len_distribution.items())) one_hot = False if config.loss_function == 'binary_crossentropy' else True train_word_ids = create_data_matrices(word_tokenizer, variation_train_data, config.n_class, one_hot, word_max_len) train_char_ids = create_data_matrices(char_tokenizer, variation_train_data, config.n_class, one_hot, char_max_len) dev_word_ids = create_data_matrices(word_tokenizer, variation_dev_data, config.n_class, one_hot, word_max_len) dev_char_ids = create_data_matrices(char_tokenizer, variation_dev_data, config.n_class, one_hot, char_max_len) # create embedding matrix by training on dataset w2v_data = train_w2v(sentences_train + sentences_dev, lambda x: x.split(), word_tokenizer.word_index) c2v_data = train_w2v(sentences_train + sentences_dev, lambda x: list(x), char_tokenizer.word_index) w_fasttext_data = train_fasttext(sentences_train + sentences_dev, lambda x: x.split(), word_tokenizer.word_index) c_fasttext_data = train_fasttext(sentences_train + sentences_dev, lambda x: list(x), char_tokenizer.word_index) # w_glove_data = train_glove(sentences_train+sentences_dev, lambda x: x.split(), word_tokenizer.word_index) # c_glove_data = train_glove(sentences_train+sentences_dev, lambda x: list(x), char_tokenizer.word_index) # save pre-process data pickle_dump( format_filename(PROCESSED_DATA_DIR, TRAIN_DATA_TEMPLATE, variation=variation), variation_train_data) pickle_dump( format_filename(PROCESSED_DATA_DIR, DEV_DATA_TEMPLATE, variation=variation), variation_dev_data) pickle_dump( format_filename(PROCESSED_DATA_DIR, TRAIN_IDS_MATRIX_TEMPLATE, variation=variation, level='word'), train_word_ids) pickle_dump( format_filename(PROCESSED_DATA_DIR, TRAIN_IDS_MATRIX_TEMPLATE, variation=variation, level='char'), train_char_ids) pickle_dump( format_filename(PROCESSED_DATA_DIR, DEV_IDS_MATRIX_TEMPLATE, variation=variation, level='word'), dev_word_ids) pickle_dump( format_filename(PROCESSED_DATA_DIR, DEV_IDS_MATRIX_TEMPLATE, variation=variation, level='char'), dev_char_ids) np.save( format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, variation=variation, type='w2v_data'), w2v_data) np.save( format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, variation=variation, type='c2v_data'), c2v_data) np.save( format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, variation=variation, type='w_fasttext_data'), w_fasttext_data) np.save( format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, variation=variation, type='c_fasttext_data'), c_fasttext_data) # np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, variation=variation, # type='w_glove_data'), w_glove_data) # np.save(format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, variation=variation, # type='c_glove_data'), c_glove_data) pickle_dump( format_filename(PROCESSED_DATA_DIR, TOKENIZER_TEMPLATE, variation=variation, level='word'), word_tokenizer) pickle_dump( format_filename(PROCESSED_DATA_DIR, TOKENIZER_TEMPLATE, variation=variation, level='char'), char_tokenizer) pickle_dump( format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, variation=variation, level='word'), word_tokenizer.word_index) pickle_dump( format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, variation=variation, level='char'), char_tokenizer.word_index) # prepare ngram feature for vectorizer_type in ['binary', 'tf', 'tfidf']: for level in ['char', 'word']: for ngram_range in [(1, 1), (2, 2), (3, 3), (2, 3), (1, 3), (2, 4), (1, 4), (4, 4), (5, 5), (6, 6), (7, 7), (8, 8)]: prepare_ngram_feature(vectorizer_type, level, ngram_range, variation_train_data, variation_dev_data, variation) # prepare skip ngram features for vectorizer_type in ['binary', 'tf', 'tfidf']: for level in ['word', 'char']: for ngram in [2, 3]: for skip_k in [1, 2, 3]: prepare_skip_ngram_feature(vectorizer_type, level, ngram, skip_k, variation_train_data, variation_dev_data, variation) # prepare pos ngram variation_train_pos_data = { 'sentence': [ get_pos(sentence) for sentence in variation_train_data['sentence'] ], 'label': variation_train_data['label'] } variation_dev_pos_data = { 'sentence': [get_pos(sentence) for sentence in variation_dev_data['sentence']], 'label': variation_dev_data['label'] } for vectorizer_type in ['binary', 'tf', 'tfidf']: for level in ['word']: for ngram_range in [(1, 1), (2, 2), (3, 3)]: prepare_ngram_feature(vectorizer_type, level, ngram_range, variation_train_pos_data, variation_dev_pos_data, variation + '_pos') # save analyze result write_log( format_filename(LOG_DIR, ANALYSIS_LOG_TEMPLATE, variation=variation), analyze_result)