def preprocess_qa(source, nlp, w2vec_model, row_num=None, test=False): answer_n = len(source.answer_names) if row_num is None: quest_n_train = source.train_data.shape[0] quest_n_test = source.test_data.shape[0] else: quest_n_train, quest_n_test = row_num, row_num _helpScripts.print_msg('generate tree representations for qa') train_questions = source.train_data['question'].values[:row_num] train_answers = source.train_data[source.answer_names].values[:row_num] if test: test_questions = source.test_data['question'].values[:row_num] test_answers = source.test_data[source.answer_names].values[:row_num] sents_all = np.concatenate((train_questions, train_answers.flatten(), test_questions, test_answers.flatten())) else: sents_all = np.concatenate((train_questions, train_answers.flatten())) words, weights_trees, vocab, root_inds, weights_ = create_trees_all(sents_all, nlp) word_n = np.shape(weights_trees)[1] wc_num = len(weights_) mask_root_ind = get_root_ind_mask(root_inds, length=word_n) mask = np.zeros_like(mask_root_ind) for i, ri in enumerate(root_inds): mask[i, range(ri[-1] + 1)] = 1 input_ = np.array([sents_to_w2v(sent, w2vec_model, word_n) for sent in words], dtype='float32') train_q, train_a = unflaffen_qa(input_[:quest_n_train*(1 + answer_n)], quest_n_train, answer_n) mask_train_ri_q, mask_train_ri_a = unflaffen_qa(mask_root_ind[:quest_n_train*(1 + answer_n)], quest_n_train, answer_n) mask_train_q, mask_train_a = unflaffen_qa(mask[:quest_n_train*(1 + answer_n)], quest_n_train, answer_n) wt_train_q, wt_train_a = unflaffen_qa(weights_trees[:quest_n_train*(1 + answer_n)], quest_n_train, answer_n) if test is None: test_q, test_a = None, None mask_test_ri_q, mask_test_ri_a = None, None mask_test_q, mask_test_a = None, None wt_test_q, wt_test_a = None, None else: test_q, test_a = unflaffen_qa(input_[quest_n_train*(1 + answer_n):], quest_n_train, answer_n) mask_test_ri_q, mask_test_ri_a = unflaffen_qa(mask_root_ind[quest_n_train*(1 + answer_n):], quest_n_train, answer_n) mask_test_q, mask_test_a = unflaffen_qa(mask[quest_n_train*(1 + answer_n):], quest_n_train, answer_n) wt_test_q, wt_test_a = unflaffen_qa(weights_trees[quest_n_train*(1 + answer_n):], quest_n_train, answer_n) correct_answ_ = source.train_data['correctAnswer'].values[:row_num] targets = int32([np.where(answ == np.array(source.answer_sym))[0][0] for answ in correct_answ_]) return (train_q, train_a, test_q, test_a, wt_train_q, wt_train_a, wt_test_q, wt_test_a, mask_train_q, mask_train_a, mask_test_q, mask_test_a, mask_train_ri_q, mask_train_ri_a, mask_test_ri_q, mask_test_ri_a, vocab, word_n, wc_num, targets)
def pre_proc_quest(source, vocab, quest_data_file, row_num=None): #-----------------------------------------------tokenize question / answer data _helpScripts.print_msg('generate question / answer data') quest_n_train = source.train_data.shape[0] quest_n_test = source.test_data.shape[0] answer_sym = ['A', 'B', 'C', 'D'] answer_names = ['answer' + s for s in answer_sym] train_quest_tok = np.array([nltk.word_tokenize(sent) for sent in source.train_data['question'].values]) test_quest_tok = np.array([nltk.word_tokenize(sent) for sent in source.test_data['question'].values]) train_quest_sl = [len(t) for t in train_quest_tok] test_quest_sl = [len(t) for t in test_quest_tok] QUEST_SENT_L = np.max(train_quest_sl + test_quest_sl) train_answ_tok = np.transpose(np.array([[nltk.word_tokenize(sent) for sent in source.train_data[a].values] for a in answer_names])) test_answ_tok = np.transpose(np.array([[nltk.word_tokenize(sent) for sent in source.test_data[a].values] for a in answer_names]).reshape((-1, 4))) train_answ_sl = [len(t) for t in train_answ_tok.flatten()] test_answ_sl = [len(t) for t in test_answ_tok.flatten()] ANSW_SENT_L = np.max(train_answ_sl + test_answ_sl) print 'max question length is {}, answer length: {}'.format(QUEST_SENT_L, ANSW_SENT_L) #-------------------------------------------------------generate the input data _helpScripts.print_msg('generate indexing for questions', allign='left') PAD_TOKEN_IND = len(vocab) - 1 train_quest = np.expand_dims(int16(sents2indPar(train_quest_tok, vocab, proc=8, max_l=QUEST_SENT_L)), 1) test_quest = np.expand_dims(int16(sents2indPar(test_quest_tok, vocab, proc=8, max_l=QUEST_SENT_L)), 1) _helpScripts.print_msg('generate indexing for answers', allign='left') train_answ = int16(sents2indPar(train_answ_tok.reshape((-1)), vocab, proc=8, max_l=ANSW_SENT_L)).reshape((-1, 4, ANSW_SENT_L)) test_answ = int16(sents2indPar(test_answ_tok.reshape((-1)), vocab, proc=8, max_l=ANSW_SENT_L)).reshape((-1, 4, ANSW_SENT_L)) correct_answ_ = correct_answ=source.train_data['correctAnswer'].values train_correct_answ = float32([np.where(answ == np.array(answer_sym))[0][0] for answ in correct_answ_]) #-------------------------------------------------------------save data to file _helpScripts.print_msg('save data to file ' + quest_data_file, allign='left') with open(quest_data_file, 'wb') as f: cPickle.dump((train_quest, train_answ, train_correct_answ, test_quest, test_answ), f, protocol=2) _helpScripts.print_msg('completed') return (train_quest, train_answ, train_correct_answ, test_quest, test_answ)
def pre_process_context(context_data_file, vocab_size=VOCAB_SIZE, row_num=None): #-----------------------------------------------------------load whoosh results _helpScripts.print_msg('load whoosh results') with open(XAPIAN_DIR + 'train.pickle', 'rb') as f: wh_pred_answ_train, wh_scores_train, wh_themes_train = pickle.load(f) with open(XAPIAN_DIR + 'test.pickle', 'rb') as f: wh_pred_answ_test, wh_scores_test, wh_themes_test = pickle.load(f) if row_num is not None: wh_pred_answ_train, wh_scores_train, wh_themes_train = \ wh_pred_answ_train[:row_num], wh_scores_train[:row_num], wh_themes_train[:row_num] wh_pred_answ_test, wh_scores_test, wh_themes_test = \ wh_pred_answ_test[:row_num], wh_scores_test[:row_num], wh_themes_test[:row_num] themes_train_ = [wh_th.flatten() for wh_th in wh_themes_train] themes_train = set([item for sublist in themes_train_ for item in sublist]) themes_test_ = [wh_th.flatten() for wh_th in wh_themes_test] themes_test = set([item for sublist in themes_test_ for item in sublist]) themes_all = themes_train | themes_test themes_all -= set([0]) themes_name_all = set([os.path.splitext(os.path.basename(th))[0] for th in list(themes_all)]) print '{} articles are used from whoosh results'.format(len(themes_name_all)) #--------------------------------------------------------------tokenize results _helpScripts.print_msg('tokenize results') sentences_, art_names = tokenize_docs(list(themes_all), mode='append', add_sep=False, max_sent_length=MAX_SENT_LENGTH, min_sent_length=MIN_SENT_LENGTH) sentences = [] sentences.extend(list(itertools.chain(*sentences_))) sent_l = [len(s) for s in sentences] print '{} sentences with sentence length: min = {}, max = {}, median = {}'.format(len(sent_l), np.min(sent_l), np.max(sent_l), np.median(sent_l)) #-------------------------------------------------------------------make source _helpScripts.print_msg('make source') source = _aai_data.Source(lower=True, create_sents=False) sentences.extend(source.sent_train_tokens) sentences.extend(source.sent_test_tokens) quest_n_train = source.train_data.shape[0] quest_n_test = source.test_data.shape[0] print 'there are {} train questions and {} test questions'.format(quest_n_train, quest_n_test) #-----------------------------------------------------------generate vocabulary _helpScripts.print_msg('generate vocabulary') VOCAB = generate_vocab(sentences, vocab_size=vocab_size) vocab_size = len(VOCAB) PAD_TOKEN_IND = len(VOCAB) - 1 #-------------------------------------------------------------generate indexies _helpScripts.print_msg('select themes') _helpScripts.print_msg('for train', allign='left') themes_train = select_themes(wh_scores_train, wh_themes_train, themes_quest=ARTICLES_QUEST) themes_train_names = [[os.path.splitext(os.path.basename(th))[0] for th in th_q] for th_q in themes_train] _helpScripts.print_msg('for test', allign='left') themes_test = select_themes(wh_scores_test, wh_themes_test, themes_quest=ARTICLES_QUEST) themes_test_names = [[os.path.splitext(os.path.basename(th))[0] for th in th_q] for th_q in themes_test] #-------------------------------------------------------------generate indexies _helpScripts.print_msg('generate indexies') _helpScripts.print_msg('for whole context', allign='left') sent_count = [[len(sents) for sents in sentences_[i]] for i in range(len(sentences_)) if art_names[i] in themes_all] sent_count = list(itertools.chain(*sent_count)) # MAX_ART_THEME = np.median(sent_count) sentences_ = np.array(sentences_) # themes_all = list(themes_all) article_inds = get_art_word_ind(sentences_, VOCAB, MAX_SENT_ART, MAX_SENT_LENGTH, PAD_TOKEN_IND) #-------------------------------------------------------------save data to file _helpScripts.print_msg('save data to file ' + context_data_file, allign='left') with open(context_data_file, 'wb') as f: cPickle.dump((VOCAB, sentences_, art_names, source, article_inds, themes_train, themes_test), f, protocol=2) _helpScripts.print_msg('completed') return (VOCAB, sentences_, art_names, source, article_inds, themes_train, themes_test)
""" return LL.get_output(encode_layer, inputs=X).eval() def get_layer_by_name(net, name): """ from https://github.com/mikesj-public/convolutional_autoencoder/blob/master/mnist_conv_autoencode.ipynb """ for i, layer in enumerate(net.get_all_layers()): if layer.name == name: return layer, i return None, None if __name__ == "__main__": _helpScripts.print_msg('start at time ' + strftime("%Y-%m-%d %H:%M:%S")) # print strftime("%Y-%m-%d %H:%M:%S") startTime = datetime.datetime.now() MAX_SAMPLES = None os.chdir('/Users/lexx/Documents/Work/Kaggle/AllenAIScience/') if not os.path.isdir(saveDir): os.makedirs(saveDir) #--------------------------------------------------generate / load context data context_data_file = saveDir + 'context_data.cpickle' if os.path.isfile(context_data_file): _helpScripts.print_msg('load context data from file: ' + context_data_file) with open(context_data_file, 'rb') as f: (VOCAB, sentences_, art_names, source, article_inds, themes_train, themes_test) = cPickle.load(f) else: (VOCAB, sentences_, art_names, source,
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logger = logging.getLogger() fhandler = logging.FileHandler(filename='./data/w2v_models/mylog.log', mode='a') formatter = logging.Formatter('%(asctime)s : %(levelname)s : %(message)s') fhandler.setFormatter(formatter) logger.addHandler(fhandler) logger.setLevel(logging.INFO) answer_sym = ['A', 'B', 'C', 'D'] answer_names = ['answer' + s for s in answer_sym] # quest_num_train = source.train_data.shape[0] if not os.path.isfile(w2vec_model_name): _helpScripts.print_msg('prepare the model') w2vec_model = Word2Vec(size=500, alpha=0.1, window=5, min_count=5, max_vocab_size=None, sample=1e-5, seed=1, workers=16, min_alpha=0.0001, sg=1, hs=1, negative=5, cbow_mean=0, iter=2, null_word=0,
return scores__ def predict_answer(answer_sym, scores): pred_answ_ = np.argmax(scores, axis=1) pred_answ = [answer_sym[i] for i in pred_answ_] return pred_answ if __name__ == "__main__": # os.chdir('/Users/lexx/Documents/Work/Kaggle/AllenAIScience/') # corpus_tokens = get_all_corpus(search_dir) xapian_dir = './data/xapian/{0}/'.format(XAPIAN_VERSION) pmi_dir = './data/pmi/{0}/'.format(PMI_VERSION) # ------------------------------------------------------------------load results _helpScripts.print_msg('load results') _helpScripts.print_msg('pmi version {}'.format(PMI_VERSION), allign='left') with open(pmi_dir + 'train.pickle', 'rb') as f: (pmi_pred_answ_train, pmi_train, pmi_colls_train) = cPickle.load(f) with open(pmi_dir + 'test.pickle', 'rb') as f: (pmi_pred_answ_test, pmi_test, pmi_colls_test) = cPickle.load(f) _helpScripts.print_msg('xapian version {}'.format(XAPIAN_VERSION), allign='left') with open(xapian_dir + 'train.pickle', 'rb') as f: (x_pred_answ_train, x_train_scores, x_train_paths) = pickle.load(f) with open(xapian_dir + 'test.pickle', 'rb') as f: (x_pred_answ_test, x_test_scores, x_test_paths) = pickle.load(f) # ------------------------------------------------------------------------source source = _aai_data.Source(create_sents=False) # ---------------------------------------------------------------combine results x_train_scores_ = combine_scores_x(x_train_scores, mode='sum',
return scores__ def predict_answer(answer_sym, scores): pred_answ_ = np.argmax(scores, axis=1) pred_answ = [answer_sym[i] for i in pred_answ_] return pred_answ if __name__ == "__main__": # os.chdir('/Users/lexx/Documents/Work/Kaggle/AllenAIScience/') # corpus_tokens = get_all_corpus(search_dir) xapian_dir = './data/xapian/{0}/'.format(XAPIAN_VERSION) pmi_dir = './data/pmi/{0}/'.format(PMI_VERSION) # ------------------------------------------------------------------load results _helpScripts.print_msg('load results') _helpScripts.print_msg('pmi version {}'.format(PMI_VERSION), allign='left') with open(pmi_dir + 'train.pickle', 'rb') as f: (pmi_pred_answ_train, pmi_train, pmi_colls_train) = cPickle.load(f) with open(pmi_dir + 'test.pickle', 'rb') as f: (pmi_pred_answ_test, pmi_test, pmi_colls_test) = cPickle.load(f) _helpScripts.print_msg('xapian version {}'.format(XAPIAN_VERSION), allign='left') with open(xapian_dir + 'train.pickle', 'rb') as f: (x_pred_answ_train, x_train_scores, x_train_paths) = pickle.load(f) with open(xapian_dir + 'test.pickle', 'rb') as f: (x_pred_answ_test, x_test_scores, x_test_paths) = pickle.load(f) # ------------------------------------------------------------------------source source = _aai_data.Source(create_sents=False) # ---------------------------------------------------------------combine results
return leave_idx if __name__ == "__main__": os.chdir("/Users/lexx/Documents/Work/Kaggle/AllenAIScience/") save_dir = "./data/generated_sents/" source = _aai_data.Source() art_dirs_pre = u"./data/corpus/" wiki_dir = u"wiki_text_mod_spl_50/" ck12_full_spl_dir = u"ck12_full_themes_mod_spl_50/" ck12_wiki_spl_dir = u"wiki_text_ck12_themes_mod_spl_50/" art_dirs = [wiki_dir, ck12_full_spl_dir, ck12_wiki_spl_dir] _helpScripts.print_msg("Generate sentences") (sents_true, sents_false) = generate_sents_par( art_dirs_pre + art_dirs[2], pos_count=1, neg_count=3, rng=np.random.RandomState(156), save_dir=save_dir + art_dirs[2], ) sents_true = np.array(sents_true) sents_false = np.array(sents_false) full_idx = get_full_sents(sents_true, sents_false) sents_true = sents_true[full_idx] sents_false = sents_false[full_idx] def temp():
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) logger = logging.getLogger() fhandler = logging.FileHandler(filename='./data/w2v_models/mylog.log', mode='a') formatter = logging.Formatter('%(asctime)s : %(levelname)s : %(message)s') fhandler.setFormatter(formatter) logger.addHandler(fhandler) logger.setLevel(logging.INFO) answer_sym = ['A', 'B', 'C', 'D'] answer_names = ['answer' + s for s in answer_sym] # quest_num_train = source.train_data.shape[0] if not os.path.isfile(w2vec_model_name): _helpScripts.print_msg('prepare the model') w2vec_model = Word2Vec(size=500, alpha=0.1, window=5, min_count=5, max_vocab_size=None, sample=1e-5, seed=1, workers=16, min_alpha=0.0001, sg=1, hs=1, negative=5, cbow_mean=0, iter=2, null_word=0, trim_rule=None) # if os.path.isfile('./data/w2v_models/sentences'): # with open(sent_file, 'rb') as f: # sentences = pickle.load(f) # else: wiki_dir = './data/corpus/wiki_text_mod/' ck12_full_spl_dir = u'./data/corpus/ck12_full_themes_mod/' ck12_wiki_spl_dir = u'./data/corpus/wiki_text_ck12_themes_mod/' _helpScripts.print_msg('tokenize wiki', allign='left')
def pre_proc_quest(source, vocab, quest_data_file, row_num=None): #-----------------------------------------------tokenize question / answer data _helpScripts.print_msg('generate question / answer data') quest_n_train = source.train_data.shape[0] quest_n_test = source.test_data.shape[0] answer_sym = ['A', 'B', 'C', 'D'] answer_names = ['answer' + s for s in answer_sym] train_quest_tok = np.array([ nltk.word_tokenize(sent) for sent in source.train_data['question'].values ]) test_quest_tok = np.array([ nltk.word_tokenize(sent) for sent in source.test_data['question'].values ]) train_quest_sl = [len(t) for t in train_quest_tok] test_quest_sl = [len(t) for t in test_quest_tok] QUEST_SENT_L = np.max(train_quest_sl + test_quest_sl) train_answ_tok = np.transpose( np.array([[ nltk.word_tokenize(sent) for sent in source.train_data[a].values ] for a in answer_names])) test_answ_tok = np.transpose( np.array( [[nltk.word_tokenize(sent) for sent in source.test_data[a].values] for a in answer_names]).reshape((-1, 4))) train_answ_sl = [len(t) for t in train_answ_tok.flatten()] test_answ_sl = [len(t) for t in test_answ_tok.flatten()] ANSW_SENT_L = np.max(train_answ_sl + test_answ_sl) print 'max question length is {}, answer length: {}'.format( QUEST_SENT_L, ANSW_SENT_L) #-------------------------------------------------------generate the input data _helpScripts.print_msg('generate indexing for questions', allign='left') PAD_TOKEN_IND = len(vocab) - 1 train_quest = np.expand_dims( int16(sents2indPar(train_quest_tok, vocab, proc=8, max_l=QUEST_SENT_L)), 1) test_quest = np.expand_dims( int16(sents2indPar(test_quest_tok, vocab, proc=8, max_l=QUEST_SENT_L)), 1) _helpScripts.print_msg('generate indexing for answers', allign='left') train_answ = int16( sents2indPar(train_answ_tok.reshape((-1)), vocab, proc=8, max_l=ANSW_SENT_L)).reshape((-1, 4, ANSW_SENT_L)) test_answ = int16( sents2indPar(test_answ_tok.reshape((-1)), vocab, proc=8, max_l=ANSW_SENT_L)).reshape((-1, 4, ANSW_SENT_L)) correct_answ_ = correct_answ = source.train_data['correctAnswer'].values train_correct_answ = float32([ np.where(answ == np.array(answer_sym))[0][0] for answ in correct_answ_ ]) #-------------------------------------------------------------save data to file _helpScripts.print_msg('save data to file ' + quest_data_file, allign='left') with open(quest_data_file, 'wb') as f: cPickle.dump((train_quest, train_answ, train_correct_answ, test_quest, test_answ), f, protocol=2) _helpScripts.print_msg('completed') return (train_quest, train_answ, train_correct_answ, test_quest, test_answ)
def pre_process_context(context_data_file, vocab_size=VOCAB_SIZE, row_num=None): #-----------------------------------------------------------load whoosh results _helpScripts.print_msg('load whoosh results') with open(XAPIAN_DIR + 'train.pickle', 'rb') as f: wh_pred_answ_train, wh_scores_train, wh_themes_train = pickle.load(f) with open(XAPIAN_DIR + 'test.pickle', 'rb') as f: wh_pred_answ_test, wh_scores_test, wh_themes_test = pickle.load(f) if row_num is not None: wh_pred_answ_train, wh_scores_train, wh_themes_train = \ wh_pred_answ_train[:row_num], wh_scores_train[:row_num], wh_themes_train[:row_num] wh_pred_answ_test, wh_scores_test, wh_themes_test = \ wh_pred_answ_test[:row_num], wh_scores_test[:row_num], wh_themes_test[:row_num] themes_train_ = [wh_th.flatten() for wh_th in wh_themes_train] themes_train = set([item for sublist in themes_train_ for item in sublist]) themes_test_ = [wh_th.flatten() for wh_th in wh_themes_test] themes_test = set([item for sublist in themes_test_ for item in sublist]) themes_all = themes_train | themes_test themes_all -= set([0]) themes_name_all = set( [os.path.splitext(os.path.basename(th))[0] for th in list(themes_all)]) print '{} articles are used from whoosh results'.format( len(themes_name_all)) #--------------------------------------------------------------tokenize results _helpScripts.print_msg('tokenize results') sentences_, art_names = tokenize_docs(list(themes_all), mode='append', add_sep=False, max_sent_length=MAX_SENT_LENGTH, min_sent_length=MIN_SENT_LENGTH) sentences = [] sentences.extend(list(itertools.chain(*sentences_))) sent_l = [len(s) for s in sentences] print '{} sentences with sentence length: min = {}, max = {}, median = {}'.format( len(sent_l), np.min(sent_l), np.max(sent_l), np.median(sent_l)) #-------------------------------------------------------------------make source _helpScripts.print_msg('make source') source = _aai_data.Source(lower=True, create_sents=False) sentences.extend(source.sent_train_tokens) sentences.extend(source.sent_test_tokens) quest_n_train = source.train_data.shape[0] quest_n_test = source.test_data.shape[0] print 'there are {} train questions and {} test questions'.format( quest_n_train, quest_n_test) #-----------------------------------------------------------generate vocabulary _helpScripts.print_msg('generate vocabulary') VOCAB = generate_vocab(sentences, vocab_size=vocab_size) vocab_size = len(VOCAB) PAD_TOKEN_IND = len(VOCAB) - 1 #-------------------------------------------------------------generate indexies _helpScripts.print_msg('select themes') _helpScripts.print_msg('for train', allign='left') themes_train = select_themes(wh_scores_train, wh_themes_train, themes_quest=ARTICLES_QUEST) themes_train_names = [[ os.path.splitext(os.path.basename(th))[0] for th in th_q ] for th_q in themes_train] _helpScripts.print_msg('for test', allign='left') themes_test = select_themes(wh_scores_test, wh_themes_test, themes_quest=ARTICLES_QUEST) themes_test_names = [[ os.path.splitext(os.path.basename(th))[0] for th in th_q ] for th_q in themes_test] #-------------------------------------------------------------generate indexies _helpScripts.print_msg('generate indexies') _helpScripts.print_msg('for whole context', allign='left') sent_count = [[len(sents) for sents in sentences_[i]] for i in range(len(sentences_)) if art_names[i] in themes_all] sent_count = list(itertools.chain(*sent_count)) # MAX_ART_THEME = np.median(sent_count) sentences_ = np.array(sentences_) # themes_all = list(themes_all) article_inds = get_art_word_ind(sentences_, VOCAB, MAX_SENT_ART, MAX_SENT_LENGTH, PAD_TOKEN_IND) #-------------------------------------------------------------save data to file _helpScripts.print_msg('save data to file ' + context_data_file, allign='left') with open(context_data_file, 'wb') as f: cPickle.dump((VOCAB, sentences_, art_names, source, article_inds, themes_train, themes_test), f, protocol=2) _helpScripts.print_msg('completed') return (VOCAB, sentences_, art_names, source, article_inds, themes_train, themes_test)
""" return LL.get_output(encode_layer, inputs=X).eval() def get_layer_by_name(net, name): """ from https://github.com/mikesj-public/convolutional_autoencoder/blob/master/mnist_conv_autoencode.ipynb """ for i, layer in enumerate(net.get_all_layers()): if layer.name == name: return layer, i return None, None if __name__ == "__main__": _helpScripts.print_msg('start at time ' + strftime("%Y-%m-%d %H:%M:%S")) # print strftime("%Y-%m-%d %H:%M:%S") startTime = datetime.datetime.now() MAX_SAMPLES = None os.chdir('/Users/lexx/Documents/Work/Kaggle/AllenAIScience/') if not os.path.isdir(saveDir): os.makedirs(saveDir) #--------------------------------------------------generate / load context data context_data_file = saveDir + 'context_data.cpickle' if os.path.isfile(context_data_file): _helpScripts.print_msg('load context data from file: ' + context_data_file) with open(context_data_file, 'rb') as f: (VOCAB, sentences_, art_names, source, article_inds, themes_train, themes_test) = cPickle.load(f) else:
return leave_idx if __name__ == '__main__': os.chdir('/Users/lexx/Documents/Work/Kaggle/AllenAIScience/') save_dir = './data/generated_sents/' source = _aai_data.Source() art_dirs_pre = u'./data/corpus/' wiki_dir = u'wiki_text_mod_spl_50/' ck12_full_spl_dir = u'ck12_full_themes_mod_spl_50/' ck12_wiki_spl_dir = u'wiki_text_ck12_themes_mod_spl_50/' art_dirs = [wiki_dir, ck12_full_spl_dir, ck12_wiki_spl_dir] _helpScripts.print_msg('Generate sentences') (sents_true, sents_false) = generate_sents_par(art_dirs_pre+art_dirs[2], pos_count=1, neg_count=3, rng=np.random.RandomState(156), save_dir=save_dir+art_dirs[2]) sents_true = np.array(sents_true) sents_false = np.array(sents_false) full_idx = get_full_sents(sents_true, sents_false) sents_true = sents_true[full_idx] sents_false = sents_false[full_idx] def temp(): art_files = [glob.glob(art_dirs_pre + art_dir + '*.txt') for art_dir in art_dirs]
""" return LL.get_output(encode_layer, inputs=X).eval() def get_layer_by_name(net, name): """ from https://github.com/mikesj-public/convolutional_autoencoder/blob/master/mnist_conv_autoencode.ipynb """ for i, layer in enumerate(net.get_all_layers()): if layer.name == name: return layer, i return None, None if __name__ == '__main__': _helpScripts.print_msg('start at time ' + strftime("%Y-%m-%d %H:%M:%S")) # print strftime("%Y-%m-%d %H:%M:%S") startTime = datetime.datetime.now() # os.chdir('/Users/lexx/Documents/Work/Kaggle/AllenAIScience/') if not os.path.isdir(saveDir): os.makedirs(saveDir) nlp = _spEnglish(data_dir=os.environ.get('SPACY_DATA', _spLOCAL_DATA_DIR)) #--------------------------------------------------------------load w2vec model w2vec_model_name = W2VEC_DIR + 'm{}'.format(W2VEC_VERSION) _helpScripts.print_msg('load the word2vec model from', w2vec_model_name) w2vec_model = Word2Vec.load(w2vec_model_name) w2vec_model.init_sims(replace=True) repr_length = w2vec_model.syn0.shape[1] #--------------------------------------------------generate / load context data #-------------------------------------------------------------------make source
def preprocess_qa(source, nlp, w2vec_model, row_num=None, test=False): answer_n = len(source.answer_names) if row_num is None: quest_n_train = source.train_data.shape[0] quest_n_test = source.test_data.shape[0] else: quest_n_train, quest_n_test = row_num, row_num _helpScripts.print_msg('generate tree representations for qa') train_questions = source.train_data['question'].values[:row_num] train_answers = source.train_data[source.answer_names].values[:row_num] if test: test_questions = source.test_data['question'].values[:row_num] test_answers = source.test_data[source.answer_names].values[:row_num] sents_all = np.concatenate( (train_questions, train_answers.flatten(), test_questions, test_answers.flatten())) else: sents_all = np.concatenate((train_questions, train_answers.flatten())) words, weights_trees, vocab, root_inds, weights_ = create_trees_all( sents_all, nlp) word_n = np.shape(weights_trees)[1] wc_num = len(weights_) mask_root_ind = get_root_ind_mask(root_inds, length=word_n) mask = np.zeros_like(mask_root_ind) for i, ri in enumerate(root_inds): mask[i, range(ri[-1] + 1)] = 1 input_ = np.array( [sents_to_w2v(sent, w2vec_model, word_n) for sent in words], dtype='float32') train_q, train_a = unflaffen_qa(input_[:quest_n_train * (1 + answer_n)], quest_n_train, answer_n) mask_train_ri_q, mask_train_ri_a = unflaffen_qa( mask_root_ind[:quest_n_train * (1 + answer_n)], quest_n_train, answer_n) mask_train_q, mask_train_a = unflaffen_qa( mask[:quest_n_train * (1 + answer_n)], quest_n_train, answer_n) wt_train_q, wt_train_a = unflaffen_qa( weights_trees[:quest_n_train * (1 + answer_n)], quest_n_train, answer_n) if test is None: test_q, test_a = None, None mask_test_ri_q, mask_test_ri_a = None, None mask_test_q, mask_test_a = None, None wt_test_q, wt_test_a = None, None else: test_q, test_a = unflaffen_qa(input_[quest_n_train * (1 + answer_n):], quest_n_train, answer_n) mask_test_ri_q, mask_test_ri_a = unflaffen_qa( mask_root_ind[quest_n_train * (1 + answer_n):], quest_n_train, answer_n) mask_test_q, mask_test_a = unflaffen_qa( mask[quest_n_train * (1 + answer_n):], quest_n_train, answer_n) wt_test_q, wt_test_a = unflaffen_qa( weights_trees[quest_n_train * (1 + answer_n):], quest_n_train, answer_n) correct_answ_ = source.train_data['correctAnswer'].values[:row_num] targets = int32([ np.where(answ == np.array(source.answer_sym))[0][0] for answ in correct_answ_ ]) return (train_q, train_a, test_q, test_a, wt_train_q, wt_train_a, wt_test_q, wt_test_a, mask_train_q, mask_train_a, mask_test_q, mask_test_a, mask_train_ri_q, mask_train_ri_a, mask_test_ri_q, mask_test_ri_a, vocab, word_n, wc_num, targets)