def process_data(dataset: str, neighbor_sample_size: int, K: int): drug_vocab = {} entity_vocab = {} relation_vocab = {} read_entity2id_file(ENTITY2ID_FILE[dataset], drug_vocab, entity_vocab) pickle_dump( format_filename(PROCESSED_DATA_DIR, DRUG_VOCAB_TEMPLATE, dataset=dataset), drug_vocab) pickle_dump( format_filename(PROCESSED_DATA_DIR, ENTITY_VOCAB_TEMPLATE, dataset=dataset), entity_vocab) examples_file = format_filename(PROCESSED_DATA_DIR, DRUG_EXAMPLE, dataset=dataset) examples = read_example_file(EXAMPLE_FILE[dataset], SEPARATOR[dataset], drug_vocab) print(len(examples)) #example contains postive samples and negative samples #example:[drug1 drug2 interaction] np.save(examples_file, examples) adj_entity_file = format_filename(PROCESSED_DATA_DIR, ADJ_ENTITY_TEMPLATE, dataset=dataset) adj_relation_file = format_filename(PROCESSED_DATA_DIR, ADJ_RELATION_TEMPLATE, dataset=dataset) adj_entity, adj_relation = read_kg(KG_FILE[dataset], entity_vocab, relation_vocab, neighbor_sample_size) pickle_dump( format_filename(PROCESSED_DATA_DIR, DRUG_VOCAB_TEMPLATE, dataset=dataset), drug_vocab) pickle_dump( format_filename(PROCESSED_DATA_DIR, ENTITY_VOCAB_TEMPLATE, dataset=dataset), entity_vocab) pickle_dump( format_filename(PROCESSED_DATA_DIR, RELATION_VOCAB_TEMPLATE, dataset=dataset), relation_vocab) adj_entity_file = format_filename(PROCESSED_DATA_DIR, ADJ_ENTITY_TEMPLATE, dataset=dataset) np.save(adj_entity_file, adj_entity) print('Logging Info - Saved:', adj_entity_file) adj_relation_file = format_filename(PROCESSED_DATA_DIR, ADJ_RELATION_TEMPLATE, dataset=dataset) np.save(adj_relation_file, adj_relation) print('Logging Info - Saved:', adj_entity_file) cross_validation(K, examples, dataset, neighbor_sample_size)
def eval_deprecated(): '''partially deprecated''' k = opt.k train_node = TrainNode(opt) idx2bin = {} dataset = utils.load_data('query') ### use dataset eventually dataset = dataset.to(device) dsnode_path = opt.dsnode_path + str(opt.n_clusters) #print('dsnode path {}'.format(dsnode_path)) dsnode = utils.pickle_load(dsnode_path) print('dsnode {}'.format(dsnode)) train_node.train(dataset, dsnode, idx2bin) #idx (of query) in entire dataset, bin is idx of leaf bin. eval_root = train_node.create_eval_tree() idx2bin = eval_root.idx2bin #eval root should contain dict for answers set indices and bin #, for evaluation. #serialize print('train.py - serializing model evaluation tree...') eval_root_path = osp.join(opt.data_dir, 'model_eval_root') ########### utils.pickle_dump(eval_root, eval_root_path) ## evaluate ## queryset = utils.load_data('query') neighbors = utils.load_data('answers') acc, probe_count = eval_model(eval_root, queryset, neighbors, opt) print('train.py - Query set prediction acc {} probe count {}'.format(acc, probe_count))
def get_validation_split(data_file, training_file, validation_file, data_split=0.8, overwrite=False): """ Splits the data into the training and validation indices list. :param data_file: pytables hdf5 data file :param training_file: :param validation_file: :param data_split: :param overwrite: :return: """ if overwrite or not os.path.exists(training_file): print("Creating validation split...") nb_samples = data_file.root.data.shape[0] sample_list = list(range(nb_samples)) training_list, validation_list = split_list(sample_list, split=data_split) pickle_dump(training_list, training_file) pickle_dump(validation_list, validation_file) return training_list, validation_list else: print("Loading previous validation split...") return pickle_load(training_file), pickle_load(validation_file)
def main(): model_file = os.path.join(modeldir, 'commoncrawl_fr-en.bin') trajectory_file = os.path.join(datadir, 'eolss-train.trajectories+scores.txt') con = Connection(configuration=client_conf) con.set_globals(trajectory_file=trajectory_file, model_file=model_file, gamma=gamma) con.run(load_data) trajectories.compute_scores(phi, gamma) transitions = [(s, r) for _, s, _, r in trajectories.SBIRL(phi) ] # in fitted-value iteration we care only about s' shuffle(transitions) regressor = None for k in range(n_iterations): print 'Iteration', k con.set_globals(regressor=regressor) training_set = con.map(training_, transitions) regressor = get_regressor(training_set) pickle_dump(regressor, 'output/regressor.{}.pickle'.format(k + 1))
def repetitive(directory='.'): stats = {'repetitive_count': {}, 'query_count': {}} repetitive_queries = set() for repo in Repository.objects.exclude(latest_successful_attempt=None): if filter_repository(repo): continue project_type_name = repo.project_type.name for action in Action.objects.filter( attempt=repo.latest_successful_attempt): queries = map(lambda x: x.content.strip(), Query.objects.filter(action=action)) for i in xrange(1, len(queries)): if queries[i] == queries[i - 1]: repetitive_queries.add(queries[i]) print project_type_name print queries[i] print stats['repetitive_count'][ project_type_name] = stats['repetitive_count'].get( project_type_name, 0) + 1 stats['query_count'][project_type_name] = stats['query_count'].get( project_type_name, 0) + len(queries) pickle_dump(directory, 'repetitive_queries', repetitive_queries) dump_all_stats(directory, stats)
def cache_sites(se_sites_path, api_key): url = 'https://api.stackexchange.com/2.2/sites' params = {'pagesize': 100} json_items = call_api(url, params) sites = [] for item in json_items: api_name = item.get('api_site_parameter') sites.append(api_name) utils.pickle_dump(se_sites_path, sites) print('cached se site list to file')
def pickle_self(self): self.total_time = time() - self.t0 self.url_file.write('\nUrl counts:{}\nDuplicate counts{}'.format( self.url_count, self.duplicate_count)) self.url_file.write('\nTime taken: {}'.format(self.total_time)) if self.depth_reached > self.max_depth: self.depth_reached -= 1 self.url_file.write('\nDepth reached: {}'.format(self.depth_reached)) self.url_file.close() self.url_file = None self.conn.close() utils.pickle_dump(self.state_path, self)
def check_refresh_complete (self): if self.rs.refresh_status["no_need_update"]: self.status_changed (self.orig_office_status) return False if self.rs.refresh_status["last_notification"] == False: return True if self.last_nid == self.rs.last_nid and int (self.last_nid) != 0: logging.debug ("self.last_nid == self.rs.last_nid") self.status_changed (self.orig_office_status) return False if self.rs.refresh_status["current_status"] == True: self.rs.refresh_status["current_status"] = False self.current_status = self.rs.current_status self.refresh_status_changed (defs.CURRENT_STATUS_COMPLETED) if self.rs.refresh_status["notification"] == True and \ self.rs.refresh_status["comments"] == True: self.rs.refresh_status["notification"] = False self.rs.refresh_status["comments"] = False self.notification = self.rs.notification self.status = self.rs.status self.refresh_status_changed (defs.NOTIFICATION_COMMENTS_COMPLETED) if self.rs.refresh_status["users_icon"] == True: self.rs.refresh_status["users_icon"] = False self.user_ids = self.rs.user_ids self.users = self.rs.users self.refresh_status_changed (defs.USERS_ICON_COMPLETED) if self.rs.refresh_status["apps_icon"] == True: self.rs.refresh_status["apps_icon"] = False self.app_ids = self.rs.app_ids self.applications = self.rs.applications self.refresh_status_changed (defs.APPS_ICON_COMPLETED) if self.rs.isAlive (): return True logging.debug ("completed") self.last_nid = self.rs.last_nid for k in self.rs.refresh_status: self.rs.refresh_status[k] = False path = self.local_data_dir + "/cache.pickle" utils.pickle_dump (self, path) self.status_changed (self.orig_office_status) return False
def save_checkpoint(model, infos, optimizer, append='tr'): if len(append) > 0: append = '-' + append # if checkpoint_path doesn't exist if not os.path.isdir(opt.checkpoint_path): os.makedirs(opt.checkpoint_path) checkpoint_path = os.path.join(opt.checkpoint_path, 'model%s.pth' % (append)) torch.save(model.state_dict(), checkpoint_path) print("model saved to {}".format(checkpoint_path)) with open(os.path.join(opt.checkpoint_path, 'infos%s.pkl' % (append)), 'wb') as f: pickle_dump(infos, f) optimizer_path = os.path.join(opt.checkpoint_path, 'optimizer%s.pth' % (append)) torch.save(optimizer.state_dict(), optimizer_path)
def main(params): imgs = json.load(open(params['input_json'], 'r')) itow = json.load(open(params['dict_json'], 'r'))['ix_to_word'] wtoi = {w: i for i, w in itow.items()} imgs = imgs['images'] ngram_words, ngram_idxs, ref_len = build_dict(imgs, wtoi, params) utils.pickle_dump({ 'document_frequency': ngram_words, 'ref_len': ref_len }, open(params['output_pkl'] + '-words.p', 'wb')) utils.pickle_dump({ 'document_frequency': ngram_idxs, 'ref_len': ref_len }, open(params['output_pkl'] + '-idxs.p', 'wb'))
def cv_split(train_data, dev_data, cate3_vocab, fold=5, balanced=True, random_state=42): def indexing_data(data, indices): part_data = {} for k in data.keys(): part_data[k] = [data[k][i] for i in indices] return part_data all_data = {} for key in train_data.keys(): all_data[key] = train_data[key] + dev_data[key] # some category in validation set is not in cate3_vocab cate3_id_list = [cate3_vocab.get(cate3, 0) for cate3 in all_data['cate3']] index_range = np.arange(len(all_data['id'])) if balanced: kf = StratifiedKFold(n_splits=fold, shuffle=True, random_state=random_state) else: kf = KFold(n_splits=fold, shuffle=True, random_state=random_state) for idx, (train_index, dev_index) in enumerate(kf.split(index_range, cate3_id_list)): train_data_fold = indexing_data(all_data, train_index) dev_data_fold = indexing_data(all_data, dev_index) pickle_dump( format_filename(PROCESSED_DATA_DIR, TRAIN_CV_DATA_TEMPLATE, random=random_state, fold=fold, index=idx), train_data_fold) pickle_dump( format_filename(PROCESSED_DATA_DIR, DEV_CV_DATA_TEMPLATE, random=random_state, fold=fold, index=idx), dev_data_fold)
def get_usr_mov_features(model: Model, params_file_path, poster_path): usr_pkl = {} mov_pkl = {} # 加载模型参数到模型中,设置为验证模式eval() model_state_dict = load_params(params_file_path) model.load_dict(model_state_dict) model.eval() # 获得整个数据集的数据 dataset = model.Dataset.dataset for i in range(len(dataset)): # 获得用户数据,电影数据,评分数据 # 本案例只转换所有在样本中出现过的user和movie,实际中可以使用业务系统中的全量数据 usr_info, mov_info, score = dataset[i]['usr_info'], dataset[i][ 'mov_info'], dataset[i]['scores'] usrid = str(usr_info['usr_id']) movid = str(mov_info['mov_id']) # 获得用户数据,计算得到用户特征,保存在usr_pkl字典中 if usrid not in usr_pkl.keys(): usr_id_v = list2tensor(usr_info['usr_id'], [1]) usr_age_v = list2tensor(usr_info['age'], [1]) usr_gender_v = list2tensor(usr_info['gender'], [1]) usr_job_v = list2tensor(usr_info['job'], [1]) usr_in = [usr_id_v, usr_gender_v, usr_age_v, usr_job_v] usr_feat = model.get_usr_feat(usr_in) usr_pkl[usrid] = usr_feat.numpy() # 获得电影数据,计算得到电影特征,保存在mov_pkl字典中 if movid not in mov_pkl.keys(): mov_id_v = list2tensor(mov_info['mov_id'], [1]) mov_tit_v = list2tensor(mov_info['title'], [1, 1, 15]) mov_cat_v = list2tensor(mov_info['category'], [1, 6]) mov_in = [mov_id_v, mov_cat_v, mov_tit_v, None] mov_feat = model.get_mov_feat(mov_in) mov_pkl[movid] = mov_feat.numpy() # 保存特征到本地 pickle_dump(usr_pkl, './usr_feat.pkl') pickle_dump(mov_pkl, './mov_feat.pkl') print("usr & mov features saved!!!")
def get_validation_split(data_file, training_file, validation_file, data_split=0.8, overwrite=False): """ """ if overwrite or not os.path.exists(training_file): print("Creating validation split...") nb_samples = data_file.root.data.shape[0] sample_list = list(range(nb_samples)) training_list, validation_list = split_list(sample_list, split=data_split) pickle_dump(training_list, training_file) pickle_dump(validation_list, validation_file) return training_list, validation_list else: print("Loading previous validation split...") return pickle_load(training_file), pickle_load(validation_file)
def main(): model_file = os.path.join(modeldir, 'commoncrawl_fr-en.bin') trajectory_file = os.path.join(datadir, 'eolss-train.trajectories+scores.txt') con = Connection(configuration=client_conf) con.set_globals(trajectory_file=trajectory_file, model_file=model_file, gamma=gamma) con.run(load_data) trajectories.compute_scores(phi, gamma) transitions = [(s, r) for _, s, _, r in trajectories.SBIRL(phi)] # in fitted-value iteration we care only about s' shuffle(transitions) regressor = None for k in range(n_iterations): print 'Iteration', k con.set_globals(regressor=regressor) training_set = con.map(training_, transitions) regressor = get_regressor(training_set) pickle_dump(regressor, 'output/regressor.{}.pickle'.format(k + 1))
def main(args): config_path = args.config name = args.name + "_{}".format(int(time.time())) config = Config(config_path) if not os.path.exists(TRAINING_RESULTS): os.makedirs(TRAINING_RESULTS) if not os.path.exists(MODELS_PATH): os.makedirs(MODELS_PATH) if not os.path.exists(CV_PARAMS_PATH): os.makedirs(CV_PARAMS_PATH) X_train, X_test, y_train, y_test = config.get_data_from_config() grid = config.get_estimator_from_config() grid.fit(X_train, y_train) terminal_break() print("Training finished") predictions = grid.predict(X_test) report = classification_report(y_test, predictions) report_path = os.path.join(TRAINING_RESULTS, name + '_report.txt') print("Classification Report stored in {}".format(report_path)) print(report) with open(report_path, 'w') as f: f.write(report) model_path = os.path.join(MODELS_PATH, name + '_model.pkl') print("\n Pickling and saving best model at {}".format(model_path)) pickle_dump(grid.best_estimator_, model_path) cv_params_and_score = { 'best_score': grid.best_score_, 'best_params': grid.best_params_ } params_path = os.path.join(CV_PARAMS_PATH, name + '_params.txt') dict_dump(cv_params_and_score, params_path)
def main(): args = get_args() device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') itos, stoi = generate_vocab_mappings(CHAR_VOCAB_PATH) print('len vocabulary:', len(stoi)) model = WordNLM(args.word_embedding_size, len(itos), args.hidden_dim, args.layer_num) model.to(device) if args.load_from is not None: if args.load_from == "LSTM": weight_path = MODELS_HOME+"/"+args.load_from+".pth.tar" else: weight_path = MODELS_HOME + args.load_from model = load_WordNLM_model(weight_path, model, device, args.load_from) else: assert False model.eval() if args.test == "gender": parameters = {"gender_model": model, "gender_device": device, "vocab_mapping": stoi} elif args.test == "syntax": path = DATASETS_PATHS[args.dataset] parameters = {"path": path, "syntactic_model": model, "syntactic_device": device, "vocab_mapping": stoi} elif args.test == "test2": pass result = TESTS[args.test](**parameters) print(result) if args.test == "gender": result_name = BASE_RESULTS_PATH + args.load_from + "_" + RESULTS_PATHS["gender"] elif args.test == "syntax": result_name = BASE_RESULTS_PATH + args.load_from + "_" + RESULTS_PATHS[args.dataset] pickle_dump(result, result_name) return result
def process_data(dataset: str, neighbor_sample_size: int): user_vocab = {} item_vocab = {} entity_vocab = {} relation_vocab = {} read_item2entity_file(ITEM2ENTITY_FILE[dataset], item_vocab, entity_vocab) train_data, dev_data, test_data = read_rating_file(RATING_FILE[dataset], SEPARATOR[dataset], THRESHOLD[dataset], user_vocab, item_vocab) adj_entity, adj_relation = read_kg(KG_FILE[dataset], entity_vocab, relation_vocab, neighbor_sample_size) pickle_dump(format_filename(PROCESSED_DATA_DIR, USER_VOCAB_TEMPLATE, dataset=dataset), user_vocab) pickle_dump(format_filename(PROCESSED_DATA_DIR, ITEM_VOCAB_TEMPLATE, dataset=dataset), item_vocab) pickle_dump(format_filename(PROCESSED_DATA_DIR, ENTITY_VOCAB_TEMPLATE, dataset=dataset), entity_vocab) pickle_dump(format_filename(PROCESSED_DATA_DIR, RELATION_VOCAB_TEMPLATE, dataset=dataset), relation_vocab) train_data_file = format_filename(PROCESSED_DATA_DIR, TRAIN_DATA_TEMPLATE, dataset=dataset) np.save(train_data_file, train_data) print('Logging Info - Saved:', train_data_file) dev_data_file = format_filename(PROCESSED_DATA_DIR, DEV_DATA_TEMPLATE, dataset=dataset) np.save(dev_data_file, dev_data) print('Logging Info - Saved:', dev_data_file) test_data_file = format_filename(PROCESSED_DATA_DIR, TEST_DATA_TEMPLATE, dataset=dataset) np.save(test_data_file, test_data) print('Logging Info - Saved:', test_data_file) adj_entity_file = format_filename(PROCESSED_DATA_DIR, ADJ_ENTITY_TEMPLATE, dataset=dataset) np.save(adj_entity_file, adj_entity) print('Logging Info - Saved:', adj_entity_file) adj_relation_file = format_filename(PROCESSED_DATA_DIR, ADJ_RELATION_TEMPLATE, dataset=dataset) np.save(adj_relation_file, adj_relation) print('Logging Info - Saved:', adj_entity_file)
from param_config import config from utils import pickle_load, pickle_dump if __name__ == '__main__': print('Generating aisle features...') order_products_prior = pickle_load(config.order_products_prior_path) products = pickle_load(config.products_path) order_products_prior = pd.merge(order_products_prior, products, on='product_id', how='left') aisle_feat = pd.DataFrame() aisle_feat['aisle_order_num'] = order_products_prior.groupby( 'aisle_id').size() aisle_feat['aisle_reorder_num'] = order_products_prior.groupby( 'aisle_id')['reordered'].sum() aisle_feat['aisle_reorder_ratio'] = aisle_feat[ 'aisle_reorder_num'] / aisle_feat['aisle_order_num'] aisle_feat[ 'aisle_average_add_to_cart_order'] = order_products_prior.groupby( 'aisle_id')['add_to_cart_order'].mean() feats = [ 'aisle_order_num', 'aisle_reorder_num', 'aisle_reorder_ratio', 'aisle_average_add_to_cart_order' ] pickle_dump(aisle_feat[feats], '{}/aisle_feat.pkl'.format(config.feat_folder)) print('Done - aisle features')
df = pd.merge(df, user_product_recent_feat, left_on=['user_id', 'product_id'], right_index=True, how='left') df = pd.merge(df, user_product_dependent_feat, left_on=['user_id', 'product_id'], right_index=True, how='left') df = pd.merge(df, user_aisle_feat, left_on=['user_id', 'aisle_id'], right_index=True, how='left') df = pd.merge(df, user_department_feat, left_on=['user_id', 'department_id'], right_index=True, how='left') return df x_train_feat = merge_features(x_train) x_test_feat = merge_features(x_test) pickle_dump(x_train_feat, '{}/x_train_feat.pkl'.format(config.output_folder)) pickle_dump(x_test_feat, '{}/x_test_feat.pkl'.format(config.output_folder)) print('Done')
def process_predict(file_folder, word_cut_func, is_en, file_name='output.csv'): checkOS() # isFirstTime = True glove_vectors, glove_embed_dim = load_glove_format('./raw_data/glove.42B.300d.txt') config = Config() print('preprocessing: ', file_folder) # nlp = spacy.load("en_core_web_sm") # nlp.tokenizer = Tokenizer(nlp.vocab) train_data = pd.read_csv(os.path.join(file_folder, file_name), header=0, index_col=None) train_data['content'] = train_data['content'].astype(str) train_data['aspect'] = train_data['aspect'].astype(str) if isUnix: train_data['word_list'] = train_data['content'].parallel_apply(word_cut_func) train_data['char_list'] = train_data['content'].parallel_apply(lambda x: list(x)) train_data['aspect_word_list'] = train_data['aspect'].parallel_apply(word_cut_func) train_data['aspect_char_list'] = train_data['aspect'].parallel_apply(lambda x: list(x)) else: train_data['word_list'] = train_data['content'].apply(word_cut_func) train_data['char_list'] = train_data['content'].apply(lambda x: list(x)) train_data['aspect_word_list'] = train_data['aspect'].apply(word_cut_func) train_data['aspect_char_list'] = train_data['aspect'].apply(lambda x: list(x)) print('size of training set:', len(train_data)) word_corpus = train_data['word_list'].values.tolist() char_corpus = train_data['char_list'].values.tolist() aspect_corpus = train_data['aspect'].values.tolist() aspect_text_word_corpus = train_data['aspect_word_list'].values.tolist() aspect_text_char_corpus = train_data['aspect_char_list'].values.tolist() # build vocabulary print('building vocabulary...') word_vocab = build_vocabulary(word_corpus, start_id=1) char_vocab = build_vocabulary(char_corpus, start_id=1) aspect_vocab = build_vocabulary(aspect_corpus, start_id=0) aspect_text_word_vocab = build_vocabulary(aspect_text_word_corpus, start_id=1) aspect_text_char_vocab = build_vocabulary(aspect_text_char_corpus, start_id=1) pickle_dump(word_vocab, os.path.join(file_folder, 'word_vocab.pkl')) pickle_dump(char_vocab, os.path.join(file_folder, 'char_vocab.pkl')) pickle_dump(aspect_vocab, os.path.join(file_folder, 'aspect_vocab.pkl')) pickle_dump(aspect_text_word_vocab, os.path.join(file_folder, 'aspect_text_word_vocab.pkl')) pickle_dump(aspect_text_char_vocab, os.path.join(file_folder, 'aspect_text_char_vocab.pkl')) print('finished building vocabulary!') print('len of word vocabulary:', len(word_vocab)) print('sample of word vocabulary:', list(word_vocab.items())[:10]) print('len of char vocabulary:', len(char_vocab)) print('sample of char vocabulary:', list(char_vocab.items())[:10]) print('len of aspect vocabulary:', len(aspect_vocab)) print('sample of aspect vocabulary:', list(aspect_vocab.items())[:10]) print('len of aspect text word vocabulary:', len(aspect_text_word_vocab)) print('sample of aspect text word vocabulary:', list(aspect_text_word_vocab.items())[:10]) print('len of aspect text char vocabulary:', len(aspect_text_char_vocab)) print('sample of aspect text char vocabulary:', list(aspect_text_char_vocab.items())[:10]) # prepare embedding print('preparing embedding...') word_w2v = build_embedding(word_corpus, word_vocab, config.word_embed_dim) aspect_word_w2v = build_aspect_embedding(aspect_vocab, word_cut_func, word_vocab, word_w2v) aspect_text_word_w2v = build_aspect_text_embedding(aspect_text_word_vocab, word_vocab, word_w2v) char_w2v = build_embedding(char_corpus, char_vocab, config.word_embed_dim) aspect_char_w2v = build_aspect_embedding(aspect_vocab, lambda x: list(x), char_vocab, char_w2v) aspect_text_char_w2v = build_aspect_text_embedding(aspect_text_char_vocab, char_vocab, char_w2v) np.save(os.path.join(file_folder, 'word_w2v.npy'), word_w2v) np.save(os.path.join(file_folder, 'aspect_word_w2v.npy'), aspect_word_w2v) np.save(os.path.join(file_folder, 'aspect_text_word_w2v.npy'), aspect_text_word_w2v) np.save(os.path.join(file_folder, 'char_w2v.npy'), char_w2v) np.save(os.path.join(file_folder, 'aspect_char_w2v.npy'), aspect_char_w2v) np.save(os.path.join(file_folder, 'aspect_text_char_w2v.npy'), aspect_text_char_w2v) print('finished preparing embedding!') print('shape of word_w2v:', word_w2v.shape) print('sample of word_w2v:', word_w2v[:2, :5]) print('shape of char_w2v:', char_w2v.shape) print('sample of char_w2v:', char_w2v[:2, :5]) print('shape of aspect_word_w2v:', aspect_word_w2v.shape) print('sample of aspect_word_w2v:', aspect_word_w2v[:2, :5]) print('shape of aspect_char_w2v:', aspect_char_w2v.shape) print('sample of aspect_char_w2v:', aspect_char_w2v[:2, :5]) print('shape of aspect_text_word_w2v:', aspect_text_word_w2v.shape) print('sample of aspect_text_word_w2v:', aspect_text_word_w2v[:2, :5]) print('shape of aspect_text_char_w2v:', aspect_text_char_w2v.shape) print('sample of aspect_text_char_w2v:', aspect_text_char_w2v[:2, :5]) if is_en: word_glove = build_glove_embedding(word_vocab, glove_vectors, glove_embed_dim) aspect_word_glove = build_aspect_embedding(aspect_vocab, word_cut_func, word_vocab, word_glove) aspect_text_word_glove = build_aspect_text_embedding(aspect_text_word_vocab, word_vocab, word_glove) np.save(os.path.join(file_folder, 'word_glove.npy'), word_glove) np.save(os.path.join(file_folder, 'aspect_word_glove.npy'), aspect_word_glove) np.save(os.path.join(file_folder, 'aspect_text_word_glove.npy'), aspect_text_word_glove) print('shape of word_glove:', word_glove.shape) print('sample of word_glove:', word_glove[:2, :5]) print('shape of aspect_word_glove:', aspect_word_glove.shape) print('sample of aspect_word_glove:', aspect_word_glove[:2, :5]) print('shape of aspect_text_word_glove:', aspect_text_word_glove.shape) print('sample of aspect_text_word_glove:', aspect_text_word_glove[:2, :5]) # prepare input print('preparing text input...') if isUnix: train_word_input = train_data['word_list'].parallel_apply( lambda x: [word_vocab.get(word, len(word_vocab)+1) for word in x]).values.tolist() train_char_input = train_data['char_list'].parallel_apply( lambda x: [char_vocab.get(char, len(char_vocab)+1) for char in x]).values.tolist() else: train_word_input = train_data['word_list'].apply( lambda x: [word_vocab.get(word, len(word_vocab)+1) for word in x]).values.tolist() train_char_input = train_data['char_list'].apply( lambda x: [char_vocab.get(char, len(char_vocab)+1) for char in x]).values.tolist() pickle_dump(train_word_input, os.path.join(file_folder, 'train_word_input.pkl')) pickle_dump(train_char_input, os.path.join(file_folder, 'train_char_input.pkl')) print('finished preparing text input!') print('preparing aspect input...') if isUnix: train_aspect_input = train_data['aspect'].parallel_apply(lambda x: [aspect_vocab[x]]).values.tolist() else: train_aspect_input = train_data['aspect'].apply(lambda x: [aspect_vocab[x]]).values.tolist() pickle_dump(train_aspect_input, os.path.join(file_folder, 'train_aspect_input.pkl')) print('finished preparing aspect input!') print('preparing aspect text input...') if isUnix: train_aspect_text_word_input = train_data['aspect_word_list'].parallel_apply( lambda x: [aspect_text_word_vocab.get(word, len(aspect_text_word_vocab) + 1) for word in x]).values.tolist() train_aspect_text_char_input = train_data['aspect_char_list'].parallel_apply( lambda x: [aspect_text_char_vocab.get(char, len(aspect_text_char_vocab) + 1) for char in x]).values.tolist() else: train_aspect_text_word_input = train_data['aspect_word_list'].apply( lambda x: [aspect_text_word_vocab.get(word, len(aspect_text_word_vocab) + 1) for word in x]).values.tolist() train_aspect_text_char_input = train_data['aspect_char_list'].apply( lambda x: [aspect_text_char_vocab.get(char, len(aspect_text_char_vocab) + 1) for char in x]).values.tolist() pickle_dump(train_aspect_text_word_input, os.path.join(file_folder, 'train_word_aspect_input.pkl')) pickle_dump(train_aspect_text_char_input, os.path.join(file_folder, 'train_char_aspect_input.pkl')) print('finished preparing aspect text input!') if 'from' in train_data.columns: print('preparing left text input, right text input & position input...') train_word_input_l, train_word_input_r, train_word_input_r_with_pad, train_word_mask, train_word_pos_input, \ train_word_offset_input, train_char_input_l, train_char_input_r, train_char_input_r_with_pad, \ train_char_mask, train_char_pos_input, train_char_offset_input = split_text_and_get_loc_info(train_data, word_vocab, char_vocab, word_cut_func) pickle_dump(train_word_input_l, os.path.join(file_folder, 'train_word_input_l.pkl')) pickle_dump(train_word_input_r, os.path.join(file_folder, 'train_word_input_r.pkl')) pickle_dump(train_word_input_r_with_pad, os.path.join(file_folder, 'train_word_input_r_with_pad.pkl')) pickle_dump(train_word_mask, os.path.join(file_folder, 'train_word_mask.pkl')) pickle_dump(train_word_pos_input, os.path.join(file_folder, 'train_word_pos_input.pkl')) pickle_dump(train_word_offset_input, os.path.join(file_folder, 'train_word_offset_input.pkl')) pickle_dump(train_char_input_l, os.path.join(file_folder, 'train_char_input_l.pkl')) pickle_dump(train_char_input_r, os.path.join(file_folder, 'train_char_input_r.pkl')) pickle_dump(train_char_input_r_with_pad, os.path.join(file_folder, 'train_char_input_r_with_pad.pkl')) pickle_dump(train_char_mask, os.path.join(file_folder, 'train_char_mask.pkl')) pickle_dump(train_char_pos_input, os.path.join(file_folder, 'train_char_pos_input.pkl')) pickle_dump(train_char_offset_input, os.path.join(file_folder, 'train_char_offset_input.pkl')) # prepare output print('preparing output....') pickle_dump(train_data['sentiment'].values.tolist(), os.path.join(file_folder, 'train_label.pkl')) print('finished preparing output!')
def pre_process(file_folder, word_cut_func, is_en): checkOS() print('preprocessing: ', file_folder) train_data = pd.read_csv(os.path.join(file_folder, 'train.csv'), header=0, index_col=None) train_data['content'] = train_data['content'].astype(str) train_data['aspect'] = train_data['aspect'].astype(str) print("checking for null obj",train_data['content'].isnull().sum()) print("checking for null obj",train_data['aspect'].isnull().sum()) if isUnix: train_data['word_list'] = train_data['content'].parallel_apply(word_cut_func) train_data['char_list'] = train_data['content'].parallel_apply(lambda x: list(x)) train_data['aspect_word_list'] = train_data['aspect'].parallel_apply(word_cut_func) train_data['aspect_char_list'] = train_data['aspect'].parallel_apply(lambda x: list(x)) else: train_data['word_list'] = train_data['content'].apply(word_cut_func) train_data['char_list'] = train_data['content'].apply(lambda x: list(x)) train_data['aspect_word_list'] = train_data['aspect'].apply(word_cut_func) train_data['aspect_char_list'] = train_data['aspect'].apply(lambda x: list(x)) valid_data = pd.read_csv(os.path.join(file_folder, 'valid.csv'), header=0, index_col=None) valid_data['content'] = valid_data['content'].astype(str) valid_data['aspect'] = valid_data['aspect'].astype(str) if isUnix: valid_data['word_list'] = valid_data['content'].parallel_apply(word_cut_func) valid_data['char_list'] = valid_data['content'].parallel_apply(lambda x: list(x)) valid_data['aspect_word_list'] = valid_data['aspect'].parallel_apply(word_cut_func) valid_data['aspect_char_list'] = valid_data['aspect'].parallel_apply(lambda x: list(x)) else: valid_data['word_list'] = valid_data['content'].apply(word_cut_func) valid_data['char_list'] = valid_data['content'].apply(lambda x: list(x)) valid_data['aspect_word_list'] = valid_data['aspect'].apply(word_cut_func) valid_data['aspect_char_list'] = valid_data['aspect'].apply(lambda x: list(x)) test_data = pd.read_csv(os.path.join(file_folder, 'test.csv'), header=0, index_col=None) test_data['content'] = test_data['content'].astype(str) test_data['aspect'] = test_data['aspect'].astype(str) if isUnix: test_data['word_list'] = test_data['content'].parallel_apply(word_cut_func) test_data['char_list'] = test_data['content'].parallel_apply(lambda x: list(x)) test_data['aspect_word_list'] = test_data['aspect'].parallel_apply(word_cut_func) test_data['aspect_char_list'] = test_data['aspect'].parallel_apply(lambda x: list(x)) else: test_data['word_list'] = test_data['content'].apply(word_cut_func) test_data['char_list'] = test_data['content'].apply(lambda x: list(x)) test_data['aspect_word_list'] = test_data['aspect'].apply(word_cut_func) test_data['aspect_char_list'] = test_data['aspect'].apply(lambda x: list(x)) print('size of training set:', len(train_data)) print('size of valid set:', len(valid_data)) print('size of test set:', len(test_data)) word_corpus = np.concatenate((train_data['word_list'].values, valid_data['word_list'].values, test_data['word_list'].values)).tolist() char_corpus = np.concatenate((train_data['char_list'].values, valid_data['char_list'].values, test_data['char_list'].values)).tolist() aspect_corpus = np.concatenate((train_data['aspect'].values, valid_data['aspect'].values, test_data['aspect'].values)).tolist() aspect_text_word_corpus = np.concatenate((train_data['aspect_word_list'].values, valid_data['aspect_word_list'].values, test_data['aspect_word_list'].values)).tolist() aspect_text_char_corpus = np.concatenate((train_data['aspect_char_list'].values, valid_data['aspect_char_list'].values, test_data['aspect_char_list'].values)).tolist() # build vocabulary print('building vocabulary...') word_vocab = build_vocabulary(word_corpus, start_id=1) char_vocab = build_vocabulary(char_corpus, start_id=1) aspect_vocab = build_vocabulary(aspect_corpus, start_id=0) aspect_text_word_vocab = build_vocabulary(aspect_text_word_corpus, start_id=1) aspect_text_char_vocab = build_vocabulary(aspect_text_char_corpus, start_id=1) pickle_dump(word_vocab, os.path.join(file_folder, 'word_vocab.pkl')) pickle_dump(char_vocab, os.path.join(file_folder, 'char_vocab.pkl')) pickle_dump(aspect_vocab, os.path.join(file_folder, 'aspect_vocab.pkl')) pickle_dump(aspect_text_word_vocab, os.path.join(file_folder, 'aspect_text_word_vocab.pkl')) pickle_dump(aspect_text_char_vocab, os.path.join(file_folder, 'aspect_text_char_vocab.pkl')) print('finished building vocabulary!') print('len of word vocabulary:', len(word_vocab)) print('sample of word vocabulary:', list(word_vocab.items())[:10]) print('len of char vocabulary:', len(char_vocab)) print('sample of char vocabulary:', list(char_vocab.items())[:10]) print('len of aspect vocabulary:', len(aspect_vocab)) print('sample of aspect vocabulary:', list(aspect_vocab.items())[:10]) print('len of aspect text word vocabulary:', len(aspect_text_word_vocab)) print('sample of aspect text word vocabulary:', list(aspect_text_word_vocab.items())[:10]) print('len of aspect text char vocabulary:', len(aspect_text_char_vocab)) print('sample of aspect text char vocabulary:', list(aspect_text_char_vocab.items())[:10]) # prepare embedding print('preparing embedding...') word_w2v = build_embedding(word_corpus, word_vocab, config.word_embed_dim) aspect_word_w2v = build_aspect_embedding(aspect_vocab, word_cut_func, word_vocab, word_w2v) aspect_text_word_w2v = build_aspect_text_embedding(aspect_text_word_vocab, word_vocab, word_w2v) char_w2v = build_embedding(char_corpus, char_vocab, config.word_embed_dim) aspect_char_w2v = build_aspect_embedding(aspect_vocab, lambda x: list(x), char_vocab, char_w2v) aspect_text_char_w2v = build_aspect_text_embedding(aspect_text_char_vocab, char_vocab, char_w2v) np.save(os.path.join(file_folder, 'word_w2v.npy'), word_w2v) np.save(os.path.join(file_folder, 'aspect_word_w2v.npy'), aspect_word_w2v) np.save(os.path.join(file_folder, 'aspect_text_word_w2v.npy'), aspect_text_word_w2v) np.save(os.path.join(file_folder, 'char_w2v.npy'), char_w2v) np.save(os.path.join(file_folder, 'aspect_char_w2v.npy'), aspect_char_w2v) np.save(os.path.join(file_folder, 'aspect_text_char_w2v.npy'), aspect_text_char_w2v) print('finished preparing embedding!') print('shape of word_w2v:', word_w2v.shape) print('sample of word_w2v:', word_w2v[:2, :5]) print('shape of char_w2v:', char_w2v.shape) print('sample of char_w2v:', char_w2v[:2, :5]) print('shape of aspect_word_w2v:', aspect_word_w2v.shape) print('sample of aspect_word_w2v:', aspect_word_w2v[:2, :5]) print('shape of aspect_char_w2v:', aspect_char_w2v.shape) print('sample of aspect_char_w2v:', aspect_char_w2v[:2, :5]) print('shape of aspect_text_word_w2v:', aspect_text_word_w2v.shape) print('sample of aspect_text_word_w2v:', aspect_text_word_w2v[:2, :5]) print('shape of aspect_text_char_w2v:', aspect_text_char_w2v.shape) print('sample of aspect_text_char_w2v:', aspect_text_char_w2v[:2, :5]) if is_en: word_glove = build_glove_embedding(word_vocab, glove_vectors, glove_embed_dim) aspect_word_glove = build_aspect_embedding(aspect_vocab, word_cut_func, word_vocab, word_glove) aspect_text_word_glove = build_aspect_text_embedding(aspect_text_word_vocab, word_vocab, word_glove) np.save(os.path.join(file_folder, 'word_glove.npy'), word_glove) np.save(os.path.join(file_folder, 'aspect_word_glove.npy'), aspect_word_glove) np.save(os.path.join(file_folder, 'aspect_text_word_glove.npy'), aspect_text_word_glove) print('shape of word_glove:', word_glove.shape) print('sample of word_glove:', word_glove[:2, :5]) print('shape of aspect_word_glove:', aspect_word_glove.shape) print('sample of aspect_word_glove:', aspect_word_glove[:2, :5]) print('shape of aspect_text_word_glove:', aspect_text_word_glove.shape) print('sample of aspect_text_word_glove:', aspect_text_word_glove[:2, :5]) # prepare input print('preparing text input...') if isUnix: train_word_input = train_data['word_list'].parallel_apply( lambda x: [word_vocab.get(word, len(word_vocab)+1) for word in x]).values.tolist() train_char_input = train_data['char_list'].parallel_apply( lambda x: [char_vocab.get(char, len(char_vocab)+1) for char in x]).values.tolist() valid_word_input = valid_data['word_list'].parallel_apply( lambda x: [word_vocab.get(word, len(word_vocab)+1) for word in x]).values.tolist() valid_char_input = valid_data['char_list'].parallel_apply( lambda x: [char_vocab.get(char, len(char_vocab)+1) for char in x]).values.tolist() test_word_input = test_data['word_list'].parallel_apply( lambda x: [word_vocab.get(word, len(word_vocab)+1) for word in x]).values.tolist() test_char_input = test_data['char_list'].parallel_apply( lambda x: [char_vocab.get(char, len(char_vocab)+1) for char in x]).values.tolist() else: train_word_input = train_data['word_list'].apply( lambda x: [word_vocab.get(word, len(word_vocab)+1) for word in x]).values.tolist() train_char_input = train_data['char_list'].apply( lambda x: [char_vocab.get(char, len(char_vocab)+1) for char in x]).values.tolist() valid_word_input = valid_data['word_list'].apply( lambda x: [word_vocab.get(word, len(word_vocab)+1) for word in x]).values.tolist() valid_char_input = valid_data['char_list'].apply( lambda x: [char_vocab.get(char, len(char_vocab)+1) for char in x]).values.tolist() test_word_input = test_data['word_list'].apply( lambda x: [word_vocab.get(word, len(word_vocab)+1) for word in x]).values.tolist() test_char_input = test_data['char_list'].apply( lambda x: [char_vocab.get(char, len(char_vocab)+1) for char in x]).values.tolist() pickle_dump(train_word_input, os.path.join(file_folder, 'train_word_input.pkl')) pickle_dump(train_char_input, os.path.join(file_folder, 'train_char_input.pkl')) pickle_dump(valid_word_input, os.path.join(file_folder, 'valid_word_input.pkl')) pickle_dump(valid_char_input, os.path.join(file_folder, 'valid_char_input.pkl')) pickle_dump(test_word_input, os.path.join(file_folder, 'test_word_input.pkl')) pickle_dump(test_char_input, os.path.join(file_folder, 'test_char_input.pkl')) print('finished preparing text input!') print('length analysis of text word input:') analyze_len_distribution(train_word_input, valid_word_input, test_word_input) print('length analysis of text char input') analyze_len_distribution(train_char_input, valid_char_input, test_char_input) print('preparing aspect input...') if isUnix: train_aspect_input = train_data['aspect'].parallel_apply(lambda x: [aspect_vocab[x]]).values.tolist() valid_aspect_input = valid_data['aspect'].parallel_apply(lambda x: [aspect_vocab[x]]).values.tolist() test_aspect_input = test_data['aspect'].parallel_apply(lambda x: [aspect_vocab[x]]).values.tolist() else: train_aspect_input = train_data['aspect'].apply(lambda x: [aspect_vocab[x]]).values.tolist() valid_aspect_input = valid_data['aspect'].apply(lambda x: [aspect_vocab[x]]).values.tolist() test_aspect_input = test_data['aspect'].apply(lambda x: [aspect_vocab[x]]).values.tolist() pickle_dump(train_aspect_input, os.path.join(file_folder, 'train_aspect_input.pkl')) pickle_dump(valid_aspect_input, os.path.join(file_folder, 'valid_aspect_input.pkl')) pickle_dump(test_aspect_input, os.path.join(file_folder, 'test_aspect_input.pkl')) print('finished preparing aspect input!') print('preparing aspect text input...') if isUnix: train_aspect_text_word_input = train_data['aspect_word_list'].parallel_apply( lambda x: [aspect_text_word_vocab.get(word, len(aspect_text_word_vocab) + 1) for word in x]).values.tolist() train_aspect_text_char_input = train_data['aspect_char_list'].parallel_apply( lambda x: [aspect_text_char_vocab.get(char, len(aspect_text_char_vocab) + 1) for char in x]).values.tolist() valid_aspect_text_word_input = valid_data['aspect_word_list'].parallel_apply( lambda x: [aspect_text_word_vocab.get(word, len(aspect_text_word_vocab) + 1) for word in x]).values.tolist() valid_aspect_text_char_input = valid_data['aspect_char_list'].parallel_apply( lambda x: [aspect_text_char_vocab.get(char, len(aspect_text_char_vocab) + 1) for char in x]).values.tolist() test_aspect_text_word_input = test_data['aspect_word_list'].parallel_apply( lambda x: [aspect_text_word_vocab.get(word, len(aspect_text_word_vocab) + 1) for word in x]).values.tolist() test_aspect_text_char_input = test_data['aspect_char_list'].parallel_apply( lambda x: [aspect_text_char_vocab.get(char, len(aspect_text_char_vocab) + 1) for char in x]).values.tolist() else: train_aspect_text_word_input = train_data['aspect_word_list'].apply( lambda x: [aspect_text_word_vocab.get(word, len(aspect_text_word_vocab) + 1) for word in x]).values.tolist() train_aspect_text_char_input = train_data['aspect_char_list'].apply( lambda x: [aspect_text_char_vocab.get(char, len(aspect_text_char_vocab) + 1) for char in x]).values.tolist() valid_aspect_text_word_input = valid_data['aspect_word_list'].apply( lambda x: [aspect_text_word_vocab.get(word, len(aspect_text_word_vocab) + 1) for word in x]).values.tolist() valid_aspect_text_char_input = valid_data['aspect_char_list'].apply( lambda x: [aspect_text_char_vocab.get(char, len(aspect_text_char_vocab) + 1) for char in x]).values.tolist() test_aspect_text_word_input = test_data['aspect_word_list'].apply( lambda x: [aspect_text_word_vocab.get(word, len(aspect_text_word_vocab) + 1) for word in x]).values.tolist() test_aspect_text_char_input = test_data['aspect_char_list'].apply( lambda x: [aspect_text_char_vocab.get(char, len(aspect_text_char_vocab) + 1) for char in x]).values.tolist() pickle_dump(train_aspect_text_word_input, os.path.join(file_folder, 'train_word_aspect_input.pkl')) pickle_dump(train_aspect_text_char_input, os.path.join(file_folder, 'train_char_aspect_input.pkl')) pickle_dump(valid_aspect_text_word_input, os.path.join(file_folder, 'valid_word_aspect_input.pkl')) pickle_dump(valid_aspect_text_char_input, os.path.join(file_folder, 'valid_char_aspect_input.pkl')) pickle_dump(test_aspect_text_word_input, os.path.join(file_folder, 'test_word_aspect_input.pkl')) pickle_dump(test_aspect_text_char_input, os.path.join(file_folder, 'test_char_aspect_input.pkl')) print('finished preparing aspect text input!') print('length analysis of aspect text word input:') analyze_len_distribution(train_aspect_text_word_input, valid_aspect_text_word_input, test_aspect_text_word_input) print('length analysis of aspect text char input') analyze_len_distribution(train_aspect_text_char_input, valid_aspect_text_char_input, test_aspect_text_char_input) if 'from' in train_data.columns: print('preparing left text input, right text input & position input...') train_word_input_l, train_word_input_r, train_word_input_r_with_pad, train_word_mask, train_word_pos_input, \ train_word_offset_input, train_char_input_l, train_char_input_r, train_char_input_r_with_pad, \ train_char_mask, train_char_pos_input, train_char_offset_input = split_text_and_get_loc_info(train_data, word_vocab, char_vocab, word_cut_func) pickle_dump(train_word_input_l, os.path.join(file_folder, 'train_word_input_l.pkl')) pickle_dump(train_word_input_r, os.path.join(file_folder, 'train_word_input_r.pkl')) pickle_dump(train_word_input_r_with_pad, os.path.join(file_folder, 'train_word_input_r_with_pad.pkl')) pickle_dump(train_word_mask, os.path.join(file_folder, 'train_word_mask.pkl')) pickle_dump(train_word_pos_input, os.path.join(file_folder, 'train_word_pos_input.pkl')) pickle_dump(train_word_offset_input, os.path.join(file_folder, 'train_word_offset_input.pkl')) pickle_dump(train_char_input_l, os.path.join(file_folder, 'train_char_input_l.pkl')) pickle_dump(train_char_input_r, os.path.join(file_folder, 'train_char_input_r.pkl')) pickle_dump(train_char_input_r_with_pad, os.path.join(file_folder, 'train_char_input_r_with_pad.pkl')) pickle_dump(train_char_mask, os.path.join(file_folder, 'train_char_mask.pkl')) pickle_dump(train_char_pos_input, os.path.join(file_folder, 'train_char_pos_input.pkl')) pickle_dump(train_char_offset_input, os.path.join(file_folder, 'train_char_offset_input.pkl')) valid_word_input_l, valid_word_input_r, valid_word_input_r_with_pad, valid_word_mask, valid_word_pos_input, \ valid_word_offset_input, valid_char_input_l, valid_char_input_r, valid_char_input_r_with_pad, \ valid_char_mask, valid_char_pos_input, valid_char_offset_input = split_text_and_get_loc_info(valid_data, word_vocab, char_vocab, word_cut_func) pickle_dump(valid_word_input_l, os.path.join(file_folder, 'valid_word_input_l.pkl')) pickle_dump(valid_word_input_r, os.path.join(file_folder, 'valid_word_input_r.pkl')) pickle_dump(valid_word_input_r_with_pad, os.path.join(file_folder, 'valid_word_input_r_with_pad.pkl')) pickle_dump(valid_word_mask, os.path.join(file_folder, 'valid_word_mask.pkl')) pickle_dump(valid_word_pos_input, os.path.join(file_folder, 'valid_word_pos_input.pkl')) pickle_dump(valid_word_offset_input, os.path.join(file_folder, 'valid_word_offset_input.pkl')) pickle_dump(valid_char_input_l, os.path.join(file_folder, 'valid_char_input_l.pkl')) pickle_dump(valid_char_input_r, os.path.join(file_folder, 'valid_char_input_r.pkl')) pickle_dump(valid_char_input_r_with_pad, os.path.join(file_folder, 'valid_char_input_r_with_pad.pkl')) pickle_dump(valid_char_mask, os.path.join(file_folder, 'valid_char_mask.pkl')) pickle_dump(valid_char_pos_input, os.path.join(file_folder, 'valid_char_pos_input.pkl')) pickle_dump(valid_char_offset_input, os.path.join(file_folder, 'valid_char_offset_input.pkl')) test_word_input_l, test_word_input_r, test_word_input_r_with_pad, test_word_mask, test_word_pos_input, \ test_word_offset_input, test_char_input_l, test_char_input_r, test_char_input_r_with_pad, test_char_mask, \ test_char_pos_input, test_char_offset_input = split_text_and_get_loc_info(test_data, word_vocab, char_vocab, word_cut_func) pickle_dump(test_word_input_l, os.path.join(file_folder, 'test_word_input_l.pkl')) pickle_dump(test_word_input_r, os.path.join(file_folder, 'test_word_input_r.pkl')) pickle_dump(test_word_input_r_with_pad, os.path.join(file_folder, 'test_word_input_r_with_pad.pkl')) pickle_dump(test_word_mask, os.path.join(file_folder, 'test_word_mask.pkl')) pickle_dump(test_word_pos_input, os.path.join(file_folder, 'test_word_pos_input.pkl')) pickle_dump(test_word_offset_input, os.path.join(file_folder, 'test_word_offset_input.pkl')) print("Test Word Output") pickle_dump(test_char_input_l, os.path.join(file_folder, 'test_char_input_l.pkl')) pickle_dump(test_char_input_r, os.path.join(file_folder, 'test_char_input_r.pkl')) pickle_dump(test_char_input_r_with_pad, os.path.join(file_folder, 'test_char_input_r_with_pad.pkl')) pickle_dump(test_char_mask, os.path.join(file_folder, 'test_char_mask.pkl')) pickle_dump(test_char_pos_input, os.path.join(file_folder, 'test_char_pos_input.pkl')) pickle_dump(test_char_offset_input, os.path.join(file_folder, 'test_char_offset_input.pkl')) print('length analysis of left text word input:') analyze_len_distribution(train_word_input_l, valid_word_input_l, test_word_input_l) print('length analysis of left text char input') analyze_len_distribution(train_char_input_l, valid_char_input_l, test_char_input_l) print('length analysis of right text word input:') analyze_len_distribution(train_word_input_r, valid_word_input_r, test_word_input_r) print('length analysis of right text char input') analyze_len_distribution(train_char_input_r, valid_char_input_r, test_char_input_r) # prepare output print('preparing output....') pickle_dump(train_data['sentiment'].values.tolist(), os.path.join(file_folder, 'train_label.pkl')) pickle_dump(valid_data['sentiment'].values.tolist(), os.path.join(file_folder, 'valid_label.pkl')) if 'sentiment' in test_data.columns: pickle_dump(test_data['sentiment'].values.tolist(), os.path.join(file_folder, 'test_label.pkl')) print('finished preparing output!') print('class analysis of training set:') analyze_class_distribution(train_data['sentiment'].values.tolist()) print('class analysis of valid set:') analyze_class_distribution(valid_data['sentiment'].values.tolist()) if 'sentiment' in test_data.columns: print('class analysis of test set:') analyze_class_distribution(valid_data['sentiment'].values.tolist())
orders = pickle_load(config.orders_path) train_orders = orders[orders.eval_set == 'train'][['order_id', 'user_id']].copy() test_orders = orders[orders.eval_set == 'test'][['order_id', 'user_id']].copy() user_product_pair = order_products_prior[['user_id', 'product_id']].drop_duplicates() train_df = pd.merge(train_orders, user_product_pair, on='user_id') test_df = pd.merge(test_orders, user_product_pair, on='user_id') order_products_train = order_products_train[[ 'order_id', 'product_id', 'reordered' ]] train_df = pd.merge(train_df, order_products_train, on=['order_id', 'product_id'], how='left') train_df['reordered'] = train_df['reordered'].fillna(0).astype(np.int) x_train = train_df[['order_id', 'user_id', 'product_id']] y_train = train_df['reordered'] x_test = test_df pickle_dump(x_train, '{}/x_train.pkl'.format(config.output_folder)) pickle_dump(y_train, '{}/y_train.pkl'.format(config.output_folder)) pickle_dump(x_test, '{}/x_test.pkl'.format(config.output_folder)) print('Done - dataset construction')
import sys import numpy as np import pandas as pd sys.path.append('../') from param_config import config from utils import pickle_load, pickle_dump if __name__ == '__main__': print('Generating user_department features...') order_products_prior = pickle_load(config.order_products_prior_path) products = pickle_load(config.products_path) order_products_prior = pd.merge(order_products_prior, products, on='product_id', how='left') ud_feat = pd.DataFrame() ud_feat['ud_first_order'] = order_products_prior.groupby(["user_id", "department_id"])['order_number_before_last_order'].max() ud_feat['ud_last_order'] = order_products_prior.groupby(["user_id", "department_id"])['order_number_before_last_order'].min() ud_feat['ud_distinct_order_num'] = order_products_prior.groupby(['user_id', 'department_id']).order_id.nunique() ud_feat['ud_distinct_product_num'] = order_products_prior.groupby(['user_id', 'department_id'])['product_id'].nunique() feats = ['ud_first_order', 'ud_last_order', 'ud_distinct_order_num', 'ud_distinct_product_num'] pickle_dump(ud_feat[feats], '{}/user_department_feat.pkl'.format(config.feat_folder)) print('Done - user_department features')
order_feat['order_days_since_prior_order_diff'] = \ (orders[orders.eval_set != 'prior'].set_index('user_id')['order_days_since_prior_order'] - \ orders[orders.eval_set == 'prior'].groupby('user_id')['order_days_since_prior_order'].mean()).values recent_orders = orders.groupby('user_id').tail(2) df = pd.DataFrame() df['order_delta_day_diff'] = np.abs(recent_orders.groupby('user_id')['order_dow'].apply(np.diff).apply( lambda x:x[0])).map(lambda x: min(x, 7-x)) df['order_delta_hour_diff'] = np.abs(recent_orders.groupby('user_id')['order_hour_of_day'].apply(np.diff).apply( lambda x:x[0])).map(lambda x: min(x, 24-x)) order_feat = pd.merge(order_feat, df, left_on='user_id', right_index=True) # Fillna # As for NaN in the feature 'order_days_since_prior_order_ratio', the numerator and the denominator are all zero. Hence fill 1 order_feat['order_days_since_prior_order_ratio'].fillna(1, inplace=True) # Generate the feature based on order_number order_products_prior = pickle_load(config.order_products_prior_path) order_number_reorder_ratio = order_products_prior.groupby('order_number')['reordered'].mean().to_frame() order_number_reorder_ratio.columns = ['order_number_reorder_ratio'] order_feat = pd.merge(order_feat, order_number_reorder_ratio, left_on='order_number', right_index=True, how='left') order_feat.set_index('order_id', inplace=True) feats = ['order_dow', 'order_hour_of_day', 'order_days_since_prior_order', 'order_weekend', 'order_hour_of_day_bin_id', 'order_days_since_prior_order_ratio', 'order_days_since_prior_order_diff', 'order_number_reorder_ratio', 'order_delta_day_diff', 'order_delta_hour_diff'] pickle_dump(order_feat[feats], '{}/order_feat.pkl'.format(config.feat_folder)) print('Done - order features')
up_feat['up_first_order_proportion'] = up_feat['up_first_order'] / up_feat[ 'user_order_num'] up_feat['up_last_order_proportion'] = up_feat['up_last_order'] / up_feat[ 'user_order_num'] up_feat['up_average_order_proportion'] = up_feat[ 'up_average_order'] / up_feat['user_order_num'] up_feat['up_last_order_proportion_ratio'] = up_feat[ 'up_last_order_proportion'] / up_feat['product_average_order_distance'] # features based on order_days_before_last_order and other features up_feat['up_first_order_days_proportion'] = up_feat[ 'up_first_order_days'] / up_feat['user_order_days'] up_feat['up_last_order_days_proportion'] = up_feat[ 'up_last_order_days'] / up_feat['user_order_days'] up_feat['up_average_order_days_proportion'] = up_feat[ 'up_average_order_days'] / up_feat['user_order_days'] up_feat['up_last_order_days_proportion_ratio'] = \ up_feat['up_last_order_days_proportion'] / up_feat['up_average_order_days_distance'] feats = [ 'up_order_num_ratio', 'up_order_num_proportion', 'up_average_add_to_cart_order_ratio', 'up_first_order_proportion', 'up_last_order_proportion', 'up_average_order_proportion', 'up_last_order_proportion_ratio', 'up_first_order_days_proportion', 'up_last_order_days_proportion', 'up_average_order_days_proportion', 'up_last_order_days_proportion_ratio' ] pickle_dump( up_feat[feats], '{}/user_product_dependent_feat.pkl'.format(config.feat_folder)) print('Done - user_product features based on other features')
def transaction_stats(directory = '.'): stats = {'transaction_count': {}, 'transaction_query_count': {}, 'transaction_read_count': {}, 'transaction_write_count': {}} transactions = [] for repo in Repository.objects.exclude(latest_successful_attempt = None): if filter_repository(repo): continue project_type_name = repo.project_type.name if project_type_name not in stats['transaction_count']: stats['transaction_count'][project_type_name] = [] if project_type_name not in stats['transaction_query_count']: stats['transaction_query_count'][project_type_name] = [] if project_type_name not in stats['transaction_read_count']: stats['transaction_read_count'][project_type_name] = [] if project_type_name not in stats['transaction_write_count']: stats['transaction_write_count'][project_type_name] = [] for action in Action.objects.filter(attempt = repo.latest_successful_attempt): transaction = '' query_count = 0 transaction_count = 0 for query in Query.objects.filter(action = action): if 'BEGIN' in query.content.upper() or 'START TRANSACTION' in query.content.upper() or 'SET AUTOCOMMIT=0' in query.content.upper(): transaction = query.content + '\n' query_count = 1 elif transaction != '': transaction += query.content + '\n' query_count += 1 if 'COMMIT' in query.content.upper(): transaction = transaction.strip('\n') # for each transaction, count the number of transactions transaction_count += 1 # for each transaction, count the number of read/write read_count = len(re.findall('SELECT', transaction.upper())) stats['transaction_read_count'][project_type_name].append(read_count) write_count = 0 for keyword in ['INSERT', 'DELETE', 'UPDATE']: write_count += len(re.findall(keyword, transaction.upper())) stats['transaction_write_count'][project_type_name].append(write_count) # for each transaction, count the queries query_count -= 2 stats['transaction_query_count'][project_type_name].append(query_count) try: transactions.append((repo.name, repo.project_type.name, transaction)) except: pass transaction = '' if transaction_count > 0: stats['transaction_count'][project_type_name].append(transaction_count) pickle_dump(directory, 'transactions', transactions) dump_all_stats(directory, stats)
def run_main(height_preset, ds, qu, neigh, opt): if height_preset == 1: n_clusters_l = [ 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768 ] #, 16384, 32768, 60000] #65536] n_clusters_l = [1 << 16] n_clusters_l = [16, 256] #[16] n_clusters_l = [16] #n_clusters_l = [1<<8] elif height_preset == 2: n_clusters_l = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024] #2 n_clusters_l = [16, 256] #[16] n_clusters_l = [256] elif height_preset == 3: n_clusters_l = [2, 4, 8, 16, 32, 64] n_clusters_l = [2] elif height_preset in range(11): n_clusters_l = [2] else: raise Exception('No n_clusters for height {}'.format(height_preset)) print('HEIGHT: {} n_clusters: {}'.format(height_preset, n_clusters_l)) #if height_preset != 1 and opt.itq: # raise Exception('Height must be 1 if using ITQ') force_height = True k = opt.k n_repeat = opt.n_repeat_km n_repeat = 1 neigh = neigh[:, 0:k] ht2cutsz = defaultdict(list) #acc_mx = np.zeros((len(n_clusters_l), len(n_bins_l))) #probe_mx = np.zeros((len(n_clusters_l), len(n_bins_l))) n_clusters_l_len = len(n_clusters_l) acc_mx = [[] for i in range(n_clusters_l_len)] probe_mx = [[] for i in range(n_clusters_l_len)] probe95_mx = [[] for i in range(n_clusters_l_len)] max_bin_count = 0 start_time = time.time() serial_data = {} serial_data['k'] = k if opt.pca or opt.rp or opt.itq or opt.st: #only 1-bin probe makes sense in these settings opt.max_bin_count = 1 for i, n_clusters in enumerate(n_clusters_l): if force_height: height = height_preset serial_data['height'] = height else: height = math.floor(math.log(len(ds), n_clusters)) bin_count = 40 #1 acc = 0 probe = 0 #if opt.itq or opt.pca or opt.rp: # #only 1-bin probe makes sense in these settings # opt.max_bin_count = 1 #keep expanding number of bins until acc reaches e.g. 0.97 while acc < opt.acc_thresh and bin_count <= min( n_clusters, opt.max_bin_count): acc = 0 probe = 0 probe95 = 0 for l in range(n_repeat): cur_acc, cur_probe, cur_probe95 = run_kmeans( ds, qu, neigh, bin_count, n_clusters, height, ht2cutsz, opt) acc += cur_acc probe += cur_probe probe95 += cur_probe95 acc /= n_repeat probe /= n_repeat probe95 /= n_repeat #bin_count += 1 bin_count += 1 acc_mx[i].append(acc) probe_mx[i].append(probe) probe95_mx[i].append(probe95) max_bin_count = max(max_bin_count, bin_count - 1) end_time = time.time() serial_data['acc_mx'] = acc_mx serial_data['probe_mx'] = probe_mx serial_data['max_loyd'] = max_loyd serial_data['km_method'] = km_method serial_data['ht2cutsz'] = ht2cutsz print_output = True if print_output: print('total computation time: {} hrs'.format( (end_time - start_time) / 3600)) print('acc {}'.format(acc_mx)) print('probe count {}'.format(probe_mx)) print('ht2cutsz {}'.format(ht2cutsz)) row_label = ['{} clusters'.format(i) for i in n_clusters_l] col_label = ['{} bins'.format(i + 1) for i in range(max_bin_count)] acc_mx0 = acc_mx probe_mx0 = probe_mx probe95_mx0 = probe95_mx acc_mx = np.zeros((n_clusters_l_len, max_bin_count)) probe_mx = np.zeros((n_clusters_l_len, max_bin_count)) probe95_mx = np.zeros((n_clusters_l_len, max_bin_count)) for i in range(len(n_clusters_l)): for j in range(len(acc_mx0[i])): acc_mx[i][j] = acc_mx0[i][j] probe_mx[i][j] = probe_mx0[i][j] probe95_mx[i][j] = probe95_mx0[i][j] #[acc_mx[i][j] = acc_mx0[i][j] for j in range(len(acc_mx0[i])) for i in range(len(n_clusters_l))] #[probe_mx[i][j] = probe_mx0[i][j] for j in range(len(probe_mx0[i])) for i in range(len(n_clusters_l))] acc_md = utils.mxs2md( [np.around(acc_mx, 3), np.rint(probe_mx), np.rint(probe95_mx)], row_label, col_label) cur_method = 'k-means' if opt.pca: cur_method = 'PCA Tree' elif opt.st: cur_method = 'ST' elif opt.itq: cur_method = 'ITQ' elif opt.rp: cur_method = 'Random Projection' elif opt.cplsh: cur_method = 'Cross Polytope LSH' if opt.write_res: #False if opt.glove: res_path = os.path.join('results', 'linear2_glove.md') elif opt.glove_c: res_path = os.path.join('results', 'linear2_glove_c.md') elif opt.sift: res_path = os.path.join('results', 'linear2_sift.md') elif opt.sift_c: res_path = os.path.join('results', 'linear2_sift_c.md') else: res_path = os.path.join('results', 'linear2_mnist.md') with open(res_path, 'a') as file: msg = '\n\n{} **For k = {}, height {}, method {}, max_iter: {}**\n\n'.format( str(date.today()), k, height, cur_method, max_loyd) if opt.itq: msg = '\n\n*ITQ*' + msg file.write(msg) file.write(acc_md) if print_output: print('acc_md\n {} \n'.format(acc_md)) if opt.glove: pickle_path = os.path.join(data_dir, 'glove', 'kmeans_ht{}.pkl'.format(height)) json_path = os.path.join(data_dir, 'glove', 'kmeans_ht{}.json'.format(height)) elif opt.glove_c: pickle_path = os.path.join(data_dir, 'glove_c', 'kmeans_ht{}.pkl'.format(height)) json_path = os.path.join(data_dir, 'glove_c', 'kmeans_ht{}.json'.format(height)) elif opt.sift: pickle_path = os.path.join(data_dir, 'sift', 'kmeans_ht{}.pkl'.format(height)) json_path = os.path.join(data_dir, 'sift', 'kmeans_ht{}.json'.format(height)) elif opt.sift_c: pickle_path = os.path.join(data_dir, 'sift_c', 'kmeans_ht{}.pkl'.format(height)) json_path = os.path.join(data_dir, 'sift_c', 'kmeans_ht{}.json'.format(height)) else: pickle_path = os.path.join(data_dir, 'kmeans_ht{}.pkl'.format(height)) json_path = os.path.join(data_dir, 'kmeans_ht{}.json'.format(height)) if False: #march utils.pickle_dump(serial_data, pickle_path) with open(json_path, 'w') as file: json.dump(serial_data, file) return acc_mx, probe_mx, probe95_mx
def _save_cache(): global sw_df, roll_mat_csr, roll_index, y_true, D_START, D_END pickle_dump((sw_df, roll_mat_csr, roll_index, y_true, D_START, D_END), EVAL_CACHE_FILE)
def pre_process(file_folder, word_cut_func, is_en,start,end): # test_data = pd.read_csv(os.path.join('./data/twitter', 'test.csv'), header=0, index_col=None,encoding = 'unicode_escape') # print(type(test_data['content'])) #test_data = pd.read_csv('data\sample\sample_data.csv', header=0, index_col=None,encoding = 'unicode_escape') #print(test_data) print('preprocessing: ', file_folder) test_data_word_list = word_cut_func(test_data_content) test_data_char_list = test_data_content test_data_aspect_word_list = word_cut_func(test_data_aspect) test_data_aspect_char_list = test_data_aspect #train_data['aspect_word_list'] = train_data['aspect'].apply(word_cut_func) #train_data['aspect_char_list'] = train_data['aspect'].apply(lambda x: list(x)) print('building vocabulary...') word_corpus = (test_data_word_list) char_corpus = (test_data_char_list) aspect_corpus = (test_data_aspect) aspect_text_word_corpus = test_data_aspect_word_list aspect_text_char_corpus = test_data_aspect_char_list word_vocab = build_vocabulary(word_corpus, start_id=1) char_vocab = build_vocabulary(char_corpus, start_id=1) aspect_vocab = build_vocabulary(aspect_corpus, start_id=0) aspect_text_word_vocab = build_vocabulary(aspect_text_word_corpus, start_id=1) aspect_text_char_vocab = build_vocabulary(aspect_text_char_corpus, start_id=1) #print(word_vocab.get(word, len(word_vocab)+1) for word in (word_vocab)) pickle_dump(word_vocab, os.path.join(file_folder, 'word_vocab2.pkl')) pickle_dump(char_vocab, os.path.join(file_folder, 'char_vocab2.pkl')) pickle_dump(aspect_vocab, os.path.join(file_folder, 'aspect_vocab2.pkl')) pickle_dump(aspect_text_word_vocab, os.path.join(file_folder, 'aspect_text_word_vocab2.pkl')) pickle_dump(aspect_text_char_vocab, os.path.join(file_folder, 'aspect_text_char_vocab2.pkl')) # prepare embedding print('preparing embedding...') word_w2v = build_embedding(word_corpus, word_vocab, config.word_embed_dim) aspect_word_w2v = build_aspect_embedding(aspect_vocab, word_cut_func, word_vocab, word_w2v) aspect_text_word_w2v = build_aspect_text_embedding(aspect_text_word_vocab, word_vocab, word_w2v) char_w2v = build_embedding(char_corpus, char_vocab, config.word_embed_dim) aspect_char_w2v = build_aspect_embedding(aspect_vocab, lambda x: list(x), char_vocab, char_w2v) aspect_text_char_w2v = build_aspect_text_embedding(aspect_text_char_vocab, char_vocab, char_w2v) np.save(os.path.join(file_folder, 'word_w2v.npy'), word_w2v) np.save(os.path.join(file_folder, 'aspect_word_w2v.npy'), aspect_word_w2v) np.save(os.path.join(file_folder, 'aspect_text_word_w2v.npy'), aspect_text_word_w2v) np.save(os.path.join(file_folder, 'char_w2v.npy'), char_w2v) np.save(os.path.join(file_folder, 'aspect_char_w2v.npy'), aspect_char_w2v) np.save(os.path.join(file_folder, 'aspect_text_char_w2v.npy'), aspect_text_char_w2v) print('finished preparing embedding!') if is_en: word_glove = build_glove_embedding(word_vocab, glove_vectors, glove_embed_dim) aspect_word_glove = build_aspect_embedding(aspect_vocab, word_cut_func, word_vocab, word_glove) aspect_text_word_glove = build_aspect_text_embedding(aspect_text_word_vocab, word_vocab, word_glove) np.save(os.path.join(file_folder, 'word_glove.npy'), word_glove) np.save(os.path.join(file_folder, 'aspect_word_glove.npy'), aspect_word_glove) np.save(os.path.join(file_folder, 'aspect_text_word_glove.npy'), aspect_text_word_glove) #print('shape of word_glove:', word_glove.shape) #print('sample of word_glove:', word_glove[:2, :5]) #print('shape of aspect_word_glove:', aspect_word_glove.shape) #print('sample of aspect_word_glove:', aspect_word_glove[:2, :5]) #print('shape of aspect_text_word_glove:', aspect_text_word_glove.shape) #print('sample of aspect_text_word_glove:', aspect_text_word_glove[:2, :5]) # prepare input print('preparing text input...') g=lambda x: [word_vocab.get(word, len(word_vocab)+1) for word in x] f=lambda x: [char_vocab.get(char, len(char_vocab)+1) for char in x] test_word_input = g(test_data_word_list) test_char_input = f(test_data_char_list) pickle_dump(test_word_input, os.path.join(file_folder, 'test_word_input2.pkl')) pickle_dump(test_char_input, os.path.join(file_folder, 'test_char_input2.pkl')) print('finished preparing text input!') print('preparing aspect input...') #train_aspect_input = train_data['aspect'].apply(lambda x: [aspect_vocab[x]]).values.tolist() #valid_aspect_input = valid_data['aspect'].apply(lambda x: [aspect_vocab[x]]).values.tolist() test_aspect_input = test_data_aspect #pickle_dump(train_aspect_input, os.path.join(file_folder, 'train_aspect_input.pkl')) #pickle_dump(valid_aspect_input, os.path.join(file_folder, 'valid_aspect_input.pkl')) pickle_dump(test_aspect_input, os.path.join(file_folder, 'test_aspect_input.pkl')) print('finished preparing aspect input!') print('preparing aspect text input...') x=lambda x: [aspect_text_word_vocab.get(word, len(aspect_text_word_vocab) + 1) for word in x] y=lambda x: [aspect_text_char_vocab.get(char, len(aspect_text_char_vocab) + 1) for char in x] test_aspect_text_word_input =x(test_data_aspect_word_list) test_aspect_text_char_input =y(test_data_aspect_char_list) pickle_dump(test_aspect_text_word_input, os.path.join(file_folder, 'test_word_aspect_input.pkl')) pickle_dump(test_aspect_text_char_input, os.path.join(file_folder, 'test_char_aspect_input.pkl')) print('finished preparing aspect text input!') test_word_input_l, test_word_input_r, test_word_input_r_with_pad, test_word_mask, test_word_pos_input, \ test_word_offset_input, test_char_input_l, test_char_input_r, test_char_input_r_with_pad, test_char_mask, \ test_char_pos_input, test_char_offset_input = split_text_and_get_loc_info(word_vocab, char_vocab, word_cut_func,start,end) pickle_dump(test_word_input_l, os.path.join(file_folder, 'test_word_input_l2.pkl')) pickle_dump(test_word_input_r, os.path.join(file_folder, 'test_word_input_r2.pkl')) pickle_dump(test_word_input_r_with_pad, os.path.join(file_folder, 'test_word_input_r_with_pad2.pkl')) pickle_dump(test_word_mask, os.path.join(file_folder, 'test_word_mask2.pkl')) pickle_dump(test_word_pos_input, os.path.join(file_folder, 'test_word_pos_input2.pkl')) pickle_dump(test_word_offset_input, os.path.join(file_folder, 'test_word_offset_input2.pkl')) pickle_dump(test_char_input_l, os.path.join(file_folder, 'test_char_input_l2.pkl')) pickle_dump(test_char_input_r, os.path.join(file_folder, 'test_char_input_r2.pkl')) pickle_dump(test_char_input_r_with_pad, os.path.join(file_folder, 'test_char_input_r_with_pad2.pkl')) pickle_dump(test_char_mask, os.path.join(file_folder, 'test_char_mask2.pkl')) pickle_dump(test_char_pos_input, os.path.join(file_folder, 'test_char_pos_input2.pkl')) pickle_dump(test_char_offset_input, os.path.join(file_folder, 'test_char_offset_input2.pkl')) # prepare output #if 'sentiment' in test_data.columns: # pickle_dump(test_data['sentiment'].values.tolist(), os.path.join(file_folder, 'test_label.pkl')) print('finished preparing output!')
def process_data(dataset: str, config: ProcessConfig): train_file = NER_TRAIN_FILE[dataset] dev_file = NER_DEV_FILE.get(dataset, None) test_file = NER_TEST_FILE.get(dataset, None) print('Logging Info - Loading ner data...') if dev_file is None and test_file is None: train_data, dev_data, test_data = load_ner_data(train_file, config.normalized, config.lower, split_mode=2) elif dev_file is None: train_data, dev_data = load_ner_data(train_file, config.normalized, config.lower, split_mode=1) test_data = load_ner_data(test_file, config.normalized, config.lower) elif test_file is None: train_data, test_data = load_ner_data(train_file, config.normalized, config.lower, split_mode=1) dev_data = load_ner_data(dev_file, config.normalized) else: train_data = load_ner_data(train_file, config.normalized, config.lower) dev_data = load_ner_data(dev_file, config.normalized, config.lower) test_data = load_ner_data(test_file, config.normalized, config.lower) print('Logging Info - Loading gazetteer and generating trie...') gaze_tries = dict() for gaze_file in GAZETTEER_FILES[dataset]: gaze_name = os.path.basename(gaze_file) gaze_tries[gaze_name] = load_gaze_trie(gaze_file, config.normalized, config.lower) print('Logging Info - Generating matching entity...') search_entity(train_data, gaze_tries) search_entity(dev_data, gaze_tries) search_entity(test_data, gaze_tries) print('Logging Info - Generating corpus...') char_corpus = [ text_example.tokens for text_example in train_data + dev_data + test_data ] fw_bigram_corpus = [ text_example.fw_bigrams for text_example in train_data + dev_data + test_data ] bw_bigram_corpus = [ text_example.bw_bigrams for text_example in train_data + dev_data + test_data ] tag_corpus = [ text_example.tags for text_example in train_data + dev_data + test_data ] print('Logging Info - Generating vocabulary...') char_vocab, idx2char = build_vocab(char_corpus) fw_bigram_vocab, idx2fw_bigram = build_vocab(fw_bigram_corpus) bw_bigram_vocab, idx2bw_bigram = build_vocab(bw_bigram_corpus) tag_vocab, idx2tag = build_tag_vocab(tag_corpus) print('Logging Info - Preparing embedding...') c2v = train_w2v(char_corpus, char_vocab, embedding_dim=config.char_embed_dim) c_fasttext = train_fasttext(char_corpus, char_vocab, embedding_dim=config.char_embed_dim) c_glove = train_glove(char_corpus, char_vocab, embedding_dim=config.char_embed_dim) fw_bi2v = train_w2v(fw_bigram_corpus, fw_bigram_vocab, embedding_dim=config.bigram_embed_dim) fw_bifasttext = train_fasttext(fw_bigram_corpus, fw_bigram_vocab, embedding_dim=config.bigram_embed_dim) fw_biglove = train_glove(fw_bigram_corpus, fw_bigram_vocab, embedding_dim=config.bigram_embed_dim) bw_bi2v = train_w2v(bw_bigram_corpus, bw_bigram_vocab, embedding_dim=config.bigram_embed_dim) bw_bifasttext = train_fasttext(bw_bigram_corpus, bw_bigram_vocab, embedding_dim=config.bigram_embed_dim) bw_biglove = train_glove(bw_bigram_corpus, bw_bigram_vocab, embedding_dim=config.bigram_embed_dim) print('Logging Info - Saving processed data...') pickle_dump( format_filename(PROCESSED_DATA_DIR, TRAIN_DATA_TEMPLATE, dataset=dataset), train_data) pickle_dump( format_filename(PROCESSED_DATA_DIR, DEV_DATA_TEMPLATE, dataset=dataset), dev_data) pickle_dump( format_filename(PROCESSED_DATA_DIR, TEST_DATA_TEMPLATE, dataset=dataset), test_data) pickle_dump( format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, dataset=dataset, level='char'), char_vocab) pickle_dump( format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, dataset=dataset, level='fw_bigram'), fw_bigram_vocab) pickle_dump( format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, dataset=dataset, level='bw_bigram'), bw_bigram_vocab) pickle_dump( format_filename(PROCESSED_DATA_DIR, VOCABULARY_TEMPLATE, dataset=dataset, level='tag'), tag_vocab) pickle_dump( format_filename(PROCESSED_DATA_DIR, IDX2TOKEN_TEMPLATE, dataset=dataset, level='char'), idx2char) pickle_dump( format_filename(PROCESSED_DATA_DIR, IDX2TOKEN_TEMPLATE, dataset=dataset, level='fw_bigram'), idx2fw_bigram) pickle_dump( format_filename(PROCESSED_DATA_DIR, IDX2TOKEN_TEMPLATE, dataset=dataset, level='bw_bigram'), idx2fw_bigram) pickle_dump( format_filename(PROCESSED_DATA_DIR, IDX2TOKEN_TEMPLATE, dataset=dataset, level='tag'), idx2tag) np.save( format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, dataset=dataset, type='c2v'), c2v) np.save( format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, dataset=dataset, type='c_fasttext'), c_fasttext) np.save( format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, dataset=dataset, type='c_glove'), c_glove) np.save( format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, dataset=dataset, type='fw_bi2v'), fw_bi2v) np.save( format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, dataset=dataset, type='fw_bifasttext'), fw_bifasttext) np.save( format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, dataset=dataset, type='fw_biglove'), fw_biglove) np.save( format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, dataset=dataset, type='bw_bi2v'), bw_bi2v) np.save( format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, dataset=dataset, type='bw_bifasttext'), bw_bifasttext) np.save( format_filename(PROCESSED_DATA_DIR, EMBEDDING_MATRIX_TEMPLATE, dataset=dataset, type='bw_biglove'), bw_biglove)
sleep(sleep_time) else: # 呼び出し元でエラー内容等を表示 raise return if __name__ == '__main__': pdf_dir = utils.check_argv_path(sys.argv) pdf_path_list = utils.get_path_list(pdf_dir, 'pdf') # 処理済みのファイルを雑に判別して除外 # pdf_path_list = [p for p in pdf_path_list if not os.path.basename(p).startswith('[')] amazon_url_list = [] for pdf_path in pdf_path_list: isbn = pdf_to_isbn(pdf_path) if isbn: try: amazon_items = fetch_amazon_item(isbn) except HTTPError as e: print('情報の取得に失敗しました:', pdf_path, isbn) continue if amazon_items: amazon_url = get_amazon_url(amazon_items) print(amazon_url) amazon_url_list.append(amazon_url) # 念のため定期的にpickelを保存しておく if len(amazon_url_list) % 10 == 0: utils.pickle_dump(amazon_url_list, filename='amazon_url_list.pickel') # API制限にかからないようにsleepを設定 sleep(2) utils.pickle_dump(amazon_url_list, filename='amazon_url_list.pickel')
'product_id').size() product_feat['product_first_reorder_num'] = order_products_prior[ order_products_prior.user_product_order_number == 1].groupby( 'product_id').size() product_feat['product_first_reorder_num'].fillna(0, inplace=True) # fillna product_feat['product_user_order_only_once_num'] = \ product_feat['product_first_order_num'] - product_feat['product_first_reorder_num'] product_feat['product_user_order_only_once_ratio'] = \ product_feat['product_user_order_only_once_num'] / product_feat['product_first_order_num'] product_feat['product_reorder_ratio'] = product_feat[ 'product_first_reorder_num'] / product_feat['product_first_order_num'] product_feat['product_average_user_reorder_num'] = product_feat[ 'product_reorder_num'] / product_feat['product_first_order_num'] product_feat[ 'product_average_add_to_cart_order'] = order_products_prior.groupby( 'product_id')['add_to_cart_order'].mean() feats = [ 'product_order_num', 'product_reorder_num', 'product_reorder_frequency', 'product_first_order_num', 'product_first_reorder_num', 'product_reorder_ratio', 'product_user_order_only_once_num', 'product_user_order_only_once_ratio', 'product_average_user_reorder_num', 'product_average_add_to_cart_order' ] pickle_dump(product_feat[feats], '{}/product_feat.pkl'.format(config.feat_folder)) print('Done - product features')