Exemple #1
0
def process_data(dataset: str, neighbor_sample_size: int, K: int):
    drug_vocab = {}
    entity_vocab = {}
    relation_vocab = {}

    read_entity2id_file(ENTITY2ID_FILE[dataset], drug_vocab, entity_vocab)

    pickle_dump(
        format_filename(PROCESSED_DATA_DIR,
                        DRUG_VOCAB_TEMPLATE,
                        dataset=dataset), drug_vocab)
    pickle_dump(
        format_filename(PROCESSED_DATA_DIR,
                        ENTITY_VOCAB_TEMPLATE,
                        dataset=dataset), entity_vocab)

    examples_file = format_filename(PROCESSED_DATA_DIR,
                                    DRUG_EXAMPLE,
                                    dataset=dataset)
    examples = read_example_file(EXAMPLE_FILE[dataset], SEPARATOR[dataset],
                                 drug_vocab)
    print(len(examples))
    #example contains postive samples and negative samples
    #example:[drug1 drug2 interaction]
    np.save(examples_file, examples)

    adj_entity_file = format_filename(PROCESSED_DATA_DIR,
                                      ADJ_ENTITY_TEMPLATE,
                                      dataset=dataset)
    adj_relation_file = format_filename(PROCESSED_DATA_DIR,
                                        ADJ_RELATION_TEMPLATE,
                                        dataset=dataset)

    adj_entity, adj_relation = read_kg(KG_FILE[dataset], entity_vocab,
                                       relation_vocab, neighbor_sample_size)

    pickle_dump(
        format_filename(PROCESSED_DATA_DIR,
                        DRUG_VOCAB_TEMPLATE,
                        dataset=dataset), drug_vocab)
    pickle_dump(
        format_filename(PROCESSED_DATA_DIR,
                        ENTITY_VOCAB_TEMPLATE,
                        dataset=dataset), entity_vocab)
    pickle_dump(
        format_filename(PROCESSED_DATA_DIR,
                        RELATION_VOCAB_TEMPLATE,
                        dataset=dataset), relation_vocab)
    adj_entity_file = format_filename(PROCESSED_DATA_DIR,
                                      ADJ_ENTITY_TEMPLATE,
                                      dataset=dataset)
    np.save(adj_entity_file, adj_entity)
    print('Logging Info - Saved:', adj_entity_file)

    adj_relation_file = format_filename(PROCESSED_DATA_DIR,
                                        ADJ_RELATION_TEMPLATE,
                                        dataset=dataset)
    np.save(adj_relation_file, adj_relation)
    print('Logging Info - Saved:', adj_entity_file)
    cross_validation(K, examples, dataset, neighbor_sample_size)
Exemple #2
0
def eval_deprecated():
    '''partially deprecated'''
    k = opt.k
    train_node = TrainNode(opt)
    idx2bin = {}
    dataset = utils.load_data('query') ### use dataset eventually
    dataset = dataset.to(device)
    dsnode_path = opt.dsnode_path + str(opt.n_clusters)
    #print('dsnode path {}'.format(dsnode_path))
    dsnode = utils.pickle_load(dsnode_path)
    print('dsnode {}'.format(dsnode))
    
    train_node.train(dataset, dsnode, idx2bin)
    #idx (of query) in entire dataset, bin is idx of leaf bin.
    
    eval_root = train_node.create_eval_tree()
    idx2bin = eval_root.idx2bin
    #eval root should contain dict for answers set indices and bin #, for evaluation.
        
    #serialize
    print('train.py - serializing model evaluation tree...')
    eval_root_path = osp.join(opt.data_dir, 'model_eval_root') ###########
    utils.pickle_dump(eval_root, eval_root_path)

    ## evaluate ##    
    queryset = utils.load_data('query')
    neighbors = utils.load_data('answers')
    acc, probe_count = eval_model(eval_root, queryset, neighbors, opt)
    print('train.py - Query set prediction acc {} probe count {}'.format(acc, probe_count))
def get_validation_split(data_file,
                         training_file,
                         validation_file,
                         data_split=0.8,
                         overwrite=False):
    """
    Splits the data into the training and validation indices list.
    :param data_file: pytables hdf5 data file
    :param training_file:
    :param validation_file:
    :param data_split:
    :param overwrite:
    :return:
    """
    if overwrite or not os.path.exists(training_file):
        print("Creating validation split...")
        nb_samples = data_file.root.data.shape[0]
        sample_list = list(range(nb_samples))
        training_list, validation_list = split_list(sample_list,
                                                    split=data_split)
        pickle_dump(training_list, training_file)
        pickle_dump(validation_list, validation_file)
        return training_list, validation_list
    else:
        print("Loading previous validation split...")
        return pickle_load(training_file), pickle_load(validation_file)
Exemple #4
0
def main():
    model_file = os.path.join(modeldir, 'commoncrawl_fr-en.bin')
    trajectory_file = os.path.join(datadir,
                                   'eolss-train.trajectories+scores.txt')

    con = Connection(configuration=client_conf)
    con.set_globals(trajectory_file=trajectory_file,
                    model_file=model_file,
                    gamma=gamma)
    con.run(load_data)

    trajectories.compute_scores(phi, gamma)
    transitions = [(s, r) for _, s, _, r in trajectories.SBIRL(phi)
                   ]  # in fitted-value iteration we care only about s'
    shuffle(transitions)

    regressor = None

    for k in range(n_iterations):
        print 'Iteration', k
        con.set_globals(regressor=regressor)

        training_set = con.map(training_, transitions)
        regressor = get_regressor(training_set)

        pickle_dump(regressor, 'output/regressor.{}.pickle'.format(k + 1))
Exemple #5
0
def repetitive(directory='.'):
    stats = {'repetitive_count': {}, 'query_count': {}}

    repetitive_queries = set()

    for repo in Repository.objects.exclude(latest_successful_attempt=None):
        if filter_repository(repo):
            continue

        project_type_name = repo.project_type.name

        for action in Action.objects.filter(
                attempt=repo.latest_successful_attempt):
            queries = map(lambda x: x.content.strip(),
                          Query.objects.filter(action=action))
            for i in xrange(1, len(queries)):
                if queries[i] == queries[i - 1]:
                    repetitive_queries.add(queries[i])
                    print project_type_name
                    print queries[i]
                    print
                    stats['repetitive_count'][
                        project_type_name] = stats['repetitive_count'].get(
                            project_type_name, 0) + 1
            stats['query_count'][project_type_name] = stats['query_count'].get(
                project_type_name, 0) + len(queries)

    pickle_dump(directory, 'repetitive_queries', repetitive_queries)

    dump_all_stats(directory, stats)
Exemple #6
0
def cache_sites(se_sites_path, api_key):
    url = 'https://api.stackexchange.com/2.2/sites'
    params = {'pagesize': 100}
    json_items = call_api(url, params)
    sites = []
    for item in json_items:
        api_name = item.get('api_site_parameter')
        sites.append(api_name)
    utils.pickle_dump(se_sites_path, sites)
    print('cached se site list to file')
Exemple #7
0
 def pickle_self(self):
     self.total_time = time() - self.t0
     self.url_file.write('\nUrl counts:{}\nDuplicate counts{}'.format(
         self.url_count, self.duplicate_count))
     self.url_file.write('\nTime taken: {}'.format(self.total_time))
     if self.depth_reached > self.max_depth:
         self.depth_reached -= 1
     self.url_file.write('\nDepth reached: {}'.format(self.depth_reached))
     self.url_file.close()
     self.url_file = None
     self.conn.close()
     utils.pickle_dump(self.state_path, self)
Exemple #8
0
    def check_refresh_complete (self):
        if self.rs.refresh_status["no_need_update"]:
            self.status_changed (self.orig_office_status)
            return False

        if self.rs.refresh_status["last_notification"] == False:
            return True

        if self.last_nid == self.rs.last_nid and int (self.last_nid) != 0:
            logging.debug ("self.last_nid == self.rs.last_nid")
            self.status_changed (self.orig_office_status)
            return False

        if self.rs.refresh_status["current_status"] == True:
            self.rs.refresh_status["current_status"] = False
            self.current_status = self.rs.current_status
            self.refresh_status_changed (defs.CURRENT_STATUS_COMPLETED)

        if self.rs.refresh_status["notification"] == True and \
                self.rs.refresh_status["comments"] == True:
            self.rs.refresh_status["notification"] = False
            self.rs.refresh_status["comments"] = False
            self.notification = self.rs.notification
            self.status = self.rs.status
            self.refresh_status_changed (defs.NOTIFICATION_COMMENTS_COMPLETED)

        if self.rs.refresh_status["users_icon"] == True:
            self.rs.refresh_status["users_icon"] = False
            self.user_ids = self.rs.user_ids
            self.users = self.rs.users
            self.refresh_status_changed (defs.USERS_ICON_COMPLETED)

        if self.rs.refresh_status["apps_icon"] == True:
            self.rs.refresh_status["apps_icon"] = False
            self.app_ids = self.rs.app_ids
            self.applications = self.rs.applications
            self.refresh_status_changed (defs.APPS_ICON_COMPLETED)

        if self.rs.isAlive ():
            return True

        logging.debug ("completed")
        self.last_nid = self.rs.last_nid
        for k in self.rs.refresh_status:
            self.rs.refresh_status[k] = False

        path = self.local_data_dir + "/cache.pickle"
        utils.pickle_dump (self, path)

        self.status_changed (self.orig_office_status)
        return False
def save_checkpoint(model, infos, optimizer, append='tr'):
    if len(append) > 0:
        append = '-' + append
    # if checkpoint_path doesn't exist
    if not os.path.isdir(opt.checkpoint_path):
        os.makedirs(opt.checkpoint_path)
    checkpoint_path = os.path.join(opt.checkpoint_path,
                                   'model%s.pth' % (append))
    torch.save(model.state_dict(), checkpoint_path)
    print("model saved to {}".format(checkpoint_path))
    with open(os.path.join(opt.checkpoint_path, 'infos%s.pkl' % (append)),
              'wb') as f:
        pickle_dump(infos, f)
    optimizer_path = os.path.join(opt.checkpoint_path,
                                  'optimizer%s.pth' % (append))
    torch.save(optimizer.state_dict(), optimizer_path)
Exemple #10
0
def main(params):

    imgs = json.load(open(params['input_json'], 'r'))
    itow = json.load(open(params['dict_json'], 'r'))['ix_to_word']
    wtoi = {w: i for i, w in itow.items()}

    imgs = imgs['images']

    ngram_words, ngram_idxs, ref_len = build_dict(imgs, wtoi, params)

    utils.pickle_dump({
        'document_frequency': ngram_words,
        'ref_len': ref_len
    }, open(params['output_pkl'] + '-words.p', 'wb'))
    utils.pickle_dump({
        'document_frequency': ngram_idxs,
        'ref_len': ref_len
    }, open(params['output_pkl'] + '-idxs.p', 'wb'))
Exemple #11
0
def cv_split(train_data,
             dev_data,
             cate3_vocab,
             fold=5,
             balanced=True,
             random_state=42):
    def indexing_data(data, indices):
        part_data = {}
        for k in data.keys():
            part_data[k] = [data[k][i] for i in indices]
        return part_data

    all_data = {}
    for key in train_data.keys():
        all_data[key] = train_data[key] + dev_data[key]

    # some category in validation set is not in cate3_vocab
    cate3_id_list = [cate3_vocab.get(cate3, 0) for cate3 in all_data['cate3']]
    index_range = np.arange(len(all_data['id']))

    if balanced:
        kf = StratifiedKFold(n_splits=fold,
                             shuffle=True,
                             random_state=random_state)
    else:
        kf = KFold(n_splits=fold, shuffle=True, random_state=random_state)

    for idx, (train_index,
              dev_index) in enumerate(kf.split(index_range, cate3_id_list)):
        train_data_fold = indexing_data(all_data, train_index)
        dev_data_fold = indexing_data(all_data, dev_index)

        pickle_dump(
            format_filename(PROCESSED_DATA_DIR,
                            TRAIN_CV_DATA_TEMPLATE,
                            random=random_state,
                            fold=fold,
                            index=idx), train_data_fold)
        pickle_dump(
            format_filename(PROCESSED_DATA_DIR,
                            DEV_CV_DATA_TEMPLATE,
                            random=random_state,
                            fold=fold,
                            index=idx), dev_data_fold)
Exemple #12
0
def get_usr_mov_features(model: Model, params_file_path, poster_path):
    usr_pkl = {}
    mov_pkl = {}

    # 加载模型参数到模型中,设置为验证模式eval()
    model_state_dict = load_params(params_file_path)
    model.load_dict(model_state_dict)
    model.eval()
    # 获得整个数据集的数据
    dataset = model.Dataset.dataset

    for i in range(len(dataset)):
        # 获得用户数据,电影数据,评分数据
        # 本案例只转换所有在样本中出现过的user和movie,实际中可以使用业务系统中的全量数据
        usr_info, mov_info, score = dataset[i]['usr_info'], dataset[i][
            'mov_info'], dataset[i]['scores']
        usrid = str(usr_info['usr_id'])
        movid = str(mov_info['mov_id'])

        # 获得用户数据,计算得到用户特征,保存在usr_pkl字典中
        if usrid not in usr_pkl.keys():
            usr_id_v = list2tensor(usr_info['usr_id'], [1])
            usr_age_v = list2tensor(usr_info['age'], [1])
            usr_gender_v = list2tensor(usr_info['gender'], [1])
            usr_job_v = list2tensor(usr_info['job'], [1])

            usr_in = [usr_id_v, usr_gender_v, usr_age_v, usr_job_v]
            usr_feat = model.get_usr_feat(usr_in)
            usr_pkl[usrid] = usr_feat.numpy()

        # 获得电影数据,计算得到电影特征,保存在mov_pkl字典中
        if movid not in mov_pkl.keys():
            mov_id_v = list2tensor(mov_info['mov_id'], [1])
            mov_tit_v = list2tensor(mov_info['title'], [1, 1, 15])
            mov_cat_v = list2tensor(mov_info['category'], [1, 6])
            mov_in = [mov_id_v, mov_cat_v, mov_tit_v, None]
            mov_feat = model.get_mov_feat(mov_in)
            mov_pkl[movid] = mov_feat.numpy()

    # 保存特征到本地
    pickle_dump(usr_pkl, './usr_feat.pkl')
    pickle_dump(mov_pkl, './mov_feat.pkl')
    print("usr & mov features saved!!!")
Exemple #13
0
def get_validation_split(data_file,
                         training_file,
                         validation_file,
                         data_split=0.8,
                         overwrite=False):
    """
    """
    if overwrite or not os.path.exists(training_file):
        print("Creating validation split...")
        nb_samples = data_file.root.data.shape[0]
        sample_list = list(range(nb_samples))
        training_list, validation_list = split_list(sample_list,
                                                    split=data_split)
        pickle_dump(training_list, training_file)
        pickle_dump(validation_list, validation_file)
        return training_list, validation_list
    else:
        print("Loading previous validation split...")
        return pickle_load(training_file), pickle_load(validation_file)
Exemple #14
0
def main():
    model_file = os.path.join(modeldir, 'commoncrawl_fr-en.bin')
    trajectory_file = os.path.join(datadir, 'eolss-train.trajectories+scores.txt')

    con = Connection(configuration=client_conf)
    con.set_globals(trajectory_file=trajectory_file, model_file=model_file, gamma=gamma)
    con.run(load_data)

    trajectories.compute_scores(phi, gamma)
    transitions = [(s, r) for _, s, _, r in trajectories.SBIRL(phi)]  # in fitted-value iteration we care only about s'
    shuffle(transitions)

    regressor = None

    for k in range(n_iterations):
        print 'Iteration', k
        con.set_globals(regressor=regressor)

        training_set = con.map(training_, transitions)
        regressor = get_regressor(training_set)

        pickle_dump(regressor, 'output/regressor.{}.pickle'.format(k + 1))
Exemple #15
0
def main(args):
    config_path = args.config
    name = args.name + "_{}".format(int(time.time()))
    config = Config(config_path)

    if not os.path.exists(TRAINING_RESULTS):
        os.makedirs(TRAINING_RESULTS)

    if not os.path.exists(MODELS_PATH):
        os.makedirs(MODELS_PATH)

    if not os.path.exists(CV_PARAMS_PATH):
        os.makedirs(CV_PARAMS_PATH)

    X_train, X_test, y_train, y_test = config.get_data_from_config()
    grid = config.get_estimator_from_config()
    grid.fit(X_train, y_train)
    terminal_break()
    print("Training finished")

    predictions = grid.predict(X_test)
    report = classification_report(y_test, predictions)
    report_path = os.path.join(TRAINING_RESULTS, name + '_report.txt')
    print("Classification Report stored in {}".format(report_path))
    print(report)

    with open(report_path, 'w') as f:
        f.write(report)
    model_path = os.path.join(MODELS_PATH, name + '_model.pkl')
    print("\n Pickling and saving best model at {}".format(model_path))
    pickle_dump(grid.best_estimator_, model_path)

    cv_params_and_score = {
        'best_score': grid.best_score_,
        'best_params': grid.best_params_
    }
    params_path = os.path.join(CV_PARAMS_PATH, name + '_params.txt')
    dict_dump(cv_params_and_score, params_path)
def main():
    args = get_args()
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    itos, stoi = generate_vocab_mappings(CHAR_VOCAB_PATH)
    print('len vocabulary:', len(stoi))
    model = WordNLM(args.word_embedding_size, len(itos), args.hidden_dim, args.layer_num)
    model.to(device)
    if args.load_from is not None:
        if args.load_from == "LSTM":
            weight_path = MODELS_HOME+"/"+args.load_from+".pth.tar"
        else:
            weight_path = MODELS_HOME + args.load_from
        model = load_WordNLM_model(weight_path, model, device, args.load_from)
    else:
        assert False
    model.eval()

    if args.test == "gender":
        parameters = {"gender_model": model, "gender_device": device, "vocab_mapping": stoi}
    elif args.test == "syntax":
        path = DATASETS_PATHS[args.dataset]
        parameters = {"path": path, "syntactic_model": model, "syntactic_device": device, "vocab_mapping": stoi}
    elif args.test == "test2":
        pass

    result = TESTS[args.test](**parameters)
    print(result)

    if args.test == "gender":
        result_name = BASE_RESULTS_PATH + args.load_from + "_" + RESULTS_PATHS["gender"]
    elif args.test == "syntax":
        result_name = BASE_RESULTS_PATH + args.load_from + "_" + RESULTS_PATHS[args.dataset]

    pickle_dump(result, result_name)

    return result
Exemple #17
0
def process_data(dataset: str, neighbor_sample_size: int):
    user_vocab = {}
    item_vocab = {}
    entity_vocab = {}
    relation_vocab = {}

    read_item2entity_file(ITEM2ENTITY_FILE[dataset], item_vocab, entity_vocab)
    train_data, dev_data, test_data = read_rating_file(RATING_FILE[dataset], SEPARATOR[dataset],
                                                       THRESHOLD[dataset], user_vocab, item_vocab)
    adj_entity, adj_relation = read_kg(KG_FILE[dataset], entity_vocab, relation_vocab,
                                       neighbor_sample_size)

    pickle_dump(format_filename(PROCESSED_DATA_DIR, USER_VOCAB_TEMPLATE, dataset=dataset),
                user_vocab)
    pickle_dump(format_filename(PROCESSED_DATA_DIR, ITEM_VOCAB_TEMPLATE, dataset=dataset),
                item_vocab)
    pickle_dump(format_filename(PROCESSED_DATA_DIR, ENTITY_VOCAB_TEMPLATE, dataset=dataset),
                entity_vocab)
    pickle_dump(format_filename(PROCESSED_DATA_DIR, RELATION_VOCAB_TEMPLATE, dataset=dataset),
                relation_vocab)

    train_data_file = format_filename(PROCESSED_DATA_DIR, TRAIN_DATA_TEMPLATE, dataset=dataset)
    np.save(train_data_file, train_data)
    print('Logging Info - Saved:', train_data_file)

    dev_data_file = format_filename(PROCESSED_DATA_DIR, DEV_DATA_TEMPLATE, dataset=dataset)
    np.save(dev_data_file, dev_data)
    print('Logging Info - Saved:', dev_data_file)

    test_data_file = format_filename(PROCESSED_DATA_DIR, TEST_DATA_TEMPLATE, dataset=dataset)
    np.save(test_data_file, test_data)
    print('Logging Info - Saved:', test_data_file)

    adj_entity_file = format_filename(PROCESSED_DATA_DIR, ADJ_ENTITY_TEMPLATE, dataset=dataset)
    np.save(adj_entity_file, adj_entity)
    print('Logging Info - Saved:', adj_entity_file)

    adj_relation_file = format_filename(PROCESSED_DATA_DIR, ADJ_RELATION_TEMPLATE, dataset=dataset)
    np.save(adj_relation_file, adj_relation)
    print('Logging Info - Saved:', adj_entity_file)
from param_config import config
from utils import pickle_load, pickle_dump

if __name__ == '__main__':
    print('Generating aisle features...')
    order_products_prior = pickle_load(config.order_products_prior_path)
    products = pickle_load(config.products_path)
    order_products_prior = pd.merge(order_products_prior,
                                    products,
                                    on='product_id',
                                    how='left')

    aisle_feat = pd.DataFrame()
    aisle_feat['aisle_order_num'] = order_products_prior.groupby(
        'aisle_id').size()
    aisle_feat['aisle_reorder_num'] = order_products_prior.groupby(
        'aisle_id')['reordered'].sum()
    aisle_feat['aisle_reorder_ratio'] = aisle_feat[
        'aisle_reorder_num'] / aisle_feat['aisle_order_num']

    aisle_feat[
        'aisle_average_add_to_cart_order'] = order_products_prior.groupby(
            'aisle_id')['add_to_cart_order'].mean()

    feats = [
        'aisle_order_num', 'aisle_reorder_num', 'aisle_reorder_ratio',
        'aisle_average_add_to_cart_order'
    ]
    pickle_dump(aisle_feat[feats],
                '{}/aisle_feat.pkl'.format(config.feat_folder))
    print('Done - aisle features')
Exemple #19
0
        df = pd.merge(df,
                      user_product_recent_feat,
                      left_on=['user_id', 'product_id'],
                      right_index=True,
                      how='left')
        df = pd.merge(df,
                      user_product_dependent_feat,
                      left_on=['user_id', 'product_id'],
                      right_index=True,
                      how='left')

        df = pd.merge(df,
                      user_aisle_feat,
                      left_on=['user_id', 'aisle_id'],
                      right_index=True,
                      how='left')
        df = pd.merge(df,
                      user_department_feat,
                      left_on=['user_id', 'department_id'],
                      right_index=True,
                      how='left')
        return df

    x_train_feat = merge_features(x_train)
    x_test_feat = merge_features(x_test)

    pickle_dump(x_train_feat,
                '{}/x_train_feat.pkl'.format(config.output_folder))
    pickle_dump(x_test_feat, '{}/x_test_feat.pkl'.format(config.output_folder))
    print('Done')
Exemple #20
0
def process_predict(file_folder, word_cut_func, is_en, file_name='output.csv'):
	checkOS()
	# isFirstTime = True
	glove_vectors, glove_embed_dim = load_glove_format('./raw_data/glove.42B.300d.txt')
	config = Config()
	print('preprocessing: ', file_folder)
	# nlp = spacy.load("en_core_web_sm")
	# nlp.tokenizer = Tokenizer(nlp.vocab)
	train_data = pd.read_csv(os.path.join(file_folder, file_name), header=0, index_col=None)
	train_data['content'] = train_data['content'].astype(str)
	train_data['aspect'] = train_data['aspect'].astype(str)
	if isUnix:
		train_data['word_list'] = train_data['content'].parallel_apply(word_cut_func)
		train_data['char_list'] = train_data['content'].parallel_apply(lambda x: list(x))
		train_data['aspect_word_list'] = train_data['aspect'].parallel_apply(word_cut_func)
		train_data['aspect_char_list'] = train_data['aspect'].parallel_apply(lambda x: list(x))
	else:
		train_data['word_list'] = train_data['content'].apply(word_cut_func)
		train_data['char_list'] = train_data['content'].apply(lambda x: list(x))
		train_data['aspect_word_list'] = train_data['aspect'].apply(word_cut_func)
		train_data['aspect_char_list'] = train_data['aspect'].apply(lambda x: list(x))



	print('size of training set:', len(train_data))

	word_corpus = train_data['word_list'].values.tolist()
	char_corpus = train_data['char_list'].values.tolist()
	aspect_corpus = train_data['aspect'].values.tolist()
	aspect_text_word_corpus = train_data['aspect_word_list'].values.tolist()
	aspect_text_char_corpus = train_data['aspect_char_list'].values.tolist()

	# build vocabulary
	print('building vocabulary...')
	word_vocab = build_vocabulary(word_corpus, start_id=1)
	char_vocab = build_vocabulary(char_corpus, start_id=1)
	aspect_vocab = build_vocabulary(aspect_corpus, start_id=0)
	aspect_text_word_vocab = build_vocabulary(aspect_text_word_corpus, start_id=1)
	aspect_text_char_vocab = build_vocabulary(aspect_text_char_corpus, start_id=1)
	pickle_dump(word_vocab, os.path.join(file_folder, 'word_vocab.pkl'))
	pickle_dump(char_vocab, os.path.join(file_folder, 'char_vocab.pkl'))
	pickle_dump(aspect_vocab, os.path.join(file_folder, 'aspect_vocab.pkl'))
	pickle_dump(aspect_text_word_vocab, os.path.join(file_folder, 'aspect_text_word_vocab.pkl'))
	pickle_dump(aspect_text_char_vocab, os.path.join(file_folder, 'aspect_text_char_vocab.pkl'))
	print('finished building vocabulary!')
	print('len of word vocabulary:', len(word_vocab))
	print('sample of word vocabulary:', list(word_vocab.items())[:10])
	print('len of char vocabulary:', len(char_vocab))
	print('sample of char vocabulary:', list(char_vocab.items())[:10])
	print('len of aspect vocabulary:', len(aspect_vocab))
	print('sample of aspect vocabulary:', list(aspect_vocab.items())[:10])
	print('len of aspect text word vocabulary:', len(aspect_text_word_vocab))
	print('sample of aspect text word vocabulary:', list(aspect_text_word_vocab.items())[:10])
	print('len of aspect text char vocabulary:', len(aspect_text_char_vocab))
	print('sample of aspect text char vocabulary:', list(aspect_text_char_vocab.items())[:10])

	# prepare embedding
	print('preparing embedding...')
	word_w2v = build_embedding(word_corpus, word_vocab, config.word_embed_dim)
	aspect_word_w2v = build_aspect_embedding(aspect_vocab, word_cut_func, word_vocab, word_w2v)
	aspect_text_word_w2v = build_aspect_text_embedding(aspect_text_word_vocab, word_vocab, word_w2v)
	char_w2v = build_embedding(char_corpus, char_vocab, config.word_embed_dim)
	aspect_char_w2v = build_aspect_embedding(aspect_vocab, lambda x: list(x), char_vocab, char_w2v)
	aspect_text_char_w2v = build_aspect_text_embedding(aspect_text_char_vocab, char_vocab, char_w2v)
	np.save(os.path.join(file_folder, 'word_w2v.npy'), word_w2v)
	np.save(os.path.join(file_folder, 'aspect_word_w2v.npy'), aspect_word_w2v)
	np.save(os.path.join(file_folder, 'aspect_text_word_w2v.npy'), aspect_text_word_w2v)
	np.save(os.path.join(file_folder, 'char_w2v.npy'), char_w2v)
	np.save(os.path.join(file_folder, 'aspect_char_w2v.npy'), aspect_char_w2v)
	np.save(os.path.join(file_folder, 'aspect_text_char_w2v.npy'), aspect_text_char_w2v)

	print('finished preparing embedding!')
	print('shape of word_w2v:', word_w2v.shape)
	print('sample of word_w2v:', word_w2v[:2, :5])
	print('shape of char_w2v:', char_w2v.shape)
	print('sample of char_w2v:', char_w2v[:2, :5])
	print('shape of aspect_word_w2v:', aspect_word_w2v.shape)
	print('sample of aspect_word_w2v:', aspect_word_w2v[:2, :5])
	print('shape of aspect_char_w2v:', aspect_char_w2v.shape)
	print('sample of aspect_char_w2v:', aspect_char_w2v[:2, :5])
	print('shape of aspect_text_word_w2v:', aspect_text_word_w2v.shape)
	print('sample of aspect_text_word_w2v:', aspect_text_word_w2v[:2, :5])
	print('shape of aspect_text_char_w2v:', aspect_text_char_w2v.shape)
	print('sample of aspect_text_char_w2v:', aspect_text_char_w2v[:2, :5])

	if is_en:
		word_glove = build_glove_embedding(word_vocab, glove_vectors, glove_embed_dim)
		aspect_word_glove = build_aspect_embedding(aspect_vocab, word_cut_func, word_vocab, word_glove)
		aspect_text_word_glove = build_aspect_text_embedding(aspect_text_word_vocab, word_vocab, word_glove)
		np.save(os.path.join(file_folder, 'word_glove.npy'), word_glove)
		np.save(os.path.join(file_folder, 'aspect_word_glove.npy'), aspect_word_glove)
		np.save(os.path.join(file_folder, 'aspect_text_word_glove.npy'), aspect_text_word_glove)
		print('shape of word_glove:', word_glove.shape)
		print('sample of word_glove:', word_glove[:2, :5])
		print('shape of aspect_word_glove:', aspect_word_glove.shape)
		print('sample of aspect_word_glove:', aspect_word_glove[:2, :5])
		print('shape of aspect_text_word_glove:', aspect_text_word_glove.shape)
		print('sample of aspect_text_word_glove:', aspect_text_word_glove[:2, :5])

	# prepare input
	print('preparing text input...')
	if isUnix:
		train_word_input = train_data['word_list'].parallel_apply(
		lambda x: [word_vocab.get(word, len(word_vocab)+1) for word in x]).values.tolist()
		train_char_input = train_data['char_list'].parallel_apply(
		lambda x: [char_vocab.get(char, len(char_vocab)+1) for char in x]).values.tolist()
	else:
		train_word_input = train_data['word_list'].apply(
		lambda x: [word_vocab.get(word, len(word_vocab)+1) for word in x]).values.tolist()
		train_char_input = train_data['char_list'].apply(
		lambda x: [char_vocab.get(char, len(char_vocab)+1) for char in x]).values.tolist()

	pickle_dump(train_word_input, os.path.join(file_folder, 'train_word_input.pkl'))
	pickle_dump(train_char_input, os.path.join(file_folder, 'train_char_input.pkl'))
	print('finished preparing text input!')


	print('preparing aspect input...')
	if isUnix:
		train_aspect_input = train_data['aspect'].parallel_apply(lambda x: [aspect_vocab[x]]).values.tolist()
	else:
		train_aspect_input = train_data['aspect'].apply(lambda x: [aspect_vocab[x]]).values.tolist()        
	pickle_dump(train_aspect_input, os.path.join(file_folder, 'train_aspect_input.pkl'))
	print('finished preparing aspect input!')

	print('preparing aspect text input...')
	if isUnix:
		train_aspect_text_word_input = train_data['aspect_word_list'].parallel_apply(
		lambda x: [aspect_text_word_vocab.get(word, len(aspect_text_word_vocab) + 1) for word in x]).values.tolist()
		train_aspect_text_char_input = train_data['aspect_char_list'].parallel_apply(
		lambda x: [aspect_text_char_vocab.get(char, len(aspect_text_char_vocab) + 1) for char in x]).values.tolist()
	else:
		train_aspect_text_word_input = train_data['aspect_word_list'].apply(
		lambda x: [aspect_text_word_vocab.get(word, len(aspect_text_word_vocab) + 1) for word in x]).values.tolist()
		train_aspect_text_char_input = train_data['aspect_char_list'].apply(
		lambda x: [aspect_text_char_vocab.get(char, len(aspect_text_char_vocab) + 1) for char in x]).values.tolist()

	pickle_dump(train_aspect_text_word_input, os.path.join(file_folder, 'train_word_aspect_input.pkl'))
	pickle_dump(train_aspect_text_char_input, os.path.join(file_folder, 'train_char_aspect_input.pkl'))
	print('finished preparing aspect text input!')

	if 'from' in train_data.columns:
		print('preparing left text input, right text input & position input...')
		train_word_input_l, train_word_input_r, train_word_input_r_with_pad, train_word_mask, train_word_pos_input, \
			train_word_offset_input, train_char_input_l, train_char_input_r, train_char_input_r_with_pad, \
			train_char_mask, train_char_pos_input, train_char_offset_input = split_text_and_get_loc_info(train_data,
																										 word_vocab,
																										 char_vocab,
																										 word_cut_func)
		pickle_dump(train_word_input_l, os.path.join(file_folder, 'train_word_input_l.pkl'))
		pickle_dump(train_word_input_r, os.path.join(file_folder, 'train_word_input_r.pkl'))
		pickle_dump(train_word_input_r_with_pad, os.path.join(file_folder, 'train_word_input_r_with_pad.pkl'))
		pickle_dump(train_word_mask, os.path.join(file_folder, 'train_word_mask.pkl'))
		pickle_dump(train_word_pos_input, os.path.join(file_folder, 'train_word_pos_input.pkl'))
		pickle_dump(train_word_offset_input, os.path.join(file_folder, 'train_word_offset_input.pkl'))
		pickle_dump(train_char_input_l, os.path.join(file_folder, 'train_char_input_l.pkl'))
		pickle_dump(train_char_input_r, os.path.join(file_folder, 'train_char_input_r.pkl'))
		pickle_dump(train_char_input_r_with_pad, os.path.join(file_folder, 'train_char_input_r_with_pad.pkl'))
		pickle_dump(train_char_mask, os.path.join(file_folder, 'train_char_mask.pkl'))
		pickle_dump(train_char_pos_input, os.path.join(file_folder, 'train_char_pos_input.pkl'))
		pickle_dump(train_char_offset_input, os.path.join(file_folder, 'train_char_offset_input.pkl'))


	# prepare output
	print('preparing output....')
	pickle_dump(train_data['sentiment'].values.tolist(), os.path.join(file_folder, 'train_label.pkl'))
	print('finished preparing output!')
Exemple #21
0
def pre_process(file_folder, word_cut_func, is_en):
	checkOS()
	print('preprocessing: ', file_folder)
	train_data = pd.read_csv(os.path.join(file_folder, 'train.csv'), header=0, index_col=None)
	train_data['content'] = train_data['content'].astype(str)
	train_data['aspect'] = train_data['aspect'].astype(str)
	print("checking for null obj",train_data['content'].isnull().sum())
	print("checking for null obj",train_data['aspect'].isnull().sum())

	if isUnix:
		train_data['word_list'] = train_data['content'].parallel_apply(word_cut_func)
		train_data['char_list'] = train_data['content'].parallel_apply(lambda x: list(x))
		train_data['aspect_word_list'] = train_data['aspect'].parallel_apply(word_cut_func)
		train_data['aspect_char_list'] = train_data['aspect'].parallel_apply(lambda x: list(x))
	else:
		train_data['word_list'] = train_data['content'].apply(word_cut_func)
		train_data['char_list'] = train_data['content'].apply(lambda x: list(x))
		train_data['aspect_word_list'] = train_data['aspect'].apply(word_cut_func)
		train_data['aspect_char_list'] = train_data['aspect'].apply(lambda x: list(x))



	valid_data = pd.read_csv(os.path.join(file_folder, 'valid.csv'), header=0, index_col=None)
	valid_data['content'] = valid_data['content'].astype(str)
	valid_data['aspect'] = valid_data['aspect'].astype(str)
	if isUnix:
		valid_data['word_list'] = valid_data['content'].parallel_apply(word_cut_func)
		valid_data['char_list'] = valid_data['content'].parallel_apply(lambda x: list(x))
		valid_data['aspect_word_list'] = valid_data['aspect'].parallel_apply(word_cut_func)
		valid_data['aspect_char_list'] = valid_data['aspect'].parallel_apply(lambda x: list(x))
	else:
		valid_data['word_list'] = valid_data['content'].apply(word_cut_func)
		valid_data['char_list'] = valid_data['content'].apply(lambda x: list(x))
		valid_data['aspect_word_list'] = valid_data['aspect'].apply(word_cut_func)
		valid_data['aspect_char_list'] = valid_data['aspect'].apply(lambda x: list(x))
	test_data = pd.read_csv(os.path.join(file_folder, 'test.csv'), header=0, index_col=None)
	test_data['content'] = test_data['content'].astype(str)
	test_data['aspect'] = test_data['aspect'].astype(str)

	if isUnix:
		test_data['word_list'] = test_data['content'].parallel_apply(word_cut_func)
		test_data['char_list'] = test_data['content'].parallel_apply(lambda x: list(x))
		test_data['aspect_word_list'] = test_data['aspect'].parallel_apply(word_cut_func)
		test_data['aspect_char_list'] = test_data['aspect'].parallel_apply(lambda x: list(x))
	else:
		test_data['word_list'] = test_data['content'].apply(word_cut_func)
		test_data['char_list'] = test_data['content'].apply(lambda x: list(x))
		test_data['aspect_word_list'] = test_data['aspect'].apply(word_cut_func)
		test_data['aspect_char_list'] = test_data['aspect'].apply(lambda x: list(x))


	print('size of training set:', len(train_data))
	print('size of valid set:', len(valid_data))
	print('size of test set:', len(test_data))

	word_corpus = np.concatenate((train_data['word_list'].values, valid_data['word_list'].values,
								  test_data['word_list'].values)).tolist()
	char_corpus = np.concatenate((train_data['char_list'].values, valid_data['char_list'].values,
								  test_data['char_list'].values)).tolist()
	aspect_corpus = np.concatenate((train_data['aspect'].values, valid_data['aspect'].values,
									test_data['aspect'].values)).tolist()
	aspect_text_word_corpus = np.concatenate((train_data['aspect_word_list'].values,
											  valid_data['aspect_word_list'].values,
											  test_data['aspect_word_list'].values)).tolist()
	aspect_text_char_corpus = np.concatenate((train_data['aspect_char_list'].values,
											  valid_data['aspect_char_list'].values,
											  test_data['aspect_char_list'].values)).tolist()

	# build vocabulary
	print('building vocabulary...')
	word_vocab = build_vocabulary(word_corpus, start_id=1)
	char_vocab = build_vocabulary(char_corpus, start_id=1)
	aspect_vocab = build_vocabulary(aspect_corpus, start_id=0)
	aspect_text_word_vocab = build_vocabulary(aspect_text_word_corpus, start_id=1)
	aspect_text_char_vocab = build_vocabulary(aspect_text_char_corpus, start_id=1)
	pickle_dump(word_vocab, os.path.join(file_folder, 'word_vocab.pkl'))
	pickle_dump(char_vocab, os.path.join(file_folder, 'char_vocab.pkl'))
	pickle_dump(aspect_vocab, os.path.join(file_folder, 'aspect_vocab.pkl'))
	pickle_dump(aspect_text_word_vocab, os.path.join(file_folder, 'aspect_text_word_vocab.pkl'))
	pickle_dump(aspect_text_char_vocab, os.path.join(file_folder, 'aspect_text_char_vocab.pkl'))
	print('finished building vocabulary!')
	print('len of word vocabulary:', len(word_vocab))
	print('sample of word vocabulary:', list(word_vocab.items())[:10])
	print('len of char vocabulary:', len(char_vocab))
	print('sample of char vocabulary:', list(char_vocab.items())[:10])
	print('len of aspect vocabulary:', len(aspect_vocab))
	print('sample of aspect vocabulary:', list(aspect_vocab.items())[:10])
	print('len of aspect text word vocabulary:', len(aspect_text_word_vocab))
	print('sample of aspect text word vocabulary:', list(aspect_text_word_vocab.items())[:10])
	print('len of aspect text char vocabulary:', len(aspect_text_char_vocab))
	print('sample of aspect text char vocabulary:', list(aspect_text_char_vocab.items())[:10])

	# prepare embedding
	print('preparing embedding...')
	word_w2v = build_embedding(word_corpus, word_vocab, config.word_embed_dim)
	aspect_word_w2v = build_aspect_embedding(aspect_vocab, word_cut_func, word_vocab, word_w2v)
	aspect_text_word_w2v = build_aspect_text_embedding(aspect_text_word_vocab, word_vocab, word_w2v)
	char_w2v = build_embedding(char_corpus, char_vocab, config.word_embed_dim)
	aspect_char_w2v = build_aspect_embedding(aspect_vocab, lambda x: list(x), char_vocab, char_w2v)
	aspect_text_char_w2v = build_aspect_text_embedding(aspect_text_char_vocab, char_vocab, char_w2v)
	np.save(os.path.join(file_folder, 'word_w2v.npy'), word_w2v)
	np.save(os.path.join(file_folder, 'aspect_word_w2v.npy'), aspect_word_w2v)
	np.save(os.path.join(file_folder, 'aspect_text_word_w2v.npy'), aspect_text_word_w2v)
	np.save(os.path.join(file_folder, 'char_w2v.npy'), char_w2v)
	np.save(os.path.join(file_folder, 'aspect_char_w2v.npy'), aspect_char_w2v)
	np.save(os.path.join(file_folder, 'aspect_text_char_w2v.npy'), aspect_text_char_w2v)

	print('finished preparing embedding!')
	print('shape of word_w2v:', word_w2v.shape)
	print('sample of word_w2v:', word_w2v[:2, :5])
	print('shape of char_w2v:', char_w2v.shape)
	print('sample of char_w2v:', char_w2v[:2, :5])
	print('shape of aspect_word_w2v:', aspect_word_w2v.shape)
	print('sample of aspect_word_w2v:', aspect_word_w2v[:2, :5])
	print('shape of aspect_char_w2v:', aspect_char_w2v.shape)
	print('sample of aspect_char_w2v:', aspect_char_w2v[:2, :5])
	print('shape of aspect_text_word_w2v:', aspect_text_word_w2v.shape)
	print('sample of aspect_text_word_w2v:', aspect_text_word_w2v[:2, :5])
	print('shape of aspect_text_char_w2v:', aspect_text_char_w2v.shape)
	print('sample of aspect_text_char_w2v:', aspect_text_char_w2v[:2, :5])

	if is_en:
		word_glove = build_glove_embedding(word_vocab, glove_vectors, glove_embed_dim)
		aspect_word_glove = build_aspect_embedding(aspect_vocab, word_cut_func, word_vocab, word_glove)
		aspect_text_word_glove = build_aspect_text_embedding(aspect_text_word_vocab, word_vocab, word_glove)
		np.save(os.path.join(file_folder, 'word_glove.npy'), word_glove)
		np.save(os.path.join(file_folder, 'aspect_word_glove.npy'), aspect_word_glove)
		np.save(os.path.join(file_folder, 'aspect_text_word_glove.npy'), aspect_text_word_glove)
		print('shape of word_glove:', word_glove.shape)
		print('sample of word_glove:', word_glove[:2, :5])
		print('shape of aspect_word_glove:', aspect_word_glove.shape)
		print('sample of aspect_word_glove:', aspect_word_glove[:2, :5])
		print('shape of aspect_text_word_glove:', aspect_text_word_glove.shape)
		print('sample of aspect_text_word_glove:', aspect_text_word_glove[:2, :5])

	# prepare input
	print('preparing text input...')
	if isUnix:
		train_word_input = train_data['word_list'].parallel_apply(
		lambda x: [word_vocab.get(word, len(word_vocab)+1) for word in x]).values.tolist()
		train_char_input = train_data['char_list'].parallel_apply(
		lambda x: [char_vocab.get(char, len(char_vocab)+1) for char in x]).values.tolist()
		valid_word_input = valid_data['word_list'].parallel_apply(
		lambda x: [word_vocab.get(word, len(word_vocab)+1) for word in x]).values.tolist()
		valid_char_input = valid_data['char_list'].parallel_apply(
		lambda x: [char_vocab.get(char, len(char_vocab)+1) for char in x]).values.tolist()
		test_word_input = test_data['word_list'].parallel_apply(
		lambda x: [word_vocab.get(word, len(word_vocab)+1) for word in x]).values.tolist()
		test_char_input = test_data['char_list'].parallel_apply(
		lambda x: [char_vocab.get(char, len(char_vocab)+1) for char in x]).values.tolist()
	else:
		train_word_input = train_data['word_list'].apply(
		lambda x: [word_vocab.get(word, len(word_vocab)+1) for word in x]).values.tolist()
		train_char_input = train_data['char_list'].apply(
		lambda x: [char_vocab.get(char, len(char_vocab)+1) for char in x]).values.tolist()
		valid_word_input = valid_data['word_list'].apply(
		lambda x: [word_vocab.get(word, len(word_vocab)+1) for word in x]).values.tolist()
		valid_char_input = valid_data['char_list'].apply(
		lambda x: [char_vocab.get(char, len(char_vocab)+1) for char in x]).values.tolist()
		test_word_input = test_data['word_list'].apply(
		lambda x: [word_vocab.get(word, len(word_vocab)+1) for word in x]).values.tolist()
		test_char_input = test_data['char_list'].apply(
		lambda x: [char_vocab.get(char, len(char_vocab)+1) for char in x]).values.tolist()

	pickle_dump(train_word_input, os.path.join(file_folder, 'train_word_input.pkl'))
	pickle_dump(train_char_input, os.path.join(file_folder, 'train_char_input.pkl'))
	pickle_dump(valid_word_input, os.path.join(file_folder, 'valid_word_input.pkl'))
	pickle_dump(valid_char_input, os.path.join(file_folder, 'valid_char_input.pkl'))
	pickle_dump(test_word_input, os.path.join(file_folder, 'test_word_input.pkl'))
	pickle_dump(test_char_input, os.path.join(file_folder, 'test_char_input.pkl'))
	print('finished preparing text input!')
	print('length analysis of text word input:')
	analyze_len_distribution(train_word_input, valid_word_input, test_word_input)
	print('length analysis of text char input')
	analyze_len_distribution(train_char_input, valid_char_input, test_char_input)

	print('preparing aspect input...')
	if isUnix:
		train_aspect_input = train_data['aspect'].parallel_apply(lambda x: [aspect_vocab[x]]).values.tolist()
		valid_aspect_input = valid_data['aspect'].parallel_apply(lambda x: [aspect_vocab[x]]).values.tolist()
		test_aspect_input = test_data['aspect'].parallel_apply(lambda x: [aspect_vocab[x]]).values.tolist()
	else:
		train_aspect_input = train_data['aspect'].apply(lambda x: [aspect_vocab[x]]).values.tolist()
		valid_aspect_input = valid_data['aspect'].apply(lambda x: [aspect_vocab[x]]).values.tolist()
		test_aspect_input = test_data['aspect'].apply(lambda x: [aspect_vocab[x]]).values.tolist()        
	pickle_dump(train_aspect_input, os.path.join(file_folder, 'train_aspect_input.pkl'))
	pickle_dump(valid_aspect_input, os.path.join(file_folder, 'valid_aspect_input.pkl'))
	pickle_dump(test_aspect_input, os.path.join(file_folder, 'test_aspect_input.pkl'))
	print('finished preparing aspect input!')

	print('preparing aspect text input...')
	if isUnix:      
		train_aspect_text_word_input = train_data['aspect_word_list'].parallel_apply(
			lambda x: [aspect_text_word_vocab.get(word, len(aspect_text_word_vocab) + 1) for word in x]).values.tolist()
		train_aspect_text_char_input = train_data['aspect_char_list'].parallel_apply(
			lambda x: [aspect_text_char_vocab.get(char, len(aspect_text_char_vocab) + 1) for char in x]).values.tolist()
		valid_aspect_text_word_input = valid_data['aspect_word_list'].parallel_apply(
			lambda x: [aspect_text_word_vocab.get(word, len(aspect_text_word_vocab) + 1) for word in x]).values.tolist()
		valid_aspect_text_char_input = valid_data['aspect_char_list'].parallel_apply(
			lambda x: [aspect_text_char_vocab.get(char, len(aspect_text_char_vocab) + 1) for char in x]).values.tolist()
		test_aspect_text_word_input = test_data['aspect_word_list'].parallel_apply(
			lambda x: [aspect_text_word_vocab.get(word, len(aspect_text_word_vocab) + 1) for word in x]).values.tolist()
		test_aspect_text_char_input = test_data['aspect_char_list'].parallel_apply(
			lambda x: [aspect_text_char_vocab.get(char, len(aspect_text_char_vocab) + 1) for char in x]).values.tolist()
	else:
		train_aspect_text_word_input = train_data['aspect_word_list'].apply(
		lambda x: [aspect_text_word_vocab.get(word, len(aspect_text_word_vocab) + 1) for word in x]).values.tolist()
		train_aspect_text_char_input = train_data['aspect_char_list'].apply(
		lambda x: [aspect_text_char_vocab.get(char, len(aspect_text_char_vocab) + 1) for char in x]).values.tolist()
		valid_aspect_text_word_input = valid_data['aspect_word_list'].apply(
		lambda x: [aspect_text_word_vocab.get(word, len(aspect_text_word_vocab) + 1) for word in x]).values.tolist()
		valid_aspect_text_char_input = valid_data['aspect_char_list'].apply(
		lambda x: [aspect_text_char_vocab.get(char, len(aspect_text_char_vocab) + 1) for char in x]).values.tolist()
		test_aspect_text_word_input = test_data['aspect_word_list'].apply(
		lambda x: [aspect_text_word_vocab.get(word, len(aspect_text_word_vocab) + 1) for word in x]).values.tolist()
		test_aspect_text_char_input = test_data['aspect_char_list'].apply(
		lambda x: [aspect_text_char_vocab.get(char, len(aspect_text_char_vocab) + 1) for char in x]).values.tolist()

	pickle_dump(train_aspect_text_word_input, os.path.join(file_folder, 'train_word_aspect_input.pkl'))
	pickle_dump(train_aspect_text_char_input, os.path.join(file_folder, 'train_char_aspect_input.pkl'))
	pickle_dump(valid_aspect_text_word_input, os.path.join(file_folder, 'valid_word_aspect_input.pkl'))
	pickle_dump(valid_aspect_text_char_input, os.path.join(file_folder, 'valid_char_aspect_input.pkl'))
	pickle_dump(test_aspect_text_word_input, os.path.join(file_folder, 'test_word_aspect_input.pkl'))
	pickle_dump(test_aspect_text_char_input, os.path.join(file_folder, 'test_char_aspect_input.pkl'))
	print('finished preparing aspect text input!')
	print('length analysis of aspect text word input:')
	analyze_len_distribution(train_aspect_text_word_input, valid_aspect_text_word_input, test_aspect_text_word_input)
	print('length analysis of aspect text char input')
	analyze_len_distribution(train_aspect_text_char_input, valid_aspect_text_char_input, test_aspect_text_char_input)

	if 'from' in train_data.columns:
		print('preparing left text input, right text input & position input...')
		train_word_input_l, train_word_input_r, train_word_input_r_with_pad, train_word_mask, train_word_pos_input, \
			train_word_offset_input, train_char_input_l, train_char_input_r, train_char_input_r_with_pad, \
			train_char_mask, train_char_pos_input, train_char_offset_input = split_text_and_get_loc_info(train_data,
																										 word_vocab,
																										 char_vocab,
																										 word_cut_func)
		pickle_dump(train_word_input_l, os.path.join(file_folder, 'train_word_input_l.pkl'))
		pickle_dump(train_word_input_r, os.path.join(file_folder, 'train_word_input_r.pkl'))
		pickle_dump(train_word_input_r_with_pad, os.path.join(file_folder, 'train_word_input_r_with_pad.pkl'))
		pickle_dump(train_word_mask, os.path.join(file_folder, 'train_word_mask.pkl'))
		pickle_dump(train_word_pos_input, os.path.join(file_folder, 'train_word_pos_input.pkl'))
		pickle_dump(train_word_offset_input, os.path.join(file_folder, 'train_word_offset_input.pkl'))
		pickle_dump(train_char_input_l, os.path.join(file_folder, 'train_char_input_l.pkl'))
		pickle_dump(train_char_input_r, os.path.join(file_folder, 'train_char_input_r.pkl'))
		pickle_dump(train_char_input_r_with_pad, os.path.join(file_folder, 'train_char_input_r_with_pad.pkl'))
		pickle_dump(train_char_mask, os.path.join(file_folder, 'train_char_mask.pkl'))
		pickle_dump(train_char_pos_input, os.path.join(file_folder, 'train_char_pos_input.pkl'))
		pickle_dump(train_char_offset_input, os.path.join(file_folder, 'train_char_offset_input.pkl'))

		valid_word_input_l, valid_word_input_r, valid_word_input_r_with_pad, valid_word_mask, valid_word_pos_input, \
			valid_word_offset_input, valid_char_input_l, valid_char_input_r, valid_char_input_r_with_pad, \
			valid_char_mask, valid_char_pos_input, valid_char_offset_input = split_text_and_get_loc_info(valid_data,
																										 word_vocab,
																										 char_vocab,
																										 word_cut_func)
		pickle_dump(valid_word_input_l, os.path.join(file_folder, 'valid_word_input_l.pkl'))
		pickle_dump(valid_word_input_r, os.path.join(file_folder, 'valid_word_input_r.pkl'))
		pickle_dump(valid_word_input_r_with_pad, os.path.join(file_folder, 'valid_word_input_r_with_pad.pkl'))
		pickle_dump(valid_word_mask, os.path.join(file_folder, 'valid_word_mask.pkl'))
		pickle_dump(valid_word_pos_input, os.path.join(file_folder, 'valid_word_pos_input.pkl'))
		pickle_dump(valid_word_offset_input, os.path.join(file_folder, 'valid_word_offset_input.pkl'))
		pickle_dump(valid_char_input_l, os.path.join(file_folder, 'valid_char_input_l.pkl'))
		pickle_dump(valid_char_input_r, os.path.join(file_folder, 'valid_char_input_r.pkl'))
		pickle_dump(valid_char_input_r_with_pad, os.path.join(file_folder, 'valid_char_input_r_with_pad.pkl'))
		pickle_dump(valid_char_mask, os.path.join(file_folder, 'valid_char_mask.pkl'))
		pickle_dump(valid_char_pos_input, os.path.join(file_folder, 'valid_char_pos_input.pkl'))
		pickle_dump(valid_char_offset_input, os.path.join(file_folder, 'valid_char_offset_input.pkl'))

		test_word_input_l, test_word_input_r, test_word_input_r_with_pad, test_word_mask, test_word_pos_input, \
			test_word_offset_input, test_char_input_l, test_char_input_r, test_char_input_r_with_pad, test_char_mask, \
			test_char_pos_input, test_char_offset_input = split_text_and_get_loc_info(test_data, word_vocab,
																					  char_vocab, word_cut_func)
		pickle_dump(test_word_input_l, os.path.join(file_folder, 'test_word_input_l.pkl'))
		pickle_dump(test_word_input_r, os.path.join(file_folder, 'test_word_input_r.pkl'))
		pickle_dump(test_word_input_r_with_pad, os.path.join(file_folder, 'test_word_input_r_with_pad.pkl'))
		pickle_dump(test_word_mask, os.path.join(file_folder, 'test_word_mask.pkl'))
		pickle_dump(test_word_pos_input, os.path.join(file_folder, 'test_word_pos_input.pkl'))
		pickle_dump(test_word_offset_input, os.path.join(file_folder, 'test_word_offset_input.pkl'))
		print("Test Word Output")
		pickle_dump(test_char_input_l, os.path.join(file_folder, 'test_char_input_l.pkl'))
		pickle_dump(test_char_input_r, os.path.join(file_folder, 'test_char_input_r.pkl'))
		pickle_dump(test_char_input_r_with_pad, os.path.join(file_folder, 'test_char_input_r_with_pad.pkl'))
		pickle_dump(test_char_mask, os.path.join(file_folder, 'test_char_mask.pkl'))
		pickle_dump(test_char_pos_input, os.path.join(file_folder, 'test_char_pos_input.pkl'))
		pickle_dump(test_char_offset_input, os.path.join(file_folder, 'test_char_offset_input.pkl'))

		print('length analysis of left text word input:')
		analyze_len_distribution(train_word_input_l, valid_word_input_l, test_word_input_l)
		print('length analysis of left text char input')
		analyze_len_distribution(train_char_input_l, valid_char_input_l, test_char_input_l)
		print('length analysis of right text word input:')
		analyze_len_distribution(train_word_input_r, valid_word_input_r, test_word_input_r)
		print('length analysis of right text char input')
		analyze_len_distribution(train_char_input_r, valid_char_input_r, test_char_input_r)

	# prepare output
	print('preparing output....')
	pickle_dump(train_data['sentiment'].values.tolist(), os.path.join(file_folder, 'train_label.pkl'))
	pickle_dump(valid_data['sentiment'].values.tolist(), os.path.join(file_folder, 'valid_label.pkl'))
	if 'sentiment' in test_data.columns:
		pickle_dump(test_data['sentiment'].values.tolist(), os.path.join(file_folder, 'test_label.pkl'))
	print('finished preparing output!')
	print('class analysis of training set:')
	analyze_class_distribution(train_data['sentiment'].values.tolist())
	print('class analysis of valid set:')
	analyze_class_distribution(valid_data['sentiment'].values.tolist())
	if 'sentiment' in test_data.columns:
		print('class analysis of test set:')
		analyze_class_distribution(valid_data['sentiment'].values.tolist())
Exemple #22
0
    orders = pickle_load(config.orders_path)

    train_orders = orders[orders.eval_set == 'train'][['order_id',
                                                       'user_id']].copy()
    test_orders = orders[orders.eval_set == 'test'][['order_id',
                                                     'user_id']].copy()

    user_product_pair = order_products_prior[['user_id',
                                              'product_id']].drop_duplicates()

    train_df = pd.merge(train_orders, user_product_pair, on='user_id')
    test_df = pd.merge(test_orders, user_product_pair, on='user_id')

    order_products_train = order_products_train[[
        'order_id', 'product_id', 'reordered'
    ]]
    train_df = pd.merge(train_df,
                        order_products_train,
                        on=['order_id', 'product_id'],
                        how='left')
    train_df['reordered'] = train_df['reordered'].fillna(0).astype(np.int)

    x_train = train_df[['order_id', 'user_id', 'product_id']]
    y_train = train_df['reordered']

    x_test = test_df

    pickle_dump(x_train, '{}/x_train.pkl'.format(config.output_folder))
    pickle_dump(y_train, '{}/y_train.pkl'.format(config.output_folder))
    pickle_dump(x_test, '{}/x_test.pkl'.format(config.output_folder))
    print('Done - dataset construction')
import sys
import numpy as np
import pandas as pd

sys.path.append('../')
from param_config import config
from utils import pickle_load, pickle_dump

if __name__ == '__main__':
    print('Generating user_department features...')
    order_products_prior = pickle_load(config.order_products_prior_path)
    products = pickle_load(config.products_path)

    order_products_prior = pd.merge(order_products_prior, products, on='product_id', how='left')

    ud_feat = pd.DataFrame()
    ud_feat['ud_first_order'] = order_products_prior.groupby(["user_id", "department_id"])['order_number_before_last_order'].max()
    ud_feat['ud_last_order'] = order_products_prior.groupby(["user_id", "department_id"])['order_number_before_last_order'].min()
    ud_feat['ud_distinct_order_num'] = order_products_prior.groupby(['user_id', 'department_id']).order_id.nunique()
    ud_feat['ud_distinct_product_num'] = order_products_prior.groupby(['user_id', 'department_id'])['product_id'].nunique()

    feats = ['ud_first_order', 'ud_last_order', 'ud_distinct_order_num', 'ud_distinct_product_num']
    pickle_dump(ud_feat[feats], '{}/user_department_feat.pkl'.format(config.feat_folder))
    print('Done - user_department features')
    order_feat['order_days_since_prior_order_diff'] = \
            (orders[orders.eval_set != 'prior'].set_index('user_id')['order_days_since_prior_order'] - \
            orders[orders.eval_set == 'prior'].groupby('user_id')['order_days_since_prior_order'].mean()).values

    recent_orders = orders.groupby('user_id').tail(2)
    df = pd.DataFrame()
    df['order_delta_day_diff'] = np.abs(recent_orders.groupby('user_id')['order_dow'].apply(np.diff).apply(
            lambda x:x[0])).map(lambda x: min(x, 7-x))
    df['order_delta_hour_diff'] = np.abs(recent_orders.groupby('user_id')['order_hour_of_day'].apply(np.diff).apply(
            lambda x:x[0])).map(lambda x: min(x, 24-x))
    order_feat = pd.merge(order_feat, df, left_on='user_id', right_index=True)

    # Fillna
    # As for NaN in the feature 'order_days_since_prior_order_ratio', the numerator and the denominator are all zero. Hence fill 1
    order_feat['order_days_since_prior_order_ratio'].fillna(1, inplace=True)

    # Generate the feature based on order_number
    order_products_prior = pickle_load(config.order_products_prior_path)
    order_number_reorder_ratio = order_products_prior.groupby('order_number')['reordered'].mean().to_frame()
    order_number_reorder_ratio.columns = ['order_number_reorder_ratio']
    order_feat = pd.merge(order_feat, order_number_reorder_ratio, left_on='order_number', right_index=True, how='left')
    order_feat.set_index('order_id', inplace=True)


    feats = ['order_dow', 'order_hour_of_day', 'order_days_since_prior_order', 'order_weekend', 'order_hour_of_day_bin_id',
             'order_days_since_prior_order_ratio', 'order_days_since_prior_order_diff', 'order_number_reorder_ratio',
             'order_delta_day_diff', 'order_delta_hour_diff']

    pickle_dump(order_feat[feats], '{}/order_feat.pkl'.format(config.feat_folder))
    print('Done - order features')
    up_feat['up_first_order_proportion'] = up_feat['up_first_order'] / up_feat[
        'user_order_num']
    up_feat['up_last_order_proportion'] = up_feat['up_last_order'] / up_feat[
        'user_order_num']
    up_feat['up_average_order_proportion'] = up_feat[
        'up_average_order'] / up_feat['user_order_num']
    up_feat['up_last_order_proportion_ratio'] = up_feat[
        'up_last_order_proportion'] / up_feat['product_average_order_distance']

    # features based on order_days_before_last_order and other features
    up_feat['up_first_order_days_proportion'] = up_feat[
        'up_first_order_days'] / up_feat['user_order_days']
    up_feat['up_last_order_days_proportion'] = up_feat[
        'up_last_order_days'] / up_feat['user_order_days']
    up_feat['up_average_order_days_proportion'] = up_feat[
        'up_average_order_days'] / up_feat['user_order_days']
    up_feat['up_last_order_days_proportion_ratio'] = \
            up_feat['up_last_order_days_proportion'] / up_feat['up_average_order_days_distance']

    feats = [
        'up_order_num_ratio', 'up_order_num_proportion',
        'up_average_add_to_cart_order_ratio', 'up_first_order_proportion',
        'up_last_order_proportion', 'up_average_order_proportion',
        'up_last_order_proportion_ratio', 'up_first_order_days_proportion',
        'up_last_order_days_proportion', 'up_average_order_days_proportion',
        'up_last_order_days_proportion_ratio'
    ]
    pickle_dump(
        up_feat[feats],
        '{}/user_product_dependent_feat.pkl'.format(config.feat_folder))
    print('Done - user_product features based on other features')
Exemple #26
0
def transaction_stats(directory = '.'):
    stats = {'transaction_count': {}, 'transaction_query_count': {}, 'transaction_read_count': {}, 'transaction_write_count': {}}

    transactions = []

    for repo in Repository.objects.exclude(latest_successful_attempt = None):
        if filter_repository(repo):
            continue

        project_type_name = repo.project_type.name
        if project_type_name not in stats['transaction_count']:
            stats['transaction_count'][project_type_name] = []
        if project_type_name not in stats['transaction_query_count']:
            stats['transaction_query_count'][project_type_name] = []
        if project_type_name not in stats['transaction_read_count']:
            stats['transaction_read_count'][project_type_name] = []
        if project_type_name not in stats['transaction_write_count']:
            stats['transaction_write_count'][project_type_name] = []
        

        for action in Action.objects.filter(attempt = repo.latest_successful_attempt):
            transaction = ''
            query_count = 0
            transaction_count = 0

            for query in Query.objects.filter(action = action):
                if 'BEGIN' in query.content.upper() or 'START TRANSACTION' in query.content.upper() or 'SET AUTOCOMMIT=0' in query.content.upper():
                    transaction = query.content + '\n'
                    query_count = 1
                elif transaction != '':
                    transaction += query.content + '\n'
                    query_count += 1
                    if 'COMMIT' in query.content.upper():
                        transaction = transaction.strip('\n')
                    
                        # for each transaction, count the number of transactions
                        transaction_count += 1

                        # for each transaction, count the number of read/write
                        read_count = len(re.findall('SELECT', transaction.upper()))
                        stats['transaction_read_count'][project_type_name].append(read_count)
                        write_count = 0
                        for keyword in ['INSERT', 'DELETE', 'UPDATE']:
                            write_count += len(re.findall(keyword, transaction.upper()))
                        stats['transaction_write_count'][project_type_name].append(write_count)
                        
                        # for each transaction, count the queries
                        query_count -= 2
                        stats['transaction_query_count'][project_type_name].append(query_count)

                        try:
                            transactions.append((repo.name, repo.project_type.name, transaction))
                        except:
                            pass

                        transaction = ''

            if transaction_count > 0:
                stats['transaction_count'][project_type_name].append(transaction_count)

    pickle_dump(directory, 'transactions', transactions)

    dump_all_stats(directory, stats)
def run_main(height_preset, ds, qu, neigh, opt):

    if height_preset == 1:
        n_clusters_l = [
            2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384,
            32768
        ]  #, 16384, 32768, 60000] #65536]
        n_clusters_l = [1 << 16]
        n_clusters_l = [16, 256]  #[16]
        n_clusters_l = [16]
        #n_clusters_l = [1<<8]
    elif height_preset == 2:
        n_clusters_l = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]  #2
        n_clusters_l = [16, 256]  #[16]
        n_clusters_l = [256]
    elif height_preset == 3:
        n_clusters_l = [2, 4, 8, 16, 32, 64]
        n_clusters_l = [2]
    elif height_preset in range(11):
        n_clusters_l = [2]
    else:
        raise Exception('No n_clusters for height {}'.format(height_preset))

    print('HEIGHT: {} n_clusters: {}'.format(height_preset, n_clusters_l))

    #if height_preset != 1 and opt.itq:
    #        raise Exception('Height must be 1 if using ITQ')

    force_height = True

    k = opt.k
    n_repeat = opt.n_repeat_km
    n_repeat = 1
    neigh = neigh[:, 0:k]
    ht2cutsz = defaultdict(list)

    #acc_mx = np.zeros((len(n_clusters_l), len(n_bins_l)))
    #probe_mx = np.zeros((len(n_clusters_l), len(n_bins_l)))
    n_clusters_l_len = len(n_clusters_l)
    acc_mx = [[] for i in range(n_clusters_l_len)]
    probe_mx = [[] for i in range(n_clusters_l_len)]
    probe95_mx = [[] for i in range(n_clusters_l_len)]
    max_bin_count = 0
    start_time = time.time()
    serial_data = {}
    serial_data['k'] = k

    if opt.pca or opt.rp or opt.itq or opt.st:
        #only 1-bin probe makes sense in these settings
        opt.max_bin_count = 1
    for i, n_clusters in enumerate(n_clusters_l):
        if force_height:
            height = height_preset
            serial_data['height'] = height
        else:
            height = math.floor(math.log(len(ds), n_clusters))
        bin_count = 40  #1

        acc = 0
        probe = 0
        #if opt.itq or opt.pca or opt.rp:
        #        #only 1-bin probe makes sense in these settings
        #        opt.max_bin_count = 1

        #keep expanding number of bins until acc reaches e.g. 0.97
        while acc < opt.acc_thresh and bin_count <= min(
                n_clusters, opt.max_bin_count):
            acc = 0
            probe = 0
            probe95 = 0
            for l in range(n_repeat):
                cur_acc, cur_probe, cur_probe95 = run_kmeans(
                    ds, qu, neigh, bin_count, n_clusters, height, ht2cutsz,
                    opt)
                acc += cur_acc
                probe += cur_probe
                probe95 += cur_probe95
            acc /= n_repeat
            probe /= n_repeat
            probe95 /= n_repeat

            #bin_count += 1
            bin_count += 1
            acc_mx[i].append(acc)
            probe_mx[i].append(probe)
            probe95_mx[i].append(probe95)

        max_bin_count = max(max_bin_count, bin_count - 1)
    end_time = time.time()
    serial_data['acc_mx'] = acc_mx
    serial_data['probe_mx'] = probe_mx
    serial_data['max_loyd'] = max_loyd
    serial_data['km_method'] = km_method
    serial_data['ht2cutsz'] = ht2cutsz

    print_output = True

    if print_output:
        print('total computation time: {} hrs'.format(
            (end_time - start_time) / 3600))
        print('acc {}'.format(acc_mx))
        print('probe count {}'.format(probe_mx))
        print('ht2cutsz {}'.format(ht2cutsz))

    row_label = ['{} clusters'.format(i) for i in n_clusters_l]

    col_label = ['{} bins'.format(i + 1) for i in range(max_bin_count)]
    acc_mx0 = acc_mx
    probe_mx0 = probe_mx
    probe95_mx0 = probe95_mx
    acc_mx = np.zeros((n_clusters_l_len, max_bin_count))
    probe_mx = np.zeros((n_clusters_l_len, max_bin_count))
    probe95_mx = np.zeros((n_clusters_l_len, max_bin_count))

    for i in range(len(n_clusters_l)):
        for j in range(len(acc_mx0[i])):
            acc_mx[i][j] = acc_mx0[i][j]
            probe_mx[i][j] = probe_mx0[i][j]
            probe95_mx[i][j] = probe95_mx0[i][j]
    #[acc_mx[i][j] = acc_mx0[i][j] for j in range(len(acc_mx0[i])) for i in range(len(n_clusters_l))]
    #[probe_mx[i][j] = probe_mx0[i][j] for j in range(len(probe_mx0[i])) for i in range(len(n_clusters_l))]

    acc_md = utils.mxs2md(
        [np.around(acc_mx, 3),
         np.rint(probe_mx),
         np.rint(probe95_mx)], row_label, col_label)

    cur_method = 'k-means'
    if opt.pca:
        cur_method = 'PCA Tree'
    elif opt.st:
        cur_method = 'ST'
    elif opt.itq:
        cur_method = 'ITQ'
    elif opt.rp:
        cur_method = 'Random Projection'
    elif opt.cplsh:
        cur_method = 'Cross Polytope LSH'

    if opt.write_res:  #False
        if opt.glove:
            res_path = os.path.join('results', 'linear2_glove.md')
        elif opt.glove_c:
            res_path = os.path.join('results', 'linear2_glove_c.md')
        elif opt.sift:
            res_path = os.path.join('results', 'linear2_sift.md')
        elif opt.sift_c:
            res_path = os.path.join('results', 'linear2_sift_c.md')
        else:
            res_path = os.path.join('results', 'linear2_mnist.md')
        with open(res_path, 'a') as file:
            msg = '\n\n{} **For k = {}, height {}, method {}, max_iter: {}**\n\n'.format(
                str(date.today()), k, height, cur_method, max_loyd)
            if opt.itq:
                msg = '\n\n*ITQ*' + msg
            file.write(msg)
            file.write(acc_md)
    if print_output:
        print('acc_md\n {} \n'.format(acc_md))

    if opt.glove:
        pickle_path = os.path.join(data_dir, 'glove',
                                   'kmeans_ht{}.pkl'.format(height))
        json_path = os.path.join(data_dir, 'glove',
                                 'kmeans_ht{}.json'.format(height))
    elif opt.glove_c:
        pickle_path = os.path.join(data_dir, 'glove_c',
                                   'kmeans_ht{}.pkl'.format(height))
        json_path = os.path.join(data_dir, 'glove_c',
                                 'kmeans_ht{}.json'.format(height))
    elif opt.sift:
        pickle_path = os.path.join(data_dir, 'sift',
                                   'kmeans_ht{}.pkl'.format(height))
        json_path = os.path.join(data_dir, 'sift',
                                 'kmeans_ht{}.json'.format(height))
    elif opt.sift_c:
        pickle_path = os.path.join(data_dir, 'sift_c',
                                   'kmeans_ht{}.pkl'.format(height))
        json_path = os.path.join(data_dir, 'sift_c',
                                 'kmeans_ht{}.json'.format(height))
    else:
        pickle_path = os.path.join(data_dir, 'kmeans_ht{}.pkl'.format(height))
        json_path = os.path.join(data_dir, 'kmeans_ht{}.json'.format(height))

    if False:  #march
        utils.pickle_dump(serial_data, pickle_path)
    with open(json_path, 'w') as file:
        json.dump(serial_data, file)

    return acc_mx, probe_mx, probe95_mx
Exemple #28
0
def _save_cache():
    global sw_df, roll_mat_csr, roll_index, y_true, D_START, D_END
    pickle_dump((sw_df, roll_mat_csr, roll_index, y_true,  D_START, D_END), EVAL_CACHE_FILE)
Exemple #29
0
def pre_process(file_folder, word_cut_func, is_en,start,end):
    # test_data = pd.read_csv(os.path.join('./data/twitter', 'test.csv'), header=0, index_col=None,encoding = 'unicode_escape')
    # print(type(test_data['content']))
    #test_data = pd.read_csv('data\sample\sample_data.csv', header=0, index_col=None,encoding = 'unicode_escape')
    #print(test_data)
    print('preprocessing: ', file_folder)
    
    test_data_word_list = word_cut_func(test_data_content)
    test_data_char_list = test_data_content
    test_data_aspect_word_list = word_cut_func(test_data_aspect)
    test_data_aspect_char_list = test_data_aspect
    #train_data['aspect_word_list'] = train_data['aspect'].apply(word_cut_func)
    #train_data['aspect_char_list'] = train_data['aspect'].apply(lambda x: list(x))
    print('building vocabulary...')
    word_corpus = (test_data_word_list)
    char_corpus = (test_data_char_list)
    aspect_corpus = (test_data_aspect)
    aspect_text_word_corpus = test_data_aspect_word_list
    aspect_text_char_corpus = test_data_aspect_char_list

    word_vocab = build_vocabulary(word_corpus, start_id=1)
    char_vocab = build_vocabulary(char_corpus, start_id=1)
    aspect_vocab = build_vocabulary(aspect_corpus, start_id=0)
    aspect_text_word_vocab = build_vocabulary(aspect_text_word_corpus, start_id=1)
    aspect_text_char_vocab = build_vocabulary(aspect_text_char_corpus, start_id=1)
    #print(word_vocab.get(word, len(word_vocab)+1) for word in (word_vocab))

    pickle_dump(word_vocab, os.path.join(file_folder, 'word_vocab2.pkl'))
    pickle_dump(char_vocab, os.path.join(file_folder, 'char_vocab2.pkl'))
    pickle_dump(aspect_vocab, os.path.join(file_folder, 'aspect_vocab2.pkl'))
    pickle_dump(aspect_text_word_vocab, os.path.join(file_folder, 'aspect_text_word_vocab2.pkl'))
    pickle_dump(aspect_text_char_vocab, os.path.join(file_folder, 'aspect_text_char_vocab2.pkl'))

    # prepare embedding
    print('preparing embedding...')
    word_w2v = build_embedding(word_corpus, word_vocab, config.word_embed_dim)
    aspect_word_w2v = build_aspect_embedding(aspect_vocab, word_cut_func, word_vocab, word_w2v)
    aspect_text_word_w2v = build_aspect_text_embedding(aspect_text_word_vocab, word_vocab, word_w2v)
    char_w2v = build_embedding(char_corpus, char_vocab, config.word_embed_dim)
    aspect_char_w2v = build_aspect_embedding(aspect_vocab, lambda x: list(x), char_vocab, char_w2v)
    aspect_text_char_w2v = build_aspect_text_embedding(aspect_text_char_vocab, char_vocab, char_w2v)
    np.save(os.path.join(file_folder, 'word_w2v.npy'), word_w2v)
    np.save(os.path.join(file_folder, 'aspect_word_w2v.npy'), aspect_word_w2v)
    np.save(os.path.join(file_folder, 'aspect_text_word_w2v.npy'), aspect_text_word_w2v)
    np.save(os.path.join(file_folder, 'char_w2v.npy'), char_w2v)
    np.save(os.path.join(file_folder, 'aspect_char_w2v.npy'), aspect_char_w2v)
    np.save(os.path.join(file_folder, 'aspect_text_char_w2v.npy'), aspect_text_char_w2v)
    print('finished preparing embedding!')

    if is_en:
        word_glove = build_glove_embedding(word_vocab, glove_vectors, glove_embed_dim)
        aspect_word_glove = build_aspect_embedding(aspect_vocab, word_cut_func, word_vocab, word_glove)
        aspect_text_word_glove = build_aspect_text_embedding(aspect_text_word_vocab, word_vocab, word_glove)
        np.save(os.path.join(file_folder, 'word_glove.npy'), word_glove)
        np.save(os.path.join(file_folder, 'aspect_word_glove.npy'), aspect_word_glove)
        np.save(os.path.join(file_folder, 'aspect_text_word_glove.npy'), aspect_text_word_glove)
        #print('shape of word_glove:', word_glove.shape)
        #print('sample of word_glove:', word_glove[:2, :5])
        #print('shape of aspect_word_glove:', aspect_word_glove.shape)
        #print('sample of aspect_word_glove:', aspect_word_glove[:2, :5])
        #print('shape of aspect_text_word_glove:', aspect_text_word_glove.shape)
        #print('sample of aspect_text_word_glove:', aspect_text_word_glove[:2, :5])

    # prepare input
    print('preparing text input...')
    g=lambda x: [word_vocab.get(word, len(word_vocab)+1) for word in x]
    f=lambda x: [char_vocab.get(char, len(char_vocab)+1) for char in x]
    test_word_input = g(test_data_word_list)
    test_char_input = f(test_data_char_list)
     
    pickle_dump(test_word_input, os.path.join(file_folder, 'test_word_input2.pkl'))
    pickle_dump(test_char_input, os.path.join(file_folder, 'test_char_input2.pkl'))
    print('finished preparing text input!')

    print('preparing aspect input...')
    #train_aspect_input = train_data['aspect'].apply(lambda x: [aspect_vocab[x]]).values.tolist()
    #valid_aspect_input = valid_data['aspect'].apply(lambda x: [aspect_vocab[x]]).values.tolist()
    test_aspect_input = test_data_aspect
    #pickle_dump(train_aspect_input, os.path.join(file_folder, 'train_aspect_input.pkl'))
    #pickle_dump(valid_aspect_input, os.path.join(file_folder, 'valid_aspect_input.pkl'))
    pickle_dump(test_aspect_input, os.path.join(file_folder, 'test_aspect_input.pkl'))
    print('finished preparing aspect input!')

    print('preparing aspect text input...')
    x=lambda x: [aspect_text_word_vocab.get(word, len(aspect_text_word_vocab) + 1) for word in x]
    y=lambda x: [aspect_text_char_vocab.get(char, len(aspect_text_char_vocab) + 1) for char in x]
    test_aspect_text_word_input =x(test_data_aspect_word_list)
    test_aspect_text_char_input =y(test_data_aspect_char_list)

    pickle_dump(test_aspect_text_word_input, os.path.join(file_folder, 'test_word_aspect_input.pkl'))
    pickle_dump(test_aspect_text_char_input, os.path.join(file_folder, 'test_char_aspect_input.pkl'))
    print('finished preparing aspect text input!')

    test_word_input_l, test_word_input_r, test_word_input_r_with_pad, test_word_mask, test_word_pos_input, \
            test_word_offset_input, test_char_input_l, test_char_input_r, test_char_input_r_with_pad, test_char_mask, \
            test_char_pos_input, test_char_offset_input = split_text_and_get_loc_info(word_vocab,
                                                                                      char_vocab, word_cut_func,start,end)
    pickle_dump(test_word_input_l, os.path.join(file_folder, 'test_word_input_l2.pkl'))
    pickle_dump(test_word_input_r, os.path.join(file_folder, 'test_word_input_r2.pkl'))
    pickle_dump(test_word_input_r_with_pad, os.path.join(file_folder, 'test_word_input_r_with_pad2.pkl'))
    pickle_dump(test_word_mask, os.path.join(file_folder, 'test_word_mask2.pkl'))
    pickle_dump(test_word_pos_input, os.path.join(file_folder, 'test_word_pos_input2.pkl'))
    pickle_dump(test_word_offset_input, os.path.join(file_folder, 'test_word_offset_input2.pkl'))
    pickle_dump(test_char_input_l, os.path.join(file_folder, 'test_char_input_l2.pkl'))
    pickle_dump(test_char_input_r, os.path.join(file_folder, 'test_char_input_r2.pkl'))
    pickle_dump(test_char_input_r_with_pad, os.path.join(file_folder, 'test_char_input_r_with_pad2.pkl'))
    pickle_dump(test_char_mask, os.path.join(file_folder, 'test_char_mask2.pkl'))
    pickle_dump(test_char_pos_input, os.path.join(file_folder, 'test_char_pos_input2.pkl'))
    pickle_dump(test_char_offset_input, os.path.join(file_folder, 'test_char_offset_input2.pkl'))

    # prepare output
    #if 'sentiment' in test_data.columns:
    #    pickle_dump(test_data['sentiment'].values.tolist(), os.path.join(file_folder, 'test_label.pkl'))
    print('finished preparing output!')
def process_data(dataset: str, config: ProcessConfig):
    train_file = NER_TRAIN_FILE[dataset]
    dev_file = NER_DEV_FILE.get(dataset, None)
    test_file = NER_TEST_FILE.get(dataset, None)

    print('Logging Info - Loading ner data...')
    if dev_file is None and test_file is None:
        train_data, dev_data, test_data = load_ner_data(train_file,
                                                        config.normalized,
                                                        config.lower,
                                                        split_mode=2)
    elif dev_file is None:
        train_data, dev_data = load_ner_data(train_file,
                                             config.normalized,
                                             config.lower,
                                             split_mode=1)
        test_data = load_ner_data(test_file, config.normalized, config.lower)
    elif test_file is None:
        train_data, test_data = load_ner_data(train_file,
                                              config.normalized,
                                              config.lower,
                                              split_mode=1)
        dev_data = load_ner_data(dev_file, config.normalized)
    else:
        train_data = load_ner_data(train_file, config.normalized, config.lower)
        dev_data = load_ner_data(dev_file, config.normalized, config.lower)
        test_data = load_ner_data(test_file, config.normalized, config.lower)

    print('Logging Info - Loading gazetteer and generating trie...')
    gaze_tries = dict()
    for gaze_file in GAZETTEER_FILES[dataset]:
        gaze_name = os.path.basename(gaze_file)
        gaze_tries[gaze_name] = load_gaze_trie(gaze_file, config.normalized,
                                               config.lower)

    print('Logging Info - Generating matching entity...')
    search_entity(train_data, gaze_tries)
    search_entity(dev_data, gaze_tries)
    search_entity(test_data, gaze_tries)

    print('Logging Info - Generating corpus...')
    char_corpus = [
        text_example.tokens
        for text_example in train_data + dev_data + test_data
    ]
    fw_bigram_corpus = [
        text_example.fw_bigrams
        for text_example in train_data + dev_data + test_data
    ]
    bw_bigram_corpus = [
        text_example.bw_bigrams
        for text_example in train_data + dev_data + test_data
    ]
    tag_corpus = [
        text_example.tags for text_example in train_data + dev_data + test_data
    ]

    print('Logging Info - Generating vocabulary...')
    char_vocab, idx2char = build_vocab(char_corpus)
    fw_bigram_vocab, idx2fw_bigram = build_vocab(fw_bigram_corpus)
    bw_bigram_vocab, idx2bw_bigram = build_vocab(bw_bigram_corpus)
    tag_vocab, idx2tag = build_tag_vocab(tag_corpus)

    print('Logging Info - Preparing embedding...')
    c2v = train_w2v(char_corpus,
                    char_vocab,
                    embedding_dim=config.char_embed_dim)
    c_fasttext = train_fasttext(char_corpus,
                                char_vocab,
                                embedding_dim=config.char_embed_dim)
    c_glove = train_glove(char_corpus,
                          char_vocab,
                          embedding_dim=config.char_embed_dim)
    fw_bi2v = train_w2v(fw_bigram_corpus,
                        fw_bigram_vocab,
                        embedding_dim=config.bigram_embed_dim)
    fw_bifasttext = train_fasttext(fw_bigram_corpus,
                                   fw_bigram_vocab,
                                   embedding_dim=config.bigram_embed_dim)
    fw_biglove = train_glove(fw_bigram_corpus,
                             fw_bigram_vocab,
                             embedding_dim=config.bigram_embed_dim)
    bw_bi2v = train_w2v(bw_bigram_corpus,
                        bw_bigram_vocab,
                        embedding_dim=config.bigram_embed_dim)
    bw_bifasttext = train_fasttext(bw_bigram_corpus,
                                   bw_bigram_vocab,
                                   embedding_dim=config.bigram_embed_dim)
    bw_biglove = train_glove(bw_bigram_corpus,
                             bw_bigram_vocab,
                             embedding_dim=config.bigram_embed_dim)

    print('Logging Info - Saving processed data...')
    pickle_dump(
        format_filename(PROCESSED_DATA_DIR,
                        TRAIN_DATA_TEMPLATE,
                        dataset=dataset), train_data)
    pickle_dump(
        format_filename(PROCESSED_DATA_DIR, DEV_DATA_TEMPLATE,
                        dataset=dataset), dev_data)
    pickle_dump(
        format_filename(PROCESSED_DATA_DIR,
                        TEST_DATA_TEMPLATE,
                        dataset=dataset), test_data)

    pickle_dump(
        format_filename(PROCESSED_DATA_DIR,
                        VOCABULARY_TEMPLATE,
                        dataset=dataset,
                        level='char'), char_vocab)
    pickle_dump(
        format_filename(PROCESSED_DATA_DIR,
                        VOCABULARY_TEMPLATE,
                        dataset=dataset,
                        level='fw_bigram'), fw_bigram_vocab)
    pickle_dump(
        format_filename(PROCESSED_DATA_DIR,
                        VOCABULARY_TEMPLATE,
                        dataset=dataset,
                        level='bw_bigram'), bw_bigram_vocab)
    pickle_dump(
        format_filename(PROCESSED_DATA_DIR,
                        VOCABULARY_TEMPLATE,
                        dataset=dataset,
                        level='tag'), tag_vocab)

    pickle_dump(
        format_filename(PROCESSED_DATA_DIR,
                        IDX2TOKEN_TEMPLATE,
                        dataset=dataset,
                        level='char'), idx2char)
    pickle_dump(
        format_filename(PROCESSED_DATA_DIR,
                        IDX2TOKEN_TEMPLATE,
                        dataset=dataset,
                        level='fw_bigram'), idx2fw_bigram)
    pickle_dump(
        format_filename(PROCESSED_DATA_DIR,
                        IDX2TOKEN_TEMPLATE,
                        dataset=dataset,
                        level='bw_bigram'), idx2fw_bigram)
    pickle_dump(
        format_filename(PROCESSED_DATA_DIR,
                        IDX2TOKEN_TEMPLATE,
                        dataset=dataset,
                        level='tag'), idx2tag)

    np.save(
        format_filename(PROCESSED_DATA_DIR,
                        EMBEDDING_MATRIX_TEMPLATE,
                        dataset=dataset,
                        type='c2v'), c2v)
    np.save(
        format_filename(PROCESSED_DATA_DIR,
                        EMBEDDING_MATRIX_TEMPLATE,
                        dataset=dataset,
                        type='c_fasttext'), c_fasttext)
    np.save(
        format_filename(PROCESSED_DATA_DIR,
                        EMBEDDING_MATRIX_TEMPLATE,
                        dataset=dataset,
                        type='c_glove'), c_glove)
    np.save(
        format_filename(PROCESSED_DATA_DIR,
                        EMBEDDING_MATRIX_TEMPLATE,
                        dataset=dataset,
                        type='fw_bi2v'), fw_bi2v)
    np.save(
        format_filename(PROCESSED_DATA_DIR,
                        EMBEDDING_MATRIX_TEMPLATE,
                        dataset=dataset,
                        type='fw_bifasttext'), fw_bifasttext)
    np.save(
        format_filename(PROCESSED_DATA_DIR,
                        EMBEDDING_MATRIX_TEMPLATE,
                        dataset=dataset,
                        type='fw_biglove'), fw_biglove)
    np.save(
        format_filename(PROCESSED_DATA_DIR,
                        EMBEDDING_MATRIX_TEMPLATE,
                        dataset=dataset,
                        type='bw_bi2v'), bw_bi2v)
    np.save(
        format_filename(PROCESSED_DATA_DIR,
                        EMBEDDING_MATRIX_TEMPLATE,
                        dataset=dataset,
                        type='bw_bifasttext'), bw_bifasttext)
    np.save(
        format_filename(PROCESSED_DATA_DIR,
                        EMBEDDING_MATRIX_TEMPLATE,
                        dataset=dataset,
                        type='bw_biglove'), bw_biglove)
                sleep(sleep_time)
            else:
                # 呼び出し元でエラー内容等を表示
                raise
    return

if __name__ == '__main__':
    pdf_dir = utils.check_argv_path(sys.argv)
    pdf_path_list = utils.get_path_list(pdf_dir, 'pdf')
    # 処理済みのファイルを雑に判別して除外
    # pdf_path_list = [p for p in pdf_path_list if not os.path.basename(p).startswith('[')]
    amazon_url_list = []
    for pdf_path in pdf_path_list:
        isbn = pdf_to_isbn(pdf_path)
        if isbn:
            try:
                amazon_items = fetch_amazon_item(isbn)
            except HTTPError as e:
                print('情報の取得に失敗しました:', pdf_path, isbn)
                continue
            if amazon_items:
                amazon_url = get_amazon_url(amazon_items)
                print(amazon_url)
                amazon_url_list.append(amazon_url)
                # 念のため定期的にpickelを保存しておく
                if len(amazon_url_list) % 10 == 0:
                    utils.pickle_dump(amazon_url_list, filename='amazon_url_list.pickel')
                # API制限にかからないようにsleepを設定
                sleep(2)

    utils.pickle_dump(amazon_url_list, filename='amazon_url_list.pickel')
            'product_id').size()
    product_feat['product_first_reorder_num'] = order_products_prior[
        order_products_prior.user_product_order_number == 1].groupby(
            'product_id').size()
    product_feat['product_first_reorder_num'].fillna(0, inplace=True)  # fillna

    product_feat['product_user_order_only_once_num'] = \
            product_feat['product_first_order_num'] - product_feat['product_first_reorder_num']
    product_feat['product_user_order_only_once_ratio'] = \
            product_feat['product_user_order_only_once_num'] / product_feat['product_first_order_num']

    product_feat['product_reorder_ratio'] = product_feat[
        'product_first_reorder_num'] / product_feat['product_first_order_num']
    product_feat['product_average_user_reorder_num'] = product_feat[
        'product_reorder_num'] / product_feat['product_first_order_num']
    product_feat[
        'product_average_add_to_cart_order'] = order_products_prior.groupby(
            'product_id')['add_to_cart_order'].mean()

    feats = [
        'product_order_num', 'product_reorder_num',
        'product_reorder_frequency', 'product_first_order_num',
        'product_first_reorder_num', 'product_reorder_ratio',
        'product_user_order_only_once_num',
        'product_user_order_only_once_ratio',
        'product_average_user_reorder_num', 'product_average_add_to_cart_order'
    ]

    pickle_dump(product_feat[feats],
                '{}/product_feat.pkl'.format(config.feat_folder))
    print('Done - product features')