def re_generator(files: Dict[str, tuple], args): """Generates files for RE""" for filename, data in files.items(): generate_re_input_files(ehr_records=data[0], ade_records=data[1], filename=args.target_dir + filename + '.' + args.ext, max_len=args.max_seq_len, sep=args.sep, is_test=data[2], is_label=data[3]) save_pickle(args.target_dir + 'train', { "EHR": files['train'][0], "ADE": files['train'][1] }) save_pickle(args.target_dir + 'test', { "EHR": files['test'][0], "ADE": files['test'][1] }) print("\nGenerating files successful. Files generated: ", 'train.tsv,', 'dev.tsv,', 'test.tsv,', 'test_labels.tsv,', 'train_rel.pkl,', 'test_rel.pkl,', 'test_labels_rel.pkl', sep=' ')
def pred_error(f_pred, prepare_data, data, iterator, verbose=False, is_test_phase=False): """ Just compute the error f_pred: Theano fct computing the prediction prepare_data: usual prepare_data for that dataset. """ valid_err = 0 for _, valid_index in iterator: x, mask, y = prepare_data([data[0][t] for t in valid_index], np.array(data[1])[valid_index], maxlen=None) preds = f_pred(x, mask) if is_test_phase: print preds utils.save_pickle(("%0.3f" % np.random.rand()) + "pred.pickle", preds) targets = np.array(data[1])[valid_index] valid_err += (preds == targets).sum() valid_err = 1. - np_floatX(valid_err) / len(data[0]) return valid_err
def main(nlp, glove_dir): """Filter out sentences that are too short to be meaningful or far longer than the rest of our data. Parameters ----------- nlp: spacy.lang.en.English Spacy parser used for tokenization. glove_dir: str Location to load glove vectors from. """ # Load and split data. dtypes = dict(text=object, sex='category', age=np.int8) df = pd.read_csv('data/sentences.csv', dtype=dtypes, usecols=dtypes.keys()) df['sex'] = (df.sex == 'male') * 1 lengths = df.text.str.split().str.len() df = df[(lengths >= 5) & (lengths <= 50)] data = train_val_test_split(df.text, df[['sex', 'age']], train_p=.99, val_p=.005, state=1, shuffle=True) # Order: x_train, x_val, x_test, y_train, y_val, y_test save_pickle(data, 'split_data') # w2count, w2idx, i2w, and w2vec will be pickled for easy access. build_word_mappings(data[0], nlp, glove_dir)
def cal_test_additional_chars( test_data_path, label_additional_chars, test_save_path, ): test_data_file_names = os.listdir(test_data_path) lengths = len(test_data_file_names) test_data_additional_chars = set() # new_extra_chars = set("/﹒–é/▲‧♥♡∩×『2〉×.è◆……①&") extra_chars = set( "!#$%&\()*+,-./:;<=>?@[\\]^_`{|}~!#¥%&?《》{}“”,:‘’。()·、;【】/……﹒–") for index in range(lengths): test_data_dir = os.path.join(test_data_path, str(index) + '.txt') with open(test_data_dir, 'r', encoding='utf-8') as f1: lines_text = f1.readlines() raw_text = '' for line_text in lines_text: raw_text += line_text test_data_additional_chars.update( re.findall(u'[^\u4e00-\u9fa5a-zA-Z0-9\*]', str(raw_text))) additional_chars = test_data_additional_chars.difference( label_additional_chars) # 去掉标签里含有的特殊字符 additional_chars = additional_chars.difference(extra_chars) # 去掉额外的一些标点符号 # additional_chars = additional_chars.difference(new_extra_chars) # 去掉额外的一些标点符号 save_pickle(additional_chars, test_save_path) # 保存成pickle形式 additional_chars = load_pickle(test_save_path) return additional_chars, test_data_additional_chars, label_additional_chars
def save_imdb_tfidf(): """ For comparison purposes only. apply tfidf to imdb data and save the resulting dataset. Not used in this project. """ train_pos_files = glob.glob('data/imdb/train/pos/*') train_neg_files = glob.glob('data/imdb/train/neg/*') test_pos_files = glob.glob('data/imdb/test/pos/*') test_neg_files = glob.glob('data/imdb/test/neg/*') vocab = get_imdb_vocab() tfidf = TfidfVectorizer(input='filename', stop_words='english', max_df=0.5, vocabulary=vocab, sublinear_tf=True) total_train = train_pos_files + train_neg_files x_train = tfidf.fit_transform(total_train) y_train = np.concatenate( (np.ones(len(train_pos_files)), np.zeros(len(train_neg_files)))) total_test = test_pos_files + test_neg_files x_test = tfidf.transform(total_test) y_test = np.concatenate( (np.ones(len(test_pos_files)), np.zeros(len(test_neg_files)))) train_data = (x_train, y_train) test_data = (x_test, y_test) data = {'train': train_data, 'test': test_data} save_pickle('data/imdb_tfidf.pkl', data)
def _commit(self, thot=None): if self._persist and thot: save_pickle(thot, '%s%s.obj' % (constants.MEMORY_PATH, thot.t_name)) if self._persist and not thot: save_pickle(self, '%s%s.obj' % (constants.MEMORY_PATH, self._name)) return
def read_data_e2(data_dir): main_dir = glob.glob(data_dir+'/*/*') print(main_dir) for fl in main_dir: # print("Participant id is: ",fl.strip().split('/')[-2]) participant = fl.strip().split("/")[-2] exp = fl.strip().split("/")[-3] print(fl.split('/')[-1]) if 'example' in fl.split('/')[-1]: ff = spio.loadmat(fl,squeeze_me=True) ff_2 = spio.loadmat(fl,squeeze_me=False) disc_pr() sents = ff['keySentences'] part_topic_id = ff['labelsPassageForEachSentence'] topic_id = ff['labelsPassageCategory'] topics = ff['keyPassageCategory'] part_of_topics =ff['keyPassages'] vxl = ff['examples'] mtd = ff_2['meta'] topic_id = [x for x, number in zip(topic_id, len(topic_id)*[4]) for _ in range(number)] data_dict={} for id,el in enumerate(part_topic_id): data_dict[(sents[id],part_of_topics[el-1],topics[topic_id[id]-1])]=vxl[id] # (Sentence,subtopic(Apple),topic(Fruit)): voxels save_pickle(data_dict, '../data_processed/' + exp + '_proc/' + participant + '/' + fl.strip().split("/")[-1]) save_pickle(mtd, '../data_processed/' + exp + '_proc/' + participant + '/' + fl.strip().split("/")[-1] + '_meta')
def save_shap_val(hp_filename, filename, name, SAVE_DIR, train_data, test_data, test_labels, use_gpu=True, background_length=100, padding_length=512): hp_d = 'models/{}.pkl'.format(hp_filename) hp_path = utils.get_abs_path(SAVE_DIR, hp_d) d = utils.load_pickle(hp_path) model_d = 'models/{}.pkl'.format(filename) model_path = utils.get_abs_path(SAVE_DIR, model_d) model = init_model(train_data, d, model_path, use_gpu=use_gpu) features_l, importance_l = [], [] features = 'features/{}_shap_all_features.pkl'.format(name) feature_path = utils.get_abs_path(SAVE_DIR, features) scores = 'feature_importance/{}_shap_all_scores.pkl'.format(name) model_path = utils.get_abs_path(SAVE_DIR, scores) features_l, importance_l = get_lstm_shap( model, train_data, test_data, background_length=background_length, padding_length=padding_length, feature_path=feature_path, model_path=model_path) utils.save_pickle(features_l, feature_path) utils.save_pickle(importance_l, model_path)
def to_sequence(df, attrs=['sku_ID', 'if_order', 'request_time', 'brand_ID'], num_clicks=1000000): ''' convert sorted dataframe into sequence: [[user_ID1, [sku_ID_1, sku_ID2, ...], [time1, time2, ...], [False, True, ...]]] [user_ID2, [sku_ID_3, sku_ID4, ...], [time3, time4, ...], [False, False, ...]] ''' num_clicks = min(num_clicks, len(df)) df = df.query('user_ID != "-"') same_user_indicator = df['user_ID'].shift(1) == df['user_ID'] same_user_indicator.iloc[0] = True sequences = [] sequence = [None] + [[] for attr in attrs] sequence[0] = df['user_ID'].iloc[0] for i in tqdm(range(0, num_clicks)): user = df['user_ID'].iloc[i] if_same_user = same_user_indicator.iloc[i] if not if_same_user: sequences.append(sequence) sequence = [None] + [[] for attr in attrs] sequence[0] = user for _, attr in enumerate(attrs): attr_value = df[attr].iloc[i] sequence[_ + 1].append(attr_value) utils.save_pickle(sequences, 'click_sequence.pk') sequences.append(sequence) return sequences
def deal_with_postag(data_list, mode='full'): if len(data_list) == 1 and mode == 'train': cache_postag = get_config_values('cache', 'postag_train') elif len(data_list) == 1 and mode == 'dev': cache_postag = get_config_values('cache', 'postag_dev') elif len(data_list) == 2 and mode == 'mix': cache_postag = get_config_values('cache', 'postag_mix') elif len(data_list) == 3 and mode == 'full': cache_postag = get_config_values('cache', 'postag_full') else: logger.warn('Found data format wrong when dealing with postag...') if not os.path.exists(cache_postag): logger.info("dealing with postag...") postag = [] for dataset in tqdm(data_list): for line in dataset: postag.append([[ Converter('zh-hans').convert(word['word'].strip().replace( ' ', '')), word['pos'], len(word['word']) ] for word in line['postag']]) save_pickle(cache_postag, postag) else: logger.info("loading with postag...") postag = load_pickle(cache_postag) logger.info("postag total num: {0}".format(len(postag))) logger.info("postag 5: {0}".format(postag[:5])) return postag
def deal_with_text(data_list, mode='full'): if len(data_list) == 1 and mode == 'train': cache_text = get_config_values('cache', 'text_train') elif len(data_list) == 1 and mode == 'dev': cache_text = get_config_values('cache', 'text_dev') elif len(data_list) == 2 and mode == 'mix': cache_text = get_config_values('cache', 'text_mix') elif len(data_list) == 3 and mode == 'full': cache_text = get_config_values('cache', 'text_full') else: logger.warn('Found data format wrong when dealing with text...') if not os.path.exists(cache_text): logger.info("dealing with text...") text = [] for dataset in tqdm(data_list): text.extend([ Converter('zh-hans').convert(line['text']) for line in dataset ]) save_pickle(cache_text, text) else: logger.info("loading with text...") text = load_pickle(cache_text) logger.info("text total num: {0}".format(len(text))) return text
def read_data_e1(data_dir): main_dir = glob.glob(data_dir+'/*/*') for fl in main_dir: # print("Participant id is: ",fl.strip().split('/')[-2]) ff = spio.loadmat(fl,squeeze_me=True) ff_nv2 = spio.loadmat(fl,squeeze_me=False) assert check_list(ff['labelsConcept']), "False ordered data" mtd = ff_nv2['meta'] # print(mtd.dtype) participant = fl.strip().split("/")[-2] exp = fl.strip().split("/")[-3] print(fl.split('/')[-1]) if 'data' in fl.split('/')[-1]: ff['labelsPOS']=[ff['keyPOS'][x-1] for x in ff['labelsPOS']] pos = ff['labelsPOS'] wds = ff['keyConcept'] vxl = ff['examples'] cnc = ff['labelsConcreteness'] mtd = ff['meta'] data_dict={} for el in ff['labelsConcept']: id=el-1 data_dict[(wds[id],pos[id],cnc[id])]=vxl[id] #print((wds[id],pos[id],cnc[id])) save_pickle(data_dict,'../data_processed/'+exp+'_proc/'+participant+'/'+fl.strip().split("/")[-1]) save_pickle(mtd,'../data_processed/'+exp+'_proc/'+participant+'/'+fl.strip().split("/")[-1]+'_meta')
def transform(zip_file, save_dir=None): """Refactor file directories, rename images and partition the train/val/test set. """ train_test_split_file = osp.join(save_dir, 'train_test_split.pkl') train_test_split = save_images(zip_file, save_dir, train_test_split_file) # train_test_split = load_pickle(train_test_split_file) # partition train/val/test set trainval_ids = list( set([ parse_new_im_name(n, 'id') for n in train_test_split['trainval_im_names'] ])) # Sort ids, so that id-to-label mapping remains the same when running # the code on different machines. trainval_ids.sort() trainval_ids2labels = dict(zip(trainval_ids, range(len(trainval_ids)))) partitions = partition_train_val_set(train_test_split['trainval_im_names'], parse_new_im_name, num_val_ids=100) train_im_names = partitions['train_im_names'] train_ids = list( set([parse_new_im_name(n, 'id') for n in partitions['train_im_names']])) # Sort ids, so that id-to-label mapping remains the same when running # the code on different machines. train_ids.sort() train_ids2labels = dict(zip(train_ids, range(len(train_ids)))) # A mark is used to denote whether the image is from # query (mark == 0), or # gallery (mark == 1), or # multi query (mark == 2) set val_marks = [0, ] * len(partitions['val_query_im_names']) \ + [1, ] * len(partitions['val_gallery_im_names']) val_im_names = list(partitions['val_query_im_names']) \ + list(partitions['val_gallery_im_names']) test_im_names = list(train_test_split['q_im_names']) \ + list(train_test_split['gallery_im_names']) test_marks = [0, ] * len(train_test_split['q_im_names']) \ + [1, ] * len(train_test_split['gallery_im_names']) partitions = { 'trainval_im_names': train_test_split['trainval_im_names'], 'trainval_ids2labels': trainval_ids2labels, 'train_im_names': train_im_names, 'train_ids2labels': train_ids2labels, 'val_im_names': val_im_names, 'val_marks': val_marks, 'test_im_names': test_im_names, 'test_marks': test_marks } partition_file = osp.join(save_dir, 'partitions.pkl') save_pickle(partitions, partition_file) print('Partition file saved to {}'.format(partition_file))
def save_lime_coef(filename, model_name, SAVE_DIR, train_dev_tokens, test_tokens, d_file=None): model = 'models/{}.pkl'.format(filename) path = utils.get_abs_path(SAVE_DIR, model) if 'svm' in model_name: model = utils.load_pickle(path, encoding=False) else: if model_name == 'lstm_att': hp_d = 'models/{}.pkl'.format(d_file) hp_path = utils.get_abs_path(SAVE_DIR, hp_d) d = utils.load_pickle(hp_path) model = init_model(train_dev_tokens, d, path) else: model = utils.load_pickle(path) features_l, importance_l = get_lime(model, test_tokens, model_name) features = 'features/{}_lime_all_features.pkl'.format(model_name) path = utils.get_abs_path(SAVE_DIR, features) utils.save_pickle(features_l, path) scores = 'feature_importance/{}_lime_all_scores.pkl'.format(model_name) path = utils.get_abs_path(SAVE_DIR, scores) utils.save_pickle(importance_l, path)
def train(model_info): os.makedirs(STACK_MODEL_DIR_v2, exist_ok=True) df = pd.read_pickle( os.path.join(NEW_DATA_V3_DIR, 'data-last1year-withsetinfo-extend1.pkl')) data_title_distance = pd.read_pickle( os.path.join(NEW_DATA_V3_DIR, 'data-title-distance-df.pkl')) all_data = np.concatenate( (df.values, data_title_distance.values.reshape(-1, 1)), axis=1) df = pd.DataFrame(data=all_data, columns=list(df.columns) + list(data_title_distance.columns)) print(df.head()) df = shuffle(df, random_state=RANDOM_SEED) print(df.head()) train_data = df[model_info['cols']].values train_y = df['label'].values print(train_data.shape) ss = StandardScaler() train_data = ss.fit_transform(train_data) save_pickle(ss, model_info['ss_path']) models = model_info['model'] params = model_info['model_param'] sm = StackModel(models, params) sm.fit(train_data, train_y) save_pickle(sm, model_info['model_path']) return sm
def train( name, model_init_fn, train_data, dev_data, test_data, prep_fn=prepare_minibatch, ): orig_name = name print("----------------------") print(f"TRAINING: {name}") print("----------------------") total_results = train_model( name, model_init_fn, optimizer_fn, num_iterations=NUM_ITERATIONS, patience=None, eval_every=EVAL_EVERY, prep_fn=prep_fn, eval_fn=evaluate, batch_fn=get_minibatch, batch_size=BATCH_SIZE, eval_batch_size=BATCH_SIZE, train_data=train_data, dev_data=dev_data, test_data=test_data, ) utils.save_pickle(f"{name}_results.pkl", total_results)
def build_word_mappings(x_train, nlp, glove_dir): """Generate word to count, word to index, and word to vector mappings.""" # Map each token to the # of times it appears in the corpus. tokens = [ item for t in nlp(' '.join(x_train.values), disable=['parser', 'tagger', 'ner']) for item in [t.text.strip()] if item ] w2count = dict(filter(lambda x: x[1] > 4, Counter(tokens).items())) save_pickle(tokens, 'tokens') save_pickle(w2count, 'w2count') # Construct w2idx dict and i2w list. w2idx = { k: i for i, (k, v) in enumerate( sorted(w2count.items(), key=lambda x: x[1], reverse=True), 2) } w2idx['<PAD>'] = 0 w2idx['<UNK>'] = 1 i2w = [k for k, v in sorted(w2idx.items(), key=lambda x: x[1])] save_pickle(w2idx, 'w2idx') save_pickle(i2w, 'i2w') # Load word vectors and filter to include words in our vocab. w2vec = load_glove(300, glove_dir) w2vec = {k: v for k, v in w2vec.items() if k in w2idx} save_pickle(w2vec, 'w2vec')
def get_arguments(): args = build_parser() # set random seed for reproducible experiments # reference: https://github.com/pytorch/pytorch/issues/7068 random.seed(args.random_seed) numpy.random.seed(args.random_seed) torch.manual_seed(args.random_seed) torch.cuda.manual_seed(args.random_seed) torch.cuda.manual_seed_all(args.random_seed) # these flags can affect performance, selec carefully # torch.backends.cudnn.deterministic = True # torch.backends.cudnn.benchmark = False os.makedirs(args.save_path, exist_ok=True) if args.train_flag: os.makedirs(os.path.join(args.save_path, 'training_log'), exist_ok=True) else: loaded_args = load_pickle( os.path.join(os.path.dirname(args.model_load), 'argument.pickle')) args = update_arguments_for_eval(args, loaded_args) # cuda setting os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' os.environ['CUDA_VISIBLE_DEVICES'] = ', '.join(map(str, args.gpu_no)) with open(os.path.join(args.save_path, 'argument.txt'), 'w') as f: for key, value in sorted(vars(args).items()): f.write('%s: %s' % (key, value) + '\n') save_pickle(os.path.join(args.save_path, 'argument.pickle'), args) return args
def ner_generator(files: Dict[str, tuple], args) -> None: """Generates files for NER""" # Generate train, dev, test files for filename, data in files.items(): generate_input_files(ehr_records=data[0], ade_records=data[1], filename=args.target_dir + filename + '.' + args.ext, max_len=args.max_seq_len, sep=args.sep) save_pickle(args.target_dir + filename, { "EHR": data[0], "ADE": data[1] }) # Generate labels file with open(args.target_dir + 'labels.txt', 'w') as file: output_labels = map(lambda x: x + '\n', labels) file.writelines(output_labels) filenames = [ name for files in map(lambda x: [x + '.' + args.ext, x + '.pkl'], list(files.keys())) for name in files ] print("\nGenerating files successful. Files generated: ", ', '.join(filenames), ', labels.txt', sep='')
def create_w2ts(w2hs, path): ''' Create a dictionary that map a whale id with the training samples ''' w2ts_path = path['root'] + path['w2ts'] train_ps_path = path['root'] + path['train_ps'] if isfile(w2ts_path): print(w2ts_path, 'exists! Load it!') w2ts = load_pickle(w2ts_path) train = load_pickle(train_ps_path) else: train = [] # A list of training image ids for hs in w2hs.values(): if len(hs) > 1: train += hs random.shuffle(train) train_set = set(train) w2ts = {} # Associate the image ids from train to each whale id. for w, hs in w2hs.items(): for h in hs: if h in train_set: if w not in w2ts: w2ts[w] = [] if h not in w2ts[w]: w2ts[w].append(h) for w, ts in w2ts.items(): w2ts[w] = np.array(ts) save_pickle(w2ts, w2ts_path) save_pickle(train, train_ps_path) return w2ts, train
def merge_loop(double_set, list_name, file=None): """ 进行团合并操作,循环直到不能合并 :param double_set: :return:团成员最大数,最终的团 """ bestSet = set() oldSet = double_set num_list = [] count_list = [] group_list = [] while len(oldSet) > 0: print('成员数:', len(list(oldSet)[0])) print('个数:', len(oldSet)) print(oldSet) num_list.append(len(list(oldSet)[0])) count_list.append(len(oldSet)) group_list.append(oldSet) bestSet = oldSet oldSet = merge_group(oldSet, double_set) if file is not None: group_list = utils.num_2_word(list_name, group_list) utils.write_csv(['成员数', '个数', '团'], file, num_list, count_list, group_list) utils.save_pickle(file + '.pkl', group_list) return len(list(bestSet)[0]), bestSet
def create_training_labels(type_, path=os.path.join(path_train, "semcor+omsti.json")): ############################################################################### # This function creates a txt file with sentences with a specific label # and a vocabulary with all the seen labels. # # Input: # type_: it is a laabel used to choose the type of label # path: path of the json file # # Output: # None ############################################################################### # create a list with sentences of the considered labels for all the training set dictionary = create_labels_words() data = utils.load_json(path) data = list( map(partial(sentence_from_dictionaries, training_sentence=False), data)) sentences = [] labels = set() for sentence in data: single_sentence = [] for word in sentence.split(): # insert the current word if type(dictionary.get(word, word)) != list: single_sentence.append(word) # insert the corrispondent label for the current word else: single_sentence.append(str(dictionary.get(word, word)[type_])) labels.add(str(dictionary.get(word, word)[type_])) sentences.append(single_sentence) # create the vocabulary of seen labels, adding the ids for the padding, the unseen labels and the unlabelled words vocabulary = { value: key for key, value in dict(enumerate(labels, 3)).items() } vocabulary["<PAD>"] = "0" vocabulary["<UNSEEN>"] = "1" vocabulary["<WORD>"] = "2" # exchange strings with ids sentences = list( map( lambda sentence: ' '.join( str(vocabulary.get(word, word)) for word in sentence), sentences)) utils.save_txt(sentences, path_train + name_training_file[type_][0]) utils.save_pickle(vocabulary, "../resources/" + name_training_file[type_][1])
def process_babi_dataset(save, print_dict=False): file = open('dialog-bAbI-tasks/dialog-babi-task5-full-dialogs-trn.txt', 'r') text = file.readlines() file.close() system_acts = load_pickle('system_acts.pickle') def print_dict(): for key in uttr_dict: print(key) print(uttr_dict[key]) print() uttr_dict = {'<BEGIN>': [set()]} for act in system_acts: uttr_dict[act] = [set()] prev_uttr = '<BEGIN>' for uttr in text: if uttr == '\n': prev_uttr = '<BEGIN>' for act in system_acts: if prev_uttr == '': prev_uttr = act continue if act in uttr: user_uttr = re.sub(r'\d+', '', uttr.split(act)[0]).strip() uttr_dict[prev_uttr][0].add(user_uttr) prev_uttr = act if save: save_pickle(uttr_dict, 'simulator_uttrs.pickle') if print_dict: for k, v in uttr_dict.items(): print(k, v, '\n')
def get_entity_word_dict(self): data_path = r'./similarity/entity_word_dict.pkl' if os.path.exists(data_path): word_vector = load_pickle(data_path) else: word_vector = self.get_dict_key_num(self.data_dict) save_pickle(data_path, word_vector) return word_vector
def get_all_sentence_vector(self): data_path = r'./similarity/all_sentence_vector.pkl' if os.path.exists(data_path): all_sentence_vector = load_pickle(data_path) else: all_sentence_vector = self.get_data_dict_vector(self.data_dict) save_pickle(data_path, all_sentence_vector) return all_sentence_vector
def main(_): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model, downstream_loader = setup() model.to(device).eval() goal_emb, distance_scale = embed(model, downstream_loader, device) utils.save_pickle(FLAGS.experiment_path, goal_emb, "goal_emb.pkl") utils.save_pickle(FLAGS.experiment_path, distance_scale, "distance_scale.pkl")
def save_att_weights(word_score_ds, save_dir): features_l, importance_l = get_att_weights(word_score_ds) features_file_name = 'features/lstm_att_weights_all_features.pkl' path = utils.get_abs_path(save_dir, features_file_name) utils.save_pickle(features_l, path) scores_file_name = 'feature_importance/lstm_att_weights_all_scores.pkl' path = utils.get_abs_path(save_dir, scores_file_name) utils.save_pickle(importance_l, path)
def process_example_phrases(save, print_dict=False): from openpyxl import load_workbook def cell(row, col): return sh[ALPH[col - 1] + str(row)].value uttr_dict = {} uttr_dict['<SILENT>'] = [set()] uttr_dict['any preference on a type of cuisine'] = [set(), set()] uttr_dict['api_call'] = [set()] uttr_dict['great let me do the reservation'] = [set()] uttr_dict['hello what can i help you with today'] = [set()] uttr_dict['here it is '] = [set(), set()] uttr_dict['how many people would be in your party'] = [set()] uttr_dict["i'm on it"] = [set()] uttr_dict['is there anything i can help you with'] = [set()] uttr_dict['ok let me look into some options for you'] = [set()] uttr_dict['sure is there anything else to update'] = [set()] uttr_dict['sure let me find an other option for you'] = [set()] uttr_dict['what do you think of this option: '] = [set()] uttr_dict['where should it be'] = [set()] uttr_dict['which price range are looking for'] = [set(), set()] uttr_dict["you're welcome"] = [set()] uttr_dict['<BEGIN>'] = [set()] wb = load_workbook(filename='user_simulator_phrases.xlsx') sh = wb['Phrases'] col = 0 for phrase, list in uttr_dict.items(): col += 1 row = 2 while True: phrase = cell(row, col) if phrase == None: break else: list[0].add(phrase) row += 1 if ' ' not in phrase: pass if len( list ) > 1: # only necessary when context_vector can be != [1, 1, 1, 1] row = 2 col += 1 while True: if cell(row, col) == None: break else: list[1].add(cell(row, col)) row += 1 if save: save_pickle(uttr_dict, 'example_phrases_dict.pickle') if print_dict: for k, v in uttr_dict.items(): print(k, v, '\n')
def dset2dict(dset, name): dset_dict = {} for idx, question in enumerate(dset): dset_dict[question['qid']] = question save_pickle( dset_dict, os.path.expanduser("~/kable_management/data/tvqa/" + name + "_dict.pickle")) print(name, "Done")
def save_xgb_impt(file, name, SAVE_DIR): model = 'models/{}.pkl'.format(file) path = utils.get_abs_path(SAVE_DIR, model) print('model path: {}'.format(path)) pipeline = utils.load_pickle(path) xgb_impt_d = get_xgb_impt_d(pipeline) features = 'features/{}_impt_all_features.pkl'.format(name) path = utils.get_abs_path(SAVE_DIR, features) utils.save_pickle(xgb_impt_d, path)
def from_file(cls, faces, data_file_name, n_eigs): pickle = get_pickle(data_file_name) if pickle is not None: logging.info('using previously calculated facespace') return cls(faces, n_eigs=n_eigs, face_space=pickle) else: logging.info('No previous facespace was found') eig_face = cls(faces, n_eigs=n_eigs) save_pickle(eig_face.entire_face_space, data_file_name) return eig_face
def _save(self, min_delta=0): if self.text_tokens_len + min_delta < len(self.text_tokens): print('_save 1: %7d = %7d + %4d %s' % (len(self.text_tokens), self.text_tokens_len, len(self.text_tokens) - self.text_tokens_len, self.text_tokens_path)) save_json(self.text_tokens_path, self.text_tokens) self.text_tokens_len = len(self.text_tokens) if self.token_vector_len + 2 * min_delta < len(self.token_vector): print('_save 2: %7d = %7d + %4d %s' % (len(self.token_vector), self.token_vector_len, len(self.token_vector) - self.token_vector_len, self.token_vector_path)) save_pickle(self.token_vector_path, self.token_vector) self.token_vector_len = len(self.token_vector)
def compute_codes(args): """Computes maximum 10,000 x 10 tracks. N is the index in the MSD: e.g. if N = 1: tracks computed: from 100,000 to 199,999 if N = 5: tracks computed: from 500,000 to 599,999 """ track_ids = args["track_ids"] maindir = args["maindir"] d = args["d"] N = args["N"] clique_ids = args["clique_ids"] outdir = args["outdir"] origcodesdir = args["origcodesdir"] pca_n = args["pca_n"] norm = args["norm"] MAX = 1e5 / 1 ITER = 1e4 / 1 for it in xrange(10): logger.info("Computing %d of 10 iteration" % it) start_idx = int(N*MAX + it*ITER) end_idx = int(start_idx + ITER) codes = [] strN = str(N) if N < 10: strN = "0" + str(N) out_file = os.path.join(outdir, strN) + str(it) + "-msd-codes.pk" if origcodesdir is None: origcodes = None else: origcodes_file = os.path.join(origcodesdir, strN) + str(it) + \ "-msd-codes.pk" origcodes = utils.load_pickle(origcodes_file)[0][0] #origcodes = utils.load_pickle(origcodes_file)[0] if d == "": codes = compute_codes_orig_it(track_ids, maindir, clique_ids, start_idx, end_idx) else: codes = compute_codes_it(track_ids, maindir, d, clique_ids, start_idx, end_idx, origcodes=origcodes, norm=norm) utils.save_pickle(codes, out_file)
def pred_error(f_pred, prepare_data, data, iterator, verbose=False, is_test_phase=False): """ Just compute the error f_pred: Theano fct computing the prediction prepare_data: usual prepare_data for that dataset. """ valid_err = 0 for _, valid_index in iterator: x, mask, y = prepare_data([data[0][t] for t in valid_index], np.array(data[1])[valid_index], maxlen=None) preds = f_pred(x, mask) if is_test_phase: print preds utils.save_pickle(("%0.3f" % np.random.rand())+"pred.pickle", preds) targets = np.array(data[1])[valid_index] valid_err += (preds == targets).sum() valid_err = 1. - np_floatX(valid_err) / len(data[0]) return valid_err
def main(): # Args parser parser = argparse.ArgumentParser(description= "Evaluates the average rank and mean AP for the test SHS " \ "over the entire MSD", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("msd_dir", action="store", help="Million Song Dataset main directory") parser.add_argument("-dictfile", action="store", default="", help="Pickle to the learned dictionary") parser.add_argument("-outdir", action="store", default="msd_codes", help="Output directory for the features") parser.add_argument("-N", action="store", default=10, type=int, help="Number of processors to use when computing " \ "the codes for 1M tracks,") parser.add_argument("-lda", action="store", default=None, help="LDA file") parser.add_argument("-pca", nargs=2, metavar=('f.pkl', 'n'), default=(None, 0), help="pca model saved in a pickle file, " \ "use n dimensions") parser.add_argument("-codes", action="store", nargs=2, default=[None,0], dest="codesdir", metavar=("msd_codes/", "n"), help="Path to the folder with all the codes and " "version to evaluate") parser.add_argument("-orig_codes", action="store", default=None, dest="origcodesdir", help="Path to the folder with all the codes without " "dimensionality reduction") parser.add_argument("-norm", action="store_true", dest="norm", default=False, help="Normalize before LDA/PCA or not") args = parser.parse_args() start_time = time.time() maindir = args.msd_dir shsf = "SHS/shs_dataset_test.txt" global lda global pca # sanity cheks utils.assert_file(maindir) utils.assert_file(shsf) utils.create_dir(args.outdir) # read cliques and all tracks cliques, all_tracks = utils.read_shs_file(shsf) track_ids = utils.load_pickle("SHS/track_ids_test.pk") clique_ids = utils.load_pickle("SHS/clique_ids_test.pk") # read codes file codesdir = args.codesdir[0] if codesdir is not None: if os.path.isfile(codesdir): c = utils.load_pickle(codesdir) feats = c[0] track_ids = c[1] clique_ids = c[2] else: feats, track_ids, clique_ids = load_codes(codesdir, lda_idx=int(args.codesdir[1])) logger.info("Codes files read") print feats.shape else: # Read PCA file if args.pca[0] is not None: pca = utils.load_pickle(args.pca[0])[int(args.pca[1])] # read LDA file lda_file = args.lda if lda_file is not None: lda = utils.load_pickle(lda_file) utils.assert_file(args.dictfile) # Prepare Multiprocessing computation input = [] pool = Pool(processes=args.N) for n in xrange(args.N): arg = {} arg["track_ids"] = track_ids arg["maindir"] = maindir arg["d"] = args.dictfile arg["N"] = n arg["clique_ids"] = clique_ids arg["outdir"] = args.outdir arg["origcodesdir"] = args.origcodesdir arg["pca_n"] = int(args.pca[1]) arg["norm"] = args.norm input.append(arg) # Start computing the codes pool.map(compute_codes, input) # Done! logger.info("Codes computation done!") logger.info("Took %.2f seconds" % (time.time() - start_time)) sys.exit() # Scores feats, clique_ids, track_ids = utils.clean_feats(feats, clique_ids, track_ids) stats = score(feats, clique_ids, N=len(all_tracks)) # TODO: change file name utils.save_pickle(stats, "stats.pk") # done logger.info('Average rank per track: %.2f, clique: %.2f, MAP: %.2f%%' \ % (anst.average_rank_per_track(stats), anst.average_rank_per_clique(stats), anst.mean_average_precision(stats) * 100)) logger.info("Done! Took %.2f seconds" % (time.time() - start_time))
def main(): # Args parser parser = argparse.ArgumentParser(description= "Cover song ID on the training Second Hand Song dataset", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("msd_dir", action="store", help="Million Song Dataset main directory") parser.add_argument("-dictfile", action="store", default="", help="Pickle to the learned dictionary") parser.add_argument("-lda", action="store", nargs=2, default=[None,0], help="LDA file and version", metavar=('lda.pkl', 'n')) parser.add_argument("-codes", action="store", default=None, dest="codesfile", help="Pickle to the features file") parser.add_argument("-f", action="store", default="", dest="featfile", help="Pickle to the final features") parser.add_argument("-pca", nargs=2, metavar=('f.pkl', 'n'), default=("", 0), help="pca model saved in a pickle file, " \ "use n dimensions") args = parser.parse_args() start_time = time.time() maindir = args.msd_dir shsf = "SHS/shs_dataset_train.txt" dictfile = args.dictfile # sanity cheks utils.assert_file(dictfile) utils.assert_file(maindir) utils.assert_file(shsf) # read clique ids and track ids cliques, all_tracks = utils.read_shs_file(shsf) track_ids = all_tracks.keys() clique_ids = np.asarray(utils.compute_clique_idxs(track_ids, cliques)) logger.info("Track ids and clique ids read") utils.save_pickle(clique_ids, "SHS/clique_ids_train.pk") utils.save_pickle(track_ids, "SHS/track_ids_train.pk") # read LDA file lda_file = args.lda[0] if lda_file != None: lda_file = utils.load_pickle(lda_file) logger.info("LDA file read") # read codes file codesfile = args.codesfile if codesfile != None: codesfile = utils.load_pickle(codesfile) logger.info("Codes file read") # Compute features if needed if args.featfile == "": feats = compute_feats(track_ids, maindir, dictfile, lda_file=lda_file, lda_n=int(args.lda[1]), codes=codesfile, pca=args.pca[0], pca_n=int(args.pca[1])) else: feats = utils.load_pickle(args.featfile) # Apply PCA pcafile = args.pca[0] pcadim = int(args.pca[1]) if pcafile != "" and False: trainedpca = utils.load_pickle(pcafile) assert pcadim > 0 logger.info('trained pca loaded') pcafeats = np.zeros((feats.shape[0], pcadim)) for i,feat in enumerate(feats): pcafeats[i] = trainedpca.apply_newdata(feat, ndims=pcadim) feats = pcafeats # Scores feats, clique_ids, track_ids = utils.clean_feats(feats, clique_ids, track_ids) stats = score(feats, clique_ids) # Save data if dictfile == "": dictfile = "thierry" # For saving purposes utils.save_pickle(stats, "results/stats-" + os.path.basename(dictfile) + ".pk") # done logger.info('Average rank per track: %.2f, clique: %.2f, MAP: %.2f%%' \ % (anst.average_rank_per_track(stats), anst.average_rank_per_clique(stats), anst.mean_average_precision(stats) * 100)) logger.info("Done! Took %.2f seconds" % (time.time() - start_time))
def compute_feats(track_ids, maindir, d, lda_file=None, lda_n=0, codes=None, ver=True, pca="", pca_n=0): """Computes the features using the dictionary d. If it doesn't exist, computes them using Thierry's method. The improved pipeline is composed of 11 steps: 1.- Beat Synchronous Chroma 2.- L2-Norm 3.- Shingle (PATCH_LEN: 75 x 12) 4.- 2D-FFT 5.- L2-Norm 6.- Log-Scale 7.- Sparse Coding 8.- Shrinkage 9.- Median Aggregation 10.- Dimensionality Reduction 11.- L2-Norm Original method by Thierry doesn't include steps 5,6,7,8,11. """ if d != "": fx = load_transform(d) K = int(d.split("_")[1].split("E")[1]) else: K = PATCH_LEN if codes is None: compute_codes = True codes = np.ones((len(track_ids),K)) * np.nan else: compute_codes = False K = codes[0].shape[0] if lda_file is not None: if lda_n == 0: n_comp = 50 elif lda_n == 1: n_comp = 100 elif lda_n == 2: n_comp = 200 else: n_comp = K if pca != "": pca = utils.load_pickle(pca) pca = pca[pca_n] final_feats = np.ones((codes.shape[0],n_comp)) * np.nan orig_feats = [] for cnt, tid in enumerate(track_ids): if compute_codes: path = utils.path_from_tid(maindir, tid) # 1.- Beat Synchronous Chroma # 2.- L2-Norm # 3.- Shingle (PATCH_LEN: 75 x 12) # 4.- 2D-FFT feats = utils.extract_feats(path) #orig_feats.append(feats) # Store orig feats if feats == None: continue if d != "": # 5.- L2-Norm # 6.- Log-Scale # 7.- Sparse Coding # 8.- Shrinkage H = fx(feats) else: H = feats #. 9.- Median Aggregation H = np.median(H, axis=0) else: H = codes[cnt] if compute_codes: codes[cnt] = H.copy() if pca != "": H = pca.transform(H) # Apply LDA if needed if lda_file is not None: #H = dan_tools.chromnorm(H.reshape(H.shape[0], 1)).squeeze() # 10.- Dimensionality Reduction H = lda_file[lda_n].transform(H) # 11.- L2-Norm final_feats[cnt] = dan_tools.chromnorm(H.reshape(H.shape[0], 1)).squeeze() if ver: if cnt % 50 == 1: logger.info("----Computing features %.1f%%" % \ (cnt/float(len(track_ids)) * 100)) if d == "": d = "orig" # For saving purposes # Save codes utils.create_dir("results") if compute_codes: utils.save_pickle(codes, "results/codes-" + os.path.basename(d) + ".pk") # Save features #utils.save_pickle(orig_feats, "results/feats-" + os.path.basename(d) + ".pk") logger.info("Features Computed") return final_feats
for key,value in wavs.iteritems(): if not test_wavs.has_key(key): test_wavs[key] = [] test_wavs[key].extend(value) test_nums = [len(test_wavs[each]) for each in test_wavs.keys()] print "test total words:{}".format(np.asarray(test_nums).sum()) print "mergeing" joint = dict() for key in train_wavs.keys(): for word in train_wavs[key]: if not joint.has_key(key): joint[key] = [] joint[key].append(word) for key in test_wavs.keys(): for word in test_wavs[key]: if not joint.has_key(key): joint[key] = [] joint[key].append(word) total_nums = 0 for key in joint.keys(): total_nums = total_nums + len(joint[key]) for word in joint[key]: if len(word) == 0: print key, word print "merge total words:{}".format(total_nums) utils.save_pickle('merge.pkl', joint)
reduced_train_sift = concatenate(train_sift, axis=0) test_sift = removing_null(test_sift_with_null, test_labels) reduced_test_sift = concatenate(test_sift, axis=0) all_sift = concatenate((reduced_train_sift, reduced_test_sift), axis=0) nfeatures = all_sift.shape[0] k = 1000 kmeans = MiniBatchKMeans( n_clusters=k, init="k-means++", n_init=10, max_iter=100, init_size=1000, batch_size=1000 ) kmeans.fit(all_sift) if SAVE: save_pickle(prefix + "kmeans.pkl", kmeans) # train_predicted = kmeans.predict(reduced_train_sift) # test_predicted = kmeans.predict(reduced_test_sift) # train_hist_features = get_histogram(k, train_sift, train_predicted) # test_hist_features = get_histogram(k, test_sift, test_predicted) elif descriptor == "spSIFT": k = 1000 if os.path.isfile(prefix + "train_sift.pkl"): kmeans = load_pickle(prefix + "kmeans.pkl") train_sift = load_pickle(prexif + "train_sift.pkl") test_sift = load_pickle(prexif + "test_sift.pkl") reduced_train_sift = concatenate(train_sift, axis=0)
sys.exit(1) MARGIN_SIZE = n except ValueError: print 'margin size must be an integer!' sys.exit(1) utils.ensure_dir(outputdir) for d in os.listdir(inputdir): path = os.path.join(inputdir, d) if not os.path.isdir(path): continue pngs = [fn for fn in os.listdir(path) if fn.endswith('.png')] data = np.ndarray(shape=(len(pngs), IMG_SIZE+2*MARGIN_SIZE, IMG_SIZE+2*MARGIN_SIZE)) start = time() for idx, png in enumerate(pngs): resized = utils.read_resize_image(os.path.join(path, png), IMG_SIZE) data[idx, :, :] = utils.add_margins(resized, MARGIN_SIZE) end = time() print '{}: {} images read and resized to {} in {:.3f}s. Saving...'.format( path, len(data), data[0].shape, end-start) utils.save_pickle({'data': data, 'image_size': IMG_SIZE, 'margin_size': MARGIN_SIZE}, d + '.pickle', outputdir)
print("------------------Request and Answer DataFrame Generating!") request=training_order_df.groupby(['date','time_of_day','start_dist_id'])['order_id'].count().reset_index() request.columns=['date','time_of_day','start_dist_id','request'] answer_training_order_df=training_order_df.dropna(axis=0,subset=['driver_id']) answer=answer_training_order_df.groupby(['date','time_of_day','start_dist_id'])['order_id'].count().reset_index() answer.columns=['date','time_of_day','start_dist_id','answer'] request_answer=pd.merge(request,answer,how='left',on=['date','time_of_day','start_dist_id']) request_answer.fillna(0,inplace=True) request_answer['gap']=request_answer['request']-request_answer['answer'] request_answer['date']=request_answer['date'].apply(lambda x:datetime.strptime(x,"%Y-%m-%d")) request_answer['day_of_week']=request_answer['date'].apply(lambda x:datetime.weekday(x)) utils.save_pickle(request_answer,"request_answer") print("------------------Model Training!") data_X=request_answer[['time_of_day','start_dist_id','day_of_week']] data_Y=request_answer['gap'] X_train,X_test,Y_train,Y_test=cross_validation.train_test_split(data_X,data_Y,test_size=0.2,random_state=0) #training_order_df=utils.generate_order_df(aim="predict") pred_df=pd.read_csv("./data/test_set_1/read_me_1.txt",sep='\t', names=['origin']) pred_df['time_of_day']=pred_df['origin'].apply(lambda x: int(x[11:])) pred_df['date']=pred_df['origin'].apply(lambda x: datetime.strptime(x[:10],"%Y-%m-%d")) pred_df['day_of_week']=pred_df['date'].apply(lambda x:datetime.weekday(x)) temp=[] for i in np.arange(start=1,stop=67,step=1):
print('`' * 80) for k, shape, score, columns in results: f1 = score[1] col = set(columns) d_f1 = f1 - last_f1 d_col = list(col - last_col) print('%4d: %-20s %g' % (k, d_col, d_f1)) last_f1 = f1 last_col = col if True: # X = X[X.columns[:5]] beam_size = 3 max_items = -1 (X_train, y_train), (X_test, y_test) = resample(X, y, sample_fraction=1.0) m_scores = beam_search_feature(X_train, y_train, X_test, y_test, beam_size=3, max_items=-1) save_pickle('m_scores.pkl', m_scores) print('*' * 80) print('beam_size=%d, max_items=%d' % (beam_size, max_items)) last_f1 = 0 for m, cols_scores in m_scores: f1 = cols_scores[0][1][1] print('%3d: f1=%.3f improvement=%+.3f' % (m, f1, f1 - last_f1)) last_f1 = f1 for i, (c, s) in enumerate(cols_scores[:3]): print('%5d: %s %s' % (i, s, c)) if False: columns = ['Parent Region', 'Reseller Tier', 'resellerDiscountPercentage', 'type'] show_splits(X, y, columns)
for review in clean_test_reviews: for word in review: if d.check(word) and not dicts.has_key(word): dicts[word] = len(dicts) encode_train_reviews = [] for review in clean_train_reviews: each = [] for word in review: if dicts.has_key(word): each.append(dicts[word]) if len(each) != 0: encode_train_reviews.append(each) encode_test_reviews = [] for review in clean_test_reviews: each = [] for word in review: if dicts.has_key(word): each.append(dicts[word]) if len(each) != 0: encode_test_reviews.append(each) assert len(encode_train_reviews) == len(train_labels) train = (encode_train_reviews, train_labels) utils.save_pickle("encode_train_reviews.pickle", train) utils.save_pickle("encode_test_reviews.pickle", encode_test_reviews) utils.save_pickle("dicts.pickle", dicts)
nlabels = len(label_map) train, train_lbl = reshape(train, train_lbl, nlabels) valid, valid_lbl = reshape(valid, valid_lbl, nlabels) test, test_lbl = reshape(test, test_lbl, nlabels) # store all in a dict and pickle it data = { 'train': train, 'train_lbl': train_lbl, 'valid': valid, 'valid_lbl': valid_lbl, 'test': test, 'test_lbl': test_lbl, } utils.save_pickle(data, outfile) print print 'Train: {}'.format(len(train)) print 'Valid: {}'.format(len(valid)) print 'Test: {}'.format(len(test)) print 'Classes: {}'.format(len(label_map)) print print 'Dataset written to {}'.format(outfile) # save label map, mean image and sizes meta = { 'label_map': label_map, 'mean_image': mean_image, 'image_size': image_size, 'margin_size': margin_size
# ngrams, as suggested in the NBSVM paper. if False: s = '''It turns out "why is" that's using. Doesn't it? Can't i''' t = tokenize(s) print(t) assert False print('Tokenization:') t0 = time.clock() train_tokens = [tokenize(s, token_vector) for s in train[COMMENT]] print('train_tokens: %1.f sec %.2f sec / token' % (time.clock() - t0, (time.clock() - t0) / len(train_tokens))) t0 = time.clock() test_tokens = [tokenize(s, token_vector) for s in test[COMMENT]] print('test_tokens: %1.f sec %.2f sec / token' % (time.clock() - t0, (time.clock() - t0) / len(test_tokens))) save_pickle('token.vector.pkl', token_vector) save_json('train.tokens.json', train_tokens) save_json('test.tokens.json', test_tokens) token_vector = load_pickle('token.vector.pkl') train_tokens = load_json('train.tokens.json') test_tokens = load_json('test.tokens.json') def compute_ngram_vector(token_list, n): """Compute an embedding vector for all n-grams in token_list """ vec = np.zeros((n, SPACY_VECTOR_SIZE), dtype=np.float64) n_vecs = len(token_list) - n + 1 for i in range(n_vecs): for j in range(n):