def read_yeast(): # Read the yeast data dataOff = data_reader('./data/datayeastoff.txt') dataOff = np.delete(dataOff, 0, 0) # delete the first row (misread headers) dataOn = data_reader('./data/datayeaston.txt') dataOn = np.delete(dataOn, 0, 0) # delte the first row (misread headers) # now we return a list of data #merged_data = np.vstack((dataOn, dataOff)) # merge the on + off datasets # return a list of data data_list = [] data_list.append(dataOn) # append the first time series segment data_list.append(dataOff) # append the second time series segment # Set the true incidence matrix defined by the literature # true_inc = [ # [0, 1, 0, 0, 0], # [0, 0, 1, 1, 0], # [1, 0, 0, 1 ,1], # [0, 1, 0, 0, 0], # [1, 0, 0, 0, 0] # ] true_inc = [ [0, 0, 1, 0, 1], [1, 0, 0, 1, 0], [0, 1, 0, 0 ,0], [0, 1, 1, 0, 0], [0, 0, 1, 0, 0] ] return(data_list, true_inc)
def _init(self): """Init common dic, access vocab by dictionary attr or load method.""" common_dic = data_reader(self._common_dic_path) vocab = common_dic.strip().split('\n') for word in vocab: if word: self.dictionary.add(word) logging.info("Initialized `{}` common words from file `{}`".format(len(vocab), self._common_dic_path))
def run(conf, only_testmode): if -1 in conf.firstN: reader = data_reader(data_dir=conf.data_dir, filename='train', batch_size=conf.batch) else: reader = data_reader_firstN(data_dir=conf.data_dir, filename='train', batch_size=conf.batch, from_to=conf.firstN) conf.class_divpnt = reader.class_divpnt conf.n_tracks = reader.num_tracks
def add_from_file(self, vocab_file): """Add words to common dic by vocab file. Args: vocab_file: user vocab file path. """ user_dic = data_reader(vocab_file) vocab = user_dic.strip().split('\n') with codecs.open(self._common_dic_path, 'a+') as fo: for word in vocab: if word not in self.dictionary: self.dictionary.add(word) fo.write('\n'+word) logging.info("Add word `{}` to file `{}`.".format(word, self._common_dic_path))
def main(): print('Starting...') data = data_reader(args.target) # solution = LocalSearch(op_idx=2) # solution = VariableLocalSearch(op_idx1=0, op_idx2=2, keep_invariant=1000, keep_invariant_max=2000) # solution = SA(op_idx=0, init_coeff=0.9, init_inner_time=200, stop_temp=1e-2, alpha=0.98) solution = GA(population_size=200, cross_rate=[0.3, 0.5], mutation_rate=[0.1, 0.5], keep_invariant=50) tsp = TSP(solution, data, euclidean_dist) tsp.run(threshhold=args.thresh, savepath=args.savepath, save_freq=args.save_freq, print_freq=args.print_freq, max_iteration=args.max_itr) if args.savepath is not None: generate_gif(args.savepath) plot(args.savepath)
def remove_from_file(self, vocab_file): """Remove words from common dic by iterable object. Args: vocab_file: user vocab file path. """ user_dic = data_reader(vocab_file) vocab = user_dic.strip().split('\n') for word in vocab: if word in self.dictionary: self.dictionary.remove(word) logging.info("Remove word `{}` to file `{}`.".format(word, self._common_dic_path)) with codecs.open(self._common_dic_path, 'w') as fo: for word in self.dictionary: fo.write(word + '\n')
def __init__(self, corpus_file, common_words_file=None, min_candidate_len=2, max_candidate_len=5, least_cnt_threshold=5, solid_rate_threshold=0.018, entropy_threshold=1.92, all_words=False): if not corpus_file: raise ValueError("Corpus file is empty, please specify corpus file path.") self._document = data_reader(corpus_file, cn_only=True) self._common_dic = common_words_file self._min_candidate_len = min_candidate_len self._max_candidate_len = max_candidate_len self._least_cnt_threshold = least_cnt_threshold self._solid_rate_threshold = solid_rate_threshold self._entropy_threshold = entropy_threshold self._all_words = all_words if not self._all_words: self.dictionary = Dictionary(self._common_dic) else: logging.warning("Extract all words mode, if you only want new words, set new_words=False to new words mode.")
def get_dev_examples(self, data_dir): """ Load dev examples """ return data_reader(os.path.join(self.data_dir, "dev.tsv"), self.vocab, self.num_examples, "dev")
def get_train_examples(self, data_dir, epoch=1): """ Load training examples """ return data_reader(os.path.join(self.data_dir, "train.tsv"), self.vocab, self.num_examples, "train", epoch)
def run(conf, only_testmode): if -1 in conf.firstN: reader = data_reader(data_dir=conf.data_dir, filename='train', batch_size=conf.batch) else: reader = data_reader_firstN(data_dir=conf.data_dir, filename='train', batch_size=conf.batch, from_to=conf.firstN) conf.n_tracks = reader.num_tracks conf.n_input = reader.num_items conf.n_output = reader.num_items conf.charsize = reader.num_char conf.strmaxlen = reader.max_title_len kp_range = conf.input_kp test_seed = conf.test_seed update_seed = conf.update_seed readers_test = {} for seed in test_seed: readers_test[seed] = data_reader_test(data_dir=conf.data_dir, filename=seed, batch_size=conf.batch, test_num=conf.testsize) info = None model = None print(conf.n_input) model_title = None if conf.mode == 'pretrain': info = '[pretrain mode]' model = DAE_tied(conf) elif conf.mode == 'dae': if only_testmode: conf.initval = conf.save info = '[dae mode]' model = DAE(conf) elif conf.mode == 'title': info = '[title mode]' model_title = get_model(conf) model = DAE_title(conf, model_title.output) info += ' start at ' + str(datetime.datetime.now()) log_write(conf, '*'*10) log_write(conf, info) model.fit() sess = tf.Session() sess.run(model.init_op) saver = tf.train.Saver() epoch = 0 max_eval = 0.0 iter = 0 loss = 0.0 # if test mode is specified, just test the result and no training session. if only_testmode: log_write(conf, '<<only test mode>>') if conf.mode == 'title': saver.restore(sess, conf.save) for seed_num, reader_test in readers_test.items(): log_write(conf, "seed num: " + seed_num) rprec, ndcg, rsc = eval(reader_test, conf, sess, model, model_title) r = show_result(rprec, ndcg, rsc) log_write(conf, r) return while True: start_idx = reader.train_idx trk_positions, art_positions, y_positions, titles, trk_val, art_val = reader.next_batch() end_idx = reader.train_idx input_kp = random.uniform(kp_range[0], kp_range[-1]) if conf.mode in ['pretrain', 'dae']: rand_int = np.random.randint(2) if rand_int == 0: _, l = sess.run([model.optimizer, model.cost], feed_dict={model.x_positions: trk_positions, model.x_ones: trk_val, model.y_positions: y_positions, model.y_ones: np.ones(len(y_positions)), model.keep_prob: conf.kp, model.input_keep_prob: input_kp}) elif rand_int == 1: _, l = sess.run([model.optimizer, model.cost], feed_dict={model.x_positions: art_positions, model.x_ones: art_val, model.y_positions: y_positions, model.y_ones: np.ones(len(y_positions)), model.keep_prob: conf.kp, model.input_keep_prob: input_kp}) elif conf.mode == 'title': _, l = sess.run([model.optimizer, model.cost], feed_dict={model.x_positions: y_positions, model.x_ones: np.ones(len(y_positions)), model.y_positions: y_positions, model.y_ones: np.ones(len(y_positions)), model_title.titles: titles, model.keep_prob: conf.kp, model_title.keep_prob: conf.title_kp, model.input_keep_prob: input_kp, model.titles_use: [[1]] * conf.batch}) loss += l iter += 1 if start_idx > end_idx or end_idx == 0: epoch += 1 loss = loss / iter if epoch >= 0: log_write(conf, "epoch "+str(epoch)) log_write(conf, "training loss: "+str(loss)) cur_eval = 0 for seed_num, reader_test in readers_test.items(): log_write(conf, "seed num: "+seed_num) rprec, ndcg, rsc = eval(reader_test, conf, sess, model, model_title) r = show_result(rprec, ndcg, rsc) log_write(conf, r) if seed_num in update_seed: cur_eval += rprec if cur_eval >= max_eval: if conf.mode in ['pretrain', 'dae']: model.save_model(sess) elif conf.mode == 'title': saver.save(sess, conf.save) max_eval = cur_eval log_write(conf, "The highest score is updated. Parameters are saved") loss = 0 iter = 0 if epoch == conf.epochs: break
def get_test_examples(self, data_dir, epoch): """ Load test examples """ return data_reader((self.data_dir + "/test.tsv"), self.vocab, self.num_examples, "infer", epoch)
def get_dev_examples(self, data_dir, epoch): """ Load dev examples """ return data_reader((self.data_dir + "/dev.tsv"), self.vocab, self.num_examples, "dev", epoch)
def get_dev_examples(self, data_dir, epoch, shuffle): return data_reader((self.data_dir + "/dev.tsv"), self.vocab, self.num_examples, "dev", epoch, shuffle)
def get_test_examples(self, data_dir): """ Load test examples """ return data_reader(os.path.join(self.data_dir, "test.tsv"), self.vocab, self.num_examples, "test")
def get_train_examples(self, data_dir, epoch): """ Load training examples """ return data_reader((self.data_dir + "/train.tsv"), self.vocab, self.num_examples, "train", epoch)
def get_infer_examples(self, data_dir): """ Load infer querys """ return data_reader(os.path.join(self.data_dir, "infer.tsv"), self.vocab, self.num_examples, "infer")
def get_train_examples(self, data_dir, epoch, shuffle): return data_reader((self.data_dir + "/train.tsv"), self.vocab, self.num_examples, "train", epoch, shuffle)