sys.stdout = codecs.getwriter('utf-8')(sys.stdout) #set topic vector size and load word embedding model if given if cf.word_embedding_model: print "Loading word embedding model..." mword = g.Word2Vec.load(cf.word_embedding_model) cf.word_embedding_size = mword.vector_size #first pass to collect vocabulary information print "First pass on train corpus to collect vocabulary stats..." idxvocab, vocabxid, tm_ignore = gen_vocab(dummy_symbols, cf.train_corpus, cf.stopwords, cf.vocab_minfreq, \ cf.vocab_maxfreq, cf.verbose) #second pass to collect train/valid data for topic and language model print "Processing train corpus to collect sentence and document data..." train_sents, train_docs, train_docids, train_stats = gen_data(vocabxid, dummy_symbols, tm_ignore, cf.train_corpus, \ cf.tm_sent_len, cf.lm_sent_len, cf.verbose, False) print "Processing valid corpus to collect sentence and document data..." valid_sents, valid_docs, valid_docids, valid_stats = gen_data(vocabxid, dummy_symbols, tm_ignore, cf.valid_corpus, \ cf.tm_sent_len, cf.lm_sent_len, cf.verbose, False) #labels given for documents train_labels, valid_labels, num_classes = None, None, 0 if hasattr(cf, "train_target") and hasattr(cf, "valid_target"): train_labels = [int(item) for item in open(cf.train_target).readlines()] valid_labels = [int(item) for item in open(cf.valid_target).readlines()] num_classes = max(set(train_labels)) + 1 cf.num_classes = num_classes #tags given for documents train_tags, valid_tags, tagxid, tag_len = None, None, {}, 0 if hasattr(cf, "train_tag") and hasattr(cf, "valid_tag"):
texts, labels = load_conll(config.train_path, config.labels_index) val_texts, val_labels = load_conll(config.dev_path, config.labels_index) # texts, labels = load_conll('keras_data/1.txt') test_texts, test_labels = load_conll(config.test_path, config.labels_index) # ===================== # build char cnn # ===================== index_char = load_index(config.char_index) # print(index_char) MAX_WORD_LENGTH = config.word_length wl = MAX_WORD_LENGTH train_char, sl, wl = gen_data(texts, 0, wl, index_char) val_char, sl, wl = gen_data(val_texts, sl, wl, index_char) test_char, sl, wl = gen_data(test_texts, sl, wl, index_char) MAX_SEQUENCE_LENGTH = sl if MAX_SEQUENCE_LENGTH % 2 == 1: MAX_SEQUENCE_LENGTH += 1 print(MAX_WORD_LENGTH) print(MAX_SEQUENCE_LENGTH) train_data_char = pad_data(train_char, MAX_SEQUENCE_LENGTH, MAX_WORD_LENGTH) val_data_char = pad_data(val_char, MAX_SEQUENCE_LENGTH, MAX_WORD_LENGTH) test_data_char = pad_data(test_char, MAX_SEQUENCE_LENGTH, MAX_WORD_LENGTH) # print(np.shape(train_char))
" ".join([idxvocab[item] for item in s]) + "\n") ###### #main# ###### #load the vocabulary vocab = cPickle.load(open(os.path.join(args.model_dir, "vocab.pickle"))) idxvocab, tm_ignore, dummy_symbols = vocab[0], vocab[1], vocab[2] pad_symbol, start_symbol, end_symbol = cf.dummy_symbols[0], cf.dummy_symbols[ 1], cf.dummy_symbols[2] vocabxid = dict([(y, x) for x, y in enumerate(idxvocab)]) # input_doc similar to corpus in train sents, docs, docids, stats = gen_data(vocabxid, cf.dummy_symbols, tm_ignore, input_doc) print "Vocab size =", len(idxvocab) labels = None tags = None with tf.Graph().as_default(), tf.Session() as sess: initializer = tf.contrib.layers.xavier_initializer(seed=cf.seed) with tf.variable_scope("model", reuse=None, initializer=initializer): tm = TM(is_training=False, vocab_size=len(idxvocab), batch_size=cf.batch_size, \ num_steps=cf.tm_sent_len, num_classes=cf.num_classes, config=cf) if cf.topic_number > 0 else None lm = LM(is_training=False, vocab_size=len(idxvocab), batch_size=cf.batch_size, \ num_steps=cf.lm_sent_len, config=cf, reuse_conv_variables=True) if cf.rnn_hidden_size > 0 else None #load tensorflow model saver = tf.train.Saver()
parser.add_argument("-d", "--distance", default=0, type=int) parser.add_argument("-l", "--latency", default=40, type=int) parser.add_argument("-s", "--states", default=4, type=int) parser.add_argument("-c", "--clusters", default=5, type=int) parser.add_argument("--max-iter", default=5, type=int) args = parser.parse_args() stock = args.stock.history(period="max") period = args.range latency = args.latency num_states = args.states num_clusters = args.clusters max_iter = args.max_iter pred_dist = args.distance orig, data = util.gen_data(stock) pred_change = predict(data, 4, period[0], period[1], pred_dist=pred_dist, num_states=num_states, num_clusters=num_clusters, max_iter=max_iter, latency=latency) pred_val = [ orig.Open[np.where(orig.index == period[0])[0] - pred_dist].values[0] * (1 + pred_change[0]) ] for i in range(1, len(pred_change)):
#load config cf_dict = pickle.load(open(os.path.join(args.model_dir, "config.pickle"), 'rb')) if "num_classes" not in cf_dict: cf_dict["num_classes"] = 0 if "num_tags" not in cf_dict: cf_dict["num_tags"] = 0 cf_dict["tag_len"] = 0 cf_dict["tag_embedding_size"] = 0 ModelConfig = namedtuple("ModelConfig", " ".join(cf_dict.keys())) cf = ModelConfig(**cf_dict) #parse and collect the documents if args.input_doc: sents, docs, docids, stats = gen_data(vocabxid, dummy_symbols, tm_ignore, args.input_doc, \ cf.tm_sent_len, cf.lm_sent_len, cf.verbose, False) #print(documents statistics print("Vocab size =", len(idxvocab)) print_corpus_stats("Documents statistics", sents, docs, stats) #collect the labels #labels given for documents if args.input_label: labels = [int(item) for item in open(args.input_label).readlines()] else: labels = None #collect the tags if cf.num_tags > 0: if not args.input_tag: sys.stderr.write("Error: Saved model is trained with document tags; " + \
return wm if (__name__ == "__main__"): parser = argparse.ArgumentParser() parser.add_argument("stock", type=yf.Ticker) parser.add_argument("-tp", "--train-period", default=240, type=int) parser.add_argument("-l", "--latency", default=40, type=int) parser.add_argument("-s", "--states", default=4, type=int) parser.add_argument("-c", "--clusters", default=5, type=int) parser.add_argument("--max_iter", default=5, type=int) args = parser.parse_args() stock = args.stock.history(period="max") train_period = args.train_period latency = args.latency num_states = args.states num_clusters = args.clusters max_iter = args.max_iter data = util.gen_data(stock)[1] pred = predict(data, 4, train_period, num_states=num_states, num_clusters=num_clusters, max_iter=max_iter, latency=latency) print("Prediction for tomorrow: " + ("+" if pred >= 0 else "-") + str(round(pred * 100, 2)))