sys.stdout = codecs.getwriter('utf-8')(sys.stdout)

#set topic vector size and load word embedding model if given
if cf.word_embedding_model:
    print "Loading word embedding model..."
    mword = g.Word2Vec.load(cf.word_embedding_model)
    cf.word_embedding_size = mword.vector_size

#first pass to collect vocabulary information
print "First pass on train corpus to collect vocabulary stats..."
idxvocab, vocabxid, tm_ignore = gen_vocab(dummy_symbols, cf.train_corpus, cf.stopwords, cf.vocab_minfreq, \
    cf.vocab_maxfreq, cf.verbose)

#second pass to collect train/valid data for topic and language model
print "Processing train corpus to collect sentence and document data..."
train_sents, train_docs, train_docids, train_stats = gen_data(vocabxid, dummy_symbols, tm_ignore, cf.train_corpus, \
    cf.tm_sent_len, cf.lm_sent_len, cf.verbose, False)
print "Processing valid corpus to collect sentence and document data..."
valid_sents, valid_docs, valid_docids, valid_stats = gen_data(vocabxid, dummy_symbols, tm_ignore, cf.valid_corpus, \
    cf.tm_sent_len, cf.lm_sent_len, cf.verbose, False)

#labels given for documents
train_labels, valid_labels, num_classes = None, None, 0
if hasattr(cf, "train_target") and hasattr(cf, "valid_target"):
    train_labels = [int(item) for item in open(cf.train_target).readlines()]
    valid_labels = [int(item) for item in open(cf.valid_target).readlines()]
    num_classes = max(set(train_labels)) + 1
cf.num_classes = num_classes

#tags given for documents
train_tags, valid_tags, tagxid, tag_len = None, None, {}, 0
if hasattr(cf, "train_tag") and hasattr(cf, "valid_tag"):
Ejemplo n.º 2
0
texts, labels = load_conll(config.train_path, config.labels_index)
val_texts, val_labels = load_conll(config.dev_path, config.labels_index)
# texts, labels = load_conll('keras_data/1.txt')
test_texts, test_labels = load_conll(config.test_path, config.labels_index)

# =====================
# build char cnn
# =====================
index_char = load_index(config.char_index)
# print(index_char)

MAX_WORD_LENGTH = config.word_length
wl = MAX_WORD_LENGTH

train_char, sl, wl = gen_data(texts, 0, wl, index_char)
val_char, sl, wl = gen_data(val_texts, sl, wl, index_char)
test_char, sl, wl = gen_data(test_texts, sl, wl, index_char)

MAX_SEQUENCE_LENGTH = sl

if MAX_SEQUENCE_LENGTH % 2 == 1:
    MAX_SEQUENCE_LENGTH += 1
print(MAX_WORD_LENGTH)
print(MAX_SEQUENCE_LENGTH)

train_data_char = pad_data(train_char, MAX_SEQUENCE_LENGTH, MAX_WORD_LENGTH)
val_data_char = pad_data(val_char, MAX_SEQUENCE_LENGTH, MAX_WORD_LENGTH)
test_data_char = pad_data(test_char, MAX_SEQUENCE_LENGTH, MAX_WORD_LENGTH)

# print(np.shape(train_char))
Ejemplo n.º 3
0
                             " ".join([idxvocab[item] for item in s]) + "\n")


######
#main#
######

#load the vocabulary
vocab = cPickle.load(open(os.path.join(args.model_dir, "vocab.pickle")))
idxvocab, tm_ignore, dummy_symbols = vocab[0], vocab[1], vocab[2]
pad_symbol, start_symbol, end_symbol = cf.dummy_symbols[0], cf.dummy_symbols[
    1], cf.dummy_symbols[2]
vocabxid = dict([(y, x) for x, y in enumerate(idxvocab)])

# input_doc similar to corpus in train
sents, docs, docids, stats = gen_data(vocabxid, cf.dummy_symbols, tm_ignore,
                                      input_doc)
print "Vocab size =", len(idxvocab)

labels = None
tags = None

with tf.Graph().as_default(), tf.Session() as sess:
    initializer = tf.contrib.layers.xavier_initializer(seed=cf.seed)
    with tf.variable_scope("model", reuse=None, initializer=initializer):
        tm = TM(is_training=False, vocab_size=len(idxvocab), batch_size=cf.batch_size, \
            num_steps=cf.tm_sent_len, num_classes=cf.num_classes, config=cf) if cf.topic_number > 0 else None
        lm = LM(is_training=False, vocab_size=len(idxvocab), batch_size=cf.batch_size, \
            num_steps=cf.lm_sent_len, config=cf, reuse_conv_variables=True) if cf.rnn_hidden_size > 0 else None

    #load tensorflow model
    saver = tf.train.Saver()
Ejemplo n.º 4
0
    parser.add_argument("-d", "--distance", default=0, type=int)
    parser.add_argument("-l", "--latency", default=40, type=int)
    parser.add_argument("-s", "--states", default=4, type=int)
    parser.add_argument("-c", "--clusters", default=5, type=int)
    parser.add_argument("--max-iter", default=5, type=int)
    args = parser.parse_args()

    stock = args.stock.history(period="max")
    period = args.range
    latency = args.latency
    num_states = args.states
    num_clusters = args.clusters
    max_iter = args.max_iter
    pred_dist = args.distance

    orig, data = util.gen_data(stock)
    pred_change = predict(data,
                          4,
                          period[0],
                          period[1],
                          pred_dist=pred_dist,
                          num_states=num_states,
                          num_clusters=num_clusters,
                          max_iter=max_iter,
                          latency=latency)
    pred_val = [
        orig.Open[np.where(orig.index == period[0])[0] - pred_dist].values[0] *
        (1 + pred_change[0])
    ]

    for i in range(1, len(pred_change)):
#load config
cf_dict = pickle.load(open(os.path.join(args.model_dir, "config.pickle"),
                           'rb'))
if "num_classes" not in cf_dict:
    cf_dict["num_classes"] = 0
if "num_tags" not in cf_dict:
    cf_dict["num_tags"] = 0
    cf_dict["tag_len"] = 0
    cf_dict["tag_embedding_size"] = 0
ModelConfig = namedtuple("ModelConfig", " ".join(cf_dict.keys()))
cf = ModelConfig(**cf_dict)

#parse and collect the documents
if args.input_doc:
    sents, docs, docids, stats = gen_data(vocabxid, dummy_symbols, tm_ignore, args.input_doc, \
        cf.tm_sent_len, cf.lm_sent_len, cf.verbose, False)
    #print(documents statistics
    print("Vocab size =", len(idxvocab))
    print_corpus_stats("Documents statistics", sents, docs, stats)

#collect the labels
#labels given for documents
if args.input_label:
    labels = [int(item) for item in open(args.input_label).readlines()]
else:
    labels = None

#collect the tags
if cf.num_tags > 0:
    if not args.input_tag:
        sys.stderr.write("Error: Saved model is trained with document tags; " + \
Ejemplo n.º 6
0
    return wm


if (__name__ == "__main__"):
    parser = argparse.ArgumentParser()
    parser.add_argument("stock", type=yf.Ticker)
    parser.add_argument("-tp", "--train-period", default=240, type=int)
    parser.add_argument("-l", "--latency", default=40, type=int)
    parser.add_argument("-s", "--states", default=4, type=int)
    parser.add_argument("-c", "--clusters", default=5, type=int)
    parser.add_argument("--max_iter", default=5, type=int)
    args = parser.parse_args()

    stock = args.stock.history(period="max")
    train_period = args.train_period
    latency = args.latency
    num_states = args.states
    num_clusters = args.clusters
    max_iter = args.max_iter

    data = util.gen_data(stock)[1]
    pred = predict(data,
                   4,
                   train_period,
                   num_states=num_states,
                   num_clusters=num_clusters,
                   max_iter=max_iter,
                   latency=latency)
    print("Prediction for tomorrow: " + ("+" if pred >= 0 else "-") +
          str(round(pred * 100, 2)))