def save_headlines(headlines): """Save headlines to file""" for stock in headlines: entries = [] with db() as (conn, cur): cur.execute( "SELECT word, replacement FROM dictionary WHERE stock=? ORDER BY LENGTH(word) DESC", [stock]) dictionary = cur.fetchall() for source in headlines[stock]: for date in headlines[stock][source]: for headline in headlines[stock][source][date]: cleaned_headline = clean_headline(headline, dictionary) entries.append((stock, date, source, cleaned_headline, headline, -999)) add_headlines(entries)
def get_embedding_matrix(tokenizer, pretrained_file='glove.840B.300d.txt', purge=False): """Load Vectors from Glove File""" print("Loading WordVecs...") embedding_matrix = np.zeros((vocab_size + 1, emb_size)) if not pretrained_file: return embedding_matrix, None ## Load Glove File (Super Slow) ## glove_db = dict() with open(os.path.join('..', 'data', pretrained_file), 'r', encoding="utf-8") as glove: for line in glove: values = line.split(' ') word = values[0].replace('-', '').replace('_', '').lower() coefs = np.asarray(values[1:], dtype='float32') if word.isalpha(): glove_db[word] = coefs print('Loaded WordVectors...' + str(len(glove_db))) ## Set Embeddings ## for word, i in tokenizer.word_index.items(): embedding_vector = glove_db.get(word) if embedding_vector is not None: embedding_matrix[i] = embedding_vector elif purge: with db() as (conn, cur): cur.execute( "SELECT 1 FROM dictionary WHERE word=? AND stock=?", [word, "none"]) if len(cur.fetchall()) == 0: print("Purge..." + word) cur.execute("DELETE FROM headlines WHERE content LIKE ?", ["%" + word + "%"]) conn.commit() return embedding_matrix, glove_db
from flask import Flask, request, render_template from Database import db, get_json, idf_db from wtforms import Form, StringField from collections import defaultdict import math import time app = Flask(__name__) t1 = time.time() db = db() j = get_json() i = idf_db() t2 = time.time() print("Read time: " + str(t2 - t1)) class SearchBar(Form): word = StringField(label="Search: ") @app.route('/', methods=['GET', 'POST']) def index(): word = SearchBar(request.form) return render_template("index.html", word=word) @app.route('/Search') def search(): words = request.full_path[request.full_path.find('=') + 1:].split("+") t3 = time.time()
def make_headline_to_effect_data(): """ Headline -> Effect Creates essentially the X, Y data for the embedding model to use when analyzing/encoding headlines. Returns a list of headlines and a list of corresponding 'effects' which represent a change in the stock price. """ all_headlines, all_tick_hist, all_effects, test_indexes = [], [], [], [] with db() as (conn, cur): for stock in stocks: ## Headline For Every Date ## cur.execute( "SELECT DISTINCT date FROM headlines WHERE stock=? ORDER BY date ASC LIMIT 1", [stock]) start_date = cur.fetchall()[0][0] cur.execute( "SELECT DISTINCT date FROM ticks WHERE stock=? AND date >= ? ORDER BY date ASC", [stock, start_date]) dates = [date[0] for date in cur.fetchall()] for date in tqdm_notebook(dates, desc=stock): ## Collect Headlines ## event_date = datetime.strptime(date, '%Y-%m-%d') cur.execute( "SELECT date, source, rawcontent FROM headlines WHERE stock=? AND date BETWEEN ? AND ? ORDER BY date DESC", [stock, add_time(event_date, -14), date]) headlines = [ (date, source, clean(content), (event_date - datetime.strptime(date, '%Y-%m-%d')).days) for (date, source, content) in cur.fetchall() if content ] if len(headlines) < sample_size: continue ## Find corresponding tick data ## cur.execute( """SELECT open, high, low, adjclose, volume FROM ticks WHERE stock=? AND date BETWEEN ? AND ? ORDER BY date DESC""", [ stock, add_time(event_date, -30 - tick_window), add_time(event_date, 0) ]) before_headline_ticks = cur.fetchall()[:tick_window] if len(before_headline_ticks) != tick_window: continue cur.execute( """SELECT AVG(adjclose) FROM ticks WHERE stock=? AND date BETWEEN ? AND ? ORDER BY date""", [stock, add_time(event_date, 1), add_time(event_date, 4)]) after_headline_ticks = cur.fetchall() if len(after_headline_ticks) == 0: continue previous_tick = before_headline_ticks[0][3] result_tick = after_headline_ticks[0][0] if not previous_tick or not result_tick: continue tick_hist = np.array(before_headline_ticks) tick_hist -= np.mean(tick_hist, axis=0) tick_hist /= np.std(tick_hist, axis=0) ## Create training example ## probs = [1 / (headline[3] + 1) for headline in headlines] probs /= np.sum(probs) contents = [headline[2] for headline in headlines] num_samples = len(contents) // sample_size effect = [(result_tick - previous_tick) / previous_tick] for i in range(num_samples): indexes = np.random.choice(np.arange(len(headlines)), sample_size, replace=False, p=probs) sample = [headlines[i] for i in indexes] if event_date > test_cutoff: # Mark as Test Example test_indexes.append(len(all_headlines)) all_headlines.append(sample) all_tick_hist.append(tick_hist) all_effects.append(effect) return all_headlines, np.array(all_tick_hist), np.array( all_effects), np.array(test_indexes)
def predict(stock, model=None, toke=None, current_date=None, predict_date=None): import keras.metrics keras.metrics.correct_sign_acc = correct_sign_acc if not model or not toke: with open(os.path.join('..', 'models', 'toke2-tick.pkl'), 'rb') as toke_file: toke = pickle.load(toke_file) model = load_model( os.path.join('..', 'models', 'media-headlines-ticks-' + model_type + '.h5')) vocab_size = len(toke.word_counts) if not current_date: current_date = datetime.today() if not predict_date: predict_date = current_date + timedelta(days=1) all_headlines, all_tick_hist = [], [] with db() as (conn, cur): event_date = current_date date = datetime.strftime(event_date, '%Y-%m-%d') cur.execute( "SELECT date, source, rawcontent FROM headlines WHERE stock=? AND date BETWEEN ? AND ? ORDER BY date DESC", [stock, add_time(event_date, -14), date]) headlines = [(date, source, clean(content), (event_date - datetime.strptime(date, '%Y-%m-%d')).days) for (date, source, content) in cur.fetchall() if content] ## Find corresponding tick data ## cur.execute( """SELECT open, high, low, adjclose, volume FROM ticks WHERE stock=? AND date BETWEEN ? AND ? ORDER BY date DESC""", [ stock, add_time(event_date, -30 - tick_window), add_time(event_date, 0) ]) before_headline_ticks = cur.fetchall()[:tick_window] actual_current = before_headline_ticks[0][3] tick_hist = np.array(before_headline_ticks) tick_hist -= np.mean(tick_hist, axis=0) tick_hist /= np.std(tick_hist, axis=0) ## Create training example ## probs = [1 / (headline[3] + 1) for headline in headlines] probs /= np.sum(probs) contents = [headline[2] for headline in headlines] num_samples = len(contents) // sample_size for i in range(num_samples): indexes = np.random.choice(np.arange(len(headlines)), sample_size, replace=False, p=probs) sample = [headlines[i] for i in indexes] all_headlines.append(sample) all_tick_hist.append(tick_hist) ## Process ## encoded_headlines, toke = encode_sentences(all_headlines, tokenizer=toke, max_length=max_length) tick_hists = np.array(all_tick_hist) predictions = model.predict([encoded_headlines, tick_hists])[:, 0] prices = predictions * 0.023 * actual_current + actual_current return predictions, prices
## Options ## stock = 'INTC' current_date = '2018-03-08' predict_date = '2018-03-09' ## Run ## predictions, prices = predict( stock, current_date=datetime.strptime(current_date, '%Y-%m-%d'), predict_date=datetime.strptime(predict_date, '%Y-%m-%d')) ## Find Actual Value ## with db() as (conn, cur): cur.execute( """SELECT adjclose FROM ticks WHERE stock=? AND date BETWEEN ? AND ? ORDER BY date ASC LIMIT 1""", [ stock, add_time(datetime.strptime(predict_date, '%Y-%m-%d'), 0), add_time(datetime.strptime(predict_date, '%Y-%m-%d'), 6) ]) after_headline_ticks = cur.fetchall() try: actual_result = after_headline_ticks[0][0] except: actual_result = -1
def make_headline_to_effect_data(): """ Headline -> Effect Creates essentially the X, Y data for the embedding model to use when analyzing/encoding headlines. Returns a list of headlines and a list of corresponding 'effects' which represent a change in the stock price. """ meta, headlines, tick_hists, effects = [], [], [], [] with db() as (conn, cur): for stock in stocks: print("Fetching Stock..." + stock) ## Go through all the headlines ## cur.execute( "SELECT date, source, content, sentimentlabel FROM headlines WHERE stock=?", [stock]) headline_query = cur.fetchall() for (date, source, content, label) in headline_query: if not content or not (5 <= content.count(' ') <= 40): continue event_date = datetime.strptime( date, '%Y-%m-%d') # The date of headline ## Find corresponding tick data ## cur.execute( """SELECT open, high, low, adjclose, volume FROM ticks WHERE stock=? AND date BETWEEN ? AND ? ORDER BY date DESC""", [ stock, add_time(event_date, -30 - tick_window), add_time(event_date, 0) ]) before_headline_ticks = cur.fetchall()[:tick_window] if len(before_headline_ticks) != tick_window: continue cur.execute( """SELECT AVG(adjclose) FROM ticks WHERE stock=? AND date BETWEEN ? AND ? ORDER BY date""", [stock, add_time(event_date, 1), add_time(event_date, 4)]) after_headline_ticks = cur.fetchall() ## Create training example ## previous_tick = before_headline_ticks[0][3] result_tick = after_headline_ticks[0][0] if previous_tick and result_tick and len( after_headline_ticks) > 0: tick_hist = np.array(before_headline_ticks) tick_hist -= np.mean(tick_hist, axis=0) tick_hist /= np.std(tick_hist, axis=0) if model_type == 'regression': # Percent Diff (+Normalization Constant) effect = [(result_tick - previous_tick) / previous_tick / 0.023] # Use labels to adjust effect if label in [-1, 1]: if label == np.sign(effect[0]): effect = [effect[0] * 4] else: effect = [effect[0] / 4] else: if result_tick > previous_tick: effect = [1., 0.] else: effect = [0., 1.] if label in [-1, 1]: if np.sign(label) != np.sign(effect[0]): effect = [.5, .5] meta.append((source, event_date.weekday())) headlines.append(content) tick_hists.append(tick_hist) effects.append(effect) return meta, headlines, np.array(tick_hists), np.array(effects)
def predict(stock, model=None, toke=None, current_date=None, predict_date=None, look_back=None): import keras.metrics keras.metrics.correct_sign_acc = correct_sign_acc if not model or not toke: with open(os.path.join('..', 'models', 'toke-tick.pkl'), 'rb') as toke_file: toke = pickle.load(toke_file) model = load_model( os.path.join('..', 'models', 'media-headlines-ticks-' + model_type + '.h5')) vocab_size = len(toke.word_counts) if not current_date: current_date = datetime.today() if not predict_date: predict_date = current_date + timedelta(days=1) if not look_back: look_back = 3 pretick_date = add_time(current_date, -look_back) with db() as (conn, cur): ## Select Actual Stock Values ## cur.execute( """SELECT open, high, low, adjclose, volume FROM ticks WHERE stock=? AND date BETWEEN ? AND ? ORDER BY date DESC""", [ stock, add_time(current_date, -30 - tick_window), add_time(current_date, 0) ]) before_headline_ticks = cur.fetchall()[:tick_window] actual_current = before_headline_ticks[0][3] cur.execute( """SELECT adjclose FROM ticks WHERE stock=? AND date BETWEEN ? AND ? ORDER BY date ASC LIMIT 1""", [stock, add_time(predict_date, 1), add_time(predict_date, 5)]) after_headline_ticks = cur.fetchall() tick_hist = np.array(before_headline_ticks) tick_hist -= np.mean(tick_hist, axis=0) tick_hist /= np.std(tick_hist, axis=0) ## Find Headlines ## cur.execute( "SELECT date, source, content FROM headlines WHERE date BETWEEN ? AND ? AND stock=?", [pretick_date, current_date, stock]) headlines = cur.fetchall() ## Process ## meta, test_sents = [], [] for (date, source, content) in headlines: meta.append( [source, datetime.strptime(date, '%Y-%m-%d').weekday()]) test_sents.append(content) encoded_meta, test_encoded, _ = encode_sentences(meta, test_sents, tokenizer=toke, max_length=max_length, vocab_size=vocab_size) tick_hists = np.array([tick_hist] * len(headlines)) predictions = model.predict([test_encoded, tick_hists, encoded_meta])[:, 0] prices = predictions * 0.023 * actual_current + actual_current return predictions, prices
def dataIsUpdate(self): isUpdate = False for i in self.threads[:-1]: isUpdate = i.isUpdated or isUpdate if i.isUpdated: i.isUpdated = False if not self.GuiIsActive(): quit() if isUpdate: self.threads[-1].app.update() database = db('sqlite:///weather.sqlite3') database.addTable('weather', date='string', mint='float', maxt='float', location='string', humidity='float', feels_like="float") database.createBase() locations = ["Moscow", "Volgograd", "New York"] try: ThreadsHandler(database, locations, 30).run() except SystemExit: quit()
from Database import Database_connection as db database = db() qy = """ INSERT INTO `berita_detail` (`judul`, `waktu`, `tag`, `isi`, `sumber`) VALUES ('Mentan harap tatanan normal baru pulihkan permintaan produk pertanian', '2020-06-07', '[normal baru,new normal,petani]', ' Dengan kebijakan normal baru utamanya di sektor pariwisata diharapkan dapat memulihkan permintaan produk pertanian Jakarta (ANTARA) - Menteri Pertanian Syahrul Yasin Limpo berharap tatanan normal baru dapat mendongkrak kesejahteraan petani dan memulihkan permintaan produk pertanian dengan dimulainya aktivitas hotel, restoran, katering (Horeka) dan perkantoran. Dampak yang ditimbulkan akibat pandemi ini masih dirasakan masyarakat, termasuk para petani. Faktor yang mempengaruhi petani yakni harga produk pertanian mengalami tekanan diakibatkan oleh panen raya musim tanam pertama. \"Kondisi ini menyebabkan deflasi kelompok bahan makanan dimana jumlah bahan pangan di lapangan banyak namun permintaan berkurang berakibat langsung dengan pendapatan petani,\" kata Syahrul dalam keterangan di Jakarta, Minggu. Selain itu, petani juga dihadapkan pada gangguan distribusi akibat Pembatasan Sosial Berskala Besar (PSBB), penurunan daya beli masyarakat, melemahnya sektor ekonomi yang terkait dengan sektor pertanian seperti Horeka dan perkantoran. Menurut Mentan, selama pandemi deflasi kelompok bahan makanan masih berimplikasi positif terhadap stabilitas sosial dan politik. Untuk mengurangi dampak ke pendapatan yang diterima petani, pemerintah memberikan bantuan sosial yang dapat mengkompensasi penurunan daya beli petani yang diakibatkan oleh penurunan harga produk pertanian. \"Dengan kebijakan normal baru utamanya di sektor pariwisata diharapkan dapat memulihkan permintaan produk pertanian sehingga dapat memperbaiki harga di tingkat petani,\" kata Syahrul. Kementerian Pertanian (Kementan) mencatat bahwa panen raya musim pertama sukses mengamankan stok pangan sehingga tidak terjadi gejolak kenaikan harga dan tersendatnya distribusi 11 bahan pokok khususnya dalam menghadapi Ramadhan dan Hari Raya Idul Fitri. Eksport komoditas pertanian juga masih tumbuh sebesar 12,6 persen. Namun demikian, Nilai Tukar Petani (NTP) diakui memang turun akibat pandemi. Syahrul menilai kondisi ini hanya sesaat. Menurut Mentan, kunci meningkatkan NTP adalah menyeimbangkan penawaran dan permintaan. Kebijakan pemerintah untuk membuka sektor pariwisata dan aktivitas perkantoran harus dipersiapkan dengan baik karena dengan keberhasilan kebijakan ini dapat berkontribusi terhadap perbaikan harga di tingkat petani. Menghadapi fenomena yang terjadi di kalangan petani, Mentan Syahrul mengatakan bahwa pihaknya sedang melakukan berbagai upaya salah satunya melakukan pengendalian dari sisi harga pertanian melalui koordinasi Bulog dan Kementerian Perdagangan. Pewarta: Mentari Dwi Gayati Editor: Ahmad Wijaya COPYRIGHT © ANTARA 2020 (adsbygoogle = window.adsbygoogle || []).push({}); ', 'antara') """ try: database.kursor.execute(qy) database.koneksi.commit() gen_id = database.kursor.lastrowid print(gen_id) except Exception as ex: database.koneksi.rollback() print(ex)
def make_doc_embeddings(query_range=(None, '1776-07-04', '3000-01-01'), use_extra_dates=True, vec_model=None): """ Create document embeddings from headlines """ if not vec_model: print('Creating doc embeddings...') docs, labels = [], [] class LabeledLineSentence: def __init__(self, docs, labels): self.docs = docs self.labels = labels def __iter__(self): for idx, doc in enumerate(self.docs): yield TaggedDocument(doc.split(), [self.labels[idx]]) # clean doc with db() as (conn, cur): q_stock, q_start, q_end = query_range for stock in stocks: ## Headline For Every Date ## if q_stock and q_stock != stock: continue cur.execute( "SELECT DISTINCT date FROM headlines WHERE stock=? AND date BETWEEN ? AND ? ORDER BY date ASC", [stock, q_start, q_end]) dates = [date[0] for date in cur.fetchall()] if use_extra_dates: # True headline days not enough so we create additional querys new_dates = [] for date in dates: d = datetime.strptime(date, '%Y-%m-%d') new_dates.append(add_time(d, -1)) new_dates.append(add_time(d, +1)) dates.extend(new_dates) if not vec_model: # Show loading bar only for training data date_iter = tqdm_notebook(dates, desc=stock) else: date_iter = iter(dates) for date in date_iter: ## Collect Headlines ## event_date = datetime.strptime(date, '%Y-%m-%d') cur.execute( "SELECT date, source, rawcontent FROM headlines WHERE stock=? AND date BETWEEN ? AND ? ORDER BY date ASC", [stock, add_time(event_date, -doc_query_days), date]) headlines = [ (date, source, clean(content), (event_date - datetime.strptime(date, '%Y-%m-%d')).days) for (date, source, content) in cur.fetchall() if content ] if len(headlines) == 0: continue ## Create training example ## contents = [headline[2] for headline in headlines] doc = " **NEXT** ".join(contents) docs.append(doc) labels.append(stock + " " + date) vectors = {stock: {} for stock in stocks} doc_iter = LabeledLineSentence(docs, labels) if not vec_model: vec_model = Doc2Vec(documents=doc_iter, **doc2vec_options) # vec_model = Doc2Vec(**doc2vec_options) # vec_model.build_vocab(doc_iter) # for epoch in range(100): # vec_model.train(doc_iter, **doc2vec_options) # vec_model.alpha -= 0.002 # vec_model.min_alpha = vec_model.alpha for label in labels: stock, date = label.split(" ") vectors[stock][date] = vec_model.docvecs[label] else: for tag_doc in doc_iter: vec = vec_model.infer_vector( tag_doc.words, alpha=doc2vec_options['alpha'], min_alpha=doc2vec_options['min_alpha'], steps=1000) stock, date = tag_doc.tags[0].split(" ") vectors[stock][date] = vec return vec_model, vectors, (docs, labels)
def make_tick_data(query_range=(None, '1776-07-04', '3000-01-01'), train=True): """ Process historic tick data (high/low/close/etc..) into training examples """ if train: print('Creating tick data...') tick_vecs = {stock: {} for stock in stocks} effect_vecs = {stock: {} for stock in stocks} with db() as (conn, cur): q_stock, q_start, q_end = query_range for stock in stocks: if q_stock and q_stock != stock: continue cur.execute( "SELECT DISTINCT date FROM headlines WHERE stock=? AND date BETWEEN ? AND ? ORDER BY date ASC LIMIT 1", [stock, q_start, q_end]) start_date = cur.fetchall()[0][0] cur.execute( "SELECT DISTINCT date FROM ticks WHERE stock=? AND date BETWEEN ? AND ? ORDER BY date ASC", [stock, start_date, q_end]) dates = [date[0] for date in cur.fetchall()] for date in dates: event_date = datetime.strptime( date, '%Y-%m-%d') # The date of headline ## Find corresponding tick data ## cur.execute( """SELECT open, high, low, adjclose, volume FROM ticks WHERE stock=? AND date BETWEEN ? AND ? ORDER BY date DESC LIMIT 52""", [ stock, add_time(event_date, -80), add_time(event_date, 0) ]) before_headline_ticks = cur.fetchall() if len(before_headline_ticks) < tick_window: continue if train: cur.execute( """SELECT adjclose FROM ticks WHERE stock=? AND date BETWEEN ? AND ? ORDER BY date ASC LIMIT 1""", [ stock, add_time(event_date, 1), add_time(event_date, 4) ]) after_headline_ticks = cur.fetchall() if len(after_headline_ticks) == 0 and train: continue ## Create ## window_ticks = np.array( list(reversed(before_headline_ticks[:tick_window])) ) # Flip so in chron. order fifty_ticks = np.array( before_headline_ticks) # Use last 50 ticks to normalize previous_tick = before_headline_ticks[0][3] if train: result_tick = after_headline_ticks[0][0] if previous_tick: window_ticks -= np.mean(fifty_ticks, axis=0) window_ticks /= np.std(fifty_ticks, axis=0) tick_vecs[stock][date] = window_ticks if train: if result_tick > previous_tick: effect = [1., 0.] else: effect = [0., 1.] effect_vecs[stock][date] = effect return tick_vecs, effect_vecs
def make_headline_to_effect_data(): """ Headline -> Effect Creates essentially the X, Y data for the embedding model to use when analyzing/encoding headlines. Returns a list of headlines and a list of corresponding 'effects' which represent a change in the stock price. """ meta, headlines, tick_hists, effects, test_indices = [], [], [], [], [] with db() as (conn, cur): for stock in stocks: print("Fetching Stock..." + stock) ## Go through all the headlines ## cur.execute( "SELECT date, source, rawcontent FROM headlines WHERE stock=?", [stock]) headline_query = cur.fetchall() for (date, source, content) in headline_query: if not content: continue content = clean(content) if not (5 <= content.count(' ') <= 35): continue event_date = datetime.strptime( date, '%Y-%m-%d') # The date of headline ## Find corresponding tick data ## cur.execute( """SELECT open, high, low, adjclose, volume FROM ticks WHERE stock=? AND date BETWEEN ? AND ? ORDER BY date DESC LIMIT 52""", [ stock, add_time(event_date, -80), add_time(event_date, 0) ]) before_headline_ticks = cur.fetchall() if len(before_headline_ticks) < tick_window: continue cur.execute( """SELECT adjclose FROM ticks WHERE stock=? AND date BETWEEN ? AND ? ORDER BY date ASC LIMIT 1""", [stock, add_time(event_date, 1), add_time(event_date, 4)]) after_headline_ticks = cur.fetchall() ## Create training example ## if len(after_headline_ticks) == 0: continue window_ticks = np.array( list(reversed(before_headline_ticks[:tick_window])) ) # Flip so in chron. order fifty_ticks = np.array( before_headline_ticks) # Use last 50 ticks to normalize previous_tick = before_headline_ticks[0][3] result_tick = after_headline_ticks[0][0] if previous_tick and result_tick: window_ticks -= np.mean(fifty_ticks, axis=0) window_ticks /= np.std(fifty_ticks, axis=0) # Percent Diff (/ Normalization Constant) effect = [ (result_tick - previous_tick) / previous_tick / 0.023 ] if event_date > test_cutoff: # Mark as Test Example test_indices.append(len(headlines)) meta.append((source, event_date.weekday())) headlines.append(content) tick_hists.append(window_ticks) effects.append(effect) return meta, headlines, np.array(tick_hists), np.array(effects), np.array( test_indices)
currently not working """ item_ID = request.form.to_dict()["hidden"] return render_template("add_another.html", item=database.get_selected(item_ID)[:3]) @app.route('/add_another', methods=["POST"]) def add_another(): """ currently not working """ addition = request.form.to_dict() item_ID = addition["hidden"] info = dict() for key, value in addition.items(): if "hidden" not in str(key): info[key] = value database.add_esc(info, item_ID) return hello() if __name__ == '__main__': item_attributes = ("name", "make", "model", "ID", "room", "teacher", "condition", "manual", "movable", "description", "hidden") log_values = ('name', 'to', 'from', 'tout', 'tin') database = db(log_values, item_attributes) app.run(host='0.0.0.0', port=8000, debug=True) print("", file=sys.stdout)
def make_headline_to_effect_data(): """ Headline -> Effect Creates essentially the X, Y data for the embedding model to use when analyzing/encoding headlines. Returns a list of headlines and a list of corresponding 'effects' which represent a change in the stock price. """ meta, headlines, effects = [], [], [] with db() as (conn, cur): for stock in stocks: print("Fetching Stock..." + stock) ## Go through all the headlines ## cur.execute( "SELECT date, source, content FROM headlines WHERE stock=? AND LENGTH(content) >= 16", [stock]) headline_query = cur.fetchall() for (date, source, content) in headline_query: event_date = datetime.strptime( date, '%Y-%m-%d') # The date of headline add_time = lambda e, days: (e + timedelta(days=days)).strftime( '%Y-%m-%d') ## Find corresponding tick data ## cur.execute( """SELECT AVG(adjclose) FROM ticks WHERE stock=? AND date BETWEEN ? AND ? ORDER BY date""", [stock, add_time(event_date, -3), add_time(event_date, 0)]) before_headline_ticks = cur.fetchall() cur.execute( """SELECT AVG(adjclose) FROM ticks WHERE stock=? AND date BETWEEN ? AND ? ORDER BY date""", [stock, add_time(event_date, 1), add_time(event_date, 6)]) after_headline_ticks = cur.fetchall() ## Create training example ## if len(before_headline_ticks) > 0 and len( after_headline_ticks) > 0 and before_headline_ticks[0][ 0] != None and after_headline_ticks[0][0] != None: previous_tick = before_headline_ticks[-1][0] result_tick = after_headline_ticks[0][0] if model_type == 'regression': # Percent Diff (+Normalization Constant) effect = [(result_tick - previous_tick) / previous_tick / 0.0044] else: if result_tick > previous_tick: effect = [1., 0.] else: effect = [0., 1.] meta.append((source, event_date.weekday())) headlines.append(content) effects.append(effect) return meta, headlines, np.array(effects)
clp.add_argument('-a', '--all-feeds', action='store_true', help='Show all \ feeds in output even if they don\'t have any new rss items. Default \ is not to show them') clp.add_argument('-f', '--feed-id', help='Only use or check this feed id') clp.add_argument('-l', '--list', action='store_true', help='List all Feeds') clp.add_argument('-c', '--comments', action='store_true', help='Show link to \ feed comments (if available)') clp.add_argument('--html', action='store_true', help='Output rss list in \ simple html') clargs = clp.parse_args() # If 'title' and 'url' then add the link to the db if (clargs.title and clargs.url) or (clargs.url): ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") try: with db(dbc, db_feed_table) as db_add: db_add.add_feed(clargs.title, clargs.url, ts) except MySQLdb._exceptions.OperationalError: print ("No mysql server connection found. Exiting.") sys.exit() sys.exit() # If output cmdline option is a filename if clargs.output: try: sys.stdout = open(clargs.output, "w") except FileNotFoundError: print("No such file or directory'" + clargs.output + "'. Exiting") exit() # Create the html header for font size etc if --html used