def upsert_exchangeCurrency(client, base_cur, quote_cur, data, ts, exchange): # db.exchanges.createIndex( { base_cur: 1, quote_cur: 1, timestamp: 1, exchange: 1 }, { unique: true } ) try: db = selectDB(client) dct_qry = { "base_cur": base_cur, "quote_cur": quote_cur, "timestamp":ts, "exchange":exchange, } dct = dct_qry.copy() dct['data']=data out = db.exchanges.update(dct_qry, dct, upsert=True) log = createLogger("exchangeProducer_info", "exchangeProducer_info") log.info(str(out)) except Exception as ex: print(ex) if not "E11000" in str(ex): print("exception: " + str(ex)) logErr = createLogger("DAL", "DAL_error") logErr.critical(str(ex), exc_info=True)
def insert_newsSite_lastBuilt(client, url): try: db = selectDB(client) db.newsbuilds.insert_one({'url':url,'ts':dtNow()}) except Exception as ex: logErr = createLogger("DAL", "DAL_error") logErr.critical(str(ex), exc_info=True)
def store_predictions_completed(client, version, timestamp_predic): try: db = selectDB(client) db.predictions_completed.update({'version': version, 'timestamp_predic': timestamp_predic}, {'version': version, 'timestamp_predic': timestamp_predic}, upsert=True) except Exception as ex: logErr = createLogger("DAL", "DAL_error") logErr.critical(str(ex), exc_info=True)
def store_prediction2(client, crypto, interval, timestamp_predic, timestamp, feature, featuresID, n_batch_size, n_neurons, n_window, n_epoch, data): # db.predictions.createIndex( { uid: 1 }, { unique: true } ) try: import collections db = selectDB(client) dct = { # use dct to construct uid first, then add other data for update 'crypto':crypto, 'interval':interval, 'timestamp_predic':timestamp_predic, 'timestamp':timestamp, 'feature':feature, 'featuresID': featuresID, 'n_batch_size': n_batch_size, 'n_neurons': n_neurons, 'n_window': n_window, 'n_epoch': n_epoch, } dct = collections.OrderedDict(sorted(dct.items())) # adding new key-values to the dct will yield duplicates because uid will be different uid = int(hashlib.md5(( ''.join((str(x) for key, x in dct.items())) ).encode()).hexdigest()[:8], 16) dct['uid'] = uid dct['data'] = data db.predictions.remove({'uid': uid}) db.predictions.update({'uid': uid}, dct, upsert=True) except Exception as ex: logErr = createLogger("DAL", "DAL_error") logErr.critical(str(ex), exc_info=True)
def liveness_IAmAlive(client, name): try: db = selectDB(client) db.liveness.update({'name': name}, {'name': name, 'timestamp': dtNow()}, upsert=True) except Exception as ex: logErr = createLogger("DAL", "DAL_error") logErr.critical(str(ex), exc_info=True)
def getJson(exchange, base_cur, quote_cur, interval, historymins, currentDateTime, sync_dict_json): # getting data from our API (OHLC, volume, sentiments, ...) depending on the type parameter in query. # url = 'https://cryptopredicted.com/api.php?type=exchangeChart&exchange='+exchange+'&base_cur='+base_cur+'"e_cur='+quote_cur+'&historymins='+str(historymins)+'¤tDateTime='+dtToString(currentDateTime)+'&interval='+str(interval) url = 'https://cryptopredicted.com/PWA/api/?type=exchange&exchange=' + exchange + '&base_cur=' + base_cur + '"e_cur=' + quote_cur + '&interval=' + str( interval) + '&historymins=' + str( historymins) + '¤tDateTime=' + dtToString(currentDateTime) log = createLogger("predictions_v1_info", "predictions_v1_info") log.info(url) i = 0 force = False while url in sync_dict_json and sync_dict_json[url] == 0: time.sleep(0.25) i += 1 if i * 4 > 60: # wait 60seconds for the json (from other process), if it fails then force proceed yourself force = True # sync_dict_json is a dictionary shared among the other processes # it prevents making the same calls to the API, if the results are already obtained by some other process # we don't want to make unnecessary API calls, one is enough given the same parameters. if force or not url in sync_dict_json: print(url) sync_dict_json[url] = 0 #print(url) response = requests.get(url) js = json.loads(response.text, object_pairs_hook=OrderedDict) sync_dict_json[url] = js #return js return sync_dict_json[url]
def func_ai_a(js, symbol): # preparing data to be trained, servers as input to the Neural Net (NN) dataset = [] #[()] * len(js) i = 0 for key in js: # DO NOT USE RELATIVE VALUES FROM API !!! only absolute ones if 'open' in js[key] and 'close' in js[key] and 'low' in js[ key] and 'high' in js[key] and 'volume' in js[key]: dataset.append([ js[key]['open'], js[key]['close'], js[key]['low'], js[key]['high'], js[key]['volume'], ]) i += 1 else: # most likely some missing interval print("missing data at interval:") print(key) #print(js[key]) logErr = createLogger("predictions_v1_error", "predictions_v1_error") logErr.critical("missing data at interval:") logErr.critical(key) #logErr.critical(js[key]) # raise dataset = np.array(dataset) return dataset
def store_prediction4(client, sendobj): # db.predictions3.createIndex( { uid: 1 }, { unique: true } ) try: import collections db = selectDB(client) dct = { # use dct to construct uid first, then add other data for update 'symbol': sendobj['symbol'], 'interval': sendobj['interval'], 'timestamp': sendobj['timestamp'], 'feature': sendobj['feature'], 'featuresID': sendobj['featuresID'], 'n_batch_size': sendobj['n_batch_size'], 'n_neurons': sendobj['n_neuron'], 'n_window': sendobj['n_window'], 'n_epoch': sendobj['n_epoch'], 'predict_n_intervals': sendobj['predict_n_intervals'], 'n_hiddenlayers': sendobj['n_hiddenlay'], } dct = collections.OrderedDict(sorted(dct.items())) # adding new key-values to the dct will yield duplicates because uid will be different uid = int(hashlib.md5(( ''.join((str(x) for key, x in dct.items())) ).encode()).hexdigest()[:8], 16) dct['uid'] = uid dct['data'] = sendobj['data'] # beware that upsert will update even deep nested objects. --> solution: remove first db.predictions4.remove({'uid': uid}) db.predictions4.update({'uid': uid}, dct, upsert=True) except Exception as ex: print(ex) logErr = createLogger("DAL", "DAL_error") logErr.critical(str(ex), exc_info=True)
def update_newsSite_lastBuilt(client, id): try: db = selectDB(client) db.newsbuilds.update({'_id':ObjectId(id)}, {'$set':{'ts':dtNow()}}) except Exception as ex: logErr = createLogger("DAL", "DAL_error") logErr.critical(str(ex), exc_info=True)
def liveness_getAll(client): try: db = selectDB(client) cursor = db.liveness.find() result = list(cursor) return result except Exception as ex: logErr = createLogger("DAL", "DAL_error") logErr.critical(str(ex), exc_info=True)
def check_when_newsSite_lastBuilt(client, url): try: db = selectDB(client) cursor = db.get_collection('newsbuilds').find({'url':{'$eq':url}}) result = list(cursor) return result except Exception as ex: logErr = createLogger("DAL", "DAL_error") logErr.critical(str(ex), exc_info=True)
def store_currency(client, crypto, USD, ts): # do not set index for 'tx' !!! otherwise conflict with different cryptos try: db = selectDB(client) db.currencies.insert_one( { "crypto": crypto, "USD": USD, "timestamp":ts } ) except Exception as ex: logErr = createLogger("DAL", "DAL_error") logErr.critical(str(ex), exc_info=True)
def store_mentions_news_extended_bulk(client, arr, ts): # each obj in arr : {title, crypto, source, url} # db.mentionsExtendedNews.createIndex( { crypto: 1, url: 1 , title: 1}, { unique: true } ) try: for a in arr: a['timestamp'] = ts db = selectDB(client) db.mentionsExtendedNews.insert_many(arr, ordered=False) # ordered: If false: when a single write fails, the operation will continue with the remaining writes, if any, and throw an exception. print('+ \t '+str(len(arr)) + ' added') except Exception as ex: logErr = createLogger("DAL", "DAL_error") logErr.critical(str(ex), exc_info=True)
def store_sentiments_news(client, sentiments, ts, crypto): # try: db = selectDB(client) db.sentimentsNews.insert_one( { "crypto": crypto, "timestamp": ts, "sentiments": sentiments } ) # print('+') except Exception as ex: logErr = createLogger("DAL", "DAL_error") logErr.critical(str(ex), exc_info=True)
def store_mentions_news(client, count, ts, crypto, source): try: db = selectDB(client) db.mentionsNews.insert_one( { "crypto": crypto, "timestamp": ts, "mentions": count, "source": source, } ) # print('+') except Exception as ex: logErr = createLogger("DAL", "DAL_error") logErr.critical(str(ex), exc_info=True)
def store_volume(client, fromSymbol, fromVol24_avg, fromVol24_sum, toSymbol, toVol24_avg, toVol24_sum, ts): try: db = selectDB(client) db.volumes.insert_one( { "fromSymbol": fromSymbol, "fromVol24_avg": fromVol24_avg, "fromVol24_sum": fromVol24_sum, "toSymbol": toSymbol, "toVol24_avg": toVol24_avg, "toVol24_sum": toVol24_sum, "timestamp":ts, } ) except Exception as ex: logErr = createLogger("DAL", "DAL_error") logErr.critical(str(ex), exc_info=True)
def init_predictions_completed(client): # temporary function to fill this new collection based on already generated data try: db = selectDB(client) cursor = db.predictions.find() #result = list(cursor) for e in cursor: if 'feature' in e: v = None if e['feature'] == 'price': v = 1 elif e['feature'] == 'price2': v = 2 elif e['feature'] == 'price3': v = 3 if v != None: print( db.predictions_completed.update({'version': v, 'timestamp_predic': e['timestamp_predic']}, {'version': v, 'timestamp_predic': e['timestamp_predic']}, upsert=True) ) except Exception as ex: logErr = createLogger("DAL", "DAL_error") logErr.critical(str(ex), exc_info=True)
# I would also advise you to get rid of Python and use NodeJS workers instead. import datetime import pprint import json import time import collections import urllib.request import threading import sys sys.path.insert(0, '/home/cryptopredicted/') from mysettings import dtNow, createLogger import DAL log = createLogger("exchangeProducer_info", "exchangeProducer_info") logErr = createLogger("exchangeProducer_err", "exchangeProducer_err") fillFromHistory = False class upserter (threading.Thread): def __init__(self, client, newData, ts, exchange, base_cur, quote_cur): threading.Thread.__init__(self) self.client = client self.newData = newData self.ts = ts self.exchange = exchange self.base_cur = base_cur self.quote_cur = quote_cur def run(self):
import html import nltk from langdetect import detect import json import time import threading import json from datetime import datetime from kafka import KafkaConsumer, TopicPartition max_window_seconds = 60 # aggregate & update in a one minute window max_buck_len = 1000 # unless the bucket reaches a threshold then we should flush it prematurely MAX_MENTIONS_EXTENDED_PER_WINDOW = 500000 # how many mentions to keep within a single buffer/bucket (max_window_seconds || max_buck_len) try: _log = createLogger("consumerK_info", "consumerK_info") _logErr = createLogger("consumerK_error", "consumerK_error") client = DAL.openConnection() except Exception as ex: log("exception") logErr(str(ex), traceback.format_exc()) exit() def log(*params): for p in params: print(p) _log.info(p) def logErr(*params):
import json from tweepy.streaming import StreamListener from tweepy import OAuthHandler from tweepy import Stream import time import nltk import sys import os sys.path.insert(0, '/home/cryptopredicted/') import producerMgr producer = producerMgr.create_kafkaProducer() from mysettings import CRYPTO_socialKeywords, dtNow, CRYPTO_twitterProducer, createLogger import DAL logErr = createLogger("twitterProducer_error", "twitterProducer_error") log = createLogger("twitterProducer_info", "twitterProducer_info") class StdOutListener(StreamListener): def __init__(self): self.CryptoMapping = list(CRYPTO_socialKeywords.items()) self.client = DAL.openConnection() self.alive_counter = dtNow() def on_data(self, data): try: data = json.loads(data) if 'user' in data: body, url = '', '' if 'user' in data and not 'retweeted_status' in data:
# url: https://www.reddit.com/prefs/apps # client id: EzcegP77YYq7dg # client secret: CwTogkSNVPGIJFiQdWyZF_Gqqr4 import praw import json import nltk import sys import os import time sys.path.insert(0, '/home/cryptopredicted/') import producerMgr from mysettings import CRYPTO_redditProducer_subreddits, CRYPTO_socialKeywords, dtNow, createLogger import DAL logErr = createLogger("redditProducer_error", "redditProducer_error") log = createLogger("redditProducer_info", "redditProducer_info") def streamAll(): producer = producerMgr.create_kafkaProducer() subreddits = list(CRYPTO_redditProducer_subreddits.values()) # get values subreddits = [item for items in subreddits for item in items] # flatten querystring = "+".join(subreddits) log.info(querystring) CryptoMapping = list(CRYPTO_socialKeywords.items()) while True: try: client = DAL.openConnection() alive_counter = dtNow()
from pyrogram import Client, MessageHandler from pyrogram.api import types import praw import json import nltk import sys import os import time sys.path.insert(0, '/home/cryptopredicted/') import producerMgr from mysettings import CRYPTO_socialKeywords, dtNow, createLogger import DAL logErr = createLogger("telegramProducer_error", "telegramProducer_error") log = createLogger("telegramProducer_info", "telegramProducer_info") dalclient = DAL.openConnection() producer = producerMgr.create_kafkaProducer() CryptoMapping = list(CRYPTO_socialKeywords.items()) def update_handler(client, message): print(message) print(type(message)) # pyrogram message type/class log.info(message) try: DAL.liveness_IAmAlive(dalclient, "producer: telegram") if message['text'] is not None: msg = (message['text'].encode('utf-8')).decode('utf-8')
def train_predict(args=sys.argv): # we need to generate every possible combination of our configuration, let's pre-process it. # we basically create and store tuples in an array. # the array will be processed in a multi-processing fashion. # we don't want to parallellize every possible combination, # but instead we want to have max 6 to 9 processes running at the same time. # that's why at the deepest level we have a "uid" which acts as separator. # this is an important part, because if you have many different combinations you want to try out (e.g. different epochs and neuron counts), # then you want to make sure the processes don't take too long or make the server crash due to too many processes (or memory consumption). for HH in range(HH_max): for exchange in sorted(exchanges): for symbol in sorted(symbols, key=lambda x: x['base_cur']): for featuresID, dataset_func in datasets.items(): for n_window in n_windows: for interval in intervals: for n_epoch in n_epochs: for n_neuron in n_neurons: for n_hiddenlay in n_hiddenlayers: for n_batch_size in n_batch_sizes: for predict_n_intervals in predict_n_intervals_arr: h5fn = h5Dir + 'predictions_v1' + ' base_cur=' + symbol[ 'base_cur'] + ' base_cur=' + symbol[ 'quote_cur'] + ' fid=' + featuresID + ' interval=' + str( interval ) + ' n_window=' + str( n_window ) + ' n_epoch=' + str( n_epoch ) + ' n_batch_size=' + str( n_batch_size ) + ' n_neuron=' + str( n_neuron ) + ' predict_n_intervals=' + str( predict_n_intervals ) + ' n_hiddenlay=' + str( n_hiddenlay) _dtime = adjustDatetime_realtime( interval, dtstart + timedelta( minutes=HH * interval)) uid = symbol[ 'base_cur'] #+"_"+symbol['quote_cur']+"_"+str(n_neuron)+"_"+str(n_window) # way to parallellize processing if not uid in arrParams: arrParams[uid] = [] arrParams[uid].append( (h5fn, featuresID, exchange, symbol, n_window, interval, _dtime, predict_n_intervals, n_neuron, n_hiddenlay, n_epoch, n_batch_size, dataset_func, sync_dict_json, sync_list_output, seq_pred_len)) # now that we have our magical array of jobs/tasks, # let's create a processing pool and execute all jobs accordingly. tasks = {} pools = {} for idf, arr in arrParams.items(): tasks[idf] = [] if not idf in pools: pools[idf] = multiprocessing.Pool(1) for tup in arr: tasks[idf].append(pools[idf].apply_async(fitAndPredict_trainAlways, tup)) client = DAL.openConnection() DAL.liveness_IAmAlive(client, "producer: predictions") for idf, arr in tasks.items(): for task in arr: try: task.get(timeout=60 * 20) except KeyboardInterrupt: raise except: traceback.print_exc() pools[idf].close() for sendobj in sync_list_output: DAL.store_predictions_v1(client, sendobj) print("/performance/") print("started:") print(_dtnow) print("ended:") print(dtNow()) print("/exited/") print("") log = createLogger("predictions_v1_info", "predictions_v1_info") log.info("/performance/") log.info("started:") log.info(str(_dtnow)) log.info("ended:") log.info(str(dtNow())) log.info("/exited/") log.info("")
def func_ai_b(js, symbol): # another type of input format, whereby we also make it predict buy/sell positions. # this is highly experimental and yielded bad results # but it may illustrate how such a thing is done in caee you need to extend your own version. dataset = [] #[()] * len(js) i = 0 for key in js: # DO NOT USE RELATIVE VALUES FROM API !!! only absolute ones if 'open' in js[key] and 'close' in js[key] and 'low' in js[ key] and 'high' in js[key] and 'volume' in js[key]: dataset.append([ js[key]['open'], js[key]['close'], js[key]['low'], js[key]['high'], js[key]['volume'], ]) i += 1 else: # most likely some missing interval print("missing data at interval:") print(key) #print(js[key]) logErr = createLogger("predictions_v1_error", "predictions_v1_error") logErr.critical("missing data at interval:") logErr.critical(key) #logErr.critical(js[key]) # raise # in this L = len(dataset) Lentry = len(dataset[0]) for i, x in enumerate(dataset): #print(i) price = (x[0] + x[1]) / 2 # avg(open ; close) j = i + 1 jarr = [] while j < L and j < 20: futurePrice = (dataset[j][0] + dataset[j][1]) / 2 if futurePrice >= price * 1.005: # if price in near future increases by 0.5% jarr.append( j ) # if we can make a profit by buying 'now' and selling at some interval 'j', then record this j += 1 if len(x) == Lentry: # if we haven't added the signal yet if len( jarr ) >= 1: # if we have at least X intervals in the future where we can sell (are we looking for a new plateau or temporary spike?) x.append(1) # buy for j in jarr: if len(dataset[j]) == Lentry: dataset[j].append(2) # sell for j in range(i + 1, max(jarr)): if len(dataset[j]) == Lentry: dataset[j].append( 0 ) # hold -- fill all gaps between first buy and possible future sells else: x.append(0) # hold # pprint.pprint(dataset[i:20]) # exit() dataset = np.array(dataset) return dataset
def fitAndPredict_trainAlways(h5fn, featuresID, exchange, symbol, n_window, interval, currentDateTime, predict_n_intervals, n_neuron, n_hiddenlay, n_epoch, n_batch_size, dataset_func, sync_dict_json, sync_list_output, seq_pred_len): # this is the core A.I. training and predictions part. import random from keras import backend as K from keras.callbacks import EarlyStopping try: # if no model exists: prepare data, create model, train it, save it and clear it. if not modelExists(h5fn): historymins = f_historymins(interval, n_window, 70) # 1000 dataset = obtainDataset( exchange, symbol, interval, historymins, currentDateTime - timedelta(minutes=interval - 1), dataset_func, sync_dict_json) n_features = len(dataset[0]) (train, train_X, train_y, scalers) = prepare_trainingset(dataset, n_features, n_window, seq_pred_len) print("creating new model: " + h5fn) model = createModel(h5fn, n_neuron, n_hiddenlay, n_features, n_window, seq_pred_len) early_stopping_monitor = EarlyStopping(monitor='loss', patience=30, verbose=1) history = model.fit(train_X, train_y, epochs=n_epoch, batch_size=n_batch_size, verbose=1, shuffle=False, callbacks=[ early_stopping_monitor ]) # validation_data=(test_X, test_y), saveModel(h5fn, model) saveWeights(h5fn, model) # saving scaler -- https://stackoverflow.com/questions/41993565/save-scaler-model-in-sklearn K.clear_session() del model # by now a model (already) exists; so we prepare data, load model, train it, make predictions and save the new weights. # notice that it's also possible to train the model once (step above), and then omit the "model.fit(...)" function, whereby we don't re-train the model each new generation. # if you omit continuous training, you will increase performance, but whether you accuracy is retained (through time) is not documented. # let us train once, and then just load model historymins = f_historymins(interval, n_window, 3) dataset = obtainDataset(exchange, symbol, interval, historymins, currentDateTime, dataset_func, sync_dict_json) n_features = len(dataset[0]) (train, train_X, train_y, scalers) = prepare_trainingset(dataset, n_features, n_window, seq_pred_len) model = loadModelAndWeights(h5fn) early_stopping_monitor = EarlyStopping(monitor='loss', patience=20, verbose=1) history = model.fit(train_X, train_y, epochs=n_epoch, batch_size=n_batch_size, verbose=1, shuffle=False, callbacks=[early_stopping_monitor ]) # validation_data=(test_X, test_y), saveWeights(h5fn, model) xpolated = make_future_predictions(scalers, train, n_window, n_features, predict_n_intervals, model, seq_pred_len) # let's prepare data to be stored into the database: currentDateTime = adjustDatetime( interval, currentDateTime ) # we use real-time datetime to make predictions, but when we persist we'll floor the datetime according to the interval tmpdt = currentDateTime + timedelta(minutes=interval) maxdt = currentDateTime + timedelta(minutes=seq_pred_len * predict_n_intervals * interval) j = 0 sendobj = { 'data': [], 'base_cur': symbol['base_cur'], 'quote_cur': symbol['quote_cur'], 'interval': interval, 'timestamp': currentDateTime, 'exchange': exchange, 'n_fid': featuresID, 'n_batch_size': n_batch_size, 'n_neuron': n_neuron, 'n_window': n_window, 'n_epoch': n_epoch, 'n_predict_intervals': predict_n_intervals, 'n_hiddenlay': n_hiddenlay, 'mode': mode } while (tmpdt <= maxdt and j < len(xpolated[0])): sendobj['data'].append({ 'timestamp': tmpdt, 'open': float(xpolated[0][j]), 'close': float(xpolated[1][j]), 'low': float(xpolated[2][j]), 'high': float(xpolated[3][j]), 'volume': float(xpolated[4][j]), # 'signal': float(xpolated[5][j]), }) tmpdt += timedelta(minutes=interval) j += 1 K.clear_session() del model # instead of writing each prediction individually, we use another shared dict variable, which we process at the very end. # this was implemented for several reasons (we want all predictions to be updated/stored at the same time, and not with a minute delay). # DAL.store_predictions_v1(DAL.openConnection(), sendobj) sync_list_output.append(sendobj) print(currentDateTime) except KeyboardInterrupt: raise except Exception as ex: traceback.print_exc() logErr = createLogger("predictions_v1_error", "predictions_v1_error") logErr.critical(str(ex), exc_info=True)
import os import multiprocessing as mp from datetime import datetime, timedelta import nltk from facepy import GraphAPI import pprint import math import time import sys sys.path.insert(0, '/home/cryptopredicted/') from mysettings import CRYPTO_facebookPages, CRYPTO_socialKeywords, dtNow, createLogger import producerMgr producer = producerMgr.create_kafkaProducer() import DAL logErr = createLogger("facebookProducer_error", "facebookProducer_error") log = createLogger("facebookProducer_info", "facebookProducer_info") # Facebook Graph API only allows about 600 API calls per 600 seconds (1 call per sec) # It is also limited to 50 (but this is handled by facepy) # Every batch's entry counts as one call # so basically we can make scrape 600 pages and then sleep for 10min # if we have more than 600 pages then we have to sleep(10*60) between every 600 # however in reality we are not going to have more than 600 pages any time soon since we only analyze big/mainstream pages # Minimum seconds to wait between each batch wait=len(batch) # To be on the safe side: wait = ceil(wait*1.30), and wait = 60 if wait < 60 (if we only have one page then we don't want to poll every 3sec, but every minute) post_arr = [] #graph = GraphAPI("637282779976098|Njeav9jewlL9uH-xTWFeodHNAak") # vanja's API graph = GraphAPI("109177903224114|FTDEjpb8JvBi-D67mrhwnQvZG38") # ilja's API