def main(): # Fetch input parameters. args = return_args() test_input = args.test_data_url X, y, x = Data.load_data(args.training_data_file, test_input) # Plot data Plot.plot_data(X, y) # SVM model. model = SVMClassifier().train(X, y) # print training details print('\nmodel.X: {}\nmodel.y: {}\nmodel.alphas: {}\nmodel.w: {}\n'.format( model.X, model.y, model.alphas, model.w)) # prediction. prediction = SVMClassifier().classify(x, model) # print test details print('\nx: {}\nprediction: {}\n'.format(x, prediction)) # Plot boundary Plot.plot_boundary(X, y, x, model) print('\nProcessed... {}\n\nURL Classified as: {}\n'.format( test_input, 'wrong url' if prediction == 0 else 'correct url'))
def FromJSON(path: str, keywordsPath: str, on_generate: Callable[Data, int, int] = None) -> DataSet: rawdata = [] keywords = {} with open(keywordsPath, 'r', encoding='utf-8') as f: keywords = json.load(f) with open(path, 'r', encoding='utf-8') as f: rawdata = json.load(f) if type(rawdata) is not list: rawdata = [] data = [] index = 0 raw_len = len(rawdata) for d in rawdata: if type(d) is not dict: continue source = d.get('source', '???').lower() title = d.get('title', '').lower() text = d.get('text', '').lower() score = 0 for negative in keywords.get('negative', []): if text.find(negative.lower()) != -1: score -= 1 for positive in keywords.get('positive', []): if text.find(positive.lower()) != -1: score += 1 category = DataSet.CategoryFromScore(score) result = Data(text, category, source, title, score) if on_generate is not None: on_generate(result, index, raw_len) data.append(result) index += 1 return DataSet(data)
def __init__(self,conf={}): self.conf = conf self.dataHandler = Data(conf) self.allCode = self.dataHandler.get("allCode") self.date = conf.get('date',Date.getDate()) #抓取指定日期的detail self.sourceName=conf.get('SOURCE_NAME') self.threadNum = int(conf.get('THREAD_NUM',THREAD_NUM)) # 采用多线程模式抓取数据时的线程数
def FromAny(text: Union[str, Iterable[str], Iterable[Data], pd.DataFrame], category: Union[str, Iterable[str]] = None) -> DataSet: data = None if type(text) is str and type(category) is str: data = DataSet([Data(text, category)]) elif isinstance(text, pd.DataFrame): data = DataSet.FromDataFrame(text) elif isinstance(text, DataSet): data = text elif isinstance(text, IterableType): if type(category) is str: data = DataSet([Data(x, category) for x in text]) elif isinstance(category, IterableType): data = DataSet( [Data(text[i], category[i]) for i in range(len(text))]) if not data: raise Exception( 'The given input is not supported: <{}, {}>.\nUse <str, str>, <str[], str|str[]>, <pandas.Dataframe[words, word_count] | Data[], Unknown>' .format(type(text), type(category))) return data
class Main(): #main holds the current running class main = None #creates all class objects config = Config() load_data = Load_data() data = Data() under = Under() #array for all databases databases_names = [] for database_name in os.listdir('databases'): databases_names.append("databases/" + database_name) def main(self, main): self.main = main self.main.config.config(self.main) self.main.data.data(self.config) #self.main.load_data.load_data(self.main, self.config) self.main.check_load_files() def new_matches(self, name): if (input("Would you like to collect data by " + name + " again: ").lower() == "yes"): return True else: return False def check_load_files(self): matchesbypermno = False up = False for file in os.listdir("."): if (file == ".matchesbypermno"): matchesbypermno = True elif (file == ".up"): up = True if (not matchesbypermno): self.main.data.match_by_permno_and_date_wrds_and_xls_or_txt( "permno", "permno", "crsp.wrds", "final.xls") up = False elif (self.new_matches("permno and date")): self.main.data.match_by_permno_and_date_wrds_and_xls_or_txt( "permno", "permno", "crsp.wrds", "final.xls") up = False if (not up): self.main.under.wrds_and_xml_or_txt_under(self.config, self.data) elif (self.new_up("Under Pricing?")): self.main.under.wrds_and_xml_or_txt_under(self.config, self.data)
def FromDataFrame(dataframe: pd.DataFrame) -> DataSet: data = [] dataview = dataframe.loc for index in dataframe.index: row = dataview[index] text = str.join(' ', [ i * row[i] for i in row.index if i != Data.CATEGORY_NAME and i != Data.SCORE_NAME ]) if Data.CATEGORY_NAME in row: score = row[Data.CATEGORY_NAME] elif Data.SCORE_NAME in row: score = row[Data.SCORE_NAME] else: score = 0 category = DataSet.CategoryFromScore(score) data.append(Data(text, category=category, score=score)) return DataSet(data)
def get_info(): vehicletable = Data('Vehicles.txt', [ 'License Plate', 'Manufacturer', 'Model', 'Year', 'Location', 'Category' ], [str, str, str, int, str, str]) ordertable = Data('Orders.txt', [ 'Order ID', 'Customer', 'Vehicle', 'Start Date', 'End Date', 'Extra Insurance', 'GPS' ], [ID, str, str, datetime, datetime, bool, bool]) for item in ordertable.get_rows(): order_list.append(item.values()) for item in vehicletable.get_rows(): car_list.append(item.values()) for item in ordertable.get_rows(): if item[3].value() <= datetime.now() < (item[4].value() + timedelta(days=1)): occupied.append(item[2].value())
class MultiThreadDownloader: def __init__(self,conf={}): self.conf = conf self.dataHandler = Data(conf) self.allCode = self.dataHandler.get("allCode") self.date = conf.get('date',Date.getDate()) #抓取指定日期的detail self.sourceName=conf.get('SOURCE_NAME') self.threadNum = int(conf.get('THREAD_NUM',THREAD_NUM)) # 采用多线程模式抓取数据时的线程数 def download(self): """ >>> app=MultiThreadDownloader(conf) >>> app.stock.allCode >>> app.download() True """ logging.debug("Start downloading data...\nCrawl mode is mutil.") conf = {} conf.update(self.conf) conf['handle']=self.handle conf['date'] = self.date oQueue = queue.Queue() for code in self.allCode: if type(code) == int: code = Util.getCode(code) oQueue.put(code) for i in range(self.threadNum): conf["queue"]=oQueue multiThreadCrawlHandler = MultiThreadHandler(conf = conf) multiThreadCrawlHandler.setDaemon(True) multiThreadCrawlHandler.start() oQueue.join() return True def handle(self,code,date): raise NotImplementedError
def train(): data = Data() data.read_data(filepath='data/train.csv', train_size=TRAIN_SIZE, validation_size=VALIDATION_SIZE, convert_to_one_hot=True) #data.train.display_digit() sess = tf.InteractiveSession() def variable_summaries(var): """Attach a lot of summaries to a Tensor (for TensorBoard visualization).""" with tf.name_scope('summaries'): mean = tf.reduce_mean(var) tf.summary.scalar('mean', mean) with tf.name_scope('stddev'): stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) tf.summary.scalar('stddev', stddev) tf.summary.scalar('max', tf.reduce_max(var)) tf.summary.scalar('min', tf.reduce_min(var)) tf.summary.histogram('histogram', var) with tf.name_scope('input'): input_layer = tf.placeholder(tf.float32, shape=[None, IMAGE_SIZE]) output_layer = tf.placeholder(tf.float32, shape=[None, N_CLASSES]) with tf.name_scope('reshape_input'): image_shaped_input = tf.reshape(input_layer, [-1, 28, 28, 1]) tf.summary.image('input', image_shaped_input) def weight_variable(shape): initial = tf.truncated_normal(shape, stddev=0.1) return tf.Variable(initial) def bias_variable(shape): initial = tf.constant(0.1, shape=shape) return tf.Variable(initial) def convolution_2d(input_tensor, input_dimension, nb_filter, filter_size, name, activation=tf.nn.relu): with tf.name_scope(name): with tf.name_scope('weights'): weights = weight_variable( [filter_size, filter_size, input_dimension, nb_filter]) variable_summaries(weights) with tf.name_scope('biases'): biases = bias_variable([nb_filter]) variable_summaries(biases) with tf.name_scope('preactivation'): preactivate = conv2d(input_tensor, weights) + biases # !!! tf.summary.histogram('pre-activation', preactivate) activations = activation(preactivate, name='activation') tf.summary.histogram('activations', activations) return activations def conv2d(input_tensor, weights): return tf.nn.conv2d(input_tensor, weights, strides=[1, 2, 2, 1], padding='SAME') def max_pool_2d(input_tensor, kernel_size, name): with tf.name_scope(name): return tf.nn.max_pool( input_tensor, ksize=[1, 2, 2, 1], # kernel size? strides=[1, 2, 2, 1], padding='SAME') def fully_connected(input_tensor, image_size, nb_filter, n_units, name, activation): with tf.name_scope(name): with tf.name_scope('weights'): weights = weight_variable( [image_size * image_size * nb_filter, n_units]) variable_summaries(weights) with tf.name_scope('biases'): biases = bias_variable([n_units]) variable_summaries(biases) with tf.name_scope('preactivation'): input_tensor_flat = tf.reshape( input_tensor, [-1, image_size * image_size * nb_filter]) preactivate = tf.matmul(input_tensor_flat, weights) + biases # same as convo tf.summary.histogram('pre-activation', preactivate) if activation == 'NONE': return preactivate else: activations = activation(preactivate, name='activation') tf.summary.histogram('activations', activations) return activations with tf.name_scope('neural_network_architecture'): conv_1 = convolution_2d(image_shaped_input, 1, nb_filter=16, filter_size=3, activation=tf.nn.relu, name='convolutional_layer_1') conv_2 = convolution_2d(conv_1, 16, nb_filter=32, filter_size=3, activation=tf.nn.relu, name='convolutional_layer_2') pool_1 = max_pool_2d(conv_2, kernel_size=2, name='pool_layer_1') conv_3 = convolution_2d(pool_1, 32, nb_filter=64, filter_size=3, activation=tf.nn.relu, name='convolutional_layer_3') conv_4 = convolution_2d(conv_3, 64, nb_filter=128, filter_size=3, activation=tf.nn.relu, name='convolutional_layer_4') pool_2 = max_pool_2d(conv_4, kernel_size=2, name='pool_layer_2') fc_1 = fully_connected(pool_2, 1, nb_filter=128, n_units=2048, activation=tf.nn.relu, name='fully_connected_1') fc_2 = fully_connected(fc_1, 1, nb_filter=2048, n_units=512, activation=tf.nn.relu, name='fully_connected_2') with tf.name_scope('dropout'): keep_prob = tf.placeholder(tf.float32) tf.summary.scalar('dropout_keep_probability', keep_prob) dropped = tf.nn.dropout(fc_2, keep_prob) y = fully_connected(dropped, 1, nb_filter=512, n_units=10, activation=tf.nn.softmax, name='fully_connected_3') with tf.name_scope('loss_function'): diff = tf.nn.softmax_cross_entropy_with_logits(labels=output_layer, logits=y) with tf.name_scope('total'): cross_entropy = tf.reduce_mean(diff) tf.summary.scalar('cross_entropy', cross_entropy) with tf.name_scope('optimizer'): train_step = tf.train.AdamOptimizer(LEARNING_RATE).minimize( cross_entropy) with tf.name_scope('accuracy'): with tf.name_scope('correct_prediction'): correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(output_layer, 1)) with tf.name_scope('accuracy'): accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) tf.summary.scalar('accuracy', accuracy) merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter(LOG_DIR + '/train', sess.graph) test_writer = tf.summary.FileWriter(LOG_DIR + '/test') tf.global_variables_initializer().run() print("\nTraining the network...") t = trange(EPOCHS * data.train.images.shape[0] // BATCH_SIZE) for i in t: # selecting a batch batch_x, batch_y = data.train.batch(BATCH_SIZE) # evaluating if i % 10 == 0: summary, acc = sess.run( [merged, accuracy], feed_dict={ input_layer: data.validation.images, output_layer: data.validation.labels, keep_prob: 1.0 }) test_writer.add_summary(summary, i) print('Accuracy at step %s: %s' % (i, acc)) else: # Record train set summaries, and train if i % 100 == 99: # Record execution stats run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, _ = sess.run( [merged, train_step], feed_dict={ input_layer: data.train.images, output_layer: data.train.labels, keep_prob: DROP_OUT }, options=run_options, run_metadata=run_metadata) train_writer.add_run_metadata(run_metadata, 'step%03d' % i) train_writer.add_summary(summary, i) print('Adding run metadata for', i) else: # Record a summary summary, _ = sess.run( [merged, train_step], feed_dict={ input_layer: data.train.images, output_layer: data.train.labels, keep_prob: DROP_OUT }) train_writer.add_summary(summary, i) train_writer.close() test_writer.close() def getActivations(layer, stimuli): units = sess.run(layer, feed_dict={ input_layer: np.reshape(stimuli, [1, 784], order='F'), keep_prob: 1.0 }) plotNNFilter(units) def plotNNFilter(units): filters = units.shape[3] plt.figure(1, figsize=(20, 20)) n_columns = 6 n_rows = math.ceil(filters / n_columns) + 1 for i in range(filters): plt.subplot(n_rows, n_columns, i + 1) plt.title('Filter ' + str(i)) plt.imshow(units[0, :, :, i], interpolation="nearest", cmap="gray") imageToUse = data.train.images[0] data.train.display_digit() plt.imshow(np.reshape(imageToUse, [28, 28]), interpolation="nearest", cmap="gray") plt.show() #getActivations(conv_1, imageToUse) #getActivations(conv_2, imageToUse) #getActivations(conv_3, imageToUse) getActivations(conv_4, imageToUse) print('h') plt.show()
def __init__(self): Data.__init__(self)
def start(self): # tf Graph self.x = tf.placeholder("float", [ None, self.params["post_padding_size"], self.params["comment_padding_size"], self.params["word2vec_dim"] ], name="input_x") self.y_sentiment = tf.placeholder("float", [ None, self.params["post_padding_size"], self.params["n_classes_sentiment"] ], name="input_y_sentiment") self.y_topics = tf.placeholder("float", [ None, self.params["post_padding_size"], self.params["n_classes_topics"] ], name="input_y_topics") self.y_emotion = tf.placeholder("float", [ None, self.params["post_padding_size"], self.params["n_classes_emotion"] ], name="input_y_emotion") self.y_speech_acts = tf.placeholder("float", [ None, self.params["post_padding_size"], self.params["n_classes_speech_acts"] ], name="input_y_speech_acts") self.keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") self.sequence_length = tf.placeholder(tf.int32, [None]) fully_connected_params = { "out_sentiment_w": tf.Variable( tf.random_normal([ self.params["n_hidden"], self.params["n_classes_sentiment"] ])), "out_topics_w": tf.Variable( tf.random_normal( [self.params["n_hidden"], self.params["n_classes_topics"]])), "out_emotion_w": tf.Variable( tf.random_normal([ self.params["n_hidden"], self.params["n_classes_emotion"] ])), "out_speech_acts_w": tf.Variable( tf.random_normal([ self.params["n_hidden"], self.params["n_classes_speech_acts"] ])), "out_sentiment_b": tf.Variable(tf.random_normal([self.params["n_classes_sentiment"] ])), "out_topics_b": tf.Variable(tf.random_normal([self.params["n_classes_topics"]])), "out_emotion_b": tf.Variable(tf.random_normal([self.params["n_classes_emotion"]])), "out_speech_acts_b": tf.Variable( tf.random_normal([self.params["n_classes_speech_acts"]])) } self.lstm = Lstm(params=self.params, fully_connected_params=fully_connected_params) # get predictions self.predictions = self.lstm.model( x=self.cnn.model(self.x, self.keep_prob), sequence_length=self.sequence_length, keep_prob=self.keep_prob) # define loss with tf.name_scope("loss_sentiment"): self.cost_sentiment = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( logits=self.predictions["prediction_sentiment"], labels=self.y_sentiment)) with tf.name_scope("loss_topics"): self.cost_topics = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=self.predictions["prediction_topics"], labels=self.y_topics)) with tf.name_scope("loss_emotions"): self.cost_emotions = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=self.predictions["prediction_emotion"], labels=self.y_emotion)) with tf.name_scope("loss_speech_acts"): self.cost_speech_acts = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=self.predictions["prediction_speech_acts"], labels=self.y_speech_acts)) # define optimizer self.optimizer = tf.train.AdamOptimizer( learning_rate=self.params["learning_rate"]).minimize( self.cost_sentiment + self.cost_topics + self.cost_emotions + self.cost_speech_acts) # evaluate model with tf.name_scope("accuracy_sentiment"): correct_pred_sentiment = tf.equal( tf.argmax(self.predictions["prediction_sentiment"], 1), tf.argmax(self.y_sentiment, 1)) self.accuracy_sentiment = tf.reduce_mean( tf.cast(correct_pred_sentiment, tf.float32)) with tf.name_scope("accuracy_topics"): correct_pred_topics = tf.equal( tf.round(tf.nn.sigmoid(self.predictions["prediction_topics"])), tf.round(self.y_topics)) self.accuracy_topics = tf.reduce_mean( tf.cast(correct_pred_topics, tf.float32)) with tf.name_scope("accuracy_emotion"): correct_pred_emotion = tf.equal( tf.round(tf.nn.sigmoid( self.predictions["prediction_emotion"])), tf.round(self.y_emotion)) self.accuracy_emotion = tf.reduce_mean( tf.cast(correct_pred_emotion, tf.float32)) with tf.name_scope("accuracy_speech_acts"): correct_pred_speech_acts = tf.equal( tf.round( tf.nn.sigmoid(self.predictions["prediction_speech_acts"])), tf.round(self.y_speech_acts)) self.accuracy_speech_acts = tf.reduce_mean( tf.cast(correct_pred_speech_acts, tf.float32)) # initializing the variables self.init = tf.global_variables_initializer() # 'Saver' op to save and restore all the variables self.saver = tf.train.Saver() # get data object self.data = Data( filename='data/word2vec/wiki.hr.vec', comment_padding_size=self.params["comment_padding_size"], post_padding_size=self.params["post_padding_size"], word2vec_dim=self.params["word2vec_dim"], binary_sentiment=self.params["binary_sentiment"]) self.runs.create_run() # START LEARNING!!! self.learn()
'agglomeration': 1/1000, 'breakage': 1/1000}) # Define model system system = System(case="Laboratory lactose case study", domain=domain, ode_settings=ode_settings, loss_settings=loss_settings, rate_settings=rate_settings, dilution=False, regularization=1, normalize=True) # Adding sensors system.add_sensor(name='Temperature', measured=True, controlled=True, unit='C') system.add_sensor(name='Concentration', measured=True, controlled=False, unit='g/µL') # Activate phenomena system.activate_phenomena(['nucleation', 'growth']) # Create data-set and set up data-shuffler data = Data(case_id='Demo data') data.load_from_pickle('demo_data') time_series_pair = TimeSeriesPair(data=data, system=system) # Split training and validation data data.set_batch_pool(pool_batch_id=['Demo batch 0', 'Demo batch 1', 'Demo batch 2', 'Demo batch 3'], pool_type='Training') data.set_batch_pool(pool_batch_id=['Demo batch 4', 'Demo batch 5', 'Demo batch 6', 'Demo batch 7', 'Demo batch 8', 'Demo batch 9'], pool_type='Validation') data.set_batch_pool(pool_batch_id=['Demo batch 4'], pool_type='Test') # Set up hybrid training model hybrid_model = HybridModel(system=system) # Compile hybrid model hybrid_model.training_model.compile(loss=hybrid_model.loss_model.loss, optimizer='Adam') # Generate shuffled training and evaluation data
class Stock(object): def __init__(self,conf={}): self.conf=conf self.code=None self.date=None self.data=Data(self.conf) #store and cache data def __iter__(self): """ support iter function for stock. >>> stock.data.adv={'20110804':{'601919':{'close':'11.11'},'601920':{'close':'22.22'}}} >>> stock.date = None >>> check = True >>> for date in stock: ... print date 20110804 >>> for code , price in stock['20110804']: ... print code ,price 601919 11.11 601920 22.22 """ if self.date: # if set date ,then return data in date, else return all dates in stock data=self.data.get(name="adv",conf={"date":self.date,"code":"all"}) if data: result=map(lambda code:(code , data.get(code,{}))) if result: return iter(result) return iter([]) else: return iter(self.data.adv.keys()) def __getitem__(self,value): """ 通过[],setDate 取值将会改变stock的基准值,而通过方法 index()则不会 >>> stock.data.adv={'20110805':{'601919':{'close':'10.0'},'601920':{'close':'22.22'}}} >>> stock['601919']['20110805'].close '10.0' """ if len(value)==6: self.code = value if len(value)==8: self.date = value return self def __getattr__(self,value): """ define some simple way to access data in stock. >>> len(stock.allCode)>1000 #and len(stock.allCode) == len(stock.info) True >>> len(stock.info) > 100 True >>> stock.data.adv={'20110804':{'601919':{'close':'11.11','volume':'111','high':'12','low':'10',"sequence": [ 7.34]},'601920':{'close':'22.22'}}} >>> stock['601919']['20110804'].close '11.11' >>> stock.volume '111' >>> stock.high '12' >>> stock.low '10' >>> stock['20110804']['601919'].sequence [7.34] """ result = self.data.get(name = value,conf={"date":self.date,"code":self.code}) if result == None: return 0 else: return result def __len__(self): """ get code length in stock data. >>> len(stock) > 1000 True """ return len(self.allCode) def index(self,index): self.date=Date.getDate(index , self.date) return self def ma(self,dateRange): """ 求指定日期内平均股价 """ return self.data.get(name = "ma",conf={"date":self.date,"code":self.code,"dateRange":dateRange}) or 0 def max(self,dateRange): return self.data.get(name = "max",conf={"date":self.date,"code":self.code,"dateRange":dateRange}) or 0 def min(self,dateRange): return self.data.get(name = "min",conf={"date":self.date,"code":self.code,"dateRange":dateRange}) or 0
def __init__(self,conf={}): self.conf=conf self.code=None self.date=None self.data=Data(self.conf) #store and cache data
import seaborn as sns sns.set() import matplotlib.pyplot as plt from Timer import Timer import lmfit # Construct discretized domain object for hybrid model domain = Domain(name='Domain') domain.add_axis(x_min=5, x_max=100, m=30, disc_by='FeretMean', name='FeretMean') # Create data-set and set up data-shuffler data = Data(case_id='Laboratory lactose case study') data.load_from_pickle( 'C:/Users/rfjoni/PycharmProjects/ParticleModel/projects/CACE_cases/CACE_lactose_study/lactose' ) data.batches[2].batch_id = 'Batch 1' data.batches[3].batch_id = 'Batch 2' # Convert time and temperature data to polynomial fit # Batch 1 t_batch1 = [ (measurement.time - data.batches[2].measurements[0].time).total_seconds() for measurement in data.batches[2].measurements ] T_batch1 = [ measurement.external_sensors[2].value for measurement in data.batches[2].measurements
nun_days = 910 #numero de candles batch_size = 1 #divisao em blocos #¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨¨ #instanciar objetos """ Sobre os dados Estes dados são informações retiradas da BMF Bovespa, o periodo é Intraday,além das informações que formam um candlestick, são associados as colunas, informações de indicadores técnicos. Index(['Hora', 'dif', 'retracao +', 'retracao -', 'RSI', 'M22M44', 'M22M66', 'M66M44', 'ADX', 'ATR', 'Momentum', 'CCI', 'Bears', 'Bulls', 'Stock1', 'Stock2', 'Wilians', 'Std', 'MFI', 'target'], dtype='object') O rótulos são iformações que consideram a tendência do preços, 1: compra, 2: venda e 0:sem operação """ data = Data(nun_days, batch_size) entrada, entrada_trader, base, media, std = data.import_data() labels = Labels() data_labels = labels.index_labels(base, entrada) print('Nome das colunas: ', data_labels.columns) print('Quantidade de cada categória: ', data_labels.target.value_counts()) """ Normalização dos dados A padronização de dados dá aos dados média zero e variação unitária, é uma boa prática, especialmente para algoritmos como KNN, que é baseado na distância dos casos: """ #separando os dados colunas = [ 'Hora', 'dif', 'retracao +', 'retracao -', 'RSI', 'M22M44', 'M22M66', 'M66M44', 'ADX', 'ATR', 'Momentum', 'CCI', 'Bears', 'Bulls', 'Stock1',
class Model: def __init__(self, params): self.params = params self.cnn = Cnn(params) # helper class for storing run details self.runs = Runs() def start(self): # tf Graph self.x = tf.placeholder("float", [ None, self.params["post_padding_size"], self.params["comment_padding_size"], self.params["word2vec_dim"] ], name="input_x") self.y_sentiment = tf.placeholder("float", [ None, self.params["post_padding_size"], self.params["n_classes_sentiment"] ], name="input_y_sentiment") self.y_topics = tf.placeholder("float", [ None, self.params["post_padding_size"], self.params["n_classes_topics"] ], name="input_y_topics") self.y_emotion = tf.placeholder("float", [ None, self.params["post_padding_size"], self.params["n_classes_emotion"] ], name="input_y_emotion") self.y_speech_acts = tf.placeholder("float", [ None, self.params["post_padding_size"], self.params["n_classes_speech_acts"] ], name="input_y_speech_acts") self.keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob") self.sequence_length = tf.placeholder(tf.int32, [None]) fully_connected_params = { "out_sentiment_w": tf.Variable( tf.random_normal([ self.params["n_hidden"], self.params["n_classes_sentiment"] ])), "out_topics_w": tf.Variable( tf.random_normal( [self.params["n_hidden"], self.params["n_classes_topics"]])), "out_emotion_w": tf.Variable( tf.random_normal([ self.params["n_hidden"], self.params["n_classes_emotion"] ])), "out_speech_acts_w": tf.Variable( tf.random_normal([ self.params["n_hidden"], self.params["n_classes_speech_acts"] ])), "out_sentiment_b": tf.Variable(tf.random_normal([self.params["n_classes_sentiment"] ])), "out_topics_b": tf.Variable(tf.random_normal([self.params["n_classes_topics"]])), "out_emotion_b": tf.Variable(tf.random_normal([self.params["n_classes_emotion"]])), "out_speech_acts_b": tf.Variable( tf.random_normal([self.params["n_classes_speech_acts"]])) } self.lstm = Lstm(params=self.params, fully_connected_params=fully_connected_params) # get predictions self.predictions = self.lstm.model( x=self.cnn.model(self.x, self.keep_prob), sequence_length=self.sequence_length, keep_prob=self.keep_prob) # define loss with tf.name_scope("loss_sentiment"): self.cost_sentiment = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( logits=self.predictions["prediction_sentiment"], labels=self.y_sentiment)) with tf.name_scope("loss_topics"): self.cost_topics = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=self.predictions["prediction_topics"], labels=self.y_topics)) with tf.name_scope("loss_emotions"): self.cost_emotions = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=self.predictions["prediction_emotion"], labels=self.y_emotion)) with tf.name_scope("loss_speech_acts"): self.cost_speech_acts = tf.reduce_mean( tf.nn.sigmoid_cross_entropy_with_logits( logits=self.predictions["prediction_speech_acts"], labels=self.y_speech_acts)) # define optimizer self.optimizer = tf.train.AdamOptimizer( learning_rate=self.params["learning_rate"]).minimize( self.cost_sentiment + self.cost_topics + self.cost_emotions + self.cost_speech_acts) # evaluate model with tf.name_scope("accuracy_sentiment"): correct_pred_sentiment = tf.equal( tf.argmax(self.predictions["prediction_sentiment"], 1), tf.argmax(self.y_sentiment, 1)) self.accuracy_sentiment = tf.reduce_mean( tf.cast(correct_pred_sentiment, tf.float32)) with tf.name_scope("accuracy_topics"): correct_pred_topics = tf.equal( tf.round(tf.nn.sigmoid(self.predictions["prediction_topics"])), tf.round(self.y_topics)) self.accuracy_topics = tf.reduce_mean( tf.cast(correct_pred_topics, tf.float32)) with tf.name_scope("accuracy_emotion"): correct_pred_emotion = tf.equal( tf.round(tf.nn.sigmoid( self.predictions["prediction_emotion"])), tf.round(self.y_emotion)) self.accuracy_emotion = tf.reduce_mean( tf.cast(correct_pred_emotion, tf.float32)) with tf.name_scope("accuracy_speech_acts"): correct_pred_speech_acts = tf.equal( tf.round( tf.nn.sigmoid(self.predictions["prediction_speech_acts"])), tf.round(self.y_speech_acts)) self.accuracy_speech_acts = tf.reduce_mean( tf.cast(correct_pred_speech_acts, tf.float32)) # initializing the variables self.init = tf.global_variables_initializer() # 'Saver' op to save and restore all the variables self.saver = tf.train.Saver() # get data object self.data = Data( filename='data/word2vec/wiki.hr.vec', comment_padding_size=self.params["comment_padding_size"], post_padding_size=self.params["post_padding_size"], word2vec_dim=self.params["word2vec_dim"], binary_sentiment=self.params["binary_sentiment"]) self.runs.create_run() # START LEARNING!!! self.learn() def learn(self): with tf.Session() as sess: self.sess = sess # initialize session sess.run(self.init) step = 1 counter = 0 for epoch in range(0, self.params["max_epoch"]): if epoch % self.params["evaluate_every"] == 0: self.evaluate() self.runs.save_model(sess=self.sess, saver=self.saver) batch_x = [] batch_seq_length = [] batch_y_sentiment = [] batch_y_topics = [] batch_y_emotions = [] batch_y_speech_acts = [] with open('data/threads/splits/split-0/train.txt', encoding="UTF-8") as f: for line in f: x, sequence_length_next, y_sentiment_next, y_topics_next, y_emotion_next, y_speech_acts_next = \ self.data.get_next(line) batch_x.append(x) batch_seq_length.append(sequence_length_next) batch_y_sentiment.append(y_sentiment_next) batch_y_topics.append(y_topics_next) batch_y_emotions.append(y_emotion_next) batch_y_speech_acts.append(y_speech_acts_next) counter += 1 if len(batch_x) == self.params["batch_size"]: # turn input to np.array batch_x = np.array(batch_x) batch_y_sentiment = np.array(batch_y_sentiment) batch_seq_length = np.array(batch_seq_length) # reshape input batch_x = batch_x.reshape( (self.params["batch_size"], self.params["post_padding_size"], self.params["comment_padding_size"], self.params["word2vec_dim"])) batch_y_sentiment = batch_y_sentiment.reshape( (self.params["batch_size"], self.params["post_padding_size"], self.params["n_classes_sentiment"])) # TRAIN HERE sess.run(self.optimizer, feed_dict={ self.x: batch_x, self.y_sentiment: batch_y_sentiment, self.y_topics: batch_y_topics, self.y_emotion: batch_y_emotions, self.y_speech_acts: batch_y_speech_acts, self.sequence_length: batch_seq_length, self.keep_prob: self.params["keep_prob_global_train"] }) step += 1 if step % self.params["display_step"] == 0: # SENTIMENT acc_sentiment = sess.run( self.accuracy_sentiment, feed_dict={ self.x: batch_x, self.y_sentiment: batch_y_sentiment, self.sequence_length: batch_seq_length, self.keep_prob: self.params["keep_prob_global_train"] }) loss_sentiment = sess.run( self.cost_sentiment, feed_dict={ self.x: batch_x, self.y_sentiment: batch_y_sentiment, self.sequence_length: batch_seq_length, self.keep_prob: self.params["keep_prob_global_train"] }) print("Epoch: " + str(epoch + 1) + " Iteration: " + str(step * self.params["batch_size"])) print("[SENTIMENT] " + " Minibatch Loss= " "{:.4f}".format(loss_sentiment) + ", Minibatch Accuracy= " "{:.4f}".format(acc_sentiment)) # EMOTIONS acc_emotion = sess.run( self.accuracy_emotion, feed_dict={ self.x: batch_x, self.y_emotion: batch_y_emotions, self.sequence_length: batch_seq_length, self.keep_prob: self.params["keep_prob_global_train"] }) loss_emotion = sess.run( self.cost_emotions, feed_dict={ self.x: batch_x, self.y_emotion: batch_y_emotions, self.sequence_length: batch_seq_length, self.keep_prob: self.params["keep_prob_global_train"] }) print("[EMOTION] " + " Minibatch Loss= " "{:.4f}".format(loss_emotion) + ", Minibatch Accuracy= " "{:.4f}".format(acc_emotion)) # TOPICS acc_topics = sess.run( self.accuracy_topics, feed_dict={ self.x: batch_x, self.y_topics: batch_y_topics, self.sequence_length: batch_seq_length, self.keep_prob: self.params["keep_prob_global_train"] }) loss_topics = sess.run( self.cost_topics, feed_dict={ self.x: batch_x, self.y_topics: batch_y_topics, self.sequence_length: batch_seq_length, self.keep_prob: self.params["keep_prob_global_train"] }) print("[TOPICS] " + " Minibatch Loss= " "{:.4f}".format(loss_topics) + ", Minibatch Accuracy= " "{:.4f}".format(acc_topics)) # SPEECH ACTS acc_speech_acts = sess.run( self.accuracy_speech_acts, feed_dict={ self.x: batch_x, self.y_speech_acts: batch_y_speech_acts, self.sequence_length: batch_seq_length, self.keep_prob: self.params["keep_prob_global_train"] }) loss_speech_acts = sess.run( self.cost_speech_acts, feed_dict={ self.x: batch_x, self.y_speech_acts: batch_y_speech_acts, self.sequence_length: batch_seq_length, self.keep_prob: self.params["keep_prob_global_train"] }) print("[SPEECH ACTS] " + "Minibatch Loss= " "{:.4f}".format(loss_speech_acts) + ", Minibatch Accuracy= " "{:.4f}".format(acc_speech_acts)) print("") # RESET BATCH batch_x = [] batch_seq_length = [] batch_y_sentiment = [] batch_y_topics = [] batch_y_emotions = [] batch_y_speech_acts = [] self.evaluate() def evaluate(self): evaluate = Evaluate(self.data, self.params, self.predictions, self.sess, self.x, self.sequence_length, self.keep_prob, self.y_sentiment, self.y_emotion, self.y_topics, self.y_speech_acts) evaluate.execute_evaluation("TRAIN", "train.txt") evaluate.execute_evaluation("TEST", "test.txt")