def train_models(self): MODELS_TO_TRAIN = [self.model_1_pre_process, self.model_2_pre_process] optimal_thetas = [[] for _ in xrange(len(MODELS_TO_TRAIN))] for i, model_preprocessor in enumerate(MODELS_TO_TRAIN): processor = DataProcessor() processor.load_input() processor.load_output() # add intercept term processor.input = np.insert(processor.input, 0, 1, 1) processor.input = processor.input.astype('float64') processor.input = model_preprocessor(processor.input) n = processor.input.shape[1] - 1 theta_init = np.zeros(n + 1) theta_init = np.matrix(theta_init) theta_init = theta_init.transpose() processor.split_to_training_test( self.TRAINING_TEST_DATA_SPLIT_RATIO) X = processor.training_input for j in xrange(processor.num_labels): y = (processor.training_output == j) y = y.astype(int) model = LogisticRegression(theta_init, X, y) optimizer = GradientDescent(model) theta_optimal = optimizer.find_min() theta_optimal = theta_optimal.transpose() optimal_thetas[i].append(theta_optimal.tolist()[0]) # theta indices i,j specify ith model and jth classification type with open(self.OUTPUT_FILE, "w") as f: for i, thetas in enumerate(optimal_thetas): for j, theta in enumerate(thetas): theta_str = ','.join(map(str, theta)) f.write("theta_%d_%d=%s\n" % (i, j, theta_str))
def __init__(self, start_date, stop_date, file_path, c_logger=None, data_processor=None): """ Init method of 'JsonReportGenerator' class. :param start_date: The start date. :param stop_date: The end data. :param file_path: Path of the generated file. """ self.start_date = start_date.replace(" ", "") self.stop_date = stop_date.replace(" ", "") self.file_path = file_path self.c_logger = c_logger if c_logger else self.__set_up_default_logger( ) self.data_processor = (data_processor if data_processor else DataProcessor(c_logger=self.c_logger))
def TestVocabMapping(): dataFile = "./dataset/samples/qa-dump-1460090355004_new.json" wordToIdFile = "./wordToId.json" idToWordFile = "./idToWord.json" dataProvider = DataProcessor(dataFile) dataProvider.BuildVocab() dataProvider.SaveVocab(wordToIdFile, idToWordFile) dataProvider.LoadVocab(wordToIdFile, idToWordFile) dataProvider.TranslateWordToIdPerArticle() data = dataProvider.data for title in data.keys(): article = data[title] sentencesInId = article["textInSentencesInId"] sentencesInWordsFromId = dataProvider.TranslateIdToWord(sentencesInId) sentencesInWords = SentenceToWord(article["textInSentences"]) for s0, s1 in zip(sentencesInWords, sentencesInWordsFromId): assert len(s0) == len(s1) for w0, w1 in zip(s0, s1): assert w0 == w1 print "Vocab Mapping test passed!"
def main(): terminal_command = sys.argv[1:] terminal_parser = TerminalParser() terminal_parser.add_argument('-i', '--input', default=[], nargs='+') terminal_parser.add_argument('-o', '--output', default=[], nargs='+') input_files = terminal_parser.get_list_of_input_files(terminal_command) output_files = terminal_parser.get_list_of_output_files(terminal_command) data_processor = DataProcessor(input_files) file_with_min_col = data_processor.get_file_with_min_amount_of_columns() columns_of_result_file = data_processor.get_sorted_columns_of_result_file() data_composer = DataComposer(input_files, output_files) data_composer.record_first_file_content_into_basic_result_file( file_with_min_col, columns_of_result_file) data_composer.record_leftovers_files_into_basic_result( columns_of_result_file) data_composer.sort_basic_results_file_content() data_composer.record_advanced_results_based_on_basic()
def __init__(self, config): self.config = config self.sess_model_list = [] self.graph_list = [] self.signature_def_list = [] self._read_sessions(self.config.predict.model_dirs, self.config.predict.model_tag) if self.config.predict.cascade_model_dirs and self.config.predict.use_cascade_model: self._read_sessions(self.config.predict.cascade_model_dirs, self.config.predict.model_tag) self.model_weights = [] for model_weight in self.config.predict.model_weights: self.model_weights.append(float(model_weight)) assert len(self.model_weights) == len(self.sess_model_list) self.data_processor = DataProcessor(config) self.data_processor.load_all_dict() self.feature_debug_file = codecs.open("feature_debug.txt", "w", encoding=util.CHARSET)
def main(_): tf.logging.set_verbosity(tf.logging.INFO) knowledge_tree = KnowledgeTree(FLAGS.graph_path) data_processor = DataProcessor(FLAGS.data_path, knowledge_tree, FLAGS.max_sequence, FLAGS.max_entity) question_feature, entity_feature, labels = data_processor.get_training_samples( ) train_numbers = len(question_feature) training_steps = int(train_numbers / FLAGS.train_batch_size * FLAGS.train_epoch) input_fn = input_fn_builder(question_feature, entity_feature, labels) valid_question, valid_entity, valid_label = data_processor.get_valid_samples( ) valid_numbers = len(valid_question) valid_steps = int(valid_numbers / FLAGS.train_batch_size) evaluate_fn = input_fn_builder(valid_question, valid_entity, valid_label) model_fn = model_fn_builder(hidden_size=256, fc_size=100, num_labels=2) config = tf.estimator.RunConfig(save_checkpoints_steps=300, log_step_count_steps=10, save_summary_steps=10, keep_checkpoint_max=10) estimator = tf.estimator.Estimator(model_dir=FLAGS.output_dir, model_fn=model_fn, config=config) train_spec = tf.estimator.TrainSpec(input_fn=input_fn, max_steps=training_steps) eval_spec = tf.estimator.EvalSpec(input_fn=evaluate_fn, steps=valid_steps, throttle_secs=10) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
def train(): # 数据集参数 batch_size = 20 # 句子长度 step_num = 40 data_processor = DataProcessor() train_data, test_data = data_processor.load_dataset(batch_size, step_num) # 模型参数 # word embedding word_embed_dim = 100 # vocab size n_words = data_processor.n_words # tags num num_tag = data_processor.num_tags types_embed_dim = 20 subtypes_embed_dim = 20 embed_path = "./data/100.utf8" # 定义模型 initial_embed = data_processor.load_word2vec(embed_path, 100) model = Model(n_words, num_tag, initial_embed=initial_embed) # 定义优化器 opti = tf.keras.optimizers.Adam(learning_rate=0.0001) start = None for epoch in range(5): for idx, batch in enumerate(train_data): if start is None: start = time.time() loss_sum, loss_mean = train_step(opti, batch, model) if (idx + 1) % 100 == 0: ends = time.time() cost = ends - start start = time.time() weights = model.get_weights() with open("./model/model.pkl", "wb") as fw: pickle.dump(weights, fw) print(idx + 1, "---->", loss_mean.numpy(), "---> time cost: ", cost) test_when_train(data_processor, model, test_data)
def _write(self, responces, write_clients): for client in write_clients: if client in responces: try: responce = responces[client] if responce != '': data_processor = DataProcessor() p_data = data_processor.make_parsed_data(responce) msg = '' msg = data_processor.form_message(p_data) if msg != '': client.send(msg) else: err = "ERROR: wrong client config or class description format" else: client.close() self._clients.remove(client) except: print('client %s %s disconnected.' % (client.fileno(), client.getpeername())) client.close() self._clients.remove(client)
def test_form_message(self): test_data_processor = DataProcessor() test_parsed_data = { "Class": { "Name": "User", "Initialization": [{}], "Methods": [{ "Method": "get_apples", "Attributes": ["apples"] }, { "Method": "give_apples", "Attributes": ["apples"] }], "Attributes": ["apples"] } } test_message = { "Class": "class User(object):\n", "Init": " def __init__(self):\n self._apples = None\n\n", "Methods": [{ "Method": " def get_apples(self, apples):\n pass\n\n" }, { "Method": " def give_apples(self, apples):\n pass\n\n" }, { "Method": " def set_apples(self, apples):\n self._apples = apples\n\n" }, { "Method": " def get_apples(self):\n return self._apples\n\n" }] } test_message = json.dumps(test_message) self.assertEqual(test_data_processor.form_message(test_parsed_data), test_message.encode('utf-8'))
def __init__( self, main_window, c_logger=None, data_processor=None, graph_settings=None, graph_settings_file_path=None, ): """ Init method of the 'MainWindow' class. :param main_window: Instance of the main Tk window. :param graph_settings: Instance of the graph settings parser. :param c_logger: Logger instance (ColoredLogger type is recommended). Default is MAIN_LOGGER (Global variable.) :param data_processor: Instance of DataProcessor module. :param graph_settings_file_path: Path of the used graph settings config file. """ self.c_logger = c_logger if c_logger else self.__set_up_default_logger( ) self.main_window = main_window self.c_logger.info("Get main window: {}".format(main_window)) self.c_logger.info("Creating DataProcessor instance.") self.data_processor = (data_processor if data_processor else DataProcessor(c_logger=self.c_logger)) self.c_logger.info("DataProcessor instance successfully created.") self.graph_settings_config_parser = graph_settings self.graph_settings_file_path = graph_settings_file_path self.graph_settings_top_level_window = None self.__create_new_record_gui_section() self.__create_visualisation_gui_section() self.__start_visualisation()
def test_parse_attributes(self): data = { "Attribute keywords": ["have ", "must have ", "has "], "Method keywords": ["can ", "should "], "Initialization keywords": { "Attribute keywords": ["is initializing by setting ", "by default get "], "Attribute values keywords": [" as ", " equal to ", " = "] } } description_config = DescriptionConfig(data) test_data_processor = DataProcessor() test_line1 = "Client have apples" test_line2 = "have apples, oranges and bananas" test_output1 = ["apples"] test_output2 = ["apples", "oranges", "bananas"] self.assertEqual( test_data_processor.parse_attributes(test_line1, description_config), test_output1) self.assertEqual( test_data_processor.parse_attributes(test_line2, description_config), test_output2)
def __init__( self, main_window, c_logger=None, data_processor=None, ): """ Init method of the 'MetricsTab' class. :param main_window: Instance of the main Tk window. :param c_logger: Logger instance (ColoredLogger type is recommended). Default is MAIN_LOGGER (Global variable.) :param data_processor: Instance of DataProcessor module. """ super(MetricsTab, self).__init__() self.c_logger = c_logger if c_logger else self.__set_up_default_logger() self.main_window = main_window self.c_logger.info("Get main window: {}".format(main_window)) self.c_logger.info("Creating DataProcessor instance.") self.data_processor = ( data_processor if data_processor else DataProcessor(c_logger=self.c_logger) ) self.c_logger.info("DataProcessor instance successfully created.") self.__generate_complete_gui() self.date_range = None
def generate_imgs(self, df, scales_path): global dict_images # Group share img_path = DataProcessor().get_group_share(df, df, fig_num) table_path = DataProcessor().get_group_share_table(df, df, fig_num) dict_images['group_share_chart'] = img_path dict_images['group_share_table'] = table_path # Client share img_path = DataProcessor().get_client_share(df, df, fig_num) table_path = DataProcessor().get_client_share_table(df, df, fig_num) dict_images['client_share_chart'] = img_path dict_images['client_share_table'] = table_path # Supplier share img_path = DataProcessor().get_supplier_share( df, df, fig_num) #TODO change to supplier share table_path = DataProcessor().get_supplier_share_table( df, df, fig_num) #TODO change to supplier table dict_images['supplier_share_chart'] = img_path dict_images['supplier_share_table'] = table_path
import json from data_source import DataSource from data_processor import DataProcessor from svg import Svg from svg_wrapper import SvgWrapper from field_names import * from config import config planet_data = DataSource().get() star_data, maxima = DataProcessor(planet_data, config['star_count'], config['sort_order']).get_star_data() svg_wrapper = SvgWrapper(Svg(), maxima) # Save data used to generate SVG - for debugging purposes if config['dump_data']: with open(config['dump_data_file'], 'w') as f: f.write(json.dumps(star_data, indent=4)) for star in star_data: svg_wrapper.add_star(star) out_file = config['out_file'] svg_wrapper.save(out_file) print('Render complete:', out_file)
def run(self): """Run the Scrape TMT articles script""" behaviour_df, behaviour_matrix, df_articles, df_users = DataProcessor( ).generate_reading_behaviour_matrix() ModelFitter(behaviour_df, behaviour_matrix, df_articles, df_users).fit_model()
def create_data_processor(measurement, options): """Factory function that can be used to switch to a specialized class depending on the options and the measurement(s) to be processed. """ return DataProcessor(measurement, options)
def train(config): # training and test configuration are basically the same config_test = copy.deepcopy(config) config_test.batch_size = 10 config_test.seq_length = 1 # process the training corpus (if not done yet) and return the training batches and other info train_data = DataProcessor(config.train_file, config.batch_size, config.seq_length, True, '<unk>', history_size=1) test_data = DataProcessor(config_test.test_file, config_test.batch_size, config_test.seq_length, False, '<unk>', history_size=1) config.vocab_size = train_data.vocab_size config_test.vocab_size = train_data.vocab_size # save the training configuration for future need if not os.path.isdir(config.save_dir): os.makedirs(config.save_dir) try: with open(os.path.join(config.save_dir, 'config.pkl'), 'wb') as f: cPickle.dump(config, f) except IOError: print("ERROR: Could not open and/or write the config file {}".format( os.path.join(config.save_dir, 'config.pkl'))) with tf.Graph().as_default(): # create the LM graph for training with tf.name_scope("Train"): with tf.variable_scope("Model", reuse=None): model_train = LM(config, True) # create the LM graph for testing with shared parameters with tf.name_scope("Test"): with tf.variable_scope("Model", reuse=True): model_test = LM(config_test, False) # run the training/testing with tf.Session() as session: session.run(tf.global_variables_initializer()) test_perplexity = model_test.run_model(session, test_data, eval_op=None, verbosity=10000, verbose=True) print("\n[INFO] Starting perplexity of test set: %.3f" % test_perplexity) print('========================\n') # model saving manager saver = tf.train.Saver(tf.global_variables(), max_to_keep=1) # loop over all epochs for e in range(config.num_epochs): # we reset/define the epoch parameters lr_decay = config.decay_rate**max(e + 1 - config.max_epoch, 0.0) session.run( tf.assign(model_train.lr, config.learning_rate * lr_decay)) print("[INFO] Epoch: %d, Learning rate: %.3f \n" % (e + 1, session.run(model_train.lr))) train_perplexity = model_train.run_model( session, train_data, eval_op=model_train.train_op, verbosity=50000, verbose=True) test_perplexity = model_test.run_model(session, test_data) print( "\n[SUMMARY] Epoch: {} | Train Perplexity: {:.3f} | Test Perplexity: {:.3f} \n" .format(e + 1, train_perplexity, test_perplexity)) print('========================') # save model after each epoch model_path = os.path.join(config.save_dir, 'model.ckpt') saver.save(session, model_path, global_step=(e + 1)) # save the final model model_path = os.path.join(config.save_dir, 'model.ckpt') saver.save(session, model_path)
output, state = decoder_cell(input_seq, state) output = tf.layers.dense(output, self.vocab_size, activation=None, reuse=tf.AUTO_REUSE, name="to_vector") output = tf.argmax(tf.nn.softmax(output, axis=-1), axis=-1) # append to output outputs.append(output) input_seq = tf.nn.embedding_lookup(self.vocab_embedding, output) outputs = tf.convert_to_tensor(outputs, dtype=tf.float32) outputs = tf.transpose(outputs, [1, 0]) return outputs def build_cost(self, outputs): # target_label = tf.one_hot(self.answer_label, depth=self.vocab_size, dtype=tf.float32) loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=outputs, labels=self.answer_label) loss = tf.multiply(self.answer_mask, loss) loss = tf.reduce_mean(loss) return loss if __name__ == "__main__": data_processor = DataProcessor("./data/QA_data/varicocele/", "./data/QA_data/varicocele/varicocele.json", word2vec="./data/word2vec/varicocele") seq2seq = Seq2Seq(data_processor.start_token, data_processor.vocab_embedding)
import os from sqlalchemy.orm import sessionmaker from sqlalchemy import create_engine from sqlalchemy.exc import IntegrityError from fact_models import FactArtistByYear, FactGenreByYear, FactSongByYear from data_processor import DataProcessor from config import Session, dataSource session = Session() dirname = os.path.dirname(__file__) ds = DataProcessor(os.path.join(dirname, dataSource)) ds.process() for i, row in ds.artistsByYear().iterrows(): record = FactArtistByYear(year=row[0], artist=row[1], titles=row[2]) try: session.add(record) session.commit() except IntegrityError: print('FactArtistByYear Record exists for year {}'.format(row[0])) session.rollback() query = session.query(FactArtistByYear) print('{} records exist in FactArtistByYear'.format(query.count())) for i, row in ds.genreByYear().iterrows(): record = FactGenreByYear(year=row[0], genre=row[1], titles=row[2]) try: session.add(record) session.commit() except IntegrityError:
def test_read_file_valid_file(self): file_path = "../input/Border_Crossing_Entry_Data.csv" dp = DataProcessor(input_file_name=file_path) self.assertIsNotNone(dp.data)
import tensorflow as tf import prices as price from data_processor import DataProcessor start = "2003-01-01" end = "2018-01-01" price.get_price('AAPL', start, end) process = DataProcessor("AAPL.csv", 0.9) process.gen_test(10) process.gen_train(10) X_train = process.X_train / 200 Y_train = process.Y_train / 200 X_test = process.X_test / 200 Y_test = process.Y_test / 200 model = tf.keras.models.Sequential() model.add(tf.layers.Dense(100, activation=tf.nn.relu)) model.add(tf.layers.Dense(100, activation=tf.nn.relu)) model.add(tf.layers.Dense(1, activation=tf.nn.relu)) model.compile(optimizer="adam", loss="mean_squared_error") model.fit(X_train, Y_train, epochs=100) print(model.evaluate(X_test, Y_test))
import os from db_handler import DBHandler from flask import Flask, request from data_processor import DataProcessor from flaskthreads import AppContextThread weather_app = Flask(__name__) env_config = os.getenv("APP_SETTINGS", "config.DevelopmentConfig") weather_app.config.from_object(env_config) data_processor = DataProcessor(weather_app) db_handler = DBHandler.get_instance(weather_app) weather_app.app_context().push() @weather_app.route('/', methods=['GET']) def welcome_to_service(): return "Welcome to my weather service!" @weather_app.route('/pre_process', methods=['GET']) def pre_process(): thread = AppContextThread(target=data_processor.process_files) thread.start() return "Pre-processing csv files..." @weather_app.route('/weather/data', methods=['GET']) def get_data_by_location(): data = request.get_json() try: if 'lon' in data and 'lat' in data:
with open(output_submit_file, "w") as writer: for i, pred in enumerate(predicts_all): json_d = {} json_d['id'] = i json_d['label'] = str(pred) writer.write(json.dumps(json_d) + '\n') print('inference over') def parse_arguments(arg): parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, default="sse", help='Choose model to train.') return parser.parse_args() if __name__ == '__main__': args = parse_arguments(sys.argv[1:]) config = Config() print('导入词向量.....') data_processor = DataProcessor(config) executor = Executor(config) print('开始训练.....') model_name = args.model_name config.model_save_path = f'saveModel/{model_name}.pt' # 训练模型 train(config, model_name, data_processor, executor) # 预测结果 inference(config, model_name, data_processor, executor)
def main(args): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() data_processor = DataProcessor("./data/conll04_train.json", "./data/conll04_dev.json", "./data/conll04_test.json") (train_examples, dev_examples), vocabulary = data_processor.get_conll_examples( do_training=True) logger.info("Example format test") logger.info("Orig id: %d" % train_examples[0].orig_id) logger.info("Tokens: %s" % (" ".join(train_examples[0].tokens))) logger.info("Label: %s" % (" ".join(train_examples[0].label))) train_features, train_label = data_processor.convert_example_to_features( train_examples, vocabulary) dev_features, dev_label = data_processor.convert_example_to_features( dev_examples, vocabulary) weight_matrix = data_processor.build_lookup_matrix(vocabulary) train_data = TensorDataset(train_features, train_label) train_dataloader = DataLoader(train_data, batch_size=args.batch_size, shuffle=True) train_batches = [batch for batch in train_dataloader] dev_data = TensorDataset(dev_features, dev_label) dev_dataloader = DataLoader(dev_data, batch_size=args.batch_size) eval_step = max(1, len(train_batches) // 5) if args.with_crf: logger.info("Running %s" % "BiLSTM+CRF") model = BiLSTM_CRF(weight_matrix, args.hidden_size, data_processor.label_to_id, "<START>", "<STOP>") else: model = BiLSTM(weight_matrix, args.hidden_size, args.num_of_tags) logger.info("Running %s" % "BiLSTM") model.to(device) if n_gpu > 1 and not args.with_crf: model = torch.nn.DataParallel(model) optimizer = Adam(model.parameters(), lr=0.01, weight_decay=0.) tr_loss = 0 tr_num_steps = 0 max_score = 0.0 start_time = time.time() for epoch in range(args.num_train_epochs): model.train() logger.info("Start epoch #{} (lr = {})...".format(epoch, 0.01)) for step, batch in enumerate(train_batches): batch = tuple(t.to(device) for t in batch) input_ids, input_label = batch if args.with_crf: loss = model.neg_log_likelihood(input_ids, input_label) else: outputs = model(input_ids) loss = loss_fn(outputs, input_label) if n_gpu > 1: loss = loss.mean() tr_loss += loss.item() tr_num_steps += 1 loss.backward() optimizer.step() optimizer.zero_grad() if (step + 1) % eval_step == 0: logger.info( 'Epoch: {}, Step: {} / {}, used_time = {:.2f}s, loss = {:.6f}' .format(epoch, step + 1, len(train_batches), time.time() - start_time, tr_loss / tr_num_steps)) save_model = False if args.do_eval: score = evaluate(args, model, device, dev_label, dev_dataloader) print("F1 score: %.6f" % score) model.train() if score > max_score: max_score = score save_model = True logger.info("!!! Best dev %s (lr=%s, epoch=%d): %.6f" % ("F1", str(0.01), epoch, score)) else: save_model = True if save_model: model_to_save = model.module if hasattr( model, 'module') else model output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) if max_score: with open( os.path.join(args.output_dir, "eval_results.txt"), "w") as writer: writer.write("Best eval result: F1 = %.4f" % max_score) if args.do_eval: [test_examples ], _ = data_processor.get_conll_examples(do_training=False) test_features, test_label = data_processor.convert_example_to_features( test_examples, vocabulary) test_data = TensorDataset(test_features, test_label) test_dataloader = DataLoader(test_data, batch_size=args.batch_size) if args.with_crf: model = BiLSTM_CRF(weight_matrix, args.hidden_size, data_processor.label_to_id, "<START>", "<STOP>") else: model = BiLSTM(weight_matrix, args.hidden_size, args.num_of_tags) model.load_state_dict( torch.load(os.path.join(args.output_dir, "pytorch_model.bin"))) model.eval() model = model.to(device) eval_result_file = os.path.join(args.output_dir, "eval_results.txt") if os.path.isfile(eval_result_file): with open(eval_result_file) as f: line = f.readline() logger.info(line) f.close() test_score = evaluate(args, model, device, test_label, test_dataloader) result = "test result: F1 = %.6f" % test_score logger.info(result)
print("----------epoch/epochs: {}/{}----------".format( epoch, epochs)) print("Train Loss: {}, Train Acc: {}".format( train_loss, train_acc)) val_acc = eval(model, loss_func, dev_loader) if val_acc >= best_val_acc: best_val_acc = val_acc best_model_params = copy.deepcopy(model.state_dict()) model.load_state_dict(best_model_params) return model if __name__ == "__main__": config = Config() processor = DataProcessor(config.data_path) train_examples = processor.get_train_examples(config.candidates_set_size) dev_examples = processor.get_dev_examples(config.candidates_set_size) train_dataset_tokens = processor.get_dataset_tokens(train_examples) dev_dataset_tokens = processor.get_dataset_tokens(dev_examples) if not os.path.exists(config.vocab_path) or config.update_vocab: processor.create_vocab(train_dataset_tokens, config.vocab_path) train_dataset_indices, vocab_size = processor.get_dataset_indices( train_dataset_tokens, config.vocab_path, config.vocab_size) dev_dataset_indices, _ = processor.get_dataset_indices( dev_dataset_tokens, config.vocab_path, config.vocab_size) config.vocab_size = vocab_size # 实际词表大小
loss_val += loss.item() * datas.size(0) #获取预测的最大概率出现的位置 preds = torch.argmax(preds, dim=1) labels = torch.argmax(labels, dim=1) corrects += torch.sum(preds == labels).item() train_loss = loss_val / len(train_loader.dataset) train_acc = corrects / len(train_loader.dataset) if(epoch % 2 == 0): print("Train Loss: {}, Train Acc: {}".format(train_loss, train_acc)) test_acc = test(model, test_loader, loss_func) if(best_val_acc < test_acc): best_val_acc = test_acc best_model_params = copy.deepcopy(model.state_dict()) model.load_state_dict(best_model_params) return model processor = DataProcessor() train_datasets, test_datasets = processor.get_datasets(vocab_size=vocab_size, embedding_size=embedding_size, max_len=sentence_max_len) train_loader = torch.utils.data.DataLoader(train_datasets, batch_size=batch_size, shuffle=True) test_loader = torch.utils.data.DataLoader(test_datasets, batch_size=batch_size, shuffle=True) model = BiLSTMModel(embedding_size, hidden_size, num_layers, num_directions, num_classes) model = model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=lr) loss_func = nn.BCELoss() model = train(model, train_loader, test_loader, optimizer, loss_func, epochs)
def main(c_logger=None): if not c_logger: # Set-up the main logger instance. path_of_log_file = os.path.join(PATH_OF_FILE_DIR, "..", "..", "logs", "main_log.log") c_logger = ColoredLogger(os.path.basename(__file__), log_file_path=path_of_log_file) if TEST_RUNNING: data_processor_instance = DataProcessor(config=TEST_CONFIG_FILE, c_logger=c_logger) graph_config_parser = set_up_graph_settings_config_parser( c_logger=c_logger, config_file=TEST_GRAPH_CONFIG_FILE) user_info_parser = set_up_user_info_config_parser( c_logger=c_logger, config_file=TEST_USER_INFO_CONFIG_FILE) else: data_processor_instance = DataProcessor(c_logger=c_logger) graph_config_parser = set_up_graph_settings_config_parser( c_logger=c_logger) user_info_parser = set_up_user_info_config_parser(c_logger=c_logger) window = tk.Tk() window.iconphoto(False, tk.PhotoImage(file=PATH_OF_WINDOW_ICON)) window.title("Time reporting") # change ttk theme to 'clam' to fix issue with downarrow button style = ttk.Style() style.theme_create( "MyStyle", parent="alt", settings={ "TNotebook": { "configure": { "tabmargins": [2, 5, 2, 0] } }, "TNotebook.Tab": { "configure": { "padding": [50, 2] } }, }, ) style.theme_use("MyStyle") note = ttk.Notebook(window) main_tab = tk.Frame(note) report_config_tab = tk.Frame(note) user_config_tab = tk.Frame(note) metrics_tab = tk.Frame(note) note.add(main_tab, text="Main") note.add(report_config_tab, text="Report") note.add(user_config_tab, text="User Config") note.add(metrics_tab, text="Metrics") note.pack(expand=True, fill=tk.BOTH) main_exit_button = tk.Button( window, width=30, text="EXIT", bg="grey60", activebackground="red", font="Helvetica 12 bold", command=lambda: quit_from_app(window), ) main_exit_button.pack(fill=tk.X) main_tab_module.MainWindow( main_tab, c_logger=c_logger, data_processor=data_processor_instance, graph_settings=graph_config_parser, graph_settings_file_path=GRAPH_CONFIG_FILE, ) report_tab_module.ReportConfigTab(report_config_tab, c_logger=c_logger, data_processor=data_processor_instance) user_tab_module.UserConfigTab( user_config_tab, c_logger=c_logger, user_info_parser=user_info_parser, user_info_config_file_path=USER_INFO_CONFIG_FILE, ) metrics_tab_module.MetricsTab( metrics_tab, c_logger=c_logger, data_processor=data_processor_instance, ) window.protocol("WM_DELETE_WINDOW", lambda: quit_from_app(window)) window.mainloop()
# This file is an interface to use the model to get recommendations. # Run this file to get recommendations based on a tv show import os from data_processor import DataProcessor import pandas as pd from fuzzywuzzy import fuzz from fuzzywuzzy import process import pickle from sys import stdin import logging show_data_processor = DataProcessor() def scrape_data(): # delete any csv files that currently exist if (os.path.exists("data/tv.csv")): os.remove("data/tv.csv") if (os.path.exists("logging/metacritic_scraper.log")): os.remove("logging/metacritic_scraper.log") print('Scraping metacritic') # run metacritic scraper import metacritic_scraper # delete related csv file if (os.path.exists("data/tv_shows_with_features.csv")): os.remove("data/tv_shows_with_features.csv")
def test_data_processor(): num_obs = 2000 data = pd.DataFrame(np.random.randn(num_obs).tolist(), columns=["Return"], index=[fake.date_time_between_dates( datetime_start=datetime(2020, 3, 13, 14, 58, 57), datetime_end=datetime(2020, 3, 20, 14, 58, 57), tzinfo=None) for x in range(num_obs)]) # pp(data.Return['2020-03-13 19:55:49.743080':'2020-03-15 13:00:00.866140']) z = DataProcessor(data)(TimeFreqFilter(TimePeriod.MINUTE, 15))(rolling_mean, col_name="Return", n=5).data # pp(z.Return['2020-03-13 19:55:49.743080':'2020-03-15 13:00:00.866140']) z2 = DataProcessor(data)(TimeFreqFilter(TimePeriod.HOUR, 1))("between_time", '08:30', '16:30')( lambda x: x.rename(columns={"Return": "RETURN"})).data # pp(z2.head(5)) # pp(z2.tail(5)) z3 = DataProcessor(data)("between_time", '15:59', '16:30')(TimeFreqFilter(TimePeriod.BUSINESS_DAY))( lambda x: x[x.Return > 0.0]) # pp(z3.head(5)) # pp(z3.tail(5)) z2 = DataProcessor(data).time_freq(TimePeriod.HOUR, 1). \ between_time('08:30', '16:30').data # pp(z2.Return['2020-03-13 19:55:49.743080':'2020-03-15 13:00:00.866140']) z2 = DataProcessor(data) \ (partial(lambda x, y, z: z.loc[x:y], '2020-03-13 08:00', '2020-03-17 08:00')) \ ("between_time", '08:15', '16:30') \ (lambda x: x[x.Return > 0.0]) \ [TimeFreqFilter(TimePeriod.MINUTE, 5, starting=datetime(2017, 6, 1, 8, 15, 0)), [DataProcessor.first, np.max, np.min, DataProcessor.last, np.median, np.mean, np.std], "Return"] \ (lambda x: x.rename(columns={'amax': 'HIGH', 'amin': 'LOW', 'mean': 'MEAN', 'median': 'MEDIAN', 'first': 'OPEN', 'last': 'CLOSE', 'std': 'STD'})).data # pp(z2['2020-03-13 12:00':'2020-03-16 13:00']) # pp(z2.head(5).HIGH - z2.head(5).LOW) # pp(z2.columns.values) z3 = DataProcessor(data).between_time('11:30', '14:00').shift_to_new_column("L1_LOG_RET", "Return", 1).data # pp(z3.tail(5)) z3 = DataProcessor(data).between_time('08:01', '18:30').time_freq(TimePeriod.BUSINESS_DAY).positive_column( value_column="Return").data # pp(z3.tail(5)) z3 = DataProcessor(data).index('2020-03-13 19:55:49.743080', '2020-03-15 13:00:00.866140'). \ between_time('08:15', '16:30').positive_column(value_column="Return"). \ summarize_intervals(TimeFreqFilter(TimePeriod.MINUTE, 5, starting=datetime(2020, 3, 13, 19, 0, 0)), [DataProcessor.first, np.max, np.min, DataProcessor.last, np.median, np.mean, np.std], "Return"). \ rename_columns(['amax', 'amin', 'mean', 'median', 'first', 'last', 'std'], ['HIGH', 'LOW', 'MEAN', 'MEDIAN', 'OPEN', 'CLOSE', 'STD']).data # pp(z3.HIGH - z3.LOW) # pp(z3.tail(5)) z2 = DataProcessor(data).index('2020-03-13 19:55', '2020-03-15 13:00'). \ between_time('08:15', '16:30').positive_column(value_column="Return"). \ summarize_intervals(TimeFreqFilter(TimePeriod.MINUTE, 30, starting=datetime(2020, 3, 14, 8, 0, 0)), [DataProcessor.first, np.max, np.min, DataProcessor.last, np.median, np.mean, np.std], "Return"). \ rename_columns(['amax', 'amin', 'mean', 'median', 'first', 'last', 'std'], ['HIGH', 'LOW', 'MEAN', 'MEDIAN', 'OPEN', 'CLOSE', 'STD'])(lambda x: x[~np.isnan(x.STD)]).data # pp(z2.tail(5)) z2 = DataProcessor(data) \ (partial(lambda x, y, z: z.loc[x:y], '2020-03-13 19:55', '2020-03-15 13:00')) \ ("between_time", '08:15', '16:30') \ (lambda x: x[x.Return > 0.0]) \ [TimeFreqFilter(TimePeriod.MINUTE, 30, starting=datetime(2020, 3, 14, 8, 0, 0)), [DataProcessor.first, np.max, np.min, DataProcessor.last, np.median, np.mean, np.std], "Return"] \ (lambda x: x.rename(columns={'amax': 'HIGH', 'amin': 'LOW', 'mean': 'MEAN', 'median': 'MEDIAN', 'first': 'OPEN', 'last': 'CLOSE', 'std': 'STD'})) \ (partial(duplicate_col, "MEAN", "LogReturn_MEAN")) \ (partial(duplicate_col, "STD", "LogReturn_STD")) \ (partial(shift_colname, 'LogReturn_MEAN', -1)) \ (partial(shift_colname, 'LogReturn_STD', -1)) \ (lambda x: x[~np.isnan(x.LogReturn_STD) & ~np.isnan(x.STD) & ~np.isnan(x.LogReturn_STD_F1)]).data
from datetime import datetime, timedelta utils = utilities() logger = utils.formatLogger("BEGIN ETL PROCESS") logger.info("BEGINNING ETL PROCESS") end_of_period = None if len(sys.argv) > 3: end_of_period = datetime.strptime(sys.argv[1], '%Y-%m-%d') logger.info("SETTING END OF PERIOD DATE - " + str(end_of_period)) time.sleep(2) raw_file_path = sys.argv[2] logger.info("SETTING RAW DATA FILE PATH TO - " + str(raw_file_path)) time.sleep(2) processed_file_path = sys.argv[3] logger.info("SETTING PROCESSED DATA FILE PATH TO - " + str(processed_file_path)) time.sleep(2) else: logger.error("ENTER END-OF-PERIOD & DATA FILE PATH") exit() data_processor = DataProcessor(end_of_period, raw_file_path, processed_file_path) data_processor.process_staging_data()