def test_accuracy_calculator_valid(): """ Test accuracy_calculator() --happy path """ rec_path = project_path + "/" + test_conf['rec'] test_data_path = project_path + "/" + test_conf['test_data'] rec = read_csv(rec_path) test = read_csv(test_data_path) acc = accuracy_calculator(rec, test) assert acc.shape == (2, 3)
def main(args): """ main function perform data augmentation with clean data and save the augmented data to csv :param args: (argparse) user-input configuration file """ # try: config_path = project_path + "/" + args.config input_data_path = project_path + "/" + args.input output_data_path = project_path + "/" + args.output config = load_config(config_path) # load data df = read_csv(input_data_path) lines = list(df['line']) charactors = list(df['label']) augmented = augment(lines, config['aug']) # Union original lines and augmented lines df2 = pd.DataFrame(list(zip(charactors, augmented)), columns=['label', 'line']) df = df[['label', 'line']] df['type'] = 'original' df2['type'] = 'augmented' result = pd.concat([df, df2]) save_csv(result, output_data_path)
def main(args): """ main function to split data :param args: (argparse) user-input configuration file """ try: config_path = project_path + "/" + args.config input_data_path = project_path + "/" + args.input out_train_path = project_path + "/" + args.output_train out_test_path = project_path + "/" + args.output_test config = load_config(config_path) df = read_csv(input_data_path) df_train, df_test = split(df, **config['split_data']) # Write to output file save_csv(df_train, out_train_path) save_csv(df_test, out_test_path) except ValueError as e1: logger.error("ValueError: " + str(e1) + " Please validate Values in the configuration file.") except Exception as e: logger.error("Unexpected error occurred when splitting data: " + str(e))
def main(args): """ Load generated model checkpoints from by default in /checkpoint/run1 and generate new text """ try: config_path = project_path + "/" + args.config input_data_path = project_path + "/" + args.input output_data_path = project_path + "/" + args.output config = load_config(config_path) # load data df = read_csv(input_data_path) lines = list(df['raw_line']) random.seed(config['generate']['random_seed']) sample_seeds = random.choices(lines, k=config['generate']['num']) sess = gpt2.start_tf_sess() gpt2.load_gpt2(sess) pred = [] for i in sample_seeds: out = gpt2.generate(sess, prefix=i, **config['generate']['generator']) pred.append(out) pred_df = pd.DataFrame(pred, columns=['raw_line']) save_csv(pred_df, output_data_path) except Exception as e: logger.error( "Unexpected error occurred when generating dialogues with gpt2: " + str(e))
def main(args): """ main function to load cleaned data, conduct eda, visualize most important tokens with tfidf score :param args: (argparse) user-input configuration file """ try: config_path = project_path + "/" + args.config input_data_path = project_path + "/" + args.input output_data_path = project_path + "/" + args.output config = load_config(config_path) # load data df = read_csv(input_data_path) df.loc[:, 'season'] = df['season'].astype('int') sys.stdout = open(output_data_path, 'w') check_balance(df) check_linelen(df, config['eda']['quantile']) groups = config['eda']['groups'] for i in range(len(groups)): df_top_words = most_important_words(df, groups[i], **config['eda']['top_n_words']) fig = plot_tfidf_classfeats_h(df_top_words) fig.savefig('{}/EDA/top_words_{}.png'.format(project_path, i)) except Exception as e: logger.error("Unexpected error occurred when eda: " + str(e))
def main(args): """ main function to clean data :param args: (argparse) user-input configuration file """ try: rec_path = project_path + "/" + args.rec test_data_path = project_path + "/" + args.test output_data_path = project_path + "/" + args.output rec = read_csv(rec_path) test = read_csv(test_data_path) accuracy = accuracy_calculator(rec, test) # Write to output file save_csv(accuracy, output_data_path) except Exception as e: logger.error("Unexpected error occurred when evaluation: " + str(e))
def main(args): """ main function to run the market basket analysis and save the recommendations to csv :param args: (argparse) user-input configuration file """ try: config_path = project_path + "/" + args.config input_data_path = project_path + "/" + args.input output_data_path = project_path + "/" + args.output product_path = project_path + "/" + conf.PRODUCT_DIM config = load_config(config_path) df = read_csv(input_data_path) product = read_csv(product_path) result = train(df, **config['train']) # Join product object table to get the name and price. final_results = join_info(result, product, "StockCode", "StockCode") final_results = join_info(final_results, product, "rec1", "StockCode") final_results = join_info(final_results, product, "rec2", "StockCode") # format conf final_results['conf1'] = round(final_results['conf1'] * 100, 2) final_results['conf2'] = round(final_results['conf2'] * 100, 2) final_results = final_results[config["result_columns"]] # Write to output file save_csv(final_results, output_data_path) except KeyError as e3: logger.error("KeyError: " + str(e3)) except ValueError as e4: logger.error("ValueError: " + str(e4) + " Please validate Values in the configuration file.") except Exception as e: logger.error( "Unexpected error occurred when making recommendations: " + str(e))
def main(args): """ main function to create object table for products form the cleaned transactions :param args: (argparse) user-input configuration file """ try: config_path = project_path + "/" + args.config input_data_path = project_path + "/" + args.input output_data_path = project_path + "/" + args.output config = load_config(config_path) df = read_csv(input_data_path) product = product_dim(df, **config['product_dim']) # Write to output file save_csv(product, output_data_path) except Exception as e: logger.error("Unexpected error occurred when creating object table for products: " + str(e))
def main(args): """ main function to load raw data, clean data and save leaned data to csv :param args: (argparse) user-input configuration file """ try: config_path = project_path + "/" + args.config input_data_path = project_path + "/" + args.input output_data_path = project_path + "/" + args.output config = load_config(config_path) df = read_csv(input_data_path) clean_data = clean(df, **config['clean']) # Write to output file save_csv(clean_data, output_data_path) except KeyError as e3: logger.error("KeyError: " + str(e3)) except Exception as e: logger.error("Unexpected error occurred when cleaning data: " + str(e))
def main(args): """ main function to load cleaned data, create baskets and same baskets to csv :param args: (argparse) user-input configuration file """ try: config_path = project_path + "/" + args.config input_data_path = project_path + "/" + args.input output_data_path = project_path + "/" + args.output config = load_config(config_path) df = read_csv(input_data_path) basket = create_basket(df, **config['create_basket']) # Write to output file save_csv(basket, output_data_path, index=True) except KeyError as e1: logger.error("KeyError: " + str(e1)) except ValueError as e2: logger.error("ValueError: " + str(e2) + " Please validate Values in the configuration file.") except Exception as e: logger.error("Unexpected error occurred when creating basket: " + str(e))
def main(args): """ main function to fune tuning bert classification model :param args: (argparse) user-input configuration file """ try: config_path = project_path + "/" + args.config input_data_path = project_path + "/" + args.input model_path = project_path + "/" + args.model evaluation_path = project_path + "/" + args.evaluation config = load_config(config_path) # load data df = read_csv(input_data_path) # # -- debug # df = df[:100] # Encode the classes for BERT. encoder = preprocessing.LabelEncoder() df['label'] = encoder.fit_transform(df['label']) # Split data into training and test sets. X_train, X_test, y_train, y_test = training_test_split( df, **config['bert']['training_test_split']) # Bert tokenization logger.info("Tokenizing...") tokenizer = transformers.BertTokenizer.from_pretrained( 'bert-base-uncased', do_lower_case=True) if not args.max_length: max_length = config['bert']['max_length'] else: max_length = int(args.max_length) # DataLoaders for running the model if not args.batch_size: batch_size = config['bert']['batch_size'] else: batch_size = int(args.batch_size) dataloader_train = pro_pipline(X_train, tokenizer, max_length, config['bert']['tokenize'], batch_size, y_train) dataloader_test = pro_pipline(X_test, tokenizer, max_length, config['bert']['tokenize'], batch_size, y_test) # Initialize the model. model = transformers.BertForSequenceClassification.from_pretrained( "bert-base-uncased", num_labels=df['label'].nunique(), output_attentions=False, output_hidden_states=False) # Setting optimizer optimizer = AdamW(model.parameters(), **config['bert']['optimizer']) # Setting epochs if not args.num_epoch: epochs = config['bert']['num_epoch'] else: epochs = int(args.num_epoch) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=len(dataloader_train) * epochs) # Setting seeds seed = config['bert']['seed'] random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) # Write prints to .txt model_name = 'max_length' + str(max_length) + 'batch_size' + str( batch_size) + 'num_epoch' + str(epochs) e_dir = evaluation_path + "/" + model_name if not os.path.exists(e_dir): os.makedirs(e_dir) sys.stdout = open(e_dir + "/" + model_name + '.txt', 'w') logger.info("Training... and evaluations will be saved into %s", e_dir) device = torch.device('cuda') # device = torch.device('cpu') model.to(device) complete_epoch, training_loss, test_accuracy = [], [], [] for epoch in tqdm(range(1, epochs + 1)): model.train() loss_train_total = 0 progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False) for batch in progress_bar: model.zero_grad() batch = tuple(b.to(device) for b in batch) inputs = { 'input_ids': batch[0].to(device), 'attention_mask': batch[1].to(device), 'labels': batch[2].to(device), } outputs = model(**inputs) loss = outputs[0] loss_train_total += loss.item() loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() scheduler.step() progress_bar.set_postfix({ 'training_loss': '{:.3f}'.format(loss.item() / len(batch)) }) # training loss tqdm.write(f'\nEpoch {epoch}') loss_train_avg = loss_train_total / len(dataloader_train) training_loss.append(loss_train_avg) tqdm.write(f'Training loss: {loss_train_avg}') # evaluate the model plt, val_accuracy = run_evaluation(dataloader_test, model, device, encoder) plt.savefig(e_dir + "/" + model_name + '-' + str(epoch) + '.png') test_accuracy.append(val_accuracy) complete_epoch.append(epoch) loss_plt = plot_loss(complete_epoch, training_loss, test_accuracy) loss_plt.savefig(e_dir + "/" + model_name + '_loss' + '.png') # save the model for future use/retrain output_dir = model_path + '/' + model_name + "/" if not os.path.exists(output_dir): os.makedirs(output_dir) logging.info("Saving model to %s" % output_dir) model_to_save = model.module if hasattr( model, 'module') else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) except KeyError as e3: logger.error("KeyError: " + str(e3)) except Exception as e: logger.error("Unexpected error occurred when training with Bert: " + str(e))
import config.config as config # Get project path project_path = path.dirname(path.dirname(path.abspath(__file__))) test_config_path = project_path + "/" + config.TEST_YAML func_config_path = project_path + "/" + config.CONFIG_YAML # Load configurations from .yaml with open(test_config_path, "r") as f: test_conf = yaml.load(f, Loader=yaml.FullLoader) with open(func_config_path, "r") as f: func_conf = yaml.load(f, Loader=yaml.FullLoader) # Unit test for clean raw_data_path = project_path + "/" + test_conf['raw_data'] raw_df = read_csv(raw_data_path) def test_clean_valid(): """ Test clean --happy path This function clean the raw data """ true_data_path = project_path + "/" + test_conf['clean_data'] true_df = pd.read_csv(true_data_path, dtype=str) test_df = clean(raw_df, **func_conf['clean']) test_df = test_df.astype(str) assert true_df.equals(test_df)