Python read_csv Exemples, src.helper.read_csv Python Exemples

Exemple #1

0

Afficher le fichier

def test_accuracy_calculator_valid():
    """
    Test accuracy_calculator() --happy path
    """
    rec_path = project_path + "/" + test_conf['rec']
    test_data_path = project_path + "/" + test_conf['test_data']

    rec = read_csv(rec_path)
    test = read_csv(test_data_path)

    acc = accuracy_calculator(rec, test)

    assert acc.shape == (2, 3)

Exemple #2

0

Afficher le fichier

Fichier : augmentation.py Projet : rachelzhaolp/F.R.I.E.N.D.S_dialogue_generator_and_classifier

def main(args):
    """
    main function perform data augmentation with clean data and save the augmented data to csv
    :param args: (argparse) user-input configuration file
    """
    # try:
    config_path = project_path + "/" + args.config
    input_data_path = project_path + "/" + args.input
    output_data_path = project_path + "/" + args.output

    config = load_config(config_path)

    # load data
    df = read_csv(input_data_path)

    lines = list(df['line'])
    charactors = list(df['label'])

    augmented = augment(lines, config['aug'])

    # Union original lines and augmented lines
    df2 = pd.DataFrame(list(zip(charactors, augmented)),
                       columns=['label', 'line'])
    df = df[['label', 'line']]

    df['type'] = 'original'
    df2['type'] = 'augmented'
    result = pd.concat([df, df2])

    save_csv(result, output_data_path)

Exemple #3

0

Afficher le fichier

def main(args):
    """
    main function to split data
    :param args: (argparse) user-input configuration file
    """
    try:

        config_path = project_path + "/" + args.config
        input_data_path = project_path + "/" + args.input
        out_train_path = project_path + "/" + args.output_train
        out_test_path = project_path + "/" + args.output_test

        config = load_config(config_path)
        df = read_csv(input_data_path)
        df_train, df_test = split(df, **config['split_data'])

        # Write to output file
        save_csv(df_train, out_train_path)
        save_csv(df_test, out_test_path)
    except ValueError as e1:
        logger.error("ValueError: " + str(e1) +
                     " Please validate Values in the configuration file.")
    except Exception as e:
        logger.error("Unexpected error occurred when splitting data: " +
                     str(e))

Exemple #4

0

Afficher le fichier

Fichier : gpt_generator.py Projet : rachelzhaolp/F.R.I.E.N.D.S_dialogue_generator_and_classifier

def main(args):
    """
    Load generated model checkpoints from by default in /checkpoint/run1 and generate new text
    """
    try:
        config_path = project_path + "/" + args.config
        input_data_path = project_path + "/" + args.input
        output_data_path = project_path + "/" + args.output

        config = load_config(config_path)

        # load data
        df = read_csv(input_data_path)
        lines = list(df['raw_line'])
        random.seed(config['generate']['random_seed'])
        sample_seeds = random.choices(lines, k=config['generate']['num'])

        sess = gpt2.start_tf_sess()
        gpt2.load_gpt2(sess)

        pred = []
        for i in sample_seeds:
            out = gpt2.generate(sess,
                                prefix=i,
                                **config['generate']['generator'])
            pred.append(out)

        pred_df = pd.DataFrame(pred, columns=['raw_line'])
        save_csv(pred_df, output_data_path)

    except Exception as e:
        logger.error(
            "Unexpected error occurred when generating dialogues with gpt2: " +
            str(e))

Exemple #5

0

Afficher le fichier

def main(args):
    """
    main function to load cleaned data, conduct eda, visualize most important tokens with tfidf score
    :param args: (argparse) user-input configuration file
    """
    try:
        config_path = project_path + "/" + args.config
        input_data_path = project_path + "/" + args.input
        output_data_path = project_path + "/" + args.output

        config = load_config(config_path)

        # load data
        df = read_csv(input_data_path)
        df.loc[:, 'season'] = df['season'].astype('int')

        sys.stdout = open(output_data_path, 'w')
        check_balance(df)
        check_linelen(df, config['eda']['quantile'])

        groups = config['eda']['groups']

        for i in range(len(groups)):
            df_top_words = most_important_words(df, groups[i],
                                                **config['eda']['top_n_words'])
            fig = plot_tfidf_classfeats_h(df_top_words)
            fig.savefig('{}/EDA/top_words_{}.png'.format(project_path, i))

    except Exception as e:
        logger.error("Unexpected error occurred when eda: " + str(e))

Exemple #6

0

Afficher le fichier

def main(args):
    """
    main function to clean data
    :param args: (argparse) user-input configuration file
    """
    try:
        rec_path = project_path + "/" + args.rec
        test_data_path = project_path + "/" + args.test
        output_data_path = project_path + "/" + args.output

        rec = read_csv(rec_path)
        test = read_csv(test_data_path)

        accuracy = accuracy_calculator(rec, test)
        # Write to output file
        save_csv(accuracy, output_data_path)
    except Exception as e:
        logger.error("Unexpected error occurred when evaluation: " + str(e))

Exemple #7

0

Afficher le fichier

def main(args):
    """
    main function to run the market basket analysis and save the recommendations to csv
    :param args: (argparse) user-input configuration file
    """
    try:
        config_path = project_path + "/" + args.config
        input_data_path = project_path + "/" + args.input
        output_data_path = project_path + "/" + args.output
        product_path = project_path + "/" + conf.PRODUCT_DIM

        config = load_config(config_path)
        df = read_csv(input_data_path)
        product = read_csv(product_path)

        result = train(df, **config['train'])

        # Join product object table to get the name and price.

        final_results = join_info(result, product, "StockCode", "StockCode")
        final_results = join_info(final_results, product, "rec1", "StockCode")
        final_results = join_info(final_results, product, "rec2", "StockCode")

        # format conf
        final_results['conf1'] = round(final_results['conf1'] * 100, 2)
        final_results['conf2'] = round(final_results['conf2'] * 100, 2)

        final_results = final_results[config["result_columns"]]

        # Write to output file
        save_csv(final_results, output_data_path)
    except KeyError as e3:
        logger.error("KeyError: " + str(e3))
    except ValueError as e4:
        logger.error("ValueError: " + str(e4) +
                     " Please validate Values in the configuration file.")
    except Exception as e:
        logger.error(
            "Unexpected error occurred when making recommendations: " + str(e))

Exemple #8

0

Afficher le fichier

Fichier : product_dim.py Projet : sabyadg/Product-Recommendation-for-Online-Grocery-WebAPP

def main(args):
    """
    main function to create object table for products form the cleaned transactions
    :param args: (argparse) user-input configuration file
    """
    try:
        config_path = project_path + "/" + args.config
        input_data_path = project_path + "/" + args.input
        output_data_path = project_path + "/" + args.output

        config = load_config(config_path)
        df = read_csv(input_data_path)
        product = product_dim(df, **config['product_dim'])

        # Write to output file
        save_csv(product, output_data_path)
    except Exception as e:
        logger.error("Unexpected error occurred when creating object table for products: " + str(e))

Exemple #9

0

Afficher le fichier

Fichier : clean.py Projet : sabyadg/Product-Recommendation-for-Online-Grocery-WebAPP

def main(args):
    """
    main function to load raw data, clean data and save leaned data to csv
    :param args: (argparse) user-input configuration file
    """
    try:
        config_path = project_path + "/" + args.config
        input_data_path = project_path + "/" + args.input
        output_data_path = project_path + "/" + args.output

        config = load_config(config_path)
        df = read_csv(input_data_path)
        clean_data = clean(df, **config['clean'])

        # Write to output file
        save_csv(clean_data, output_data_path)
    except KeyError as e3:
        logger.error("KeyError: " + str(e3))
    except Exception as e:
        logger.error("Unexpected error occurred when cleaning data: " + str(e))

Exemple #10

0

Afficher le fichier

Fichier : create_basket.py Projet : sabyadg/Product-Recommendation-for-Online-Grocery-WebAPP

def main(args):
    """
    main function to load cleaned data, create baskets and same baskets to csv
    :param args: (argparse) user-input configuration file
    """
    try:
        config_path = project_path + "/" + args.config
        input_data_path = project_path + "/" + args.input
        output_data_path = project_path + "/" + args.output

        config = load_config(config_path)
        df = read_csv(input_data_path)
        basket = create_basket(df, **config['create_basket'])

        # Write to output file
        save_csv(basket, output_data_path, index=True)
    except KeyError as e1:
        logger.error("KeyError: " + str(e1))
    except ValueError as e2:
        logger.error("ValueError: " + str(e2) + " Please validate Values in the configuration file.")
    except Exception as e:
        logger.error("Unexpected error occurred when creating basket: " + str(e))

Exemple #11

0

Afficher le fichier

Fichier : bert_classification.py Projet : rachelzhaolp/F.R.I.E.N.D.S_dialogue_generator_and_classifier

def main(args):
    """
    main function to fune tuning bert classification model
    :param args: (argparse) user-input configuration file
    """
    try:
        config_path = project_path + "/" + args.config
        input_data_path = project_path + "/" + args.input
        model_path = project_path + "/" + args.model
        evaluation_path = project_path + "/" + args.evaluation

        config = load_config(config_path)

        # load data
        df = read_csv(input_data_path)
        # # -- debug
        # df = df[:100]
        # Encode the classes for BERT.
        encoder = preprocessing.LabelEncoder()
        df['label'] = encoder.fit_transform(df['label'])

        # Split data into training and test sets.
        X_train, X_test, y_train, y_test = training_test_split(
            df, **config['bert']['training_test_split'])

        # Bert tokenization
        logger.info("Tokenizing...")
        tokenizer = transformers.BertTokenizer.from_pretrained(
            'bert-base-uncased', do_lower_case=True)
        if not args.max_length:
            max_length = config['bert']['max_length']
        else:
            max_length = int(args.max_length)

        # DataLoaders for running the model
        if not args.batch_size:
            batch_size = config['bert']['batch_size']
        else:
            batch_size = int(args.batch_size)

        dataloader_train = pro_pipline(X_train, tokenizer, max_length,
                                       config['bert']['tokenize'], batch_size,
                                       y_train)
        dataloader_test = pro_pipline(X_test, tokenizer, max_length,
                                      config['bert']['tokenize'], batch_size,
                                      y_test)

        # Initialize the model.
        model = transformers.BertForSequenceClassification.from_pretrained(
            "bert-base-uncased",
            num_labels=df['label'].nunique(),
            output_attentions=False,
            output_hidden_states=False)
        # Setting optimizer
        optimizer = AdamW(model.parameters(), **config['bert']['optimizer'])

        # Setting epochs
        if not args.num_epoch:
            epochs = config['bert']['num_epoch']
        else:
            epochs = int(args.num_epoch)

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps=len(dataloader_train) * epochs)

        # Setting seeds
        seed = config['bert']['seed']
        random.seed(seed)
        np.random.seed(seed)
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

        # Write prints to .txt
        model_name = 'max_length' + str(max_length) + 'batch_size' + str(
            batch_size) + 'num_epoch' + str(epochs)
        e_dir = evaluation_path + "/" + model_name
        if not os.path.exists(e_dir):
            os.makedirs(e_dir)
        sys.stdout = open(e_dir + "/" + model_name + '.txt', 'w')
        logger.info("Training... and evaluations will be saved into %s", e_dir)

        device = torch.device('cuda')
        # device = torch.device('cpu')
        model.to(device)

        complete_epoch, training_loss, test_accuracy = [], [], []

        for epoch in tqdm(range(1, epochs + 1)):
            model.train()
            loss_train_total = 0
            progress_bar = tqdm(dataloader_train,
                                desc='Epoch {:1d}'.format(epoch),
                                leave=False,
                                disable=False)
            for batch in progress_bar:
                model.zero_grad()
                batch = tuple(b.to(device) for b in batch)
                inputs = {
                    'input_ids': batch[0].to(device),
                    'attention_mask': batch[1].to(device),
                    'labels': batch[2].to(device),
                }
                outputs = model(**inputs)

                loss = outputs[0]
                loss_train_total += loss.item()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                progress_bar.set_postfix({
                    'training_loss':
                    '{:.3f}'.format(loss.item() / len(batch))
                })
            # training loss
            tqdm.write(f'\nEpoch {epoch}')
            loss_train_avg = loss_train_total / len(dataloader_train)
            training_loss.append(loss_train_avg)
            tqdm.write(f'Training loss: {loss_train_avg}')
            # evaluate the model
            plt, val_accuracy = run_evaluation(dataloader_test, model, device,
                                               encoder)
            plt.savefig(e_dir + "/" + model_name + '-' + str(epoch) + '.png')

            test_accuracy.append(val_accuracy)
            complete_epoch.append(epoch)
            loss_plt = plot_loss(complete_epoch, training_loss, test_accuracy)
            loss_plt.savefig(e_dir + "/" + model_name + '_loss' + '.png')

        # save the model for future use/retrain
        output_dir = model_path + '/' + model_name + "/"
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        logging.info("Saving model to %s" % output_dir)

        model_to_save = model.module if hasattr(
            model,
            'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)

    except KeyError as e3:
        logger.error("KeyError: " + str(e3))
    except Exception as e:
        logger.error("Unexpected error occurred when training with Bert: " +
                     str(e))

Exemple #12

0

Afficher le fichier

import config.config as config

# Get project path
project_path = path.dirname(path.dirname(path.abspath(__file__)))
test_config_path = project_path + "/" + config.TEST_YAML
func_config_path = project_path + "/" + config.CONFIG_YAML

# Load configurations from .yaml
with open(test_config_path, "r") as f:
    test_conf = yaml.load(f, Loader=yaml.FullLoader)
with open(func_config_path, "r") as f:
    func_conf = yaml.load(f, Loader=yaml.FullLoader)

# Unit test for clean
raw_data_path = project_path + "/" + test_conf['raw_data']
raw_df = read_csv(raw_data_path)


def test_clean_valid():
    """
    Test clean --happy path
    This function clean the raw data
    """
    true_data_path = project_path + "/" + test_conf['clean_data']
    true_df = pd.read_csv(true_data_path, dtype=str)

    test_df = clean(raw_df, **func_conf['clean'])
    test_df = test_df.astype(str)

    assert true_df.equals(test_df)