Exemple #1
0
def divid_images_in_folder(folder, labels):
    utils.try_mkdir(os.path.join(folder, true_folder))
    utils.try_mkdir(os.path.join(folder, false_folder))

    for f in os.listdir(folder):
        if f.endswith(image_postfix):
            fid = int(os.path.splitext(f)[0])
            if labels[fid] == 1:
                os.rename(os.path.join(folder, f),
                          os.path.join(folder, true_folder, f))
            else:
                os.rename(os.path.join(folder, f),
                          os.path.join(folder, false_folder, f))
    def __init__(self):

        try_mkdir(self.model_dir)
        try_mkdir(self.params_dir)
        try_mkdir(self.logs_dir)

        self._init_data_loaders()
 def __init__(self, df):
     self._rows = df.shape[0]
     self._directory = hash_df(df)
     try_mkdir(self._directory)
     self.raw = df
Exemple #4
0
    import os
    from sklearn.linear_model import LogisticRegression
    import pickle
    from utils import try_mkdir
    import pandas as pd

    parser = argparse.ArgumentParser()
    parser.add_argument('input_data_path', type=str)
    parser.add_argument('input_features_path', type=str)
    parser.add_argument('artifacts_path', type=str)
    parser.add_argument('output_data_path', type=str)
    args = parser.parse_args()

    data_df = pd.read_csv(args.input_data_path)
    pred_df = data_df[['id']]
    try_mkdir(args.output_data_path)

    for category in [
            "toxic", "severe_toxic", "obscene", "threat", "insult",
            "identity_hate"
    ]:
        with open(
                os.path.join(args.input_features_path,
                             '{}_features_test.pkl'.format(category)),
                'rb') as f:
            features = pickle.load(f)

        with open(
                os.path.join(args.artifacts_path,
                             '{}_lr.pkl'.format(category)), 'rb') as f:
            lr = pickle.load(f)
bootstrapping_start_epoch = 4
bootstrapping_increase_coef = 1.5
bootstrapping_max_usage = 0.75
gradient_accumulation_steps = 4
LOG_NAME = 'finetune-all'
DEBUG = False
EPOCH = 3
save_freq_per_epoch = 0
save_freq_n_epoch = 1
assert bool(save_freq_n_epoch) != bool(save_freq_per_epoch)

dir = './data/datasets/yelp_review_polarity_csv/'

batch_size_train = 32
batch_size_test = 32
try_mkdir(dir+'models')
try_mkdir(dir+'models/logs')
try_mkdir(dir+'models/params')


print('Building data...')
label_ids, raw_train_set = pickle.load(open(dir+'train_set.pk', 'rb'))
_, test_set = pickle.load(open(dir+'test_set.pk', 'rb'))

_, train_set_unused = pickle.load(open(dir+'train_set_unused.pk', 'rb'))

batch_size_train = batch_size_train // gradient_accumulation_steps
train_loader = DataLoader(raw_train_set, sampler=RandomSampler(raw_train_set), batch_size=batch_size_train)
test_loader = DataLoader(test_set, sampler=SequentialSampler(test_set), batch_size=batch_size_test)

Exemple #6
0
            seq, attn_masks, labels = seq.to(device), attn_masks.to(
                device), labels.to(device)
            pred = pd.DataFrame(torch.sigmoid(bert(seq,
                                                   attn_masks).cpu()).numpy(),
                                columns=[
                                    "toxic", "severe_toxic", "obscene",
                                    "threat", "insult", "identity_hate"
                                ])
            df = pd.concat([df, pred], ignore_index=True)

    df['id'] = test_df['id']
    cols = df.columns.to_list()[-1:] + df.columns.to_list()[:-1]
    df = df[cols]
    return df


if __name__ == "__main__":
    from utils import try_mkdir

    parser = argparse.ArgumentParser()
    parser.add_argument('test_file', type=str)
    parser.add_argument('model_file', type=str)
    parser.add_argument('prediction_file', type=str)
    parser.add_argument('--maxlen', default=30, type=int)
    parser.add_argument('--batch_size', default=256, type=int)
    args = parser.parse_args()

    df = predict(args.model_file, args.test_file, args.maxlen, args.batch_size)

    try_mkdir('/'.join(args.prediction_file.split('/')[:-1]))
    df.to_csv(args.prediction_file, index=False)
Exemple #7
0
            loss.backward()

            opt.step()

            if (it + 1) % prints_every == 0:
                print("Iteration {} of epoch {} complete. Loss : {}".format(
                    it + 1, ep + 1, loss.item()))

    return net


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('train_filename', type=str)
    parser.add_argument('model_file', type=str)
    parser.add_argument('--freeze', default=True, type=bool)
    parser.add_argument('--maxlen', default=30, type=int)
    parser.add_argument('--batch_size', default=128, type=int)
    parser.add_argument('--max_epochs', default=1, type=int)
    parser.add_argument('--n_jobs', default=0, type=int)
    parser.add_argument('--prints_every', default=10, type=int)

    args = parser.parse_args()
    net = train_net(args.train_filename, args.freeze, args.maxlen,
                    args.batch_size, args.max_epochs, args.n_jobs,
                    args.prints_every)

    try_mkdir('/'.join(args.model_file.split('/')[:-1]))
    torch.save(net.state_dict(), args.model_file)
Exemple #8
0
    import os
    import pandas as pd
    from utils import try_mkdir
    from featurizers.mnb_featurizer import MNBFeaturizer

    parser = argparse.ArgumentParser()
    parser.add_argument('input_data_path', type=str)
    parser.add_argument('output_data_path', type=str)
    parser.add_argument('artifacts_path', type=str)
    parser.add_argument('--train', help='Trains featurizer', action='store_true')
    parser.add_argument('--test', help='Transforms test data', action='store_true')
    args = parser.parse_args()

    if args.test:
        data_df = pd.read_csv(args.input_data_path)
        try_mkdir(args.output_data_path)
        for category in ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]:
            featurizer = MNBFeaturizer.load(os.path.join(args.artifacts_path, category))
            transformed = featurizer.transform(data_df['comment_text'])
            
            with open(os.path.join(args.output_data_path, '{}_features_test.pkl'.format(category)), 'wb') as f:
                pickle.dump(transformed, f)

    else:
        print('Reading data from {}'.format(args.input_data_path))
        data_df = pd.read_csv(args.input_data_path)
        bpemb_en = BPEmb(lang="en", dim=50, vs=200000)
        tfidf = TfidfVectorizer(tokenizer=bpemb_en.encode)
        tfidf.fit(data_df['comment_text'])

        try_mkdir(args.artifacts_path)
Exemple #9
0
    Arguments:
        df {pd.DataFrame} -- data to process
        column_name {string} -- dataFrame column to clean
    
    Returns:
        pd.DataFrame -- processed dataframe
    """
    df[column_name] = df[column_name].fillna('').str.replace('\n', ' ')
    return df


if __name__ == "__main__":
    import sys
    import os

    input_df_paths = sys.argv[1:]
    for input_df_path in input_df_paths:
        input_file_name, extention = input_df_path.split(
            os.path.sep)[-1].split('.')
        output_file_name = '.'.join([input_file_name + '_prepared', extention])
        output_path = os.path.join('data', 'prepared')

        try_mkdir(output_path)

        print('processing {}'.format(input_df_path))
        df = pd.read_csv(input_df_path)
        df_clean = clean_df(df, 'comment_text')
        print('writing {} to {}'.format(output_file_name, output_path))
        df_clean.to_csv(os.path.join(output_path, output_file_name),
                        index=False)
Exemple #10
0
    import pickle
    from utils import try_mkdir
    import pandas as pd
    import mlflow
    from datetime import datetime
    import numpy as np
    
    parser = argparse.ArgumentParser()
    parser.add_argument('input_data_path', type=str)
    parser.add_argument('input_features_path', type=str)
    parser.add_argument('artifacts_path', type=str)
    args = parser.parse_args()  
    
    print('Reading data from {}'.format(args.input_data_path))
    data_df = pd.read_csv(args.input_data_path)
    try_mkdir(args.artifacts_path)

    for category in ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]:
        with open(os.path.join(args.input_features_path, '{}_features_train.pkl'.format(category)), 'rb') as f:
            features = pickle.load(f)
            
        C_s = [0.01, 0.1, 1, 10, 100]
        model = LogisticRegressionCV(C_s,
                                     cv=5, 
                                     n_jobs=-1,
                                     max_iter=1000,
                                     scoring=make_scorer(roc_auc_score))
        
        print("Fitting lr for category {}".format(category))
        model.fit(features, data_df[category])