def divid_images_in_folder(folder, labels): utils.try_mkdir(os.path.join(folder, true_folder)) utils.try_mkdir(os.path.join(folder, false_folder)) for f in os.listdir(folder): if f.endswith(image_postfix): fid = int(os.path.splitext(f)[0]) if labels[fid] == 1: os.rename(os.path.join(folder, f), os.path.join(folder, true_folder, f)) else: os.rename(os.path.join(folder, f), os.path.join(folder, false_folder, f))
def __init__(self): try_mkdir(self.model_dir) try_mkdir(self.params_dir) try_mkdir(self.logs_dir) self._init_data_loaders()
def __init__(self, df): self._rows = df.shape[0] self._directory = hash_df(df) try_mkdir(self._directory) self.raw = df
import os from sklearn.linear_model import LogisticRegression import pickle from utils import try_mkdir import pandas as pd parser = argparse.ArgumentParser() parser.add_argument('input_data_path', type=str) parser.add_argument('input_features_path', type=str) parser.add_argument('artifacts_path', type=str) parser.add_argument('output_data_path', type=str) args = parser.parse_args() data_df = pd.read_csv(args.input_data_path) pred_df = data_df[['id']] try_mkdir(args.output_data_path) for category in [ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ]: with open( os.path.join(args.input_features_path, '{}_features_test.pkl'.format(category)), 'rb') as f: features = pickle.load(f) with open( os.path.join(args.artifacts_path, '{}_lr.pkl'.format(category)), 'rb') as f: lr = pickle.load(f)
bootstrapping_start_epoch = 4 bootstrapping_increase_coef = 1.5 bootstrapping_max_usage = 0.75 gradient_accumulation_steps = 4 LOG_NAME = 'finetune-all' DEBUG = False EPOCH = 3 save_freq_per_epoch = 0 save_freq_n_epoch = 1 assert bool(save_freq_n_epoch) != bool(save_freq_per_epoch) dir = './data/datasets/yelp_review_polarity_csv/' batch_size_train = 32 batch_size_test = 32 try_mkdir(dir+'models') try_mkdir(dir+'models/logs') try_mkdir(dir+'models/params') print('Building data...') label_ids, raw_train_set = pickle.load(open(dir+'train_set.pk', 'rb')) _, test_set = pickle.load(open(dir+'test_set.pk', 'rb')) _, train_set_unused = pickle.load(open(dir+'train_set_unused.pk', 'rb')) batch_size_train = batch_size_train // gradient_accumulation_steps train_loader = DataLoader(raw_train_set, sampler=RandomSampler(raw_train_set), batch_size=batch_size_train) test_loader = DataLoader(test_set, sampler=SequentialSampler(test_set), batch_size=batch_size_test)
seq, attn_masks, labels = seq.to(device), attn_masks.to( device), labels.to(device) pred = pd.DataFrame(torch.sigmoid(bert(seq, attn_masks).cpu()).numpy(), columns=[ "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate" ]) df = pd.concat([df, pred], ignore_index=True) df['id'] = test_df['id'] cols = df.columns.to_list()[-1:] + df.columns.to_list()[:-1] df = df[cols] return df if __name__ == "__main__": from utils import try_mkdir parser = argparse.ArgumentParser() parser.add_argument('test_file', type=str) parser.add_argument('model_file', type=str) parser.add_argument('prediction_file', type=str) parser.add_argument('--maxlen', default=30, type=int) parser.add_argument('--batch_size', default=256, type=int) args = parser.parse_args() df = predict(args.model_file, args.test_file, args.maxlen, args.batch_size) try_mkdir('/'.join(args.prediction_file.split('/')[:-1])) df.to_csv(args.prediction_file, index=False)
loss.backward() opt.step() if (it + 1) % prints_every == 0: print("Iteration {} of epoch {} complete. Loss : {}".format( it + 1, ep + 1, loss.item())) return net if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('train_filename', type=str) parser.add_argument('model_file', type=str) parser.add_argument('--freeze', default=True, type=bool) parser.add_argument('--maxlen', default=30, type=int) parser.add_argument('--batch_size', default=128, type=int) parser.add_argument('--max_epochs', default=1, type=int) parser.add_argument('--n_jobs', default=0, type=int) parser.add_argument('--prints_every', default=10, type=int) args = parser.parse_args() net = train_net(args.train_filename, args.freeze, args.maxlen, args.batch_size, args.max_epochs, args.n_jobs, args.prints_every) try_mkdir('/'.join(args.model_file.split('/')[:-1])) torch.save(net.state_dict(), args.model_file)
import os import pandas as pd from utils import try_mkdir from featurizers.mnb_featurizer import MNBFeaturizer parser = argparse.ArgumentParser() parser.add_argument('input_data_path', type=str) parser.add_argument('output_data_path', type=str) parser.add_argument('artifacts_path', type=str) parser.add_argument('--train', help='Trains featurizer', action='store_true') parser.add_argument('--test', help='Transforms test data', action='store_true') args = parser.parse_args() if args.test: data_df = pd.read_csv(args.input_data_path) try_mkdir(args.output_data_path) for category in ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]: featurizer = MNBFeaturizer.load(os.path.join(args.artifacts_path, category)) transformed = featurizer.transform(data_df['comment_text']) with open(os.path.join(args.output_data_path, '{}_features_test.pkl'.format(category)), 'wb') as f: pickle.dump(transformed, f) else: print('Reading data from {}'.format(args.input_data_path)) data_df = pd.read_csv(args.input_data_path) bpemb_en = BPEmb(lang="en", dim=50, vs=200000) tfidf = TfidfVectorizer(tokenizer=bpemb_en.encode) tfidf.fit(data_df['comment_text']) try_mkdir(args.artifacts_path)
Arguments: df {pd.DataFrame} -- data to process column_name {string} -- dataFrame column to clean Returns: pd.DataFrame -- processed dataframe """ df[column_name] = df[column_name].fillna('').str.replace('\n', ' ') return df if __name__ == "__main__": import sys import os input_df_paths = sys.argv[1:] for input_df_path in input_df_paths: input_file_name, extention = input_df_path.split( os.path.sep)[-1].split('.') output_file_name = '.'.join([input_file_name + '_prepared', extention]) output_path = os.path.join('data', 'prepared') try_mkdir(output_path) print('processing {}'.format(input_df_path)) df = pd.read_csv(input_df_path) df_clean = clean_df(df, 'comment_text') print('writing {} to {}'.format(output_file_name, output_path)) df_clean.to_csv(os.path.join(output_path, output_file_name), index=False)
import pickle from utils import try_mkdir import pandas as pd import mlflow from datetime import datetime import numpy as np parser = argparse.ArgumentParser() parser.add_argument('input_data_path', type=str) parser.add_argument('input_features_path', type=str) parser.add_argument('artifacts_path', type=str) args = parser.parse_args() print('Reading data from {}'.format(args.input_data_path)) data_df = pd.read_csv(args.input_data_path) try_mkdir(args.artifacts_path) for category in ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]: with open(os.path.join(args.input_features_path, '{}_features_train.pkl'.format(category)), 'rb') as f: features = pickle.load(f) C_s = [0.01, 0.1, 1, 10, 100] model = LogisticRegressionCV(C_s, cv=5, n_jobs=-1, max_iter=1000, scoring=make_scorer(roc_auc_score)) print("Fitting lr for category {}".format(category)) model.fit(features, data_df[category])