def split_data_txt( training_data_output: OutputDirectory(type='AnyDirectory'), validation_data_output: OutputDirectory(type='AnyDirectory'), test_data_output: OutputDirectory(type='AnyDirectory'), input_dir: InputDirectory(type='AnyDirectory') = None, training_data_ratio=0.7, validation_data_ratio=0.1, random_split=False, seed=0): print('============================================') print( f"value of input_dir:'{input_dir}', type of input_dir:'{type(input_dir)}'" ) with open(input_dir, 'r', encoding='utf-8') as f: data = f.readlines() random.seed(seed if random_split else 0) # list shuffle random.shuffle(data) n = len(data) # for logging run = Run.get_context() training_data_num = int(n * training_data_ratio) dev_data_num = int(n * validation_data_ratio) train = data[:training_data_num] dev = data[training_data_num:training_data_num + dev_data_num] test = data[training_data_num + dev_data_num:] print('num of total data:', len(data)) print('num of training data:', len(train)) print('num of validation data:', len(dev)) print('num of test_data:', len(test)) # for metrics run.log(name='num of total data', value=len(data)) run.log(name='num of training data', value=len(train)) run.log(name='num of validation data', value=len(dev)) run.log(name='num of test_data', value=len(test)) os.makedirs(training_data_output, exist_ok=True) path = os.path.join(training_data_output, "train.txt") with open(path, 'w', encoding='utf-8') as f: f.writelines(train) print(path) print(os.listdir(training_data_output)) os.makedirs(validation_data_output, exist_ok=True) path = os.path.join(validation_data_output, "dev.txt") with open(path, 'w', encoding='utf-8') as f: f.writelines(dev) print(path) print(os.listdir(validation_data_output)) os.makedirs(test_data_output, exist_ok=True) path = os.path.join(test_data_output, "test.txt") with open(path, 'w', encoding='utf-8') as f: f.writelines(test) print(path) print(os.listdir(test_data_output)) print('============================================')
def split_data_txt(training_data_output: OutputDirectory(), validation_data_output: OutputDirectory(), test_data_output: OutputDirectory(), input_dir: InputDirectory() = None, training_data_ratio=0.7, validation_data_ratio=0.1, random_split=False, seed=0): print('============================================') print( f"value of input_dir:'{input_dir}', type of input_dir:'{type(input_dir)}'" ) path_input_data = os.path.join(input_dir, 'data.txt') with open(path_input_data, 'r', encoding='utf-8') as f: data = f.readlines() random.seed(seed if random_split else 0) random.shuffle(data) n = len(data) # for metrics run = Run.get_context() training_data_num = int(n * training_data_ratio) dev_data_num = int(n * validation_data_ratio) train = data[:training_data_num] dev = data[training_data_num:training_data_num + dev_data_num] test = data[training_data_num + dev_data_num:] print('num of total data:', len(data)) print('num of training data:', len(train)) print('num of validation data:', len(dev)) print('num of test_data:', len(test)) # for metrics run.log(name='num of total data', value=len(data)) run.log(name='num of training data', value=len(train)) run.log(name='num of validation data', value=len(dev)) run.log(name='num of test_data', value=len(test)) path_label = os.path.join(input_dir, 'label.txt') path_word_to_index = os.path.join(input_dir, 'word_to_index.json') shutil.copy(src=path_label, dst=training_data_output) shutil.copy(src=path_word_to_index, dst=training_data_output) path = os.path.join(training_data_output, "data.txt") with open(path, 'w', encoding='utf-8') as f: f.writelines(train) shutil.copy(src=path_label, dst=validation_data_output) shutil.copy(src=path_word_to_index, dst=validation_data_output) path = os.path.join(validation_data_output, "data.txt") with open(path, 'w', encoding='utf-8') as f: f.writelines(dev) shutil.copy(src=path_label, dst=test_data_output) shutil.copy(src=path_word_to_index, dst=test_data_output) path = os.path.join(test_data_output, "data.txt") with open(path, 'w', encoding='utf-8') as f: f.writelines(test) print('============================================')
def gdal_sample( ##define interface(input, output, paratmers) of the module here output_dir1: OutputDirectory(), output_dir2: OutputDirectory(), input_dir1: InputDirectory(), input_dir2: InputDirectory()): print('I am in module definition') print(f'input_dir1: {Path(input_dir1).resolve()}') print(f'input_dir2: {Path(input_dir2).resolve()}') ## add custom logic here dfd1 = load_data_frame_from_directory(input_dir1) data_frame1 = dfd1.data print(data_frame1.head(10))
def compare_two_models( the_better_model: OutputDirectory(), first_trained_model: InputDirectory(type='AnyDirectory') = None, first_trained_result: InputDirectory(type='AnyDirectory') = None, second_trained_model: InputDirectory(type='AnyDirectory') = None, second_trained_result: InputDirectory(type='AnyDirectory') = None, ): print('=====================================================') print(f'input_dir: {Path(first_trained_model).resolve()}') print(f'input_dir: {Path(first_trained_result).resolve()}') print(f'input_dir: {Path(second_trained_model).resolve()}') print(f'input_dir: {Path(second_trained_result).resolve()}') # for logging run = Run.get_context() path = os.path.join(first_trained_result, 'result.json') result_first = json.load(open(path, 'r'))['acc'] path = os.path.join(second_trained_result, 'result.json') second_first = json.load(open(path, 'r'))['acc'] dst = os.path.join(the_better_model, 'BestModel') if result_first >= second_first: print('choose the first model') run.log(name='which one', value='first') src = os.path.join(first_trained_model, 'BestModel') shutil.copy(src=src, dst=dst) else: print('choose the second model') run.log(name='which one', value='second') src = os.path.join(second_trained_model, 'BestModel') shutil.copy(src=src, dst=dst) print('=====================================================')
def fasttext_evaluation( model_testing_result: OutputDirectory(type='AnyDirectory'), trained_model_dir: InputDirectory(type='AnyDirectory') = None, test_data_dir: InputDirectory(type='AnyDirectory') = None, char2index_dir: InputDirectory(type='AnyDirectory') = None): print('=====================================================') print(f'trained_model_dir: {Path(trained_model_dir).resolve()}') print(f'test_data_dir: {Path(test_data_dir).resolve()}') print(f'char2index_dir: {Path(char2index_dir).resolve()}') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') max_len_ = 38 path = os.path.join(test_data_dir, 'test.txt') test_samples = load_dataset(file_path=path, max_len=max_len_, char2index_dir=char2index_dir) test_iter = DataIter(test_samples) path = os.path.join(trained_model_dir, 'BestModel') model = torch.load(f=path) path = os.path.join(model_testing_result, 'result.json') acc_ = test(model, test_iter, device) json.dump({"acc": acc_}, open(path, 'w')) print('\n============================================')
def fasttext_evaluation(model_testing_result: OutputDirectory(), trained_model_dir: InputDirectory() = None, test_data_dir: InputDirectory() = None): print('=====================================================') print(f'trained_model_dir: {Path(trained_model_dir).resolve()}') print(f'test_data_dir: {Path(test_data_dir).resolve()}') path_word_to_index = os.path.join(test_data_dir, 'word_to_index.json') word_to_index = get_vocab(path_word_to_index) path_label = os.path.join(test_data_dir, 'label.txt') map_id_label, map_label_id = get_id_label(path_label) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print('device:', device) path = os.path.join(trained_model_dir, 'shared_params.json') with open(path, 'r', encoding='utf-8') as f: shared_params = json.load(f) path = os.path.join(test_data_dir, 'data.txt') test_samples = load_dataset(file_path=path, max_len=shared_params['max_len'], ngram_size=shared_params['ngram_size'], word_to_index=word_to_index, map_label_id=map_label_id) test_iter = DataIter(samples=test_samples, shuffle=False, device=device) path = os.path.join(trained_model_dir, 'BestModel') model = torch.load(f=path, map_location=device) path = os.path.join(model_testing_result, 'result.json') acc_ = test(model, test_iter) with open(path, 'w', encoding='utf-8') as f: json.dump({"acc": acc_}, f) print('\n============================================')
def fasttext_score( scored_data_output_dir: OutputDirectory(), fasttext_model_dir: InputDirectory() = '.' ): print('=====================================================') print(f'fasttext_model: {Path(fasttext_model_dir).resolve()}') print(f'scored_data_output_dir: {scored_data_output_dir}') path_word_to_index = os.path.join(fasttext_model_dir, 'word_to_index.json') word_to_index = get_vocab(path_word_to_index) path_label = os.path.join(fasttext_model_dir, 'label.txt') map_id_label, map_label_id = get_id_label(path_label) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print('device:', device) path = os.path.join(fasttext_model_dir, 'shared_params.json') with open(path, 'r', encoding='utf-8') as f: shared_params = json.load(f) path = os.path.join(fasttext_model_dir, 'BestModel') model = torch.load(f=path, map_location=device) def run(files): if len(files) == 0: return [] with torch.no_grad(): test_samples = load_dataset(file_path=files, max_len=shared_params['max_len'], ngram_size=shared_params['ngram_size'], word_to_index=word_to_index, map_label_id=map_label_id) test_iter = DataIter(samples=test_samples, batch_size=1, shuffle=False, device=device) results = predict_parallel(model, test_iter, map_id_label) dict_ = {'Filename': files, 'Class': results} df = pd.DataFrame(data=dict_) output_file = os.path.join(scored_data_output_dir, f"{uuid4().hex}.parquet") df.to_parquet(output_file, index=False) return results return run
def fasttext_score_parallel( scored_dataset: OutputDirectory(type='AnyDirectory'), fasttext_model: InputDirectory(type='AnyDirectory') = '.', char2index_dir: InputDirectory(type='AnyDirectory') = None ): print('=====================================================') print(f'fasttext_model: {Path(fasttext_model).resolve()}') print(f'char2index_dir: {Path(char2index_dir).resolve()}') print(f'scored_dataset: {scored_dataset}') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') max_len_ = 38 path = os.path.join(fasttext_model, 'BestModel') model = torch.load(f=path) def run(files): if len(files) == 0: return [] print(f"Ready to process {len(files)} texts.") print('\n'.join(files)) with torch.no_grad(): test_samples = load_dataset_parallel(files=files, max_len=max_len_, char2index_dir=char2index_dir) test_iter = DataIter_Parallel(test_samples, shuffle=False) results = predict_parallel(model, test_iter, device) dict_ = {'Filename': files, 'Class': results} df = pd.DataFrame(data=dict_) print("Result:") print(df) output_file = os.path.join(scored_dataset, f"{uuid4().hex}.parquet") df.to_parquet(output_file, index=False) return results return run
def basic_module( output_dir: OutputDirectory(), input_dir: InputDirectory() = '.', str_param='some_string', ): print(f'input_dir: {Path(input_dir).resolve()}') print(f'str_param: {str_param}') output_dir = Path(output_dir) with open(output_dir / f"output.txt", 'w') as fout: fout.write(str_param)
def parallel_score_images( scored_dataset: OutputDirectory(), trained_model: InputDirectory() = None, ): # Use the path of a prepared model if trained_model is None if trained_model is None: trained_model = str( Path(__file__).parent / 'tests/parallel_score_images/inputs/trained_model/') print("Scored dataset:", scored_dataset) print("Trained model:", trained_model) map_location = 'cpu' if not torch.cuda.is_available() else None model = torch.load(os.path.join(trained_model, 'model.pt'), map_location=map_location) os.makedirs(scored_dataset, exist_ok=True) print("Model is loaded:", model) def run(files): if len(files) == 0: return [] results = [] nthreads = min(2 * cpu_count(), len(files)) print(f"Ready to process {len(files)} images.") print('\n'.join(files)) with ThreadPool(nthreads) as pool: imgs = pool.map(Image.open, files) for f, img in zip(files, imgs): img = Image.open(f) tensor = transform(img).unsqueeze(0) if torch.cuda.is_available(): tensor = tensor.cuda() with torch.no_grad(): output = model(tensor) softmax = nn.Softmax(dim=1) pred_probs = softmax(output).cpu().numpy()[0] index = torch.argmax(output, 1)[0].cpu().item() result = { 'Filename': Path(f).name, 'Class': MNIST.classes[index] } for c, prob in zip(MNIST.classes, pred_probs): result[f"Prob of {c}"] = prob results.append(result) columns = sorted(list(results[0].keys())) df = pd.DataFrame(results, columns=columns) print("Result:") print(df) output_file = os.path.join(scored_dataset, f"{uuid4().hex}.parquet") df.to_parquet(output_file, index=False) return results return run
def slice_video( input_video: InputDirectory( description="input directory of video file") = './data/input/video', output_audio: OutputDirectory( description="output directory of audio from video" ) = '/data/output/video', output_images: OutputDirectory( description="output directory of images slice from video" ) = '/data/output/images', ): ## this module takes input video, and slice the video into images with ffmpeg subprocess.run("ffmpeg -i {} {}/video.aac".format(input_video, output_audio), shell=True, check=True) subprocess.run("ffmpeg -i {} {}/%05d_video.jpg -hide_banner".format( input_video, output_images), shell=True, check=True)
def train(model, trained_model_dir: OutputDirectory(type='AnyDirectory'), train_iter, dev_iter=None, epochs=20, learning_rate=0.0001, stop_patience=3, device=None): if device is None: device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print('device:', device) model.train() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) model.to(device) loss = torch.nn.CrossEntropyLoss() min_loss_epoch = (None, None) stop_flag = False model_name = model._get_name() # for metrics run = Run.get_context() for epoch in range(epochs): loss_value_list = [] total_iter = len(train_iter) for i, (btach_x, btach_y) in enumerate(train_iter): outputs = model(btach_x) optimizer.zero_grad() loss_value = loss(outputs, btach_y) loss_value.backward() optimizer.step() loss_value_list.append(loss_value.cpu().data.numpy()) # for metrics run.log(name='CrossEntropyLoss', value=np.mean(loss_value_list)) if i % 50 == 0: str_ = f"{model_name} epoch:{epoch + 1}/{epochs} step:{i + 1}/{total_iter} mean_loss:{np.mean(loss_value_list): .4f}" print(str_) if (i + 1) == total_iter and dev_iter is not None: loss_, acc_, prec_, recall_, f1_ = evaluation(model, dev_iter) str_ = f" validation loss:{loss_:.4f} acc:{acc_:.4f}" print(str_) model.train() if (min_loss_epoch[0] is None) or (min_loss_epoch[0] > loss_): min_loss_epoch = (loss_, epoch) os.makedirs(trained_model_dir, exist_ok=True) path = os.path.join(trained_model_dir, "BestModel") torch.save(obj=model, f=path) else: if (epoch - min_loss_epoch[1]) >= stop_patience: stop_flag = True break if stop_flag is True: break
def sample_module( # The input/output port are defined using the following 4 annotations. # Note that you need to register data type using # DataType.create_data_type(ws, 'MyDirectory', description=description, is_directory=True) # DataType.create_data_type(ws, 'MyFile', description=description, is_directory=False) # See https://docs.microsoft.com/en-us/python/api/azureml-pipeline-core/azureml.pipeline.core.graph.datatype?view=azure-ml-py#create-data-type-workspace--name--description--is-directory--parent-datatypes-none- output_dir: OutputDirectory(type='MyDirectory'), output_file: OutputFile(type='MyFile'), input_dir: InputDirectory(type='MyDirectory') = None, input_file: InputFile(type='MyFile') = None, # The parameter with default values will be considered as annotated with such type, # Now we support the following 5 types: str, int, float, bool, enum str_param='abc', int_param=1, float_param=0.1, bool_param=False, enum_param=MyEnum.Enum0, # If the default value is None without annotation, it will be treated as str. none_param=None, ): """A sample module use different parameter types and customized input/output ports.""" print(f"Arg 'input_dir' = '{input_dir}', type='{type(input_dir)}'") if input_dir: print(f"Contents of input directory:") print('\n'.join(f.name for f in Path(input_dir).iterdir())) print(f"Arg 'input_file' = {input_file}, type='{type(input_file)}'") print(f"Arg 'output_dir' = {output_dir}, type='{type(output_dir)}'") print(f"Arg 'output_file' = {output_file}, type='{type(output_file)}'") print(f"Arg 'str_param' = {str_param}, type='{type(str_param)}'") print(f"Arg 'int_param' = {int_param}, type='{type(int_param)}'") print(f"Arg 'float_param' = {float_param}, type='{type(float_param)}'") print(f"Arg 'bool_param' = {bool_param}, type='{type(bool_param)}'") print(f"Arg 'enum_param' = {enum_param}, type='{type(enum_param)}'") print(f"Arg 'none_param' = {none_param}, type='{type(none_param)}'") data = str_param if input_file: with open(input_file, 'r') as fin: data = fin.read() print("Content of input file:", data) if input_dir: shutil.copytree(input_dir, output_dir) else: os.makedirs(output_dir, exist_ok=True) with open(os.path.join(output_dir, "test.txt"), 'w') as fout: fout.write(data) with open(output_file, 'w') as fout: fout.write(data)
def fasttext_train( trained_model_dir: OutputDirectory(type='AnyDirectory'), training_data_dir: InputDirectory(type='AnyDirectory') = None, validation_data_dir: InputDirectory(type='AnyDirectory') = None, char2index_dir: InputDirectory(type='AnyDirectory') = None, epochs=2, batch_size=32, learning_rate=0.0005, embedding_dim=128): print('============================================') print('training_data_dir:', training_data_dir) print('validation_data_dir:', validation_data_dir) c2i = get_vocab(char2index_dir) class_ = get_classs() max_len_ = 38 n_class_ = len(class_) vocab_size_ = len(c2i) stop_patience = 5 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') path = os.path.join(training_data_dir, 'train.txt') train_samples = load_dataset(file_path=path, max_len=max_len_, char2index_dir=char2index_dir) path = os.path.join(validation_data_dir, 'dev.txt') dev_samples = load_dataset(file_path=path, max_len=max_len_, char2index_dir=char2index_dir) train_iter = DataIter(train_samples, batch_size) dev_iter = DataIter(dev_samples, batch_size) model = FastText(vocab_size=vocab_size_, n_class=n_class_, embed_dim=embedding_dim) start = time.time() train(model, trained_model_dir, train_iter, dev_iter=dev_iter, epochs=epochs, learning_rate=learning_rate, stop_patience=stop_patience, device=device) end = time.time() print('\nspent time: %.2f sec' % (end - start)) print('============================================')
def add(left: InputDirectory(), right: InputDirectory(), output: OutputDirectory()): l = float((Path(left).resolve() / f'data').read_text().strip()) r = float((Path(right).resolve() / f'data').read_text().strip()) print('left = ', l) print('right = ', r) m = l + r run = Run.get_context() run.log('result', m) run.log('left', l) run.log('right', r) run.flush() Path(output).absolute().mkdir(parents=True, exist_ok=True) with open(Path(output).resolve() / f'data', 'w') as fout: fout.write(str(m))
def tokenizer( input_file_path: InputDirectory(description="Input text file path"), output_dir_path: OutputDirectory(description="Output file directory path"), output_to_file: IntParameter( description= "whether to interpret output_dir_path as file to write to, or folder containing file to write to" ) = 0, input_is_tsv: IntParameter( description="bool determining whether to use tsv related options") = 0, delimiter: StringParameter( description="optional, delimiter to use if parsing a tsv type file" ) = None, ignore_cols: IntParameter( description="indices of columns to ignore if parsing a tsv") = None, mode: EnumParameter( enum=EnumMode, description="Tokenizer to use [train, inference, spacy]") = EnumMode. train, type: EnumParameter( enum=EnumType, description="Whether to use word tokenizer or sentence tokenizer" ) = EnumType.word, ): sys.argv = [ 'tokenizer.py', '-i', str(input_file_path), '-o', str(output_dir_path), '--output_to_file', str(output_to_file), '--input_is_tsv', str(input_is_tsv), '-m', mode.value, '-t', type.value, ] if delimiter is not None: sys.argv += ['--delimiter', str(delimiter)] if ignore_cols is not None: sys.argv += ['--ignore_cols', str(ignore_cols)] print(' '.join(sys.argv)) runpy.run_path('tokenizer.py', run_name='__main__')
def prepare_data( output_data: OutputDirectory(), input_data: InputDirectory() = None, str_param: str = None, int_param: int = 0, enum_param: EnumEnumParam = None, ): sys.argv = [ 'prepare_data.py', '--input_data', str(input_data), '--output_data', str(output_data), '--int_param', str(int_param), ] if str_param is not None: sys.argv += ['--str_param', str(str_param)] if enum_param is not None: sys.argv += ['--enum_param', enum_param.value] print(' '.join(sys.argv)) runpy.run_path('prepare_data.py', run_name='__main__')
def copy_files( output_dir: OutputDirectory(), input_dir: InputDirectory() = '.', str_param='some_string', ): input_dir = Path(input_dir) print(f'input_dir: {input_dir.resolve()}') print(f'str_param: {str_param}') files = [] if input_dir.is_dir(): files = [str(f) for f in input_dir.iterdir()] if (len(files) == 0): raise ValueError(f'input_dir should be an directory with files') output_dir = Path(output_dir) with open(output_dir / f"output.txt", 'w') as fout: fout.write(str_param)
def style_transform_parallel( model_dir: InputDirectory( description="saved torch model to be used for stylizing the image."), output_path: OutputDirectory( description="directory holding the output images"), style: StringParameter(description="style name") = None, ): print(f'output path: {output_path}') print(f'Cuda available? {torch.cuda.is_available()}') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") with torch.no_grad(): style_model = TransformerNet() state_dict = torch.load(os.path.join(model_dir, style + ".pth")) # remove saved deprecated running_* keys in InstanceNorm from the checkpoint for k in list(state_dict.keys()): if re.search(r'in\d+\.running_(mean|var)$', k): del state_dict[k] style_model.load_state_dict(state_dict) style_model.to(device) print(f'Model loaded successfully. Path: {model_dir}') def run(mini_batch): result = [] for image_file_path in mini_batch: img = load_image(image_file_path) print(f'load image from: {image_file_path}') with torch.no_grad(): content_transform = transforms.Compose([ transforms.ToTensor(), transforms.Lambda(lambda x: x.mul(255)) ]) content_image = content_transform(img) content_image = content_image.unsqueeze(0).to(device) output = style_model(content_image).cpu() output_file_path = os.path.join( output_path, os.path.basename(image_file_path)) save_image(output_file_path, output[0]) result.append(output_file_path) print(f'transferred image saved in: {output_file_path}') return result return run
def mpi_module( output_dir: OutputDirectory(), input_dir: InputDirectory() = '.', param0: str = 'abc', param1: int = 10, ): from mpi4py import MPI for k, v in locals().items(): print(f"{k}: {v}") comm = MPI.COMM_WORLD size = comm.Get_size() rank = comm.Get_rank() print(f"This is an MPI module, I'm rank {rank}/{size}.") if rank == 0: print("I will write data.") output_dir = Path(output_dir) with open(output_dir / f"output.txt", 'w') as fout: fout.write(param0) fout.write(str(param1)) else: print("I don't return data.")
def stitch_video( input_images: InputDirectory(description="input directory of images"), input_audio: InputDirectory(description="input directory of audio"), output_video: OutputDirectory( description="output directory of stitched video file")): ## this module takes input video, and slice the video into images with ffmpeg subprocess.run( "ffmpeg -framerate 30 -i {}/%05d_video.jpg -c:v libx264 -profile:v high -crf 20 -pix_fmt yuv420p " "-y {}/video_without_audio.mp4".format(args.images_dir, args.output_dir), shell=True, check=True) subprocess.run( "ffmpeg -i {}/video_without_audio.mp4 -i {}/video.aac -map 0:0 -map 1:0 -vcodec " "copy -acodec copy -y {}/video_with_audio.mp4".format( args.output_dir, args.input_audio, args.output_dir), shell=True, check=True)
def compare_two_models(the_better_model: OutputDirectory(), first_trained_model: InputDirectory() = None, first_trained_result: InputDirectory() = None, second_trained_model: InputDirectory() = None, second_trained_result: InputDirectory() = None): print('=====================================================') print(f'input_dir: {Path(first_trained_model).resolve()}') print(f'input_dir: {Path(first_trained_result).resolve()}') print(f'input_dir: {Path(second_trained_model).resolve()}') print(f'input_dir: {Path(second_trained_result).resolve()}') # for metrics run = Run.get_context() path = os.path.join(first_trained_result, 'result.json') with open(path, 'r', encoding='utf-8') as f: result_first = json.load(f)['acc'] path = os.path.join(second_trained_result, 'result.json') with open(path, 'r', encoding='utf-8') as f: second_first = json.load(f)['acc'] dst = the_better_model if result_first >= second_first: print('choose the first model') run.log(name='which one', value='first') src = os.path.join(first_trained_model, 'BestModel') shutil.copy(src=src, dst=dst) else: print('choose the second model') run.log(name='which one', value='second') src = os.path.join(second_trained_model, 'BestModel') shutil.copy(src=src, dst=dst) path_word_to_index = os.path.join(first_trained_model, 'word_to_index.json') path_label = os.path.join(first_trained_model, 'label.txt') path_shared_params = os.path.join(first_trained_model, 'shared_params.json') shutil.copy(src=path_word_to_index, dst=dst) shutil.copy(src=path_label, dst=dst) shutil.copy(src=path_shared_params, dst=dst) print('=====================================================')
def merge( cleaned_yellow_data: InputDirectory( description= "cleaned yellow data, needs to be read as pandas dataframe"), cleaned_green_data: InputDirectory( description= "cleaned green data, needs to be read as pandas dataframe"), merged_output: OutputDirectory(description="output data after merge"), ): green_df = pd.read_csv(cleaned_green_data) yellow_df = pd.read_csv(cleaned_yellow_data) print("Argument (output merge taxi data path): %s" % merged_output) merge_df = green_df.append(yellow_df, ignore_index=True) merge_df.reset_index(inplace=True, drop=True) if not (merged_output is None): os.makedirs(merged_output, exist_ok=True) print("merge output folder %s created" % merged_output) path = merged_output + "/merged.csv" write_df = merge_df.to_csv(path)
def fasttext_train(trained_model_dir: OutputDirectory(type='ModelDirectory'), training_data_dir: InputDirectory() = None, validation_data_dir: InputDirectory() = None, epochs=1, batch_size=64, max_len=32, embed_dim=300, hidden_size=256, ngram_size=200000, dropout=0.5, learning_rate=0.001): print('============================================') print('training_data_dir:', training_data_dir) print('validation_data_dir:', validation_data_dir) path_word_to_index = os.path.join(training_data_dir, 'word_to_index.json') word_to_index = get_vocab(path_word_to_index) path_label = os.path.join(training_data_dir, 'label.txt') map_id_label, map_label_id = get_id_label(path_label) class_num = len(map_id_label) vocab_size = len(word_to_index) stop_patience = 5 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print('device:', device) # load training dataset path = os.path.join(training_data_dir, 'data.txt') train_samples = load_dataset(file_path=path, word_to_index=word_to_index, map_label_id=map_label_id, max_len=max_len, ngram_size=ngram_size) train_iter = DataIter(samples=train_samples, batch_size=batch_size, shuffle=True, device=device) # load validation dataset path = os.path.join(validation_data_dir, 'data.txt') dev_samples = load_dataset(file_path=path, word_to_index=word_to_index, map_label_id=map_label_id, max_len=max_len, ngram_size=ngram_size) dev_iter = DataIter(samples=dev_samples, batch_size=batch_size, shuffle=True, device=device) model = FastText(vocab_size=vocab_size, class_num=class_num, dropout=dropout, embed_dim=embed_dim, hidden_size=hidden_size, ngram_size=ngram_size) # watch parameters print(model.parameters) # copy word_to_index.json and label.txt for later scoring. shutil.copy(src=path_word_to_index, dst=trained_model_dir) shutil.copy(src=path_label, dst=trained_model_dir) # shared parameters for loading dataset shared_params = {'max_len': max_len, 'ngram_size': ngram_size} path = os.path.join(trained_model_dir, 'shared_params.json') with open(path, 'w', encoding='utf-8') as f: json.dump(shared_params, f) start = time.time() train(model, trained_model_dir, train_iter=train_iter, dev_iter=dev_iter, epochs=epochs, learning_rate=learning_rate, stop_patience=stop_patience, device=device) end = time.time() print('\nduration of training process: %.2f sec' % (end - start)) print('============================================')
def enter_num_manually(output: OutputDirectory(), num='0'): Path(output).absolute().mkdir(parents=True, exist_ok=True) with open(Path(output).resolve() / f'data', 'w') as fout: fout.write(num)
def train(model, trained_model_dir: OutputDirectory(type='AnyDirectory'), train_iter, dev_iter=None, epochs=20, learning_rate=0.0001, stop_patience=3, device=None): if device is None: device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model.train() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) model.to(device) loss = torch.nn.CrossEntropyLoss() min_loss_epoch = (None, None) stop_flag = False model_name = model._get_name() tip_str = f"\n{model_name} start training....." print(tip_str) # for logging run = Run.get_context() for epoch in range(epochs): loss_value_list = [] total_iter = len(train_iter) for i, (x_batch, y_batch) in enumerate(train_iter): x_batch = torch.LongTensor(x_batch).to(device) y_batch = torch.LongTensor(y_batch).to(device) outputs = model(x_batch) optimizer.zero_grad() loss_value = loss(outputs, y_batch) loss_value.backward() optimizer.step() # for metrics run.log(name='CrossEntropyLoss', value=np.mean(loss_value_list)) loss_value_list.append(loss_value.cpu().data.numpy()) str_ = f"{model_name} epoch:{epoch + 1}/{epochs} step:{i + 1}/{total_iter} mean_loss:{np.mean(loss_value_list): .4f}" sys.stdout.write('\r' + str_) sys.stdout.flush() if (i + 1) == total_iter and dev_iter is not None: loss_, acc_, prec_, recall_, f1_ = eval( model, dev_iter, device) str_ = f" validation loss:{loss_:.4f} acc:{acc_:.4f}" sys.stdout.write(str_) sys.stdout.flush() print() model.train() if (min_loss_epoch[0] is None) or (min_loss_epoch[0] > loss_): min_loss_epoch = (loss_, epoch) os.makedirs(trained_model_dir, exist_ok=True) path = trained_model_dir torch.save(obj=model, f=path) else: if (epoch - min_loss_epoch[1]) >= stop_patience: stop_flag = True break if stop_flag is True: break