def compare_two_models( the_better_model: OutputDirectory(), first_trained_model: InputDirectory(type='AnyDirectory') = None, first_trained_result: InputDirectory(type='AnyDirectory') = None, second_trained_model: InputDirectory(type='AnyDirectory') = None, second_trained_result: InputDirectory(type='AnyDirectory') = None, ): print('=====================================================') print(f'input_dir: {Path(first_trained_model).resolve()}') print(f'input_dir: {Path(first_trained_result).resolve()}') print(f'input_dir: {Path(second_trained_model).resolve()}') print(f'input_dir: {Path(second_trained_result).resolve()}') # for logging run = Run.get_context() path = os.path.join(first_trained_result, 'result.json') result_first = json.load(open(path, 'r'))['acc'] path = os.path.join(second_trained_result, 'result.json') second_first = json.load(open(path, 'r'))['acc'] dst = os.path.join(the_better_model, 'BestModel') if result_first >= second_first: print('choose the first model') run.log(name='which one', value='first') src = os.path.join(first_trained_model, 'BestModel') shutil.copy(src=src, dst=dst) else: print('choose the second model') run.log(name='which one', value='second') src = os.path.join(second_trained_model, 'BestModel') shutil.copy(src=src, dst=dst) print('=====================================================')
def fasttext_evaluation(model_testing_result: OutputDirectory(), trained_model_dir: InputDirectory() = None, test_data_dir: InputDirectory() = None): print('=====================================================') print(f'trained_model_dir: {Path(trained_model_dir).resolve()}') print(f'test_data_dir: {Path(test_data_dir).resolve()}') path_word_to_index = os.path.join(test_data_dir, 'word_to_index.json') word_to_index = get_vocab(path_word_to_index) path_label = os.path.join(test_data_dir, 'label.txt') map_id_label, map_label_id = get_id_label(path_label) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print('device:', device) path = os.path.join(trained_model_dir, 'shared_params.json') with open(path, 'r', encoding='utf-8') as f: shared_params = json.load(f) path = os.path.join(test_data_dir, 'data.txt') test_samples = load_dataset(file_path=path, max_len=shared_params['max_len'], ngram_size=shared_params['ngram_size'], word_to_index=word_to_index, map_label_id=map_label_id) test_iter = DataIter(samples=test_samples, shuffle=False, device=device) path = os.path.join(trained_model_dir, 'BestModel') model = torch.load(f=path, map_location=device) path = os.path.join(model_testing_result, 'result.json') acc_ = test(model, test_iter) with open(path, 'w', encoding='utf-8') as f: json.dump({"acc": acc_}, f) print('\n============================================')
def fasttext_score_parallel( scored_dataset: OutputDirectory(type='AnyDirectory'), fasttext_model: InputDirectory(type='AnyDirectory') = '.', char2index_dir: InputDirectory(type='AnyDirectory') = None ): print('=====================================================') print(f'fasttext_model: {Path(fasttext_model).resolve()}') print(f'char2index_dir: {Path(char2index_dir).resolve()}') print(f'scored_dataset: {scored_dataset}') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') max_len_ = 38 path = os.path.join(fasttext_model, 'BestModel') model = torch.load(f=path) def run(files): if len(files) == 0: return [] print(f"Ready to process {len(files)} texts.") print('\n'.join(files)) with torch.no_grad(): test_samples = load_dataset_parallel(files=files, max_len=max_len_, char2index_dir=char2index_dir) test_iter = DataIter_Parallel(test_samples, shuffle=False) results = predict_parallel(model, test_iter, device) dict_ = {'Filename': files, 'Class': results} df = pd.DataFrame(data=dict_) print("Result:") print(df) output_file = os.path.join(scored_dataset, f"{uuid4().hex}.parquet") df.to_parquet(output_file, index=False) return results return run
def fasttext_evaluation( model_testing_result: OutputDirectory(type='AnyDirectory'), trained_model_dir: InputDirectory(type='AnyDirectory') = None, test_data_dir: InputDirectory(type='AnyDirectory') = None, char2index_dir: InputDirectory(type='AnyDirectory') = None): print('=====================================================') print(f'trained_model_dir: {Path(trained_model_dir).resolve()}') print(f'test_data_dir: {Path(test_data_dir).resolve()}') print(f'char2index_dir: {Path(char2index_dir).resolve()}') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') max_len_ = 38 path = os.path.join(test_data_dir, 'test.txt') test_samples = load_dataset(file_path=path, max_len=max_len_, char2index_dir=char2index_dir) test_iter = DataIter(test_samples) path = os.path.join(trained_model_dir, 'BestModel') model = torch.load(f=path) path = os.path.join(model_testing_result, 'result.json') acc_ = test(model, test_iter, device) json.dump({"acc": acc_}, open(path, 'w')) print('\n============================================')
def gdal_sample( ##define interface(input, output, paratmers) of the module here output_dir1: OutputDirectory(), output_dir2: OutputDirectory(), input_dir1: InputDirectory(), input_dir2: InputDirectory()): print('I am in module definition') print(f'input_dir1: {Path(input_dir1).resolve()}') print(f'input_dir2: {Path(input_dir2).resolve()}') ## add custom logic here dfd1 = load_data_frame_from_directory(input_dir1) data_frame1 = dfd1.data print(data_frame1.head(10))
def fasttext_score( scored_data_output_dir: OutputDirectory(), fasttext_model_dir: InputDirectory() = '.' ): print('=====================================================') print(f'fasttext_model: {Path(fasttext_model_dir).resolve()}') print(f'scored_data_output_dir: {scored_data_output_dir}') path_word_to_index = os.path.join(fasttext_model_dir, 'word_to_index.json') word_to_index = get_vocab(path_word_to_index) path_label = os.path.join(fasttext_model_dir, 'label.txt') map_id_label, map_label_id = get_id_label(path_label) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print('device:', device) path = os.path.join(fasttext_model_dir, 'shared_params.json') with open(path, 'r', encoding='utf-8') as f: shared_params = json.load(f) path = os.path.join(fasttext_model_dir, 'BestModel') model = torch.load(f=path, map_location=device) def run(files): if len(files) == 0: return [] with torch.no_grad(): test_samples = load_dataset(file_path=files, max_len=shared_params['max_len'], ngram_size=shared_params['ngram_size'], word_to_index=word_to_index, map_label_id=map_label_id) test_iter = DataIter(samples=test_samples, batch_size=1, shuffle=False, device=device) results = predict_parallel(model, test_iter, map_id_label) dict_ = {'Filename': files, 'Class': results} df = pd.DataFrame(data=dict_) output_file = os.path.join(scored_data_output_dir, f"{uuid4().hex}.parquet") df.to_parquet(output_file, index=False) return results return run
def fasttext_train( trained_model_dir: OutputDirectory(type='AnyDirectory'), training_data_dir: InputDirectory(type='AnyDirectory') = None, validation_data_dir: InputDirectory(type='AnyDirectory') = None, char2index_dir: InputDirectory(type='AnyDirectory') = None, epochs=2, batch_size=32, learning_rate=0.0005, embedding_dim=128): print('============================================') print('training_data_dir:', training_data_dir) print('validation_data_dir:', validation_data_dir) c2i = get_vocab(char2index_dir) class_ = get_classs() max_len_ = 38 n_class_ = len(class_) vocab_size_ = len(c2i) stop_patience = 5 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') path = os.path.join(training_data_dir, 'train.txt') train_samples = load_dataset(file_path=path, max_len=max_len_, char2index_dir=char2index_dir) path = os.path.join(validation_data_dir, 'dev.txt') dev_samples = load_dataset(file_path=path, max_len=max_len_, char2index_dir=char2index_dir) train_iter = DataIter(train_samples, batch_size) dev_iter = DataIter(dev_samples, batch_size) model = FastText(vocab_size=vocab_size_, n_class=n_class_, embed_dim=embedding_dim) start = time.time() train(model, trained_model_dir, train_iter, dev_iter=dev_iter, epochs=epochs, learning_rate=learning_rate, stop_patience=stop_patience, device=device) end = time.time() print('\nspent time: %.2f sec' % (end - start)) print('============================================')
def add(left: InputDirectory(), right: InputDirectory(), output: OutputDirectory()): l = float((Path(left).resolve() / f'data').read_text().strip()) r = float((Path(right).resolve() / f'data').read_text().strip()) print('left = ', l) print('right = ', r) m = l + r run = Run.get_context() run.log('result', m) run.log('left', l) run.log('right', r) run.flush() Path(output).absolute().mkdir(parents=True, exist_ok=True) with open(Path(output).resolve() / f'data', 'w') as fout: fout.write(str(m))
def split_data_txt( training_data_output: OutputDirectory(type='AnyDirectory'), validation_data_output: OutputDirectory(type='AnyDirectory'), test_data_output: OutputDirectory(type='AnyDirectory'), input_dir: InputDirectory(type='AnyDirectory') = None, training_data_ratio=0.7, validation_data_ratio=0.1, random_split=False, seed=0): print('============================================') print( f"value of input_dir:'{input_dir}', type of input_dir:'{type(input_dir)}'" ) with open(input_dir, 'r', encoding='utf-8') as f: data = f.readlines() random.seed(seed if random_split else 0) # list shuffle random.shuffle(data) n = len(data) # for logging run = Run.get_context() training_data_num = int(n * training_data_ratio) dev_data_num = int(n * validation_data_ratio) train = data[:training_data_num] dev = data[training_data_num:training_data_num + dev_data_num] test = data[training_data_num + dev_data_num:] print('num of total data:', len(data)) print('num of training data:', len(train)) print('num of validation data:', len(dev)) print('num of test_data:', len(test)) # for metrics run.log(name='num of total data', value=len(data)) run.log(name='num of training data', value=len(train)) run.log(name='num of validation data', value=len(dev)) run.log(name='num of test_data', value=len(test)) os.makedirs(training_data_output, exist_ok=True) path = os.path.join(training_data_output, "train.txt") with open(path, 'w', encoding='utf-8') as f: f.writelines(train) print(path) print(os.listdir(training_data_output)) os.makedirs(validation_data_output, exist_ok=True) path = os.path.join(validation_data_output, "dev.txt") with open(path, 'w', encoding='utf-8') as f: f.writelines(dev) print(path) print(os.listdir(validation_data_output)) os.makedirs(test_data_output, exist_ok=True) path = os.path.join(test_data_output, "test.txt") with open(path, 'w', encoding='utf-8') as f: f.writelines(test) print(path) print(os.listdir(test_data_output)) print('============================================')
def basic_module( output_dir: OutputDirectory(), input_dir: InputDirectory() = '.', str_param='some_string', ): print(f'input_dir: {Path(input_dir).resolve()}') print(f'str_param: {str_param}') output_dir = Path(output_dir) with open(output_dir / f"output.txt", 'w') as fout: fout.write(str_param)
def split_data_txt(training_data_output: OutputDirectory(), validation_data_output: OutputDirectory(), test_data_output: OutputDirectory(), input_dir: InputDirectory() = None, training_data_ratio=0.7, validation_data_ratio=0.1, random_split=False, seed=0): print('============================================') print( f"value of input_dir:'{input_dir}', type of input_dir:'{type(input_dir)}'" ) path_input_data = os.path.join(input_dir, 'data.txt') with open(path_input_data, 'r', encoding='utf-8') as f: data = f.readlines() random.seed(seed if random_split else 0) random.shuffle(data) n = len(data) # for metrics run = Run.get_context() training_data_num = int(n * training_data_ratio) dev_data_num = int(n * validation_data_ratio) train = data[:training_data_num] dev = data[training_data_num:training_data_num + dev_data_num] test = data[training_data_num + dev_data_num:] print('num of total data:', len(data)) print('num of training data:', len(train)) print('num of validation data:', len(dev)) print('num of test_data:', len(test)) # for metrics run.log(name='num of total data', value=len(data)) run.log(name='num of training data', value=len(train)) run.log(name='num of validation data', value=len(dev)) run.log(name='num of test_data', value=len(test)) path_label = os.path.join(input_dir, 'label.txt') path_word_to_index = os.path.join(input_dir, 'word_to_index.json') shutil.copy(src=path_label, dst=training_data_output) shutil.copy(src=path_word_to_index, dst=training_data_output) path = os.path.join(training_data_output, "data.txt") with open(path, 'w', encoding='utf-8') as f: f.writelines(train) shutil.copy(src=path_label, dst=validation_data_output) shutil.copy(src=path_word_to_index, dst=validation_data_output) path = os.path.join(validation_data_output, "data.txt") with open(path, 'w', encoding='utf-8') as f: f.writelines(dev) shutil.copy(src=path_label, dst=test_data_output) shutil.copy(src=path_word_to_index, dst=test_data_output) path = os.path.join(test_data_output, "data.txt") with open(path, 'w', encoding='utf-8') as f: f.writelines(test) print('============================================')
def parallel_score_images( scored_dataset: OutputDirectory(), trained_model: InputDirectory() = None, ): # Use the path of a prepared model if trained_model is None if trained_model is None: trained_model = str( Path(__file__).parent / 'tests/parallel_score_images/inputs/trained_model/') print("Scored dataset:", scored_dataset) print("Trained model:", trained_model) map_location = 'cpu' if not torch.cuda.is_available() else None model = torch.load(os.path.join(trained_model, 'model.pt'), map_location=map_location) os.makedirs(scored_dataset, exist_ok=True) print("Model is loaded:", model) def run(files): if len(files) == 0: return [] results = [] nthreads = min(2 * cpu_count(), len(files)) print(f"Ready to process {len(files)} images.") print('\n'.join(files)) with ThreadPool(nthreads) as pool: imgs = pool.map(Image.open, files) for f, img in zip(files, imgs): img = Image.open(f) tensor = transform(img).unsqueeze(0) if torch.cuda.is_available(): tensor = tensor.cuda() with torch.no_grad(): output = model(tensor) softmax = nn.Softmax(dim=1) pred_probs = softmax(output).cpu().numpy()[0] index = torch.argmax(output, 1)[0].cpu().item() result = { 'Filename': Path(f).name, 'Class': MNIST.classes[index] } for c, prob in zip(MNIST.classes, pred_probs): result[f"Prob of {c}"] = prob results.append(result) columns = sorted(list(results[0].keys())) df = pd.DataFrame(results, columns=columns) print("Result:") print(df) output_file = os.path.join(scored_dataset, f"{uuid4().hex}.parquet") df.to_parquet(output_file, index=False) return results return run
def fasttext_test( trained_model_dir: InputDirectory(type='AnyDirectory') = None, test_data_dir: InputDirectory(type='AnyDirectory') = None, char2index_dir: InputDirectory(type='AnyDirectory') = None ): print('============================================') print('test_data_dir:', test_data_dir) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') max_len_ = 38 path = os.path.join(test_data_dir, 'test.txt') test_samples = load_dataset(file_path=path, max_len=max_len_, char2index_dir=char2index_dir) test_iter = DataIter(test_samples) path = os.path.join(trained_model_dir, 'BestModel') model = torch.load(f=path) test(model, test_iter, device) print('\n============================================')
def stitch_video( input_images: InputDirectory(description="input directory of images"), input_audio: InputDirectory(description="input directory of audio"), output_video: OutputDirectory( description="output directory of stitched video file")): ## this module takes input video, and slice the video into images with ffmpeg subprocess.run( "ffmpeg -framerate 30 -i {}/%05d_video.jpg -c:v libx264 -profile:v high -crf 20 -pix_fmt yuv420p " "-y {}/video_without_audio.mp4".format(args.images_dir, args.output_dir), shell=True, check=True) subprocess.run( "ffmpeg -i {}/video_without_audio.mp4 -i {}/video.aac -map 0:0 -map 1:0 -vcodec " "copy -acodec copy -y {}/video_with_audio.mp4".format( args.output_dir, args.input_audio, args.output_dir), shell=True, check=True)
def fasttext_score(fasttext_model: InputDirectory(type='AnyDirectory') = '.', input_sentence='I like playing football very much', char2index_dir: InputDirectory(type='AnyDirectory') = None): print('=====================================================') print(f'fasttext_model: {Path(fasttext_model).resolve()}') print(f'char2index_dir: {Path(char2index_dir).resolve()}') print(f'input_sentence: {input_sentence}') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') max_len_ = 38 path = input_sentence test_samples = load_dataset(file_path=path, max_len=max_len_, char2index_dir=char2index_dir) test_iter = DataIter(test_samples, batch_size=1) path = os.path.join(fasttext_model, 'BestModel') model = torch.load(f=path) res = predict(model, test_iter, device) print('the category of "%s" is %s' % (input_sentence, res)) print('=====================================================')
def compare_two_models(the_better_model: OutputDirectory(), first_trained_model: InputDirectory() = None, first_trained_result: InputDirectory() = None, second_trained_model: InputDirectory() = None, second_trained_result: InputDirectory() = None): print('=====================================================') print(f'input_dir: {Path(first_trained_model).resolve()}') print(f'input_dir: {Path(first_trained_result).resolve()}') print(f'input_dir: {Path(second_trained_model).resolve()}') print(f'input_dir: {Path(second_trained_result).resolve()}') # for metrics run = Run.get_context() path = os.path.join(first_trained_result, 'result.json') with open(path, 'r', encoding='utf-8') as f: result_first = json.load(f)['acc'] path = os.path.join(second_trained_result, 'result.json') with open(path, 'r', encoding='utf-8') as f: second_first = json.load(f)['acc'] dst = the_better_model if result_first >= second_first: print('choose the first model') run.log(name='which one', value='first') src = os.path.join(first_trained_model, 'BestModel') shutil.copy(src=src, dst=dst) else: print('choose the second model') run.log(name='which one', value='second') src = os.path.join(second_trained_model, 'BestModel') shutil.copy(src=src, dst=dst) path_word_to_index = os.path.join(first_trained_model, 'word_to_index.json') path_label = os.path.join(first_trained_model, 'label.txt') path_shared_params = os.path.join(first_trained_model, 'shared_params.json') shutil.copy(src=path_word_to_index, dst=dst) shutil.copy(src=path_label, dst=dst) shutil.copy(src=path_shared_params, dst=dst) print('=====================================================')
def merge( cleaned_yellow_data: InputDirectory( description= "cleaned yellow data, needs to be read as pandas dataframe"), cleaned_green_data: InputDirectory( description= "cleaned green data, needs to be read as pandas dataframe"), merged_output: OutputDirectory(description="output data after merge"), ): green_df = pd.read_csv(cleaned_green_data) yellow_df = pd.read_csv(cleaned_yellow_data) print("Argument (output merge taxi data path): %s" % merged_output) merge_df = green_df.append(yellow_df, ignore_index=True) merge_df.reset_index(inplace=True, drop=True) if not (merged_output is None): os.makedirs(merged_output, exist_ok=True) print("merge output folder %s created" % merged_output) path = merged_output + "/merged.csv" write_df = merge_df.to_csv(path)
def sample_module( # The input/output port are defined using the following 4 annotations. # Note that you need to register data type using # DataType.create_data_type(ws, 'MyDirectory', description=description, is_directory=True) # DataType.create_data_type(ws, 'MyFile', description=description, is_directory=False) # See https://docs.microsoft.com/en-us/python/api/azureml-pipeline-core/azureml.pipeline.core.graph.datatype?view=azure-ml-py#create-data-type-workspace--name--description--is-directory--parent-datatypes-none- output_dir: OutputDirectory(type='MyDirectory'), output_file: OutputFile(type='MyFile'), input_dir: InputDirectory(type='MyDirectory') = None, input_file: InputFile(type='MyFile') = None, # The parameter with default values will be considered as annotated with such type, # Now we support the following 5 types: str, int, float, bool, enum str_param='abc', int_param=1, float_param=0.1, bool_param=False, enum_param=MyEnum.Enum0, # If the default value is None without annotation, it will be treated as str. none_param=None, ): """A sample module use different parameter types and customized input/output ports.""" print(f"Arg 'input_dir' = '{input_dir}', type='{type(input_dir)}'") if input_dir: print(f"Contents of input directory:") print('\n'.join(f.name for f in Path(input_dir).iterdir())) print(f"Arg 'input_file' = {input_file}, type='{type(input_file)}'") print(f"Arg 'output_dir' = {output_dir}, type='{type(output_dir)}'") print(f"Arg 'output_file' = {output_file}, type='{type(output_file)}'") print(f"Arg 'str_param' = {str_param}, type='{type(str_param)}'") print(f"Arg 'int_param' = {int_param}, type='{type(int_param)}'") print(f"Arg 'float_param' = {float_param}, type='{type(float_param)}'") print(f"Arg 'bool_param' = {bool_param}, type='{type(bool_param)}'") print(f"Arg 'enum_param' = {enum_param}, type='{type(enum_param)}'") print(f"Arg 'none_param' = {none_param}, type='{type(none_param)}'") data = str_param if input_file: with open(input_file, 'r') as fin: data = fin.read() print("Content of input file:", data) if input_dir: shutil.copytree(input_dir, output_dir) else: os.makedirs(output_dir, exist_ok=True) with open(os.path.join(output_dir, "test.txt"), 'w') as fout: fout.write(data) with open(output_file, 'w') as fout: fout.write(data)
def tokenizer( input_file_path: InputDirectory(description="Input text file path"), output_dir_path: OutputDirectory(description="Output file directory path"), output_to_file: IntParameter( description= "whether to interpret output_dir_path as file to write to, or folder containing file to write to" ) = 0, input_is_tsv: IntParameter( description="bool determining whether to use tsv related options") = 0, delimiter: StringParameter( description="optional, delimiter to use if parsing a tsv type file" ) = None, ignore_cols: IntParameter( description="indices of columns to ignore if parsing a tsv") = None, mode: EnumParameter( enum=EnumMode, description="Tokenizer to use [train, inference, spacy]") = EnumMode. train, type: EnumParameter( enum=EnumType, description="Whether to use word tokenizer or sentence tokenizer" ) = EnumType.word, ): sys.argv = [ 'tokenizer.py', '-i', str(input_file_path), '-o', str(output_dir_path), '--output_to_file', str(output_to_file), '--input_is_tsv', str(input_is_tsv), '-m', mode.value, '-t', type.value, ] if delimiter is not None: sys.argv += ['--delimiter', str(delimiter)] if ignore_cols is not None: sys.argv += ['--ignore_cols', str(ignore_cols)] print(' '.join(sys.argv)) runpy.run_path('tokenizer.py', run_name='__main__')
def prepare_data( output_data: OutputDirectory(), input_data: InputDirectory() = None, str_param: str = None, int_param: int = 0, enum_param: EnumEnumParam = None, ): sys.argv = [ 'prepare_data.py', '--input_data', str(input_data), '--output_data', str(output_data), '--int_param', str(int_param), ] if str_param is not None: sys.argv += ['--str_param', str(str_param)] if enum_param is not None: sys.argv += ['--enum_param', enum_param.value] print(' '.join(sys.argv)) runpy.run_path('prepare_data.py', run_name='__main__')
def copy_files( output_dir: OutputDirectory(), input_dir: InputDirectory() = '.', str_param='some_string', ): input_dir = Path(input_dir) print(f'input_dir: {input_dir.resolve()}') print(f'str_param: {str_param}') files = [] if input_dir.is_dir(): files = [str(f) for f in input_dir.iterdir()] if (len(files) == 0): raise ValueError(f'input_dir should be an directory with files') output_dir = Path(output_dir) with open(output_dir / f"output.txt", 'w') as fout: fout.write(str_param)
def style_transform_parallel( model_dir: InputDirectory( description="saved torch model to be used for stylizing the image."), output_path: OutputDirectory( description="directory holding the output images"), style: StringParameter(description="style name") = None, ): print(f'output path: {output_path}') print(f'Cuda available? {torch.cuda.is_available()}') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") with torch.no_grad(): style_model = TransformerNet() state_dict = torch.load(os.path.join(model_dir, style + ".pth")) # remove saved deprecated running_* keys in InstanceNorm from the checkpoint for k in list(state_dict.keys()): if re.search(r'in\d+\.running_(mean|var)$', k): del state_dict[k] style_model.load_state_dict(state_dict) style_model.to(device) print(f'Model loaded successfully. Path: {model_dir}') def run(mini_batch): result = [] for image_file_path in mini_batch: img = load_image(image_file_path) print(f'load image from: {image_file_path}') with torch.no_grad(): content_transform = transforms.Compose([ transforms.ToTensor(), transforms.Lambda(lambda x: x.mul(255)) ]) content_image = content_transform(img) content_image = content_image.unsqueeze(0).to(device) output = style_model(content_image).cpu() output_file_path = os.path.join( output_path, os.path.basename(image_file_path)) save_image(output_file_path, output[0]) result.append(output_file_path) print(f'transferred image saved in: {output_file_path}') return result return run
def mpi_module( output_dir: OutputDirectory(), input_dir: InputDirectory() = '.', param0: str = 'abc', param1: int = 10, ): from mpi4py import MPI for k, v in locals().items(): print(f"{k}: {v}") comm = MPI.COMM_WORLD size = comm.Get_size() rank = comm.Get_rank() print(f"This is an MPI module, I'm rank {rank}/{size}.") if rank == 0: print("I will write data.") output_dir = Path(output_dir) with open(output_dir / f"output.txt", 'w') as fout: fout.write(param0) fout.write(str(param1)) else: print("I don't return data.")
def slice_video( input_video: InputDirectory( description="input directory of video file") = './data/input/video', output_audio: OutputDirectory( description="output directory of audio from video" ) = '/data/output/video', output_images: OutputDirectory( description="output directory of images slice from video" ) = '/data/output/images', ): ## this module takes input video, and slice the video into images with ffmpeg subprocess.run("ffmpeg -i {} {}/video.aac".format(input_video, output_audio), shell=True, check=True) subprocess.run("ffmpeg -i {} {}/%05d_video.jpg -hide_banner".format( input_video, output_images), shell=True, check=True)
def save_image(filename, data): img = data.clone().clamp(0, 255).numpy() img = img.transpose(1, 2, 0).astype("uint8") img = Image.fromarray(img) img.save(filename) #for parallel run module, the input and output need to be defined in dsl.module(parallel_inputs = ***) like below @dsl.module( job_type='parallel', description='use pretrained pythorch model to do image style transfer', name='Style Transfer Parallel', parallel_inputs=[ InputDirectory(name='Content Dir', description="directory of input images") ]) def style_transform_parallel( model_dir: InputDirectory( description="saved torch model to be used for stylizing the image."), output_path: OutputDirectory( description="directory holding the output images"), style: StringParameter(description="style name") = None, ): print(f'output path: {output_path}') print(f'Cuda available? {torch.cuda.is_available()}') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") with torch.no_grad(): style_model = TransformerNet() state_dict = torch.load(os.path.join(model_dir, style + ".pth")) # remove saved deprecated running_* keys in InstanceNorm from the checkpoint
from uuid import uuid4 import torch from pathlib import Path from azureml.pipeline.wrapper.dsl.module import ModuleExecutor, InputDirectory, OutputDirectory from azureml.pipeline.wrapper import dsl from common.utils import DataIter, load_dataset, predict_parallel, get_vocab, get_id_label @dsl.module( name="FastText Score", version='0.0.23', description='Predict the categories of the input sentences', job_type='parallel', parallel_inputs=[InputDirectory(name='Texts to score')], base_image='mcr.microsoft.com/azureml/intelmpi2018.3-cuda10.0-cudnn7-ubuntu16.04' ) def fasttext_score( scored_data_output_dir: OutputDirectory(), fasttext_model_dir: InputDirectory() = '.' ): print('=====================================================') print(f'fasttext_model: {Path(fasttext_model_dir).resolve()}') print(f'scored_data_output_dir: {scored_data_output_dir}') path_word_to_index = os.path.join(fasttext_model_dir, 'word_to_index.json') word_to_index = get_vocab(path_word_to_index) path_label = os.path.join(fasttext_model_dir, 'label.txt') map_id_label, map_label_id = get_id_label(path_label) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print('device:', device)
from pathlib import Path from azureml.pipeline.wrapper import dsl from azureml.pipeline.wrapper.dsl.module import InputDirectory, OutputDirectory, ModuleExecutor transform = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, ))]) @dsl.module( name='Parallel Score Images', version='0.0.1', job_type='parallel', parallel_inputs=[ InputDirectory(name='Images to score'), InputDirectory(name='Optional images to score', optional=True) ], ) def parallel_score_images( scored_dataset: OutputDirectory(), trained_model: InputDirectory() = None, ): # Use the path of a prepared model if trained_model is None if trained_model is None: trained_model = str( Path(__file__).parent / 'tests/parallel_score_images/inputs/trained_model/') print("Scored dataset:", scored_dataset) print("Trained model:", trained_model) map_location = 'cpu' if not torch.cuda.is_available() else None
def fasttext_train(trained_model_dir: OutputDirectory(type='ModelDirectory'), training_data_dir: InputDirectory() = None, validation_data_dir: InputDirectory() = None, epochs=1, batch_size=64, max_len=32, embed_dim=300, hidden_size=256, ngram_size=200000, dropout=0.5, learning_rate=0.001): print('============================================') print('training_data_dir:', training_data_dir) print('validation_data_dir:', validation_data_dir) path_word_to_index = os.path.join(training_data_dir, 'word_to_index.json') word_to_index = get_vocab(path_word_to_index) path_label = os.path.join(training_data_dir, 'label.txt') map_id_label, map_label_id = get_id_label(path_label) class_num = len(map_id_label) vocab_size = len(word_to_index) stop_patience = 5 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print('device:', device) # load training dataset path = os.path.join(training_data_dir, 'data.txt') train_samples = load_dataset(file_path=path, word_to_index=word_to_index, map_label_id=map_label_id, max_len=max_len, ngram_size=ngram_size) train_iter = DataIter(samples=train_samples, batch_size=batch_size, shuffle=True, device=device) # load validation dataset path = os.path.join(validation_data_dir, 'data.txt') dev_samples = load_dataset(file_path=path, word_to_index=word_to_index, map_label_id=map_label_id, max_len=max_len, ngram_size=ngram_size) dev_iter = DataIter(samples=dev_samples, batch_size=batch_size, shuffle=True, device=device) model = FastText(vocab_size=vocab_size, class_num=class_num, dropout=dropout, embed_dim=embed_dim, hidden_size=hidden_size, ngram_size=ngram_size) # watch parameters print(model.parameters) # copy word_to_index.json and label.txt for later scoring. shutil.copy(src=path_word_to_index, dst=trained_model_dir) shutil.copy(src=path_label, dst=trained_model_dir) # shared parameters for loading dataset shared_params = {'max_len': max_len, 'ngram_size': ngram_size} path = os.path.join(trained_model_dir, 'shared_params.json') with open(path, 'w', encoding='utf-8') as f: json.dump(shared_params, f) start = time.time() train(model, trained_model_dir, train_iter=train_iter, dev_iter=dev_iter, epochs=epochs, learning_rate=learning_rate, stop_patience=stop_patience, device=device) end = time.time() print('\nduration of training process: %.2f sec' % (end - start)) print('============================================')