Example #1
0
 def test_module_with_execute(self):
     # delete files created before
     result_dir = self.prepare_outputs()['scored_data_output_dir']
     os.makedirs(result_dir, exist_ok=True)
     if len(os.listdir(result_dir)) > 0:
         for file in os.listdir(result_dir):
             path = os.path.join(result_dir, file)
             os.remove(path)
     # This test simulates a parallel run from cmd line arguments to call fasttext_score_parallel.
     ModuleExecutor(fasttext_score).execute(self.prepare_argv())
     data_dir = self.prepare_inputs()['texts_to_score']
     os.makedirs(data_dir, exist_ok=True)
     num_of_test_file = len(os.listdir(data_dir))
     num_of_test_result = 0
     for file in os.listdir(result_dir):
         path = os.path.join(result_dir, file)
         num = pd.read_parquet(path).shape[0]
         num_of_test_result += num
     self.assertEqual(num_of_test_file, num_of_test_result)
Example #2
0
    # for metrics
    run.log(name='num of total data', value=len(data))
    run.log(name='num of training data', value=len(train))
    run.log(name='num of validation data', value=len(dev))
    run.log(name='num of test_data', value=len(test))
    path_label = os.path.join(input_dir, 'label.txt')
    path_word_to_index = os.path.join(input_dir, 'word_to_index.json')

    shutil.copy(src=path_label, dst=training_data_output)
    shutil.copy(src=path_word_to_index, dst=training_data_output)
    path = os.path.join(training_data_output, "data.txt")
    with open(path, 'w', encoding='utf-8') as f:
        f.writelines(train)

    shutil.copy(src=path_label, dst=validation_data_output)
    shutil.copy(src=path_word_to_index, dst=validation_data_output)
    path = os.path.join(validation_data_output, "data.txt")
    with open(path, 'w', encoding='utf-8') as f:
        f.writelines(dev)

    shutil.copy(src=path_label, dst=test_data_output)
    shutil.copy(src=path_word_to_index, dst=test_data_output)
    path = os.path.join(test_data_output, "data.txt")
    with open(path, 'w', encoding='utf-8') as f:
        f.writelines(test)
    print('============================================')


if __name__ == '__main__':
    ModuleExecutor(split_data_txt).execute(sys.argv)
Example #3
0
import shutil

from azureml.pipeline.wrapper.dsl.module import ModuleExecutor, InputDirectory, OutputDirectory
from azureml.pipeline.wrapper import dsl


@dsl.module(name="copy_files")
def copy_files(
    output_dir: OutputDirectory(),
    input_dir: InputDirectory() = '.',
    str_param='some_string',
):
    input_dir = Path(input_dir)
    print(f'input_dir: {input_dir.resolve()}')
    print(f'str_param: {str_param}')

    files = []
    if input_dir.is_dir():
        files = [str(f) for f in input_dir.iterdir()]

    if (len(files) == 0):
        raise ValueError(f'input_dir should be an directory with files')

    output_dir = Path(output_dir)
    with open(output_dir / f"output.txt", 'w') as fout:
        fout.write(str_param)


if __name__ == '__main__':
    ModuleExecutor(copy_files).execute(sys.argv)
Example #4
0
            if torch.cuda.is_available():
                tensor = tensor.cuda()

            with torch.no_grad():
                output = model(tensor)
                softmax = nn.Softmax(dim=1)
                pred_probs = softmax(output).cpu().numpy()[0]
                index = torch.argmax(output, 1)[0].cpu().item()
                result = {
                    'Filename': Path(f).name,
                    'Class': MNIST.classes[index]
                }
                for c, prob in zip(MNIST.classes, pred_probs):
                    result[f"Prob of {c}"] = prob
            results.append(result)
        columns = sorted(list(results[0].keys()))
        df = pd.DataFrame(results, columns=columns)
        print("Result:")
        print(df)
        output_file = os.path.join(scored_dataset, f"{uuid4().hex}.parquet")
        df.to_parquet(output_file, index=False)
        return results

    return run


# This main code is only used for local debugging, will never be reached in AzureML when it is a parallel module.
# See https://docs.microsoft.com/en-us/azure/machine-learning/how-to-use-parallel-run-step#write-your-inference-script
if __name__ == '__main__':
    ModuleExecutor(parallel_score_images).execute(sys.argv)
Example #5
0
    description='merge two datasets',
    name='nyc taxi merge',
)
def merge(
        cleaned_yellow_data: InputDirectory(
            description=
            "cleaned yellow data, needs to be read as pandas dataframe"),
        cleaned_green_data: InputDirectory(
            description=
            "cleaned green data, needs to be read as pandas dataframe"),
        merged_output: OutputDirectory(description="output data after merge"),
):

    green_df = pd.read_csv(cleaned_green_data)
    yellow_df = pd.read_csv(cleaned_yellow_data)

    print("Argument (output merge taxi data path): %s" % merged_output)

    merge_df = green_df.append(yellow_df, ignore_index=True)
    merge_df.reset_index(inplace=True, drop=True)

    if not (merged_output is None):
        os.makedirs(merged_output, exist_ok=True)
        print("merge output folder %s created" % merged_output)
        path = merged_output + "/merged.csv"
        write_df = merge_df.to_csv(path)


if __name__ == '__main__':
    ModuleExecutor(merge).execute(sys.argv)
Example #6
0
    # for metrics
    run.log(name='num of total data', value=len(data))
    run.log(name='num of training data', value=len(train))
    run.log(name='num of validation data', value=len(dev))
    run.log(name='num of test_data', value=len(test))

    os.makedirs(training_data_output, exist_ok=True)
    path = os.path.join(training_data_output, "train.txt")
    with open(path, 'w', encoding='utf-8') as f:
        f.writelines(train)
    print(path)
    print(os.listdir(training_data_output))

    os.makedirs(validation_data_output, exist_ok=True)
    path = os.path.join(validation_data_output, "dev.txt")
    with open(path, 'w', encoding='utf-8') as f:
        f.writelines(dev)
    print(path)
    print(os.listdir(validation_data_output))

    os.makedirs(test_data_output, exist_ok=True)
    for i, t in enumerate(test):
        path_new = os.path.join(test_data_output, str(i))
        with open(path_new, 'w', encoding='utf-8') as f:
            f.write(t)
    print('============================================')


if __name__ == '__main__':
    ModuleExecutor(split_data_txt_parallel).execute(sys.argv)
Example #7
0
def fasttext_evaluation(
        model_testing_result: OutputDirectory(type='AnyDirectory'),
        trained_model_dir: InputDirectory(type='AnyDirectory') = None,
        test_data_dir: InputDirectory(type='AnyDirectory') = None,
        char2index_dir: InputDirectory(type='AnyDirectory') = None):
    print('=====================================================')
    print(f'trained_model_dir: {Path(trained_model_dir).resolve()}')
    print(f'test_data_dir: {Path(test_data_dir).resolve()}')
    print(f'char2index_dir: {Path(char2index_dir).resolve()}')

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    max_len_ = 38
    path = os.path.join(test_data_dir, 'test.txt')
    test_samples = load_dataset(file_path=path,
                                max_len=max_len_,
                                char2index_dir=char2index_dir)

    test_iter = DataIter(test_samples)

    path = os.path.join(trained_model_dir, 'BestModel')
    model = torch.load(f=path)

    path = os.path.join(model_testing_result, 'result.json')
    acc_ = test(model, test_iter, device)
    json.dump({"acc": acc_}, open(path, 'w'))
    print('\n============================================')


if __name__ == '__main__':
    ModuleExecutor(fasttext_evaluation).execute(sys.argv)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print('device:', device)
    path = os.path.join(fasttext_model_dir, 'shared_params.json')
    with open(path, 'r', encoding='utf-8') as f:
        shared_params = json.load(f)
    path = os.path.join(fasttext_model_dir, 'BestModel')
    model = torch.load(f=path, map_location=device)

    def run(files):
        if len(files) == 0:
            return []
        with torch.no_grad():
            test_samples = load_dataset(file_path=files, max_len=shared_params['max_len'],
                                        ngram_size=shared_params['ngram_size'], word_to_index=word_to_index,
                                        map_label_id=map_label_id)
            test_iter = DataIter(samples=test_samples, batch_size=1, shuffle=False, device=device)
            results = predict_parallel(model, test_iter, map_id_label)
            dict_ = {'Filename': files, 'Class': results}
            df = pd.DataFrame(data=dict_)
            output_file = os.path.join(scored_data_output_dir, f"{uuid4().hex}.parquet")
            df.to_parquet(output_file, index=False)
        return results

    return run


# This main code is only used for local debugging, will never be reached in AzureML when it is a parallel module.
# See https://docs.microsoft.com/en-us/azure/machine-learning/how-to-use-parallel-run-step#write-your-inference-script
if __name__ == '__main__':
    ModuleExecutor(fasttext_score).execute(sys.argv)
Example #9
0
@dsl.module(
    description='slice input video into images and audio',
    name='slice video',
)
def slice_video(
    input_video: InputDirectory(
        description="input directory of video file") = './data/input/video',
    output_audio: OutputDirectory(
        description="output directory of audio from video"
    ) = '/data/output/video',
    output_images: OutputDirectory(
        description="output directory of images slice from video"
    ) = '/data/output/images',
):

    ## this module takes input video, and slice the video into images with ffmpeg

    subprocess.run("ffmpeg -i {} {}/video.aac".format(input_video,
                                                      output_audio),
                   shell=True,
                   check=True)

    subprocess.run("ffmpeg -i {} {}/%05d_video.jpg -hide_banner".format(
        input_video, output_images),
                   shell=True,
                   check=True)


if __name__ == '__main__':
    ModuleExecutor(slice_video).execute(sys.argv)
    type: EnumParameter(
        enum=EnumType,
        description="Whether to use word tokenizer or sentence tokenizer"
    ) = EnumType.word,
):
    sys.argv = [
        'tokenizer.py',
        '-i',
        str(input_file_path),
        '-o',
        str(output_dir_path),
        '--output_to_file',
        str(output_to_file),
        '--input_is_tsv',
        str(input_is_tsv),
        '-m',
        mode.value,
        '-t',
        type.value,
    ]
    if delimiter is not None:
        sys.argv += ['--delimiter', str(delimiter)]
    if ignore_cols is not None:
        sys.argv += ['--ignore_cols', str(ignore_cols)]
    print(' '.join(sys.argv))
    runpy.run_path('tokenizer.py', run_name='__main__')


if __name__ == '__main__':
    ModuleExecutor(tokenizer).execute(sys.argv)
Example #11
0
    if input_dir:
        print(f"Contents of input directory:")
        print('\n'.join(f.name for f in Path(input_dir).iterdir()))
    print(f"Arg 'input_file' = {input_file}, type='{type(input_file)}'")
    print(f"Arg 'output_dir' = {output_dir}, type='{type(output_dir)}'")
    print(f"Arg 'output_file' = {output_file}, type='{type(output_file)}'")
    print(f"Arg 'str_param' = {str_param}, type='{type(str_param)}'")
    print(f"Arg 'int_param' = {int_param}, type='{type(int_param)}'")
    print(f"Arg 'float_param' = {float_param}, type='{type(float_param)}'")
    print(f"Arg 'bool_param' = {bool_param}, type='{type(bool_param)}'")
    print(f"Arg 'enum_param' = {enum_param}, type='{type(enum_param)}'")
    print(f"Arg 'none_param' = {none_param}, type='{type(none_param)}'")

    data = str_param
    if input_file:
        with open(input_file, 'r') as fin:
            data = fin.read()
        print("Content of input file:", data)
    if input_dir:
        shutil.copytree(input_dir, output_dir)
    else:
        os.makedirs(output_dir, exist_ok=True)
        with open(os.path.join(output_dir, "test.txt"), 'w') as fout:
            fout.write(data)
    with open(output_file, 'w') as fout:
        fout.write(data)


if __name__ == '__main__':
    ModuleExecutor(sample_module).execute(sys.argv)
    val2 = 'val2'


@dsl.module(
    description='A sample module which shows the input data.',
    name='Prepare data',
)
def prepare_data(
    output_data: OutputDirectory(),
    input_data: InputDirectory() = None,
    str_param: str = None,
    int_param: int = 0,
    enum_param: EnumEnumParam = None,
):
    sys.argv = [
        'prepare_data.py',
        '--input_data', str(input_data),
        '--output_data', str(output_data),
        '--int_param', str(int_param),
    ]
    if str_param is not None:
        sys.argv += ['--str_param', str(str_param)]
    if enum_param is not None:
        sys.argv += ['--enum_param', enum_param.value]
    print(' '.join(sys.argv))
    runpy.run_path('prepare_data.py', run_name='__main__')


if __name__ == '__main__':
    ModuleExecutor(prepare_data).execute(sys.argv)
Example #13
0
    print(f'char2index_dir: {Path(char2index_dir).resolve()}')
    print(f'scored_dataset: {scored_dataset}')
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    max_len_ = 38
    path = os.path.join(fasttext_model, 'BestModel')
    model = torch.load(f=path)

    def run(files):
        if len(files) == 0:
            return []
        print(f"Ready to process {len(files)} texts.")
        print('\n'.join(files))

        with torch.no_grad():
            test_samples = load_dataset_parallel(files=files, max_len=max_len_, char2index_dir=char2index_dir)
            test_iter = DataIter_Parallel(test_samples, shuffle=False)
            results = predict_parallel(model, test_iter, device)
            dict_ = {'Filename': files, 'Class': results}
            df = pd.DataFrame(data=dict_)
            print("Result:")
            print(df)
            output_file = os.path.join(scored_dataset, f"{uuid4().hex}.parquet")
            df.to_parquet(output_file, index=False)
        return results

    return run


if __name__ == '__main__':
    ModuleExecutor(fasttext_score_parallel).execute(sys.argv)
Example #14
0
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------
import sys

from azureml.pipeline.wrapper.dsl.module import ModuleExecutor
from azureml.pipeline.wrapper import dsl

from package1.foo import bar


@dsl.module(name="Basic Module")
def basic_module(
    string_parameter: str,
    int_parameter: int,
    boolean_parameter: bool,
    string_parameter_with_default='abc',
):
    bar()


if __name__ == '__main__':
    ModuleExecutor(basic_module).execute(sys.argv)
Example #15
0
# ---------------------------------------------------------
# Copyright (c) Microsoft Corporation. All rights reserved.
# ---------------------------------------------------------

import sys
from azureml.pipeline.wrapper import dsl
from azureml.pipeline.wrapper.dsl.module import ModuleExecutor, OutputFile

from my_module import *
import my_module


@dsl.module()
def add(
        output_file: OutputFile(),
        a=1,
        b=2,
        c=3,
):
    return my_module.add(**locals())


if __name__ == '__main__':
    ModuleExecutor(add).execute(sys.argv)
Example #16
0
import gdal

from azureml.pipeline.wrapper.dsl.module import ModuleExecutor, InputDirectory, OutputDirectory
from azureml.studio.core.io.data_frame_directory import load_data_frame_from_directory
from azureml.pipeline.wrapper import dsl

gdal_version_num = int(gdal.VersionInfo('VERSION_NUM'))
print(f'gdal version number is {gdal_version_num}.')


@dsl.module(name="gdal_sample")
def gdal_sample(
        ##define interface(input, output, paratmers) of the module here
        output_dir1: OutputDirectory(),
        output_dir2: OutputDirectory(),
        input_dir1: InputDirectory(),
        input_dir2: InputDirectory()):
    print('I am in module definition')
    print(f'input_dir1: {Path(input_dir1).resolve()}')
    print(f'input_dir2: {Path(input_dir2).resolve()}')

    ## add custom logic here

    dfd1 = load_data_frame_from_directory(input_dir1)
    data_frame1 = dfd1.data
    print(data_frame1.head(10))


if __name__ == '__main__':
    ModuleExecutor(gdal_sample).execute(sys.argv)
Example #17
0
import sys
from pathlib import Path
from tempfile import mkdtemp

from azureml.pipeline.wrapper.dsl.module import ModuleExecutor, InputDirectory, OutputDirectory
from azureml.pipeline.wrapper import dsl
from azureml.core.run import Run

@dsl.module(
    name='enter_num_manually', 
    description='Put a number in parameter and this module will convert it to a file', 
    job_type='basic')
def enter_num_manually(output: OutputDirectory(), num='0'):
    Path(output).absolute().mkdir(parents=True, exist_ok=True)
    with open(Path(output).resolve() / f'data', 'w') as fout:
        fout.write(num)

if __name__ == "__main__":
    ModuleExecutor(enter_num_manually).execute(sys.argv)
import sys
from pathlib import Path
from tempfile import mkdtemp

from azureml.pipeline.wrapper.dsl.module import ModuleExecutor
from azureml.pipeline.wrapper import dsl
from azureml.core.run import Run


@dsl.module(job_type="basic", name="multiply")
def multiply(left: Path = Path('.'), right: Path = Path('.')) -> Path:
    l = float(Path(left.resolve() / f'data').read_text().strip())
    r = float(Path(right.resolve() / f'data').read_text().strip())
    print('left = ', l)
    print('right = ', r)

    result = l * r
    run = Run.get_context()
    run.log('result', result)
    run.flush()

    output_path = Path(mkdtemp())
    with open(output_path / f'data', 'w') as fout:
        fout.write(str(result))
    return output_path


if __name__ == "__main__":
    ModuleExecutor(multiply).execute(sys.argv)
):
    print('=====================================================')
    print(f'input_dir: {Path(first_trained_model).resolve()}')
    print(f'input_dir: {Path(first_trained_result).resolve()}')
    print(f'input_dir: {Path(second_trained_model).resolve()}')
    print(f'input_dir: {Path(second_trained_result).resolve()}')
    # for logging
    run = Run.get_context()
    path = os.path.join(first_trained_result, 'result.json')
    result_first = json.load(open(path, 'r'))['acc']

    path = os.path.join(second_trained_result, 'result.json')
    second_first = json.load(open(path, 'r'))['acc']

    dst = os.path.join(the_better_model, 'BestModel')
    if result_first >= second_first:
        print('choose the first model')
        run.log(name='which one', value='first')
        src = os.path.join(first_trained_model, 'BestModel')
        shutil.copy(src=src, dst=dst)
    else:
        print('choose the second model')
        run.log(name='which one', value='second')
        src = os.path.join(second_trained_model, 'BestModel')
        shutil.copy(src=src, dst=dst)
    print('=====================================================')


if __name__ == '__main__':
    ModuleExecutor(compare_two_models).execute(sys.argv)
Example #20
0
@dsl.module(
    description='stitch images and audio back to video',
    name='stitch video',
)
def stitch_video(
        input_images: InputDirectory(description="input directory of images"),
        input_audio: InputDirectory(description="input directory of audio"),
        output_video: OutputDirectory(
            description="output directory of stitched video file")):

    ## this module takes input video, and slice the video into images with ffmpeg

    subprocess.run(
        "ffmpeg -framerate 30 -i {}/%05d_video.jpg -c:v libx264 -profile:v high -crf 20 -pix_fmt yuv420p "
        "-y {}/video_without_audio.mp4".format(args.images_dir,
                                               args.output_dir),
        shell=True,
        check=True)

    subprocess.run(
        "ffmpeg -i {}/video_without_audio.mp4 -i {}/video.aac -map 0:0 -map 1:0 -vcodec "
        "copy -acodec copy -y {}/video_with_audio.mp4".format(
            args.output_dir, args.input_audio, args.output_dir),
        shell=True,
        check=True)


if __name__ == '__main__':
    ModuleExecutor(stitch_video).execute(sys.argv)
    def split_samples(self, sub_samples):
        b_x = [item[0] for item in sub_samples]
        b_y = [item[1] for item in sub_samples]
        return np.array(b_x), np.array(b_y)

    def __next__(self):
        if (self.index == self.n_batches) and (self.residue is True):
            sub_samples = self.samples[self.index * self.batch_size: len(self.samples)]
            self.index += 1
            return self.split_samples(sub_samples)
        elif self.index >= self.n_batches:
            self.index = 0
            raise StopIteration
        else:
            sub_samples = self.samples[self.index * self.batch_size: (self.index + 1) * self.batch_size]
            self.index += 1
            return self.split_samples(sub_samples)

    def __iter__(self):
        return self

    def __len__(self):
        if self.residue:
            return self.n_batches + 1
        else:
            return self.n_batches


if __name__ == '__main__':
    ModuleExecutor(fasttext_test).execute(sys.argv)
Example #22
0
from azureml.pipeline.wrapper import dsl
from azureml.pipeline.wrapper.dsl.module import ModuleExecutor, InputDirectory, OutputDirectory


@dsl.module(name="MPI Module", job_type='mpi')
def mpi_module(
    output_dir: OutputDirectory(),
    input_dir: InputDirectory() = '.',
    param0: str = 'abc',
    param1: int = 10,
):
    from mpi4py import MPI
    for k, v in locals().items():
        print(f"{k}: {v}")
    comm = MPI.COMM_WORLD
    size = comm.Get_size()
    rank = comm.Get_rank()
    print(f"This is an MPI module, I'm rank {rank}/{size}.")
    if rank == 0:
        print("I will write data.")
        output_dir = Path(output_dir)
        with open(output_dir / f"output.txt", 'w') as fout:
            fout.write(param0)
            fout.write(str(param1))
    else:
        print("I don't return data.")


if __name__ == '__main__':
    ModuleExecutor(mpi_module).execute(sys.argv)
 def test_module_with_execute(self):
     # This test simulates a parallel run from cmd line arguments to call parallel_sample.
     ModuleExecutor(parallel_score_images).execute(self.prepare_argv())
Example #24
0
    max_len_ = 38
    n_class_ = len(class_)
    vocab_size_ = len(c2i)
    stop_patience = 5
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    path = os.path.join(training_data_dir, 'train.txt')
    train_samples = load_dataset(file_path=path, max_len=max_len_, char2index_dir=char2index_dir)
    path = os.path.join(validation_data_dir, 'dev.txt')
    dev_samples = load_dataset(file_path=path, max_len=max_len_, char2index_dir=char2index_dir)

    train_iter = DataIter(train_samples, batch_size)
    dev_iter = DataIter(dev_samples, batch_size)

    model = FastText(vocab_size=vocab_size_, n_class=n_class_, embed_dim=embedding_dim)
    start = time.time()
    train(model,
          trained_model_dir,
          train_iter,
          dev_iter=dev_iter,
          epochs=epochs,
          learning_rate=learning_rate,
          stop_patience=stop_patience,
          device=device)
    end = time.time()
    print('\nspent time: %.2f sec' % (end - start))
    print('============================================')


if __name__ == '__main__':
    ModuleExecutor(fasttext_train).execute(sys.argv)
Example #25
0
        style_model.load_state_dict(state_dict)
        style_model.to(device)
    print(f'Model loaded successfully. Path: {model_dir}')

    def run(mini_batch):
        result = []
        for image_file_path in mini_batch:
            img = load_image(image_file_path)
            print(f'load image from: {image_file_path}')
            with torch.no_grad():
                content_transform = transforms.Compose([
                    transforms.ToTensor(),
                    transforms.Lambda(lambda x: x.mul(255))
                ])
                content_image = content_transform(img)
                content_image = content_image.unsqueeze(0).to(device)
                output = style_model(content_image).cpu()
                output_file_path = os.path.join(
                    output_path, os.path.basename(image_file_path))
                save_image(output_file_path, output[0])
                result.append(output_file_path)
                print(f'transferred image saved in: {output_file_path}')
        return result

    return run


# This main code is only used for local debugging, will never be reached in AzureML when it is a parallel module.
if __name__ == '__main__':
    ModuleExecutor(style_transform_parallel).execute(sys.argv)