def compare_two_models(
    the_better_model: OutputDirectory(),
    first_trained_model: InputDirectory(type='AnyDirectory') = None,
    first_trained_result: InputDirectory(type='AnyDirectory') = None,
    second_trained_model: InputDirectory(type='AnyDirectory') = None,
    second_trained_result: InputDirectory(type='AnyDirectory') = None,
):
    print('=====================================================')
    print(f'input_dir: {Path(first_trained_model).resolve()}')
    print(f'input_dir: {Path(first_trained_result).resolve()}')
    print(f'input_dir: {Path(second_trained_model).resolve()}')
    print(f'input_dir: {Path(second_trained_result).resolve()}')
    # for logging
    run = Run.get_context()
    path = os.path.join(first_trained_result, 'result.json')
    result_first = json.load(open(path, 'r'))['acc']

    path = os.path.join(second_trained_result, 'result.json')
    second_first = json.load(open(path, 'r'))['acc']

    dst = os.path.join(the_better_model, 'BestModel')
    if result_first >= second_first:
        print('choose the first model')
        run.log(name='which one', value='first')
        src = os.path.join(first_trained_model, 'BestModel')
        shutil.copy(src=src, dst=dst)
    else:
        print('choose the second model')
        run.log(name='which one', value='second')
        src = os.path.join(second_trained_model, 'BestModel')
        shutil.copy(src=src, dst=dst)
    print('=====================================================')
Esempio n. 2
0
def fasttext_evaluation(model_testing_result: OutputDirectory(),
                        trained_model_dir: InputDirectory() = None,
                        test_data_dir: InputDirectory() = None):
    print('=====================================================')
    print(f'trained_model_dir: {Path(trained_model_dir).resolve()}')
    print(f'test_data_dir: {Path(test_data_dir).resolve()}')
    path_word_to_index = os.path.join(test_data_dir, 'word_to_index.json')
    word_to_index = get_vocab(path_word_to_index)
    path_label = os.path.join(test_data_dir, 'label.txt')
    map_id_label, map_label_id = get_id_label(path_label)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print('device:', device)
    path = os.path.join(trained_model_dir, 'shared_params.json')
    with open(path, 'r', encoding='utf-8') as f:
        shared_params = json.load(f)
    path = os.path.join(test_data_dir, 'data.txt')
    test_samples = load_dataset(file_path=path,
                                max_len=shared_params['max_len'],
                                ngram_size=shared_params['ngram_size'],
                                word_to_index=word_to_index,
                                map_label_id=map_label_id)
    test_iter = DataIter(samples=test_samples, shuffle=False, device=device)
    path = os.path.join(trained_model_dir, 'BestModel')
    model = torch.load(f=path, map_location=device)
    path = os.path.join(model_testing_result, 'result.json')
    acc_ = test(model, test_iter)
    with open(path, 'w', encoding='utf-8') as f:
        json.dump({"acc": acc_}, f)
    print('\n============================================')
Esempio n. 3
0
def fasttext_score_parallel(
        scored_dataset: OutputDirectory(type='AnyDirectory'),
        fasttext_model: InputDirectory(type='AnyDirectory') = '.',
        char2index_dir: InputDirectory(type='AnyDirectory') = None
):
    print('=====================================================')
    print(f'fasttext_model: {Path(fasttext_model).resolve()}')
    print(f'char2index_dir: {Path(char2index_dir).resolve()}')
    print(f'scored_dataset: {scored_dataset}')
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    max_len_ = 38
    path = os.path.join(fasttext_model, 'BestModel')
    model = torch.load(f=path)

    def run(files):
        if len(files) == 0:
            return []
        print(f"Ready to process {len(files)} texts.")
        print('\n'.join(files))

        with torch.no_grad():
            test_samples = load_dataset_parallel(files=files, max_len=max_len_, char2index_dir=char2index_dir)
            test_iter = DataIter_Parallel(test_samples, shuffle=False)
            results = predict_parallel(model, test_iter, device)
            dict_ = {'Filename': files, 'Class': results}
            df = pd.DataFrame(data=dict_)
            print("Result:")
            print(df)
            output_file = os.path.join(scored_dataset, f"{uuid4().hex}.parquet")
            df.to_parquet(output_file, index=False)
        return results

    return run
Esempio n. 4
0
def fasttext_evaluation(
        model_testing_result: OutputDirectory(type='AnyDirectory'),
        trained_model_dir: InputDirectory(type='AnyDirectory') = None,
        test_data_dir: InputDirectory(type='AnyDirectory') = None,
        char2index_dir: InputDirectory(type='AnyDirectory') = None):
    print('=====================================================')
    print(f'trained_model_dir: {Path(trained_model_dir).resolve()}')
    print(f'test_data_dir: {Path(test_data_dir).resolve()}')
    print(f'char2index_dir: {Path(char2index_dir).resolve()}')

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    max_len_ = 38
    path = os.path.join(test_data_dir, 'test.txt')
    test_samples = load_dataset(file_path=path,
                                max_len=max_len_,
                                char2index_dir=char2index_dir)

    test_iter = DataIter(test_samples)

    path = os.path.join(trained_model_dir, 'BestModel')
    model = torch.load(f=path)

    path = os.path.join(model_testing_result, 'result.json')
    acc_ = test(model, test_iter, device)
    json.dump({"acc": acc_}, open(path, 'w'))
    print('\n============================================')
Esempio n. 5
0
def gdal_sample(
        ##define interface(input, output, paratmers) of the module here
        output_dir1: OutputDirectory(),
        output_dir2: OutputDirectory(),
        input_dir1: InputDirectory(),
        input_dir2: InputDirectory()):
    print('I am in module definition')
    print(f'input_dir1: {Path(input_dir1).resolve()}')
    print(f'input_dir2: {Path(input_dir2).resolve()}')

    ## add custom logic here

    dfd1 = load_data_frame_from_directory(input_dir1)
    data_frame1 = dfd1.data
    print(data_frame1.head(10))
def fasttext_score(
        scored_data_output_dir: OutputDirectory(),
        fasttext_model_dir: InputDirectory() = '.'
):
    print('=====================================================')
    print(f'fasttext_model: {Path(fasttext_model_dir).resolve()}')
    print(f'scored_data_output_dir: {scored_data_output_dir}')
    path_word_to_index = os.path.join(fasttext_model_dir, 'word_to_index.json')
    word_to_index = get_vocab(path_word_to_index)
    path_label = os.path.join(fasttext_model_dir, 'label.txt')
    map_id_label, map_label_id = get_id_label(path_label)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print('device:', device)
    path = os.path.join(fasttext_model_dir, 'shared_params.json')
    with open(path, 'r', encoding='utf-8') as f:
        shared_params = json.load(f)
    path = os.path.join(fasttext_model_dir, 'BestModel')
    model = torch.load(f=path, map_location=device)

    def run(files):
        if len(files) == 0:
            return []
        with torch.no_grad():
            test_samples = load_dataset(file_path=files, max_len=shared_params['max_len'],
                                        ngram_size=shared_params['ngram_size'], word_to_index=word_to_index,
                                        map_label_id=map_label_id)
            test_iter = DataIter(samples=test_samples, batch_size=1, shuffle=False, device=device)
            results = predict_parallel(model, test_iter, map_id_label)
            dict_ = {'Filename': files, 'Class': results}
            df = pd.DataFrame(data=dict_)
            output_file = os.path.join(scored_data_output_dir, f"{uuid4().hex}.parquet")
            df.to_parquet(output_file, index=False)
        return results

    return run
def fasttext_train(
        trained_model_dir: OutputDirectory(type='AnyDirectory'),
        training_data_dir: InputDirectory(type='AnyDirectory') = None,
        validation_data_dir: InputDirectory(type='AnyDirectory') = None,
        char2index_dir: InputDirectory(type='AnyDirectory') = None,
        epochs=2,
        batch_size=32,
        learning_rate=0.0005,
        embedding_dim=128):
    print('============================================')
    print('training_data_dir:', training_data_dir)
    print('validation_data_dir:', validation_data_dir)
    c2i = get_vocab(char2index_dir)
    class_ = get_classs()
    max_len_ = 38
    n_class_ = len(class_)
    vocab_size_ = len(c2i)
    stop_patience = 5
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    path = os.path.join(training_data_dir, 'train.txt')
    train_samples = load_dataset(file_path=path,
                                 max_len=max_len_,
                                 char2index_dir=char2index_dir)
    path = os.path.join(validation_data_dir, 'dev.txt')
    dev_samples = load_dataset(file_path=path,
                               max_len=max_len_,
                               char2index_dir=char2index_dir)

    train_iter = DataIter(train_samples, batch_size)
    dev_iter = DataIter(dev_samples, batch_size)

    model = FastText(vocab_size=vocab_size_,
                     n_class=n_class_,
                     embed_dim=embedding_dim)
    start = time.time()
    train(model,
          trained_model_dir,
          train_iter,
          dev_iter=dev_iter,
          epochs=epochs,
          learning_rate=learning_rate,
          stop_patience=stop_patience,
          device=device)
    end = time.time()
    print('\nspent time: %.2f sec' % (end - start))
    print('============================================')
Esempio n. 8
0
def add(left: InputDirectory(), right: InputDirectory(),
        output: OutputDirectory()):
    l = float((Path(left).resolve() / f'data').read_text().strip())
    r = float((Path(right).resolve() / f'data').read_text().strip())
    print('left = ', l)
    print('right = ', r)

    m = l + r
    run = Run.get_context()
    run.log('result', m)
    run.log('left', l)
    run.log('right', r)
    run.flush()

    Path(output).absolute().mkdir(parents=True, exist_ok=True)
    with open(Path(output).resolve() / f'data', 'w') as fout:
        fout.write(str(m))
Esempio n. 9
0
def split_data_txt(
        training_data_output: OutputDirectory(type='AnyDirectory'),
        validation_data_output: OutputDirectory(type='AnyDirectory'),
        test_data_output: OutputDirectory(type='AnyDirectory'),
        input_dir: InputDirectory(type='AnyDirectory') = None,
        training_data_ratio=0.7,
        validation_data_ratio=0.1,
        random_split=False,
        seed=0):
    print('============================================')
    print(
        f"value of input_dir:'{input_dir}', type of input_dir:'{type(input_dir)}'"
    )
    with open(input_dir, 'r', encoding='utf-8') as f:
        data = f.readlines()
    random.seed(seed if random_split else 0)
    # list shuffle
    random.shuffle(data)
    n = len(data)
    # for logging
    run = Run.get_context()
    training_data_num = int(n * training_data_ratio)
    dev_data_num = int(n * validation_data_ratio)
    train = data[:training_data_num]
    dev = data[training_data_num:training_data_num + dev_data_num]
    test = data[training_data_num + dev_data_num:]
    print('num of total data:', len(data))
    print('num of training data:', len(train))
    print('num of validation data:', len(dev))
    print('num of test_data:', len(test))
    # for metrics
    run.log(name='num of total data', value=len(data))
    run.log(name='num of training data', value=len(train))
    run.log(name='num of validation data', value=len(dev))
    run.log(name='num of test_data', value=len(test))

    os.makedirs(training_data_output, exist_ok=True)
    path = os.path.join(training_data_output, "train.txt")
    with open(path, 'w', encoding='utf-8') as f:
        f.writelines(train)
    print(path)
    print(os.listdir(training_data_output))

    os.makedirs(validation_data_output, exist_ok=True)
    path = os.path.join(validation_data_output, "dev.txt")
    with open(path, 'w', encoding='utf-8') as f:
        f.writelines(dev)
    print(path)
    print(os.listdir(validation_data_output))

    os.makedirs(test_data_output, exist_ok=True)
    path = os.path.join(test_data_output, "test.txt")
    with open(path, 'w', encoding='utf-8') as f:
        f.writelines(test)
    print(path)
    print(os.listdir(test_data_output))
    print('============================================')
Esempio n. 10
0
def basic_module(
    output_dir: OutputDirectory(),
    input_dir: InputDirectory() = '.',
    str_param='some_string',
):
    print(f'input_dir: {Path(input_dir).resolve()}')
    print(f'str_param: {str_param}')
    output_dir = Path(output_dir)
    with open(output_dir / f"output.txt", 'w') as fout:
        fout.write(str_param)
Esempio n. 11
0
def split_data_txt(training_data_output: OutputDirectory(),
                   validation_data_output: OutputDirectory(),
                   test_data_output: OutputDirectory(),
                   input_dir: InputDirectory() = None,
                   training_data_ratio=0.7,
                   validation_data_ratio=0.1,
                   random_split=False,
                   seed=0):
    print('============================================')
    print(
        f"value of input_dir:'{input_dir}', type of input_dir:'{type(input_dir)}'"
    )
    path_input_data = os.path.join(input_dir, 'data.txt')
    with open(path_input_data, 'r', encoding='utf-8') as f:
        data = f.readlines()
    random.seed(seed if random_split else 0)
    random.shuffle(data)
    n = len(data)
    # for metrics
    run = Run.get_context()
    training_data_num = int(n * training_data_ratio)
    dev_data_num = int(n * validation_data_ratio)
    train = data[:training_data_num]
    dev = data[training_data_num:training_data_num + dev_data_num]
    test = data[training_data_num + dev_data_num:]
    print('num of total data:', len(data))
    print('num of training data:', len(train))
    print('num of validation data:', len(dev))
    print('num of test_data:', len(test))
    # for metrics
    run.log(name='num of total data', value=len(data))
    run.log(name='num of training data', value=len(train))
    run.log(name='num of validation data', value=len(dev))
    run.log(name='num of test_data', value=len(test))
    path_label = os.path.join(input_dir, 'label.txt')
    path_word_to_index = os.path.join(input_dir, 'word_to_index.json')

    shutil.copy(src=path_label, dst=training_data_output)
    shutil.copy(src=path_word_to_index, dst=training_data_output)
    path = os.path.join(training_data_output, "data.txt")
    with open(path, 'w', encoding='utf-8') as f:
        f.writelines(train)

    shutil.copy(src=path_label, dst=validation_data_output)
    shutil.copy(src=path_word_to_index, dst=validation_data_output)
    path = os.path.join(validation_data_output, "data.txt")
    with open(path, 'w', encoding='utf-8') as f:
        f.writelines(dev)

    shutil.copy(src=path_label, dst=test_data_output)
    shutil.copy(src=path_word_to_index, dst=test_data_output)
    path = os.path.join(test_data_output, "data.txt")
    with open(path, 'w', encoding='utf-8') as f:
        f.writelines(test)
    print('============================================')
Esempio n. 12
0
def parallel_score_images(
    scored_dataset: OutputDirectory(),
    trained_model: InputDirectory() = None,
):
    # Use the path of a prepared model if trained_model is None
    if trained_model is None:
        trained_model = str(
            Path(__file__).parent /
            'tests/parallel_score_images/inputs/trained_model/')
    print("Scored dataset:", scored_dataset)
    print("Trained model:", trained_model)
    map_location = 'cpu' if not torch.cuda.is_available() else None
    model = torch.load(os.path.join(trained_model, 'model.pt'),
                       map_location=map_location)
    os.makedirs(scored_dataset, exist_ok=True)
    print("Model is loaded:", model)

    def run(files):
        if len(files) == 0:
            return []
        results = []
        nthreads = min(2 * cpu_count(), len(files))

        print(f"Ready to process {len(files)} images.")
        print('\n'.join(files))
        with ThreadPool(nthreads) as pool:
            imgs = pool.map(Image.open, files)

        for f, img in zip(files, imgs):
            img = Image.open(f)
            tensor = transform(img).unsqueeze(0)
            if torch.cuda.is_available():
                tensor = tensor.cuda()

            with torch.no_grad():
                output = model(tensor)
                softmax = nn.Softmax(dim=1)
                pred_probs = softmax(output).cpu().numpy()[0]
                index = torch.argmax(output, 1)[0].cpu().item()
                result = {
                    'Filename': Path(f).name,
                    'Class': MNIST.classes[index]
                }
                for c, prob in zip(MNIST.classes, pred_probs):
                    result[f"Prob of {c}"] = prob
            results.append(result)
        columns = sorted(list(results[0].keys()))
        df = pd.DataFrame(results, columns=columns)
        print("Result:")
        print(df)
        output_file = os.path.join(scored_dataset, f"{uuid4().hex}.parquet")
        df.to_parquet(output_file, index=False)
        return results

    return run
def fasttext_test(
        trained_model_dir: InputDirectory(type='AnyDirectory') = None,
        test_data_dir: InputDirectory(type='AnyDirectory') = None,
        char2index_dir: InputDirectory(type='AnyDirectory') = None
):
    print('============================================')
    print('test_data_dir:', test_data_dir)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    max_len_ = 38
    path = os.path.join(test_data_dir, 'test.txt')
    test_samples = load_dataset(file_path=path, max_len=max_len_, char2index_dir=char2index_dir)

    test_iter = DataIter(test_samples)

    path = os.path.join(trained_model_dir, 'BestModel')
    model = torch.load(f=path)

    test(model, test_iter, device)
    print('\n============================================')
Esempio n. 14
0
def stitch_video(
        input_images: InputDirectory(description="input directory of images"),
        input_audio: InputDirectory(description="input directory of audio"),
        output_video: OutputDirectory(
            description="output directory of stitched video file")):

    ## this module takes input video, and slice the video into images with ffmpeg

    subprocess.run(
        "ffmpeg -framerate 30 -i {}/%05d_video.jpg -c:v libx264 -profile:v high -crf 20 -pix_fmt yuv420p "
        "-y {}/video_without_audio.mp4".format(args.images_dir,
                                               args.output_dir),
        shell=True,
        check=True)

    subprocess.run(
        "ffmpeg -i {}/video_without_audio.mp4 -i {}/video.aac -map 0:0 -map 1:0 -vcodec "
        "copy -acodec copy -y {}/video_with_audio.mp4".format(
            args.output_dir, args.input_audio, args.output_dir),
        shell=True,
        check=True)
Esempio n. 15
0
def fasttext_score(fasttext_model: InputDirectory(type='AnyDirectory') = '.',
                   input_sentence='I like playing football very much',
                   char2index_dir: InputDirectory(type='AnyDirectory') = None):
    print('=====================================================')
    print(f'fasttext_model: {Path(fasttext_model).resolve()}')
    print(f'char2index_dir: {Path(char2index_dir).resolve()}')
    print(f'input_sentence: {input_sentence}')
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    max_len_ = 38
    path = input_sentence
    test_samples = load_dataset(file_path=path,
                                max_len=max_len_,
                                char2index_dir=char2index_dir)

    test_iter = DataIter(test_samples, batch_size=1)

    path = os.path.join(fasttext_model, 'BestModel')
    model = torch.load(f=path)

    res = predict(model, test_iter, device)
    print('the category of "%s" is %s' % (input_sentence, res))
    print('=====================================================')
def compare_two_models(the_better_model: OutputDirectory(),
                       first_trained_model: InputDirectory() = None,
                       first_trained_result: InputDirectory() = None,
                       second_trained_model: InputDirectory() = None,
                       second_trained_result: InputDirectory() = None):
    print('=====================================================')
    print(f'input_dir: {Path(first_trained_model).resolve()}')
    print(f'input_dir: {Path(first_trained_result).resolve()}')
    print(f'input_dir: {Path(second_trained_model).resolve()}')
    print(f'input_dir: {Path(second_trained_result).resolve()}')
    # for metrics
    run = Run.get_context()
    path = os.path.join(first_trained_result, 'result.json')
    with open(path, 'r', encoding='utf-8') as f:
        result_first = json.load(f)['acc']
    path = os.path.join(second_trained_result, 'result.json')
    with open(path, 'r', encoding='utf-8') as f:
        second_first = json.load(f)['acc']
    dst = the_better_model
    if result_first >= second_first:
        print('choose the first model')
        run.log(name='which one', value='first')
        src = os.path.join(first_trained_model, 'BestModel')
        shutil.copy(src=src, dst=dst)
    else:
        print('choose the second model')
        run.log(name='which one', value='second')
        src = os.path.join(second_trained_model, 'BestModel')
        shutil.copy(src=src, dst=dst)
    path_word_to_index = os.path.join(first_trained_model,
                                      'word_to_index.json')
    path_label = os.path.join(first_trained_model, 'label.txt')
    path_shared_params = os.path.join(first_trained_model,
                                      'shared_params.json')
    shutil.copy(src=path_word_to_index, dst=dst)
    shutil.copy(src=path_label, dst=dst)
    shutil.copy(src=path_shared_params, dst=dst)
    print('=====================================================')
Esempio n. 17
0
def merge(
        cleaned_yellow_data: InputDirectory(
            description=
            "cleaned yellow data, needs to be read as pandas dataframe"),
        cleaned_green_data: InputDirectory(
            description=
            "cleaned green data, needs to be read as pandas dataframe"),
        merged_output: OutputDirectory(description="output data after merge"),
):

    green_df = pd.read_csv(cleaned_green_data)
    yellow_df = pd.read_csv(cleaned_yellow_data)

    print("Argument (output merge taxi data path): %s" % merged_output)

    merge_df = green_df.append(yellow_df, ignore_index=True)
    merge_df.reset_index(inplace=True, drop=True)

    if not (merged_output is None):
        os.makedirs(merged_output, exist_ok=True)
        print("merge output folder %s created" % merged_output)
        path = merged_output + "/merged.csv"
        write_df = merge_df.to_csv(path)
Esempio n. 18
0
def sample_module(
    # The input/output port are defined using the following 4 annotations.
    # Note that you need to register data type using
    # DataType.create_data_type(ws, 'MyDirectory', description=description, is_directory=True)
    # DataType.create_data_type(ws, 'MyFile', description=description, is_directory=False)
    # See https://docs.microsoft.com/en-us/python/api/azureml-pipeline-core/azureml.pipeline.core.graph.datatype?view=azure-ml-py#create-data-type-workspace--name--description--is-directory--parent-datatypes-none-
    output_dir: OutputDirectory(type='MyDirectory'),
    output_file: OutputFile(type='MyFile'),
    input_dir: InputDirectory(type='MyDirectory') = None,
    input_file: InputFile(type='MyFile') = None,
    # The parameter with default values will be considered as annotated with such type,
    # Now we support the following 5 types: str, int, float, bool, enum
    str_param='abc',
    int_param=1,
    float_param=0.1,
    bool_param=False,
    enum_param=MyEnum.Enum0,
    # If the default value is None without annotation, it will be treated as str.
    none_param=None,
):
    """A sample module use different parameter types and customized input/output ports."""
    print(f"Arg 'input_dir' = '{input_dir}', type='{type(input_dir)}'")
    if input_dir:
        print(f"Contents of input directory:")
        print('\n'.join(f.name for f in Path(input_dir).iterdir()))
    print(f"Arg 'input_file' = {input_file}, type='{type(input_file)}'")
    print(f"Arg 'output_dir' = {output_dir}, type='{type(output_dir)}'")
    print(f"Arg 'output_file' = {output_file}, type='{type(output_file)}'")
    print(f"Arg 'str_param' = {str_param}, type='{type(str_param)}'")
    print(f"Arg 'int_param' = {int_param}, type='{type(int_param)}'")
    print(f"Arg 'float_param' = {float_param}, type='{type(float_param)}'")
    print(f"Arg 'bool_param' = {bool_param}, type='{type(bool_param)}'")
    print(f"Arg 'enum_param' = {enum_param}, type='{type(enum_param)}'")
    print(f"Arg 'none_param' = {none_param}, type='{type(none_param)}'")

    data = str_param
    if input_file:
        with open(input_file, 'r') as fin:
            data = fin.read()
        print("Content of input file:", data)
    if input_dir:
        shutil.copytree(input_dir, output_dir)
    else:
        os.makedirs(output_dir, exist_ok=True)
        with open(os.path.join(output_dir, "test.txt"), 'w') as fout:
            fout.write(data)
    with open(output_file, 'w') as fout:
        fout.write(data)
def tokenizer(
    input_file_path: InputDirectory(description="Input text file path"),
    output_dir_path: OutputDirectory(description="Output file directory path"),
    output_to_file: IntParameter(
        description=
        "whether to interpret output_dir_path as file to write to, or folder containing file to write to"
    ) = 0,
    input_is_tsv: IntParameter(
        description="bool determining whether to use tsv related options") = 0,
    delimiter: StringParameter(
        description="optional, delimiter to use if parsing a tsv type file"
    ) = None,
    ignore_cols: IntParameter(
        description="indices of columns to ignore if parsing a tsv") = None,
    mode: EnumParameter(
        enum=EnumMode,
        description="Tokenizer to use [train, inference, spacy]") = EnumMode.
    train,
    type: EnumParameter(
        enum=EnumType,
        description="Whether to use word tokenizer or sentence tokenizer"
    ) = EnumType.word,
):
    sys.argv = [
        'tokenizer.py',
        '-i',
        str(input_file_path),
        '-o',
        str(output_dir_path),
        '--output_to_file',
        str(output_to_file),
        '--input_is_tsv',
        str(input_is_tsv),
        '-m',
        mode.value,
        '-t',
        type.value,
    ]
    if delimiter is not None:
        sys.argv += ['--delimiter', str(delimiter)]
    if ignore_cols is not None:
        sys.argv += ['--ignore_cols', str(ignore_cols)]
    print(' '.join(sys.argv))
    runpy.run_path('tokenizer.py', run_name='__main__')
Esempio n. 20
0
def prepare_data(
    output_data: OutputDirectory(),
    input_data: InputDirectory() = None,
    str_param: str = None,
    int_param: int = 0,
    enum_param: EnumEnumParam = None,
):
    sys.argv = [
        'prepare_data.py',
        '--input_data', str(input_data),
        '--output_data', str(output_data),
        '--int_param', str(int_param),
    ]
    if str_param is not None:
        sys.argv += ['--str_param', str(str_param)]
    if enum_param is not None:
        sys.argv += ['--enum_param', enum_param.value]
    print(' '.join(sys.argv))
    runpy.run_path('prepare_data.py', run_name='__main__')
Esempio n. 21
0
def copy_files(
    output_dir: OutputDirectory(),
    input_dir: InputDirectory() = '.',
    str_param='some_string',
):
    input_dir = Path(input_dir)
    print(f'input_dir: {input_dir.resolve()}')
    print(f'str_param: {str_param}')

    files = []
    if input_dir.is_dir():
        files = [str(f) for f in input_dir.iterdir()]

    if (len(files) == 0):
        raise ValueError(f'input_dir should be an directory with files')

    output_dir = Path(output_dir)
    with open(output_dir / f"output.txt", 'w') as fout:
        fout.write(str_param)
Esempio n. 22
0
def style_transform_parallel(
    model_dir: InputDirectory(
        description="saved torch model to be used for stylizing the image."),
    output_path: OutputDirectory(
        description="directory holding the output images"),
    style: StringParameter(description="style name") = None,
):
    print(f'output path: {output_path}')
    print(f'Cuda available? {torch.cuda.is_available()}')
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    with torch.no_grad():
        style_model = TransformerNet()
        state_dict = torch.load(os.path.join(model_dir, style + ".pth"))
        # remove saved deprecated running_* keys in InstanceNorm from the checkpoint
        for k in list(state_dict.keys()):
            if re.search(r'in\d+\.running_(mean|var)$', k):
                del state_dict[k]
        style_model.load_state_dict(state_dict)
        style_model.to(device)
    print(f'Model loaded successfully. Path: {model_dir}')

    def run(mini_batch):
        result = []
        for image_file_path in mini_batch:
            img = load_image(image_file_path)
            print(f'load image from: {image_file_path}')
            with torch.no_grad():
                content_transform = transforms.Compose([
                    transforms.ToTensor(),
                    transforms.Lambda(lambda x: x.mul(255))
                ])
                content_image = content_transform(img)
                content_image = content_image.unsqueeze(0).to(device)
                output = style_model(content_image).cpu()
                output_file_path = os.path.join(
                    output_path, os.path.basename(image_file_path))
                save_image(output_file_path, output[0])
                result.append(output_file_path)
                print(f'transferred image saved in: {output_file_path}')
        return result

    return run
Esempio n. 23
0
def mpi_module(
    output_dir: OutputDirectory(),
    input_dir: InputDirectory() = '.',
    param0: str = 'abc',
    param1: int = 10,
):
    from mpi4py import MPI
    for k, v in locals().items():
        print(f"{k}: {v}")
    comm = MPI.COMM_WORLD
    size = comm.Get_size()
    rank = comm.Get_rank()
    print(f"This is an MPI module, I'm rank {rank}/{size}.")
    if rank == 0:
        print("I will write data.")
        output_dir = Path(output_dir)
        with open(output_dir / f"output.txt", 'w') as fout:
            fout.write(param0)
            fout.write(str(param1))
    else:
        print("I don't return data.")
Esempio n. 24
0
def slice_video(
    input_video: InputDirectory(
        description="input directory of video file") = './data/input/video',
    output_audio: OutputDirectory(
        description="output directory of audio from video"
    ) = '/data/output/video',
    output_images: OutputDirectory(
        description="output directory of images slice from video"
    ) = '/data/output/images',
):

    ## this module takes input video, and slice the video into images with ffmpeg

    subprocess.run("ffmpeg -i {} {}/video.aac".format(input_video,
                                                      output_audio),
                   shell=True,
                   check=True)

    subprocess.run("ffmpeg -i {} {}/%05d_video.jpg -hide_banner".format(
        input_video, output_images),
                   shell=True,
                   check=True)
Esempio n. 25
0

def save_image(filename, data):
    img = data.clone().clamp(0, 255).numpy()
    img = img.transpose(1, 2, 0).astype("uint8")
    img = Image.fromarray(img)
    img.save(filename)


#for parallel run module, the input and output need to be defined in dsl.module(parallel_inputs = ***) like below
@dsl.module(
    job_type='parallel',
    description='use pretrained pythorch model to do image style transfer',
    name='Style Transfer Parallel',
    parallel_inputs=[
        InputDirectory(name='Content Dir',
                       description="directory of input images")
    ])
def style_transform_parallel(
    model_dir: InputDirectory(
        description="saved torch model to be used for stylizing the image."),
    output_path: OutputDirectory(
        description="directory holding the output images"),
    style: StringParameter(description="style name") = None,
):
    print(f'output path: {output_path}')
    print(f'Cuda available? {torch.cuda.is_available()}')
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    with torch.no_grad():
        style_model = TransformerNet()
        state_dict = torch.load(os.path.join(model_dir, style + ".pth"))
        # remove saved deprecated running_* keys in InstanceNorm from the checkpoint
from uuid import uuid4

import torch
from pathlib import Path
from azureml.pipeline.wrapper.dsl.module import ModuleExecutor, InputDirectory, OutputDirectory
from azureml.pipeline.wrapper import dsl

from common.utils import DataIter, load_dataset, predict_parallel, get_vocab, get_id_label


@dsl.module(
    name="FastText Score",
    version='0.0.23',
    description='Predict the categories of the input sentences',
    job_type='parallel',
    parallel_inputs=[InputDirectory(name='Texts to score')],
    base_image='mcr.microsoft.com/azureml/intelmpi2018.3-cuda10.0-cudnn7-ubuntu16.04'
)
def fasttext_score(
        scored_data_output_dir: OutputDirectory(),
        fasttext_model_dir: InputDirectory() = '.'
):
    print('=====================================================')
    print(f'fasttext_model: {Path(fasttext_model_dir).resolve()}')
    print(f'scored_data_output_dir: {scored_data_output_dir}')
    path_word_to_index = os.path.join(fasttext_model_dir, 'word_to_index.json')
    word_to_index = get_vocab(path_word_to_index)
    path_label = os.path.join(fasttext_model_dir, 'label.txt')
    map_id_label, map_label_id = get_id_label(path_label)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print('device:', device)
Esempio n. 27
0
from pathlib import Path

from azureml.pipeline.wrapper import dsl
from azureml.pipeline.wrapper.dsl.module import InputDirectory, OutputDirectory, ModuleExecutor

transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.1307, ), (0.3081, ))])


@dsl.module(
    name='Parallel Score Images',
    version='0.0.1',
    job_type='parallel',
    parallel_inputs=[
        InputDirectory(name='Images to score'),
        InputDirectory(name='Optional images to score', optional=True)
    ],
)
def parallel_score_images(
    scored_dataset: OutputDirectory(),
    trained_model: InputDirectory() = None,
):
    # Use the path of a prepared model if trained_model is None
    if trained_model is None:
        trained_model = str(
            Path(__file__).parent /
            'tests/parallel_score_images/inputs/trained_model/')
    print("Scored dataset:", scored_dataset)
    print("Trained model:", trained_model)
    map_location = 'cpu' if not torch.cuda.is_available() else None
Esempio n. 28
0
def fasttext_train(trained_model_dir: OutputDirectory(type='ModelDirectory'),
                   training_data_dir: InputDirectory() = None,
                   validation_data_dir: InputDirectory() = None,
                   epochs=1,
                   batch_size=64,
                   max_len=32,
                   embed_dim=300,
                   hidden_size=256,
                   ngram_size=200000,
                   dropout=0.5,
                   learning_rate=0.001):
    print('============================================')
    print('training_data_dir:', training_data_dir)
    print('validation_data_dir:', validation_data_dir)
    path_word_to_index = os.path.join(training_data_dir, 'word_to_index.json')
    word_to_index = get_vocab(path_word_to_index)
    path_label = os.path.join(training_data_dir, 'label.txt')
    map_id_label, map_label_id = get_id_label(path_label)
    class_num = len(map_id_label)
    vocab_size = len(word_to_index)
    stop_patience = 5
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print('device:', device)
    # load training dataset
    path = os.path.join(training_data_dir, 'data.txt')
    train_samples = load_dataset(file_path=path,
                                 word_to_index=word_to_index,
                                 map_label_id=map_label_id,
                                 max_len=max_len,
                                 ngram_size=ngram_size)
    train_iter = DataIter(samples=train_samples,
                          batch_size=batch_size,
                          shuffle=True,
                          device=device)
    # load validation dataset
    path = os.path.join(validation_data_dir, 'data.txt')
    dev_samples = load_dataset(file_path=path,
                               word_to_index=word_to_index,
                               map_label_id=map_label_id,
                               max_len=max_len,
                               ngram_size=ngram_size)
    dev_iter = DataIter(samples=dev_samples,
                        batch_size=batch_size,
                        shuffle=True,
                        device=device)

    model = FastText(vocab_size=vocab_size,
                     class_num=class_num,
                     dropout=dropout,
                     embed_dim=embed_dim,
                     hidden_size=hidden_size,
                     ngram_size=ngram_size)
    # watch parameters
    print(model.parameters)
    # copy word_to_index.json and label.txt for later scoring.
    shutil.copy(src=path_word_to_index, dst=trained_model_dir)
    shutil.copy(src=path_label, dst=trained_model_dir)
    # shared parameters for loading dataset
    shared_params = {'max_len': max_len, 'ngram_size': ngram_size}
    path = os.path.join(trained_model_dir, 'shared_params.json')
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(shared_params, f)
    start = time.time()
    train(model,
          trained_model_dir,
          train_iter=train_iter,
          dev_iter=dev_iter,
          epochs=epochs,
          learning_rate=learning_rate,
          stop_patience=stop_patience,
          device=device)
    end = time.time()
    print('\nduration of training process: %.2f sec' % (end - start))
    print('============================================')