Exemple #1
0
def main(args):
    '''
        Module entry point function
    '''

    seq_col = args.sequence_column
    id_col = args.identifier_column

    logger.debug(f'input-dir {args.input_dir}')
    logger.debug(f'model input dir {args.model_input_dir}')
    logger.debug(f'sequence-column {seq_col}')
    logger.debug(f'identifier-column {id_col}')
    logger.debug(f'output-dir {args.output_dir}')

    sgt = load_model_from_directory(args.model_input_dir,
                                    model_loader=joblib_loader).data
    input_df = load_data_frame_from_directory(args.input_dir).data

    if input_df[seq_col].isnull().sum().sum() > 0:
        print(f'column{seq_col} contains missing values ')
        sys.exit(1)

    embedding_df = score(input_df, sgt, seq_col, id_col)
    print('f embedding shape{embedding_df.shape}')
    print(embedding_df.head())

    save_data_frame_to_directory(
        save_to=args.output_dir,
        data=embedding_df,
        schema=DataFrameSchema.data_frame_to_dict(embedding_df))
Exemple #2
0
def predict(args):

    # Load data that needs to be scored
    df = load_data_frame_from_directory(args.input_data).data

    # Connect to workspace
    ws = automl_helper.get_workspace()

    # Get AutoML run details
    automl_run = automl_helper.get_automl_run(ws, args.experiment, args.run_id)
    properties = automl_run.properties

    # Load AutoML model
    model = automl_helper.load_automl_model(automl_run)

    # Score data
    print("Using model to score input data...")

    isForecasting = isinstance(
        model, azureml.automl.runtime.shared.model_wrappers.
        ForecastingPipelineWrapper)
    if (isForecasting):
        y_query = None
        if 'y_query' in df.columns:
            y_query = df.pop('y_query').values
        results = model.forecast(df, y_query)
        results = results[0]
    else:
        results = model.predict(df)

    results_df = pd.DataFrame(results, columns=['Predictions'])
    print(f"This is how your prediction data looks like:\n{results_df.head()}")

    # Write results back
    automl_helper.write_prediction_dataframe(args.predictions_data, results_df)
Exemple #3
0
def main(args):
    '''
    Module entry function

    args:
      args:list, user parameters

   '''

    logger.debug(f'input-dir {args.input_dir}')
    logger.debug(f'model input dir {args.model_input_dir}')

    logger.debug(f'output-dir {args.output_dir}')

    input_df = load_data_frame_from_directory(args.input_dir).data
    logger.debug(f'{input_df.describe()}\n shape{input_df.shape} ')

    pca_module = load_model_from_directory(args.model_input_dir,
                                           model_loader=pcamodule_loader).data

    logger.debug(pca_module.pca_instance)

    output_df = score(pca_module, input_df)

    logger.debug(f'output shape {output_df.shape}')
    save_data_frame_to_directory(
        save_to=args.output_dir,
        data=output_df,
        schema=DataFrameSchema.data_frame_to_dict(output_df))
def process_data(args, file_name):
    """

    :return: word2id: map word to id
             id2word: map id to word
             label2id: map label to id
             id2label: map id to label
             max_len: max length of text
    """
    label2id, id2label, word2id, id2word, max_len = {}, {}, {}, {}, 0
    label_set, word_set = set([]), set([])
    df = load_data_frame_from_directory(file_name).data

    for index, row in df.iterrows():
        label_set.add(row[args.label_column])
        sentence = row[args.text_column]
        words = word_tokenize(sentence)
        if len(words) > max_len:
            max_len = len(words)
        word_set |= set(words)

    id2word[0] = '<UNK>'  # unknown
    word2id['<UNK>'] = 0
    id2word[1] = '<EOS>'  # ending
    word2id['<EOS>'] = 1
    for i, word in enumerate(word_set):
        word2id[word] = i + 2
        id2word[i + 2] = word
    for i, label in enumerate(label_set):
        label2id[label] = i
        id2label[i] = label
    return word2id, id2word, label2id, id2label, max_len
Exemple #5
0
def main(args):
    '''
    Module entry function

    args:
      args:list transformer parameters requested by user/

   '''

    logger.debug(f'input-dir {args.input_dir}')
    logger.debug(f'output-dir {args.output_dir}')
    logger.debug(f'model output dir {args.model_output_dir}')

    input_df = load_data_frame_from_directory(args.input_dir).data
    logger.debug(f'{input_df.describe()}\n shape{input_df.shape} ')

    pca_module = PCAModule(args)
    logger.debug(pca_module.pca_instance)

    output_df = pca_module.fit_transform(input_df)
    pca_module.log_metrics(input_df.columns)

    logger.debug(f'output shape {output_df.shape}')
    save_data_frame_to_directory(
        save_to=args.output_dir,
        data=output_df,
        schema=DataFrameSchema.data_frame_to_dict(output_df))

    save_model_to_directory(save_to=args.model_output_dir,
                            model_dumper=pca_module_dumper(data=pca_module))
 def load(cls, load_from: str):
     if isinstance(load_from, str):
         dfd = load_data_frame_from_directory(load_from_dir=load_from)
         return cls(df=dfd.data,
                    column_attributes=dfd.schema_instance.column_attributes)
     elif isinstance(load_from, DataFrameDirectory):
         return cls(
             df=load_from.data,
             column_attributes=load_from.schema_instance.column_attributes)
     else:
         raise NotImplementedError(
             f"Cannot load data from {load_from} of type {type(load_from)}")
Exemple #7
0
def invoke(input_path, detect_mode, timestamp_column, value_column, batch_size, threshold, sensitivity,
            appendMode, compute_stats_in_visualization, output_path):
    data_frame_directory = load_data_frame_from_directory(input_path)

    logger.debug(f"Shape of loaded DataFrame: {data_frame_directory.data.shape}")

    if data_frame_directory.data.shape[0] < MIN_POINTS:
        raise UserError(NotEnoughPoints.format(MIN_POINTS))

    if 0 < batch_size < MIN_POINTS:
        raise UserError(InvalidBatchSize.format(MIN_POINTS))

    query_string = unquote(timestamp_column)
    timestamp_column_selector = ColumnSelection(query_string)
    timestamp = timestamp_column_selector.select_dataframe_directory(data_frame_directory).data

    timestamps = pd.to_datetime(timestamp.iloc[:, 0].values)

    if np.any(np.isnat(timestamps)):
        raise UserError(InvalidTimestamps)

    res = is_timestamp_ascending(timestamps)
    if res == -1:
        raise UserError(InvalidSeriesOrder)
    elif res == -2:
        raise UserError(DuplicateSeriesTimestamp)


    query_string = unquote(value_column)
    data_column_selector = ColumnSelection(query_string)
    data_columns = data_column_selector.select_dataframe_directory(data_frame_directory).data

    for col in data_columns.columns:
        try:
            float_data = data_columns[col].apply(float)
        except Exception as e:
            raise UserError(InvalidValueFormat.format(col))

        if not np.all(np.isfinite(float_data)):
            raise UserError(InvalidSeriesValue.format(col))

        if np.any(np.less(float_data, VALUE_LOWER_BOUND)) or np.any(np.greater(float_data, VALUE_UPPER_BOUND)):
            raise UserError(ValueOverflow.format(col))

        data_columns[col] = float_data

    result = sr_detector.detect(timestamps, data_columns, detect_mode=detect_mode,
                                batch_size=batch_size, threshold=threshold, sensitivity=sensitivity)

    if appendMode is True:
        result = pd.merge(data_frame_directory.data, result, left_index=True, right_index=True)

    save_data_frame_to_directory(output_path, result, compute_stats_in_visualization=compute_stats_in_visualization)
Exemple #8
0
def gdal_sample(
        ##define interface(input, output, paratmers) of the module here
        output_dir1: OutputDirectory(),
        output_dir2: OutputDirectory(),
        input_dir1: InputDirectory(),
        input_dir2: InputDirectory()):
    print('I am in module definition')
    print(f'input_dir1: {Path(input_dir1).resolve()}')
    print(f'input_dir2: {Path(input_dir2).resolve()}')

    ## add custom logic here

    dfd1 = load_data_frame_from_directory(input_dir1)
    data_frame1 = dfd1.data
    print(data_frame1.head(10))
Exemple #9
0
 def testAnomalyAndMargin(self):
     df = pd.DataFrame()
     df['timestamp'] = pd.date_range(start='2020-01-01', periods=200, freq='1D')
     df['value'] = np.sin(np.linspace(1, 20, 200))
     save_data_frame_to_directory(self.__input_path, df)
     invoker.invoke(self.__input_path, "AnomalyAndMargin", self.__timestamp_column, self.__value_column,
                     self.__batch_size, self.__threshold, self.__sensitivity, self.__append_mode,
                     self.compute_stats_in_visualization, self.__output_path)
     result = load_data_frame_from_directory(self.__output_path).data
     self.assertEqual(result.shape[0], 200)
     self.assertTrue('value' in result.columns)
     self.assertTrue('isAnomaly' in result.columns)
     self.assertTrue('score' in result.columns)
     self.assertTrue('expectedValue' in result.columns)
     self.assertTrue('upperBoundary' in result.columns)
     self.assertTrue('lowerBoundary' in result.columns)
    def __init__(self,
                 file,
                 word2id,
                 label2id,
                 args,
                 transform=sentence2idlist,
                 max_len=-1):
        self.data = []
        self.transform = transform
        self.max_len = max_len
        df = load_data_frame_from_directory(file).data

        for index, row in df.iterrows():
            self.data.append(
                (np.array(self.transform(row[args.text_column], word2id)),
                 label2id[row[args.label_column]]))

        self.len = len(self.data)
Exemple #11
0
def main(args=None):
    '''
        Module entry function
    '''
    input_dir = args.input_dir
    corr_type = args.correlation_method

    logger.debug(f'input-dir {input_dir}')
    logger.debug(f'correlation-method {corr_type}')
    logger.debug(f'output-dir {args.output_dir}')
    input_df = load_data_frame_from_directory(args.input_dir).data

    corr_df = ComputeCorrelationModule(corr_type).compute(input_df)
    logger.debug(f'correlation matrix shape {corr_df.shape}')

    save_data_frame_to_directory(
        save_to=args.output_dir,
        data=corr_df,
        schema=DataFrameSchema.data_frame_to_dict(corr_df))
def main(args):
    '''
        Module entry function
    '''

    transformer = SUPPORTED_TRANSFORMERS[args.transformer]

    logger.debug(f'input-dir {args.input_dir}')
    logger.debug(f'column {args.column_name}')
    logger.debug(f'distance {args.distance}')
    logger.debug(f'transformer {transformer}')
    logger.debug(f'sim-dir {args.sim_dir}')

    input_df = load_data_frame_from_directory(args.input_dir).data

    if input_df[args.column_name].isnull().sum().sum() > 0:
        logger.debug(f'column{args.column_name} contains missing values ')
        sys.exit(1)

    sts = TextualSimilarity(transformer=transformer,
                            distance_func=args.distance)
    embedding_df, sim_df = sts.fit_transform(input_df[args.column_name].values)

    sim_df.insert(0, args.column_name, input_df[args.column_name])

    logger.debug(f'similarity matrix shape {sim_df.shape}')
    logger.debug(f'embedding  shape {embedding_df.shape}')

    save_data_frame_to_directory(
        save_to=args.sim_dir,
        data=sim_df,
        schema=DataFrameSchema.data_frame_to_dict(sim_df))

    save_data_frame_to_directory(
        save_to=args.embedding_dir,
        data=embedding_df,
        schema=DataFrameSchema.data_frame_to_dict(embedding_df))
def main(args=None):
    '''
      Module entry point function
    '''

    seq_col = args.sequence_column
    id_col = args.identifier_column
    length_sensitive = args.length_sensitive
    kappa = args.kappa

    logger.debug(f'input-dir {args.input_dir}')
    logger.debug(f'sequence-column {seq_col}')
    logger.debug(f'identifier-column {id_col}')
    logger.debug(f'length-sensitive {length_sensitive}')
    logger.debug(f'kappa {args.kappa}')
    logger.debug(f'output-dir {args.output_dir}')
    logger.debug(f'model output dir {args.model_output_dir}')

    input_df = load_data_frame_from_directory(args.input_dir).data

    if input_df[seq_col].isnull().sum().sum() > 0:
        logger.debug(f'column {seq_col} contains missing values ')
        sys.exit(1)

    embedding_df, sgt = compute_embeddings(input_df, seq_col, kappa,
                                           length_sensitive, id_col)

    logger.debug(f'embedding shape {embedding_df.shape}')

    save_data_frame_to_directory(
        save_to=args.output_dir,
        data=embedding_df,
        schema=DataFrameSchema.data_frame_to_dict(embedding_df))

    save_model_to_directory(save_to=args.model_output_dir,
                            model_dumper=sgt_dumper(data=sgt))
Exemple #14
0
import os
import pandas as pd
from azureml.studio.core.io.data_frame_directory import load_data_frame_from_directory, save_data_frame_to_directory

print(
    "Replace undefined values to relavant values and rename columns to meaningful names"
)

parser = argparse.ArgumentParser("normalize")
parser.add_argument("--filtered_data", type=str, help="filtered taxi data")
parser.add_argument("--output_normalize",
                    type=str,
                    help="replaced undefined values and renamed columns")

args = parser.parse_args()
combined_converted_df = load_data_frame_from_directory(args.filtered_data).data
print("Argument (output normalized taxi data path): %s" %
      args.output_normalize)

# These functions replace undefined values and rename to use meaningful names.
replaced_stfor_vals_df = (combined_converted_df.replace({
    "store_forward": "0"
}, {
    "store_forward": "N"
}).fillna({"store_forward": "N"}))

replaced_distance_vals_df = (replaced_stfor_vals_df.replace({
    "distance": ".00"
}, {
    "distance": 0
}).fillna({"distance": 0}))
Exemple #15
0
from textclscnn.args_util import preprocess_args

nltk.download('punkt')


class DataPreprocessor(object):
    def __init__(self, vocab_path, text_column):
        self.vocab_path = vocab_path
        self.text_column = text_column
        self.rule = re.compile(r"[^\u4e00-\u9fa5]")
        self.cut = word_tokenize
        with open(self.vocab_path + '/' + 'word2id.pkl', 'rb') as f:
            self.word2id = pickle.load(f)

    def process(self, data_frame: pd.DataFrame):
        out_df = data_frame.copy()
        out_df['text_id'] = data_frame[self.text_column].apply(lambda text: [
            self.word2id[word] if word != '\x00' and word in self.word2id else
            0 for word in word_tokenize(text)
        ])
        print(f'first 5 lines of processed df: {out_df.head()}')
        return out_df


if __name__ == '__main__':
    args = preprocess_args()
    processor = DataPreprocessor(args.input_vocab, args.text_column)
    data_frame = load_data_frame_from_directory(args.input_data).data
    save_data_frame_to_directory(args.output_data,
                                 data=processor.process(data_frame))
        plt.ylim([0, 1.1])
        plt.ylabel('score')
        plt.title('Scores')

        return f2_plt

    def evaluation(self, df_true, df_predict, df_prob, output_eval_dir):
        run = Run.get_context()

        f1_plt = self.prcurve(df_true, df_predict, df_prob)
        run.log_image("precision/recall curve", plot=f1_plt)
        f1_plt.savefig(os.path.join(output_eval_dir, 'precision_recall.png'))

        f2_plt = self.scores(df_true, df_predict)
        run.log_image("scores", plot=f2_plt)
        f2_plt.savefig(os.path.join(output_eval_dir, 'scores.png'))


if __name__ == '__main__':
    args = predict_args()
    predictor = Predictor(args.trained_model)
    df = load_data_frame_from_directory(args.predict_path).data
    out_df = predictor.predict(df)
    save_data_frame_to_directory(args.predict_result_path, data=out_df)
    label_column = predictor.label_column
    print(f'label column {label_column}')
    if label_column in df.columns:
        print(f"Got actual label column {label_column}, evaluating:")
        predictor.evaluation(df[label_column], out_df['Scored Label'],
                             out_df['Scored Prob'], args.predict_result_path)
Exemple #17
0
        '--remove-seen-items',
        type=str,
        help='Remove items seen in training from recommendation')
    parser.add_argument('--score-result', help='Ratings or items to output')

    args, _ = parser.parse_known_args()

    logger.info(f"Arguments: {args}")
    sort_top_k = strtobool(args.sort_top_k) if args.sort_top_k else None
    remove_seen_items = strtobool(
        args.remove_seen_items) if args.remove_seen_items else None
    normalize = strtobool(args.normalize) if args.normalize else None

    sar_model = load_model_from_directory(args.trained_model,
                                          model_loader=joblib_loader).data
    dataset_to_score = load_data_frame_from_directory(
        args.dataset_to_score).data
    logger.debug(f"Shape of loaded DataFrame: {dataset_to_score.shape}")

    score_sar_module = ScoreSARModule(model=sar_model,
                                      input_data=dataset_to_score)

    score_type = ScoreType(args.score_type)
    if score_type == ScoreType.ITEM_RECOMMENDATION:
        score_result = score_sar_module.recommend_items(
            ranking_metric=RankingMetric(args.ranking_metric),
            top_k=args.top_k,
            sort_top_k=sort_top_k,
            remove_seen=args.remove_seen_items,
            normalize=normalize)
    elif score_type == ScoreType.RATING_PREDICTION:
        score_result = score_sar_module.predict_ratings(
Exemple #18
0
def invoke(input_path, detect_mode, timestamp_column, value_column, batch_size,
           threshold, sensitivity, appendMode, compute_stats_in_visualization,
           output_path):
    df = load_data_frame_from_directory(input_path).data
    logging.info(f"Shape of loaded DataFrame: {df.shape}")

    if df.shape[0] < MIN_POINTS:
        raise Exception(NotEnoughPoints.format(MIN_POINTS))

    if 0 < batch_size < MIN_POINTS:
        raise Exception(InvalidBatchSize.format(MIN_POINTS))

    if timestamp_column not in list(df.columns):
        raise Exception(ColumnNotFoundError.format(timestamp_column))

    if value_column not in list(df.columns):
        raise Exception(ColumnNotFoundError.format(value_column))

    timestamp = pd.DataFrame(df, columns=[timestamp_column])
    timestamps = pd.to_datetime(timestamp.iloc[:, 0].values)

    if np.any(np.isnat(timestamps)):
        raise Exception(InvalidTimestamps)

    res = is_timestamp_ascending(timestamps)

    if res == -1:
        raise Exception(InvalidSeriesOrder)
    elif res == -2:
        raise Exception(DuplicateSeriesTimestamp)

    data_columns = pd.DataFrame(df, columns=[value_column])

    for col in data_columns:
        try:
            float_data = data_columns[col].apply(float)
        except Exception as e:
            raise Exception(InvalidValueFormat.format(col))

        if not np.all(np.isfinite(float_data)):
            raise Exception(InvalidSeriesValue.format(col))

        if np.any(np.less(float_data, VALUE_LOWER_BOUND)) or np.any(
                np.greater(float_data, VALUE_UPPER_BOUND)):
            raise Exception(ValueOverflow.format(col))

        data_columns[col] = float_data

    result = sr_detector.detect(timestamps,
                                data_columns,
                                detect_mode=detect_mode,
                                batch_size=batch_size,
                                threshold=threshold,
                                sensitivity=sensitivity)

    if appendMode is True:
        result = pd.merge(df, result, left_index=True, right_index=True)

    save_data_frame_to_directory(
        output_path,
        result,
        compute_stats_in_visualization=compute_stats_in_visualization)
Exemple #19
0
parser = argparse.ArgumentParser("XGBRegressorEvaluation")
parser.add_argument("--Evaluation_Data", type=str, help="Evaluation dataset.")
parser.add_argument("--Lable_Col",
                    type=str,
                    help="Lable column in the evaluation dataset.")
parser.add_argument("--Model_Path",
                    type=str,
                    help="Path where contains model file.")
parser.add_argument("--Model_FileName",
                    type=str,
                    help="Name of the model file.")
parser.add_argument("--Evaluation_Output", type=str, help="Evaluation result")
args = parser.parse_args()

## Load data from DataFrameDirectory to Pandas DataFrame
evaluation_df = load_data_frame_from_directory(args.Evaluation_Data).data

## Prepare evaluation data
evaluation_df_features = evaluation_df[[
    c for c in evaluation_df.columns if c != args.Lable_Col
]]
evaluation_df_lable = evaluation_df[args.Lable_Col]

## Load model
xg_reg = xgb.XGBRegressor()
xg_reg.load_model(args.Model_Path + "/" + args.Model_FileName)

## Evaluation
preds = xg_reg.predict(evaluation_df_features)
rmse = np.sqrt(mean_squared_error(evaluation_df_lable, preds))
print("RMSE: %f" % (rmse))
 def inference(self, data_path, save_path):
     os.makedirs(save_path, exist_ok=True)
     input = load_data_frame_from_directory(data_path).data
     df = self.run(input)
     save_data_frame_to_directory(save_path, data=df)
    int_param = args.int_parameter
    bool_param = args.boolean_parameter
    enum_param = args.enum_parameter

    logger.debug(f"Received parameters:")
    logger.debug(f"    {str_param}")
    logger.debug(f"    {int_param}")
    logger.debug(f"    {bool_param}")
    logger.debug(f"    {enum_param}")

    if rank > 0:
        logger.debug(f"I'm rank {rank}/{size}, wait for data.")
        data = comm.recv(source=0, tag=rank)
        logger.debug(f"Received shape of loaded DataFrame: {data} ")
    else:
        logger.debug(f"I'm rank 0/{size}, load and dump.")

        logger.debug(f"Input path: {args.input_path}")
        data_frame_directory = load_data_frame_from_directory(args.input_path)

        logger.debug(f"Shape of loaded DataFrame: {data_frame_directory.data.shape}")

        logger.debug(f"Output path: {args.output_path}")
        save_data_frame_to_directory(args.output_path, data_frame_directory.data)

        for i in range(1, size):
            data = data_frame_directory.data.shape
            logger.debug(f"Send shape to rank {i}")
            comm.send(data, dest=i, tag=i)

parser.add_argument("--Learning_rate",
                    type=float,
                    help="Boosting learning rate.")
parser.add_argument("--Max_depth",
                    type=int,
                    help="Maximum tree depth for base learners.")
parser.add_argument("--Model_FileName",
                    type=str,
                    help="Name of the model file.")
parser.add_argument("--Model_Path",
                    type=str,
                    help="Path to store XGBoost model file in Json format.")
args = parser.parse_args()

## Load data from DataFrameDirectory to Pandas DataFrame
training_df = load_data_frame_from_directory(args.Training_Data).data

## Prepare training data
training_df_features = training_df[[
    c for c in training_df.columns if c != args.Lable_Col
]]
training_df_lable = training_df[args.Lable_Col]

## Training
xg_reg = xgb.XGBRegressor(objective='reg:linear',
                          colsample_bytree=0.3,
                          alpha=10,
                          n_estimators=10,
                          learning_rate=args.Learning_rate,
                          max_depth=args.Max_depth)
    parser.add_argument(
        "--relevancy-method",
        type=str,
        help="method for determining relevancy ['top_k', 'by_threshold'].",
    )
    parser.add_argument("--k",
                        type=int,
                        help="number of top k items per user.")
    parser.add_argument("--threshold",
                        type=float,
                        help="threshold of top items per user.")
    parser.add_argument("--score-result", help="Result of the computation.")

    args, _ = parser.parse_known_args()

    rating_true = load_data_frame_from_directory(args.rating_true).data
    rating_pred = load_data_frame_from_directory(args.rating_pred).data

    col_user = args.col_user
    col_item = args.col_item
    col_rating = args.col_rating
    col_prediction = args.col_prediction
    relevancy_method = args.relevancy_method
    k = args.k
    threshold = args.threshold

    logger.debug(f"Received parameters:")
    logger.debug(f"User:       {col_user}")
    logger.debug(f"Item:       {col_item}")
    logger.debug(f"Rating:     {col_rating}")
    logger.debug(f"Prediction: {col_prediction}")
Exemple #24
0
import argparse
import os
from azureml.studio.core.io.data_frame_directory import load_data_frame_from_directory, save_data_frame_to_directory

print("Merge Green and Yellow taxi data")

parser = argparse.ArgumentParser("merge")
parser.add_argument("--cleansed_green_data",
                    type=str,
                    help="cleansed green data")
parser.add_argument("--cleansed_yellow_data",
                    type=str,
                    help="cleansed yellow data")
parser.add_argument("--output_merge",
                    type=str,
                    help="green and yellow taxi data merged")

args = parser.parse_args()
green_df = load_data_frame_from_directory(args.cleansed_green_data).data
yellow_df = load_data_frame_from_directory(args.cleansed_yellow_data).data
print("Argument (output merge taxi data path): %s" % args.output_merge)

# Appending yellow data to green data
combined_df = green_df.append(yellow_df, ignore_index=True)
combined_df.reset_index(inplace=True, drop=True)

if not (args.output_merge is None):
    os.makedirs(args.output_merge, exist_ok=True)
    print("%s created" % args.output_merge)
    save_data_frame_to_directory(args.output_merge, combined_df)