Exemple #1
0
    def getUpdateInstance_byFlask(self):
        print("Into ACR Update Instance by Flask")
        parameter = load_json_config("./parameter.json")
        list_args = parameter["acr_preprocess"]
        #    acr_path = parameter["DATA_DIR"] + "/pickles/acr_articles_metadata_embeddings_predict/acr_articles_metadata_embeddings_predict.pickle"
        #    acr_path = parameter["DATA_DIR"] + "/pickles/acr_articles_metadata_embeddings/acr_articles_metadata_embeddings.pickle"
        self.model_nar_dir = parameter["DATA_DIR"] + parameter["model_dir_nar"]

        (acr_label_encoders, articles_metadata_df, content_article_embeddings) = \
            deserialize(self.acr_path)
        self.acr_label_encoders = acr_label_encoders
        self.articles_metadata_df = articles_metadata_df
        # self.reverse_acr_article_id = {}
        self.reverse_acr_article_id = {
            v: k
            for k, v in acr_label_encoders['article_id'].items()
        }
        self.content_article_embeddings_matrix = content_article_embeddings

        # get list_id
        list_id = get_list_id()
        encoded_list_id = []

        for id in list_id:
            if (int(id) in acr_label_encoders['article_id']):
                encoded_list_id.append(self.get_article_id_encoded(int(id)))
        list_id_week_encoded = list(articles_metadata_df['article_id'])[-600:]
        encoded_list_id = list_id_week_encoded + encoded_list_id
        list_id_week = list(acr_label_encoders['article_id'].keys())[-600:]
        self.list_id = list(list_id) + list_id_week
        self.encoded_list_id = encoded_list_id

        # print("Loading ACR singleton")
        ACR_Pickle_Singleton.__instance = self
        print("Done Update SingleTon Flask Init Done")
Exemple #2
0
 def get_training_files(self, training_hour):
     parameter = load_json_config("./parameter.json")
     training_dir = parameter["DATA_DIR"] + parameter["nar_preprocess_2"][
         "output_sessions_tfrecords_path"]
     train_files = resolve_files(training_dir)[-training_hour:]
     # print("TrainFile")
     # print(train_files)
     return list(chunks(train_files, training_hour))
Exemple #3
0
 def __init__(self):
     parameter = load_json_config("./parameter.json")
     list_args = parameter["acr_preprocess"]
     nar_path = parameter[
         "DATA_DIR"] + "/pickles/nar_preprocessing_resources/nar_preprocessing_resources.pickle"
     self = load_nar_module_preprocessing_resources(nar_path)
     # print("Loading NAR singleton")
     NAR_Pickle_Singleton.__instance = self
Exemple #4
0
def main_nar_preprocess_2():
    #def main():
    # parser = create_args_parser()
    # args = parser.parse_args()

    print("<=== STARTING NAR PREPROCESS 2 ===> ")

    # parameter = load_json_config("./parameter.json")
    parameter = load_json_config("./parameter.json")
    list_args = parameter["nar_preprocess_2"]

    DATA_DIR = parameter["DATA_DIR"]
    num_day = list_args["num_day"]
    input_sessions_json_folder_path = DATA_DIR + list_args[
        "input_sessions_json_folder_path"]
    input_acr_metadata_embeddings_path = DATA_DIR + list_args[
        "input_acr_metadata_embeddings_path"]
    input_nar_encoders_dict_path = DATA_DIR + list_args[
        "input_nar_encoders_dict_path"]
    number_hours_to_preprocess = list_args["number_hours_to_preprocess"]
    output_nar_preprocessing_resources_path = DATA_DIR + list_args[
        "output_nar_preprocessing_resources_path"]
    output_sessions_tfrecords_path = DATA_DIR + list_args[
        "output_sessions_tfrecords_path"]

    if path.exists(output_nar_preprocessing_resources_path):
        pass
    else:
        import os
        os.makedirs(output_nar_preprocessing_resources_path)

    print('Loading resources generated ACR module (articles metadata)')
    # truyen file
    get_article_text_length_fn, get_article_id_encoded_fn = load_acr_module_resources(
        get_all_file(input_acr_metadata_embeddings_path)[0])
    #get_article_text_length_fn = None

    # # degub
    # print(get_article_text_length_fn)

    print(
        'Loading resources generated by the first step of NAR preprocessing (cat. features dict encoders)'
    )
    nar_encoders_dict = load_nar_module_resources(
        get_all_file(input_nar_encoders_dict_path)[0])

    print('Loading sessions from folder: {}'.format(
        input_sessions_json_folder_path))
    print('Exporting TFRecords to: {}'.format(output_sessions_tfrecords_path))

    # delete file .part*
    # from subprocess import Popen
    # var1 = DATA_DIR+input_sessions_json_folder_path+"session_hour=*/.*"
    # Process = Popen(['./nar_module/scripts/remove_hiden_file.sh %s' % str(var1)],  shell=True)

    import os
    var1 = 'rm -rf ' + input_sessions_json_folder_path + "/session_hour=*/.*"
    print(var1)
    myCmd = var1
    if os.system(myCmd) != 0:
        print("Xoa thanh cong")
    else:
        print("Xoa That bai")

    # split path output_sessions_tfrecords_path
    path_tf = DATA_DIR + '/' + list_args[
        "output_sessions_tfrecords_path"].split('/')[1]
    if path.exists(path_tf):
        pass
    else:
        import os
        os.makedirs(path_tf)

    clicks_by_articles_counters = []
    #a = preprocess_for_predict("2265891616712405988", get_article_text_length_fn, get_article_id_encoded_fn)
    for (hour_index, sessions_hour
         ) in load_sessions_hours(input_sessions_json_folder_path):
        # check directory empty:
        if len(os.listdir(DATA_DIR + "/sessions_tfrecords_by_hour/")) != 0:
            hour_index = split_string(DATA_DIR +
                                      "/sessions_tfrecords_by_hour/") + 1
        print('Processing hour {}'.format(hour_index))

        ####compute_global_metrics(sessions_hour)

        sessions_hour_df, hour_stats, hour_clicks_by_articles_counter = process_session_clicks_features(
            sessions_hour, get_article_text_length_fn)
        #sessions_hour_df.to_csv('hour-{}-to-debug.csv'.format(hour_index))

        hour_stats['_hour_index'] = hour_index
        #stats.append(hour_stats)

        clicks_by_articles_counters.append(hour_clicks_by_articles_counter)

        # sessions_hour_df.to_csv(DATA_DIR+"/sessions_tfrecords_by_hour/sessions_hour_df.csv", index=False)
        export_sessions_hour_to_tf_records(
            hour_index,
            sessions_hour_df,
            output_path=output_sessions_tfrecords_path)
        # print('')

        # if number_hours_to_preprocess >= 0 and hour_index == number_hours_to_preprocess:
        #     break

    print()

    print(
        'Exporting Categorical Feature encoders and Numeric scalers dicts: {}'.
        format(output_nar_preprocessing_resources_path))

    save_nar_preprocessing_resources(
        output_nar_preprocessing_resources_path +
        "nar_preprocessing_resources.pickle", nar_encoders_dict,
        numeric_scalers)

    # delete to keep tf record in 2 week nearest

    # after export tfrecord for trainning, delete all file in input_sessions_json_folder_path
    if len(os.listdir(DATA_DIR +
                      "/sessions_tfrecords_by_hour/")) > 24 * num_day:
        delete_file_keep_in_two_week(DATA_DIR + "/sessions_tfrecords_by_hour/",
                                     24 * num_day)

    # delete_all_file_in_path(input_sessions_json_folder_path)

    print("<=== END NAR PREPROCESS 2 ===> ")
def main_nar_preprocess_1():
    spark = spark_inital()
    sc = spark.sparkContext
    sqlContext = SQLContext(sc)

    # get paramater
    parameter = load_json_config("./parameter.json")
    list_args = parameter["nar_preprocess_1"]

    DATA_DIR = parameter["DATA_DIR"]
    input_path_data_log_click = list_args["input_path_data_log_click"]
    date_start = list_args["date_start"]
    date_end = list_args["date_end"]
    input_path_proprcessed_cafebiz_articale_csv_from_acr = DATA_DIR + list_args[
        "input_path_proprcessed_cafebiz_articale_csv_from_acr"]
    nar_encoders_cafebiz = DATA_DIR + list_args["nar_encoders_cafebiz"]
    input_articles_csv_path_original = DATA_DIR + list_args[
        "input_articles_csv_path_original"]
    mysql_host = list_args["mysql_host"]
    mysql_user = list_args["mysql_user"]
    mysql_passwd = list_args["mysql_passwd"]
    mysql_database = list_args["mysql_database"]
    mysql_table = list_args["mysql_table"]
    domain = list_args["domain"]
    num_hour_trainning = list_args["n_hour_train_continue"]

    list_args2 = parameter["nar_preprocess_2"]
    spark_pre_json_path = DATA_DIR + list_args2[
        'input_sessions_json_folder_path']
    session_tfrecord_path = DATA_DIR + list_args2[
        'output_sessions_tfrecords_path']

    # Delete folder before run nar
    if path.exists(spark_pre_json_path):
        import shutil
        shutil.rmtree(spark_pre_json_path)

    # delete_all_file_in_path(split_string_train_path( session_tfrecord_path))

    print("STARTNIG NAR PREPROCESSING ....")
    # database news
    print(input_articles_csv_path_original)
    df_news = handle_database_news_load_from_acr(
        input_articles_csv_path_original, spark)
    current_time = get_date_dt_data_log()
    flag = 0
    if path.exists(DATA_DIR + "/sessions_tfrecords_by_hour/"):
        # second time
        # read date now , 1 hour before
        df_log = handle_database_log_mysql(num_hour_trainning, domain, spark,
                                           mysql_host, mysql_user,
                                           mysql_passwd, mysql_database,
                                           mysql_table, current_time)
        flag = 2  # run second times

    else:
        # first time
        # read log from data_start to date_end
        df_log = handle_database_log(domain, spark, input_path_data_log_click,
                                     date_start, date_end)
        flag = 1  # first time

    # join database news and database log
    df = df_log.join(df_news, 'full_url', 'inner').drop(df_news.domain)
    # print(df.printSchema())

    print("<=== STARTING NAR PREPROCESSING 1 ===>")

    articles_original_df = load_cafebiz_article_from_acr(
        get_all_file(input_path_proprcessed_cafebiz_articale_csv_from_acr)[0])

    # code runable
    valid_articles_urls_to_ids_dict = dict(articles_original_df[[
        'url', 'id_encoded'
    ]].apply(lambda x: (x['url'], x['id_encoded']), axis=1).values)

    valid_articles_urls_to_ids_dict_broadcast = spark.sparkContext.broadcast(
        valid_articles_urls_to_ids_dict)

    ### Loading user interactions
    ### TEST data_test
    # df = spark.read.parquet("file:///data1/ngocvb/Ngoc_COV/chameleon/nar_preprocess/nar_data/data_test/")

    #     df = spark.read.parquet("file:///data/tungtv/jupytercode/data-log-news-parquet-thang45/")
    # df_news = handle_database_news_load_from_acr(
    #     "/data/tungtv/Code/dataset/dataset_cafebiz_acr_nar_1/original_cafebiz_articles_csv/cafebiz_articles_original.csv",
    #     spark)
    # df_log = handle_database_log(spark, "hdfs://10.5.37.76:8020/Data/Logging/pvTosFull/pc/", "2019-07-01", "2019-07-05")
    # df = df_log.join(df_news, 'full_url', 'inner').drop(df_news.domain)

    interactions_df = df.select("full_url", "dt", "os_code", "loc_id", "path" \
                                , "guid", "category0", "id", "content", "created_at_ts" \
                                , "teaser", "title", "keywords", "time", "url", "top")
    # .alias("full_url"), "id", "content", "created_at_ts", "teaser", "domain", "keywords",
    # "title", \
    # "url", "category0", "persons", "locations", "text_highlights")
    # tructField("dt", StringType(), True) \
    #     , StructField("loc_id", IntegerType(), True) \
    #     , StructField("os_code", IntegerType(), True) \
    #     , StructField("top", IntegerType(), True) \
    #     , StructField("guid", StringType(), True) \
    #     , StructField("news_id", LongType(), True) \
    #     , StructField("domain", StringType(), True) \
    #     , StructField("path", StringType(), True)])

    interactions_df = interactions_df.withColumn(
        "id", interactions_df["id"].cast(LongType()))
    interactions_df = interactions_df.withColumn(
        "created_at_ts", interactions_df["created_at_ts"].cast(LongType()))

    get_article_id_encoded_from_url_udf = F.udf(
        lambda url: get_article_id_encoded_from_url(
            url, valid_articles_urls_to_ids_dict_broadcast),
        pyspark.sql.types.IntegerType())

    # Filtering only interactions whose url/id is available in the articles table
    # tungtv
    interactions_article_id_encoded_df = interactions_df.withColumn(
        'article_id',
        get_article_id_encoded_from_url_udf(interactions_df['url']))
    #     interactions_article_id_encoded_df = interactions_df.withColumn('article_id', interactions_df['id'])

    interactions_filtered_df = interactions_article_id_encoded_df.filter(
        interactions_article_id_encoded_df['article_id'].isNull() == False)

    # print(interactions_filtered_df.printSchema())
    # print(interactions_filtered_df.select("article_id"))
    first_timestamp_ts = interactions_filtered_df.select('time').agg(
        F.min('time')).collect()[0][0] * 1000

    # Analyzing elapsed time since publishing
    interactions_filtered_df = add_column_dataframe(
        interactions_filtered_df, "publish_ts",
        F.to_timestamp(
            interactions_filtered_df.created_at_ts.cast(
                dataType=t.TimestampType())))

    interactions_filtered_df = add_column_dataframe(
        interactions_filtered_df, "publish_ts",
        F.unix_timestamp(col("publish_ts"), 'yyyy-MM-dd HH:mm:ss'))

    get_timestamp_from_date_str_udf = F.udf(get_timestamp_from_date_str,
                                            pyspark.sql.types.IntegerType())

    interactions_filtered_with_publish_ts_df = add_column_dataframe(interactions_filtered_df,
                                                                    "elapsed_min_since_published", \
                                                                    ((F.col('time') - F.col('publish_ts')) / 60).cast(
                                                                        pyspark.sql.types.IntegerType()))

    interactions_filtered_with_publish_ts_df.approxQuantile(
        "elapsed_min_since_published", [0.10, 0.25, 0.50, 0.75, 0.90], 0.01)

    elapsed_min_since_published_df = interactions_filtered_with_publish_ts_df.select(
        'elapsed_min_since_published').toPandas()
    """PAD_TOKEN = '<PAD>'
    UNFREQ_TOKEN = '<UNF>'"""

    # Analyzing clicks by article distribution
    ## Processing categorical features
    if flag == 1:
        cities_df = get_categ_features_counts_dataframe(
            interactions_filtered_df, 'loc_id')
        cities_encoder_dict = get_categ_features_encoder_dict(cities_df)

        os_df = get_categ_features_counts_dataframe(interactions_filtered_df,
                                                    'os_code')
        os_encoder_dict = get_categ_features_encoder_dict(os_df)
    else:
        # map value from nar encode city
        # read nar_encode_pickle
        nar_encoder_dict = deserialize(get_all_file(nar_encoders_cafebiz)[0])
        cities_df = get_categ_features_counts_dataframe(
            interactions_filtered_df, 'loc_id')
        cities_encoder_dict = get_categ_features_encoder_dict_second_time(
            cities_df, nar_encoder_dict['os'])

        os_df = get_categ_features_counts_dataframe(interactions_filtered_df,
                                                    'os_code')
        os_encoder_dict = get_categ_features_encoder_dict_second_time(
            os_df, nar_encoder_dict['city'])

    encoders_dict = {
        'city': cities_encoder_dict,
        #     'region': regions_encoder_dict,
        #     'country': countries_encoder_dict,
        'os': os_encoder_dict,
        #     'device': devices_encoder_dict,
        #     'referrer_class': referrer_class_encoder_dict
    }

    # Processing numeric features

    active_time_quantiles = interactions_filtered_df.approxQuantile(
        "top", [0.10, 0.25, 0.50, 0.75, 0.90], 0.01)

    active_time_stats_df = interactions_filtered_df.describe('top').toPandas()

    active_time_mean = float(active_time_stats_df[
        active_time_stats_df['summary'] == 'mean']['top'].values[0])
    active_time_stddev = float(active_time_stats_df[
        active_time_stats_df['summary'] == 'stddev']['top'].values[0])

    interactions_filtered_df = interactions_filtered_df.withColumnRenamed(
        "guid", "user_id")
    interactions_filtered_df = interactions_filtered_df.orderBy("time")

    ### Splitting sessions
    MAX_SESSION_IDLE_TIME_MS = 30 * 60 * 1000  # 30 min
    # test_df = interactions_filtered_df.limit(1000).rdd.map(lambda x: (x['user_id'], x))
    # print(test_df.take(10))
    sessions_rdd = interactions_filtered_df.rdd.map(lambda x: (x['user_id'], x)).groupByKey() \
        .flatMap(lambda row: split_sessions(row, encoders_dict, MAX_SESSION_IDLE_TIME_MS, first_timestamp_ts)) \
        .sortByKey() \
        .map(lambda x: x[1])

    #### Exporting sessions to JSON lines
    sessions_sdf = sessions_rdd.toDF()

    sessions_sdf.write.partitionBy("session_hour").json(
        os.path.join(DATA_DIR, "sessions_processed_by_spark/"))

    if path.exists(nar_encoders_cafebiz):
        pass
    else:
        os.makedirs(nar_encoders_cafebiz)

    if flag == 1:  # first time
        NAR_ENCODERS_PATH = 'nar_encoders_cafebiz.pickle'
        serialize(nar_encoders_cafebiz + NAR_ENCODERS_PATH, encoders_dict)

    print(" <=== END NAR PREPROCESSING 1 ===>")
Exemple #6
0
def main(unused_argv):
    try:
        print("<=== STARTING NAR TRAINING ===> ")
        # load file parameter
        parameter = load_json_config("./parameter.json")
        data_dir = parameter['DATA_DIR']
        list_args = parameter["nar_preprocess_2"]
        paht_session_spark = data_dir + list_args[
            "input_sessions_json_folder_path"] + "/"

        # get num hour to train continue ...
        list_args_train = parameter["nar_preprocess_1"]
        hour_train_continue = list_args_train["n_hour_train_continue"]
        print(hour_train_continue)

        # Capture whether it will be a single training job or a hyper parameter tuning job.
        tf_config_env = json.loads(os.environ.get('TF_CONFIG', '{}'))
        task_data = tf_config_env.get('task') or {'type': 'master', 'index': 0}
        trial = task_data.get('trial')

        running_on_mlengine = (len(tf_config_env) > 0)
        print('Running {}'.format('on Google ML Engine' if running_on_mlengine
                                  else 'on a server/machine'))

        #Disabling duplicate logs on console when running locally
        logging.getLogger('tensorflow').propagate = running_on_mlengine

        tf.logging.info('Starting training job')

        gcs_model_output_dir = data_dir + FLAGS.model_dir

        # create folder model_dir
        if path.exists(gcs_model_output_dir):
            pass
        else:
            os.makedirs(gcs_model_output_dir)

        #If must persist and load model ouput in a local cache (to speedup in ML Engine)
        if FLAGS.use_local_cache_model_dir:
            model_output_dir = tempfile.mkdtemp()
            tf.logging.info(
                'Created local temp folder for models output: {}'.format(
                    model_output_dir))
        else:
            model_output_dir = gcs_model_output_dir

        if trial is not None:
            model_output_dir = os.path.join(model_output_dir, trial)
            gcs_model_output_dir = os.path.join(gcs_model_output_dir, trial)
            tf.logging.info(
                "Hyperparameter Tuning - Trial {} - model_dir = {} - gcs_model_output_dir = {} "
                .format(trial, model_output_dir, gcs_model_output_dir))

        tf.logging.info(
            'Will save temporary model outputs to {}'.format(model_output_dir))

        #If should warm start training from other previously trained model
        if FLAGS.warmup_model_dir != None:
            tf.logging.info(
                'Copying model outputs from previous job ({}) for warm start'.
                format(FLAGS.warmup_model_dir))
            dowload_model_output_from_gcs(
                model_output_dir,
                gcs_model_dir=FLAGS.warmup_model_dir,
                files_pattern=['graph.pb', 'model.ckpt-', 'checkpoint'])

            local_files_after_download_to_debug = list(
                glob.iglob("{}/**/*".format(model_output_dir), recursive=True))
            tf.logging.info(
                'Files copied from GCS to warm start training: {}'.format(
                    local_files_after_download_to_debug))

        print("STARTING ...")
        tf.logging.info('Loading ACR module assets')
        acr_label_encoders, articles_metadata_df, content_article_embeddings_matrix = \
                load_acr_module_resources(get_all_file(data_dir + FLAGS.acr_module_resources_path)[0])

        #Min-max scaling of the ACR embedding for a compatible range with other input features for NAR module
        # content_article_embeddings_matrix = min_max_scale(content_article_embeddings_matrix, min_max_range=(-0.1,0.1))

        #Apply l2-norm by sample
        l2_normalizer_by_sample = Normalizer(norm='l2')
        content_article_embeddings_matrix = l2_normalizer_by_sample.fit_transform(
            content_article_embeddings_matrix)

        #Rescaling content features
        content_article_embeddings_matrix = content_article_embeddings_matrix * FLAGS.content_embedding_scale_factor

        articles_features_config = get_articles_features_config(
            acr_label_encoders)
        articles_metadata = process_articles_metadata(
            articles_metadata_df, articles_features_config)

        tf.logging.info('Loading NAR module preprocesing assets')
        nar_label_encoders = load_nar_module_preprocessing_resources(
            get_all_file(data_dir +
                         FLAGS.nar_module_preprocessing_resources_path)[0])

        session_features_config = get_session_features_config(
            nar_label_encoders)

        tf.logging.info('Building NAR model')
        global eval_sessions_metrics_log, clicked_items_state, sessions_negative_items_log, sessions_chameleon_recommendations_log, global_eval_hour_id
        eval_sessions_metrics_log = []
        clicked_items_state = ClickedItemsState(
            FLAGS.recent_clicks_buffer_hours,
            FLAGS.recent_clicks_buffer_max_size,
            FLAGS.recent_clicks_for_normalization,
            content_article_embeddings_matrix.shape[0])

        tf.logging.info('Getting training file names')
        train_files = resolve_files(data_dir + FLAGS.train_set_path_regex)
        # get number of file in trani folder, to find train_file_from, train_file_to
        train_files_from = 0
        train_files_up_to = get_train_file_max(
            split_string_train_path(data_dir + FLAGS.train_set_path_regex))
        # train_files_up_to = 15
        if train_files_from > train_files_up_to:
            raise Exception(
                'Final training file cannot be lower than Starting training file'
            )
        train_files = train_files[train_files_from:train_files_up_to]
        print("============================>")
        print("Len file train: {}".format(len(train_files)))
        tf.logging.info('{} files where the network will be trained and evaluated on, from {} to {}' \
                        .format(len(train_files), train_files[0], train_files[-1]))

        if len(train_files) <= hour_train_continue:
            print("run_config for second time")
            run_config = tf.estimator.RunConfig(
                tf_random_seed=RANDOM_SEED,
                keep_checkpoint_max=1,
                # save_checkpoints_secs=1200,
                save_checkpoints_steps=1,
                save_summary_steps=100,
                log_step_count_steps=100,
                # session_config=config_proto
            )
        else:
            print("run_config for frist time")
            run_config = tf.estimator.RunConfig(
                tf_random_seed=RANDOM_SEED,
                keep_checkpoint_max=1,
                save_checkpoints_secs=1200,
                # save_checkpoints_steps=1,
                save_summary_steps=100,
                log_step_count_steps=100,
                # session_config=config_proto
            )

        model = build_estimator(run_config, model_output_dir,
                                content_article_embeddings_matrix,
                                articles_metadata, articles_features_config,
                                session_features_config)

        start_train = time()
        tf.logging.info("Starting Training Loop")

        training_files_chunks = list(
            chunks(train_files, FLAGS.training_hours_for_each_eval))
        for chunk_id in range(0, len(training_files_chunks)):

            training_files_chunk = training_files_chunks[chunk_id]
            tf.logging.info('Training files from {} to {}'.format(
                training_files_chunk[0], training_files_chunk[-1]))
            model.train(input_fn=lambda: prepare_dataset_iterator(
                training_files_chunk,
                session_features_config,
                batch_size=FLAGS.batch_size,
                truncate_session_length=FLAGS.truncate_session_length))

            if chunk_id < len(training_files_chunks) - 1:
                #Using the first hour of next training chunck as eval
                eval_file = training_files_chunks[chunk_id + 1][0]
                tf.logging.info('Evaluating file {}'.format(eval_file))
                model.evaluate(input_fn=lambda: prepare_dataset_iterator(
                    eval_file,
                    session_features_config,
                    batch_size=FLAGS.batch_size,
                    truncate_session_length=FLAGS.truncate_session_length))

            #After each number of train/eval loops
            if chunk_id % FLAGS.save_results_each_n_evals == 0:
                tf.logging.info('Saving eval metrics')
                global_eval_hour_id += 1
                save_eval_benchmark_metrics_csv(
                    eval_sessions_metrics_log,
                    model_output_dir,
                    training_hours_for_each_eval=FLAGS.
                    training_hours_for_each_eval)

                if FLAGS.save_eval_sessions_negative_samples:
                    #Flushing to disk the negative samples used to evaluate each sessions,
                    #so that benchmarks metrics outside the framework (eg. Matrix Factorization) can be comparable
                    save_sessions_negative_items(model_output_dir,
                                                 sessions_negative_items_log)
                    sessions_negative_items_log = []

                if FLAGS.save_eval_sessions_recommendations:
                    #Flushing to disk the recommended items to test re-ranking approaches (e.g. MMR)
                    save_sessions_chameleon_recommendations_log(
                        model_output_dir,
                        sessions_chameleon_recommendations_log,
                        global_eval_hour_id)
                    sessions_chameleon_recommendations_log = []

                    #Incrementing the eval hour id
                    global_eval_hour_id += 1

                #If must persist and load model ouput in a local cache (to speedup in ML Engine)
                if FLAGS.use_local_cache_model_dir:
                    tf.logging.info('Uploading cached results to GCS')
                    upload_model_output_to_gcs(
                        model_output_dir,
                        gcs_model_dir=gcs_model_output_dir,
                        #files_pattern=None)
                        files_pattern=[  #'events.out.tfevents.', 
                            '.csv', '.json'
                        ])

        tf.logging.info('Finalized Training')

        save_eval_benchmark_metrics_csv(
            eval_sessions_metrics_log,
            model_output_dir,
            training_hours_for_each_eval=FLAGS.training_hours_for_each_eval)

        if FLAGS.save_eval_sessions_negative_samples:
            #Flushing to disk the negative samples used to evaluate each sessions,
            #so that benchmarks metrics outside the framework (eg. Matrix Factorization) can be comparable
            save_sessions_negative_items(model_output_dir,
                                         sessions_negative_items_log)

        if FLAGS.save_eval_sessions_recommendations:
            #Flushing to disk the recommended items to test re-ranking approaches (e.g. MMR)
            save_sessions_chameleon_recommendations_log(
                model_output_dir, sessions_chameleon_recommendations_log,
                global_eval_hour_id)

        tf.logging.info('Saved eval metrics')

        #If must persist and load model ouput in a local cache (to speedup in ML Engine)
        if FLAGS.use_local_cache_model_dir:
            #Uploads all files to GCS
            upload_model_output_to_gcs(model_output_dir,
                                       gcs_model_dir=gcs_model_output_dir,
                                       files_pattern=None)

        #  after training , delete all file in train file path, and delete dir sessions_processed_by_spark
        delete_all_file_in_path(
            split_string_train_path(data_dir + FLAGS.train_set_path_regex))
        import shutil
        shutil.rmtree(paht_session_spark)
        # os.rmdir(paht_session_spark)

        # delete all file event in model_nar
        for filename in glob.glob(model_output_dir + "/events.out.tfevents*"):
            os.remove(filename)

        log_elapsed_time(start_train, 'Finalized TRAINING Loop')
        print("<=== END NAR TRAINING ===> ")

    except Exception as ex:
        tf.logging.error('ERROR: {}'.format(ex))
        raise
Exemple #7
0
def main_acr_train():
    # def main(unused_argv):
    # try:
    print("<=== STARTING ARC TRAINING ===>")

    parameter = load_json_config("./parameter.json")
    list_args = parameter["acr_training"]

    DATA_DIR = parameter["DATA_DIR"]
    model_dir = DATA_DIR + list_args["model_dir"]
    train_set_path_regex = DATA_DIR + list_args["train_set_path_regex"]
    input_word_vocab_embeddings_path = DATA_DIR + list_args[
        "input_word_vocab_embeddings_path"]
    input_label_encoders_path = DATA_DIR + list_args[
        "input_label_encoders_path"]
    output_acr_metadata_embeddings_path = DATA_DIR + list_args[
        "output_acr_metadata_embeddings_path"]
    batch_size = list_args["batch_size"]
    truncate_tokens_length = list_args["truncate_tokens_length"]
    training_epochs = list_args["training_epochs"]
    learning_rate = list_args["learning_rate"]
    dropout_keep_prob = list_args["dropout_keep_prob"]
    l2_reg_lambda = list_args["l2_reg_lambda"]
    text_feature_extractor = list_args["text_feature_extractor"]
    training_task = list_args["training_task"]
    cnn_filter_sizes = list_args["cnn_filter_sizes"]
    cnn_num_filters = list_args["cnn_num_filters"]
    rnn_units = list_args["rnn_units"]
    rnn_layers = list_args["rnn_layers"]
    rnn_direction = list_args["rnn_direction"]
    acr_embeddings_size = list_args["acr_embeddings_size"]
    # mysql_host = list_args["mysql_host"]
    # mysql_user = list_args["mysql_user"]
    # mysql_passwd = list_args["mysql_passwd"]
    # mysql_database = list_args["mysql_database"]

    # Capture whether it will be a single training job or a hyper parameter tuning job.
    tf_config_env = json.loads(os.environ.get('TF_CONFIG', '{}'))
    task_data = tf_config_env.get('task') or {'type': 'master', 'index': 0}
    trial = task_data.get('trial')

    running_on_mlengine = (len(tf_config_env) > 0)
    tf.logging.info(
        'Running {}'.format('on Google ML Engine'
                            if running_on_mlengine else 'on a server/machine'))

    #Disabling duplicate logs on console when running locally
    logging.getLogger('tensorflow').propagate = running_on_mlengine

    start_train = time()
    tf.logging.info('Starting training job')

    model_output_dir = model_dir

    if trial is not None:
        model_output_dir = os.path.join(model_output_dir, trial)
        tf.logging.info(
            "Hyperparameter Tuning - Trial {}. model_dir = {}".format(
                trial, model_output_dir))
    else:
        tf.logging.info('Saving model outputs to {}'.format(model_output_dir))

    tf.logging.info('Loading ACR preprocessing assets')

    # check exist path
    if path.exists(model_output_dir):
        pass
    else:
        os.makedirs(model_output_dir)

    if path.exists(output_acr_metadata_embeddings_path):
        pass
    else:
        os.makedirs(output_acr_metadata_embeddings_path)

    print("Loading ACR preprocessing assets....")

    print(input_label_encoders_path)
    print(output_acr_metadata_embeddings_path)

    file_lable_encode = get_all_file(input_label_encoders_path)[0]
    file_word_embedding = get_all_file(input_word_vocab_embeddings_path)[0]

    # current_time = split_date(file_lable_encode)
    # print(current_time)

    # load file with max date
    acr_label_encoders, labels_class_weights, word_embeddings_matrix = \
        load_acr_preprocessing_assets(file_lable_encode,file_word_embedding)

    features_config = get_session_features_config(acr_label_encoders)

    #input_tfrecords = os.path.join(FLAGS.data_dir, FLAGS.train_set_path_regex)
    input_tfrecords = train_set_path_regex
    tf.logging.info(
        'Defining input data (TFRecords): {}'.format(input_tfrecords))

    #Creating an ambedding for a special token to initiate decoding of RNN-autoencoder
    special_token_embedding_vector = np.random.uniform(
        low=-0.04, high=0.04, size=[1, word_embeddings_matrix.shape[1]])

    # train_files = get_listmax_date(get_all_file(train_set_path_regex))
    train_files = get_all_file(train_set_path_regex)
    print(train_files)

    if len(os.listdir(model_dir)) == 0:  #acr_model not exist
        print("NO Have ACR Module")
        acr_model = build_acr_estimator(model_output_dir,
                                        word_embeddings_matrix,
                                        features_config, labels_class_weights,
                                        special_token_embedding_vector,
                                        list_args)
        tf.logging.info('Training model')
        acr_model.train(
            input_fn=lambda: prepare_dataset(files=train_files,
                                             features_config=features_config,
                                             batch_size=batch_size,
                                             epochs=training_epochs,
                                             shuffle_dataset=True,
                                             shuffle_buffer_size=10000))
    else:  #acr_model   exist
        print("Have ACR Module")
        acr_model = build_acr_estimator(model_output_dir,
                                        word_embeddings_matrix,
                                        features_config, labels_class_weights,
                                        special_token_embedding_vector,
                                        list_args)

    #The objective is to overfitting this network, so that the ACR embedding represent well the articles content
    tf.logging.info('Evaluating model - TRAIN SET')
    print("Evaluating model - TRAIN SET")
    eval_results = acr_model.evaluate(
        input_fn=lambda: prepare_dataset(files=train_files,
                                         features_config=features_config,
                                         batch_size=batch_size,
                                         epochs=1,
                                         shuffle_dataset=False))
    tf.logging.info(
        'Evaluation results with TRAIN SET (objective is to overfit): {}'.
        format(eval_results))
    '''
        tf.logging.info('Evaluating model - TEST SET')
        eval_results = acr_model.evaluate(input_fn=lambda: prepare_dataset(files=test_files,
                                                    features_config=features_config,
                                                    batch_size=FLAGS.batch_size, 
                                                    epochs=1, 
                                                    shuffle_dataset=False))
        tf.logging.info('Evaluation results with TEST SET: {}'.format(eval_results))
        '''

    tf.logging.info('Predicting ACR embeddings')
    print("Predicting ACR embeddings")
    article_metadata_with_pred_embeddings = acr_model.predict(
        input_fn=lambda: prepare_dataset(files=train_files,
                                         features_config=features_config,
                                         batch_size=batch_size,
                                         epochs=1,
                                         shuffle_dataset=False))

    articles_metadata_df, content_article_embeddings = get_articles_metadata_embeddings(
        article_metadata_with_pred_embeddings)
    tf.logging.info('Generated ACR embeddings: {}'.format(
        content_article_embeddings.shape))

    # read csv preprocessed by acr preprocessing
    list_args2 = parameter["acr_preprocess"]
    path_csv = DATA_DIR + list_args2['output_articles_csv_path_preprocessed']
    df = pd.read_csv(get_all_file(path_csv)[0])
    print(len(df['id']))

    export_acr_metadata_embeddings_with_datetime(
        df, acr_label_encoders, articles_metadata_df,
        content_article_embeddings, output_acr_metadata_embeddings_path)

    print("Export done, Call load acr auto ...")

    print("Remove acr embedding")
    remove_acr_pickle(get_all_file(output_acr_metadata_embeddings_path)[0])

    # TODO gọi service load acr_label_encoders, articles_metadata_df, content_article_embeddings vào biến singleton
    # import requests
    # resp = requests.get('http://0.0.0.0:8082/loadacr')
    # if resp.status_code == 200:
    #     print("Called load acr_pickle")
    # else:
    #     print("Not Yet call load acr_pickle")

    # save_to_mysql_database( mysql_host,  mysql_user,  mysql_passwd, mysql_database, acr_label_encoders,articles_metadata_df , content_article_embeddings)

    # after trainning, delete all file tfrecord
    delete_all_file_in_path(train_set_path_regex)
    log_elapsed_time(start_train, 'Finalized TRAINING')
    print("<=== END ARC TRAINING ===>")
Exemple #8
0
    a = dt_object.strftime('%Y-%m-%d')

    import datetime
    import dateutil.relativedelta

    d = datetime.datetime.strptime(a, "%Y-%m-%d")
    d2 = d - dateutil.relativedelta.relativedelta(days=7)
    #     print(d2)

    from datetime import datetime
    timestamp = datetime.timestamp(d2)
    return int(timestamp)


# REMOVE PICKLE
parameter = load_json_config("./parameter.json")


def remove_acr_pickle(path_file):
    acr_label_encoders, articles_metadata_df, content_article_embeddings_matrix = deserialize(
        path_file)

    def serialize(filename, obj):
        # with open(filename, 'wb') as handle:
        with tf.gfile.Open(filename, 'wb') as handle:
            pickle.dump(obj, handle)

    def merge_two_dicts(x, y):
        return {**x, **y}

    # articles_metadata_df = articles_metadata_df[