def coalesce_file(input_file_path, output_file_path, append=False):

    if append:
        append_string = ">>"
    else:
        append_string = ">"

    os.system("cat " + input_file_path + "/p* " + append_string +
              output_file_path)
    const.delete_file(input_file_path)
def coalesce_folder(output_file_path_with_prefix, append=False):

    for folder in os.listdir(output_file_path_with_prefix):

        if folder.startswith("_1"):

            folder_split = folder.split("=")

            if len(folder_split) != 2:
                raise ValueError("Folder label is not in proper format!")

            key = folder_split[1]

            if append:
                append_string = ">>"
            else:
                const.delete_file(output_file_path_with_prefix + str(key) +
                                  ".txt")
                append_string = ">"
            command = "cat " + output_file_path_with_prefix + "/" + folder + \
                      "/p* " + append_string + output_file_path_with_prefix + str(key) + ".txt"

            os.system(command)
            const.delete_file(output_file_path_with_prefix + "/" + folder)

    const.delete_file(output_file_path_with_prefix)
Beispiel #3
0
def run_cqi_predict(arg_value):

    logger.info("cqi predict started...")
    # create a directory to store the result

    if arg_value != "resume":
        const.delete_file(const.get_cqi_output_file_path())
        os.makedirs(const.get_cqi_output_file_path(), exist_ok=True)

    # select n avro files
    input_directory = os.fsencode(const.get_cqi_input_file_path())
    input_file_names = [os.fsdecode(s) for s in os.listdir(input_directory)]
    input_file_names = list(
        filter(lambda x: x.endswith(".avro"), input_file_names))

    output_directory = os.fsencode(const.get_cqi_output_file_path())
    output_file_names = [os.fsdecode(s) for s in os.listdir(output_directory)]
    output_file_names = list(
        filter(lambda x: x.endswith(".avro"), output_file_names))

    set_input_file_name = set(input_file_names)
    set_output_file_name = set(output_file_names)

    if arg_value == "resume":
        final_file_name = set_input_file_name - set_output_file_name
    else:
        final_file_name = set_input_file_name

    # multi-processing
    thread_num = 10
    pool = Pool(processes=thread_num, maxtasksperchild=1)
    pool.starmap(create_cqi_output, zip(final_file_name))

    pool.close()
    pool.join()

    os.system('touch ' + const.get_cqi_output_file_path() + '_SUCCESS')

    logger.info("cqi predict complete!")
def train(model_path, level=1, loss_function='softmax', resume="resume"):
    # model = FastText()

    logger.info('Training LEVEL' + str(level) + ' model started')

    category_dict_train = dict()
    category_dict_test = dict()
    summary_logs = list()

    if resume != "resume":
        logger.info('Non-resume training...delete model directory')
        const.delete_file(
            const.get_model_file_path(model_path) + "lv" + str(level) + "/")
    else:
        logger.info('Resume training enabled')

    if level == 1:
        category_dict_train['ROOT'] = 1
        category_dict_test['ROOT'] = 1
    else:
        label_index = 2 * level - 2

        category_dict_train = get_category_dict(
            const.get_raw_train_file_name(), label_index, category_dict_train)
        category_dict_test = get_category_dict(
            const.get_raw_validate_file_name(), label_index,
            category_dict_test)

    for label in category_dict_train:

        if resume == "resume":
            file_exist = const.get_model_file_exist(model_path,
                                                    level=level,
                                                    cate=label)
            if file_exist:
                logger.info("There is already model for LV" + str(level) +
                            " " + str(label) + " skip training...")
                continue

        word_n_gram = 3
        learning_rate = 0.1
        current_best_name = ''
        current_best_score = 0.0
        patience = 0
        lf = loss_function
        max_patience = const.TRAINING_MAX_PATIENCE
        epoch_start = 50

        if label == const.CATE_INT_BOOK or label == '34405' or label == '93185':  # domestic/foreign books
            bucket = 3000000
            epoch_start = 50
            max_patience = 0
            lf = 'hs'
        else:

            if level == 1:
                bucket = 2000000
                epoch_start = 50
                learning_rate = 0.5
            elif level == 2:
                bucket = 1000000
                epoch_start = 200
                learning_rate = 0.1
            else:
                epoch_start = 300
                learning_rate = 0.1
                train_file_size = os.path.getsize(
                    const.get_processed_train_sub_file_name(label, level))
                if train_file_size > 100000000:  # 100MB
                    bucket = 3000000
                elif train_file_size > 40000000:  # 40MB
                    bucket = 2000000
                elif train_file_size > 1000000:  # 1MB
                    bucket = 100000
                else:
                    bucket = 10000

        for epoch in range(epoch_start, 301):

            start_date = const.get_time()

            model_file_name = const.get_model_file_name(model_path,
                                                        level=level,
                                                        epoch=epoch,
                                                        lr=learning_rate,
                                                        cate=label)

            classifier = fasttext.supervised(
                input_file=const.get_processed_train_sub_file_name(
                    label, level),
                output=model_file_name,
                lr=learning_rate,
                epoch=epoch,
                loss=lf,
                word_ngrams=word_n_gram,
                thread=const.TRAINING_THREAD,
                silent=0,
                encoding='utf-8',
                ws=5,
                dim=50,
                bucket=bucket)

            if label not in category_dict_test:
                logger.warning("There is no testing data with level" +
                               str(level) + " catecode:" + label)
                break

            result = classifier.test(
                const.get_processed_test_sub_file_name(label, level))
            end_date = const.get_time()

            result_log = ("LV" + str(level) + " " + str(label) +
                          ': precision:' + str(round(result.precision, 4)) +
                          '(size:' + str(result.nexamples) + ', labels:' +
                          str(len(classifier.labels)) + ') ep:' + str(epoch) +
                          ', lr:' + str(learning_rate) + ', n-gram:' +
                          str(word_n_gram) + ', duration:' +
                          str(end_date - start_date))

            if current_best_score < result.precision:
                current_best_score = result.precision
                logger.info(result_log + ' Model improved!!!!')
                if current_best_name != '':
                    os.remove(current_best_name)
                current_best_name = model_file_name + '.bin'
                patience = 0
            else:
                logger.info(result_log)
                os.remove(model_file_name + '.bin')
                patience = patience + 1

            if patience >= max_patience:
                summery_log = "LV" + str(level) + " " + str(label) + ' Patience exceed ' + \
                              str(const.TRAINING_MAX_PATIENCE) + ', best score is:' + \
                              str(round(current_best_score, 4)) + '(size:' + str(result.nexamples) + \
                              ', labels:' + str(len(classifier.labels)) + ')'

                logger.info(summery_log)
                summary_logs.append(summery_log)

                break

            sys.stdout.flush()

    logger.info('Training LEVEL' + str(level) + '  model completed!')
    logger.info('=============================')
    for log in summary_logs:
        logger.info(log)
    logger.info('=============================')
def pre_process(input_file, output_file_path, output_file_prefix):

    logger.info('Pre-processing started for file:' + input_file)

    findspark.init()
    sc = None
    try:
        sc = pyspark.SparkContext(appName=__name__)
        spark = SparkSession(sc)

        const.delete_file(output_file_path)

        raw_data = sc.textFile(input_file)
        processed_data = raw_data.map(format_data_line)
        standard_data = processed_data.map(lambda row: row.split("\t"))
        standard_data.persist(pyspark.StorageLevel.DISK_ONLY)

        lv1_data = standard_data.map(lambda row: ('ROOT', (
            const.MODEL_LABEL_KEY + row[0] + ' ' + row[3])))

        lv1_data.persist(pyspark.StorageLevel.DISK_ONLY)
        lv1_df = lv1_data.toDF()
        lv1_output_file_prefix = output_file_path + "lv1/" + output_file_prefix + "lv1_"
        lv1_df.write.partitionBy("_1").csv(lv1_output_file_prefix)
        util.coalesce_folder(lv1_output_file_prefix, append=False)
        """
        if dropout:
            lv1_dropout_data = lv1_data.map(lambda row: (row[0], feature.dropout(row[1])))
            lv1_dropout_df = lv1_dropout_data.toDF()
            lv1_dropout_df.write.partitionBy("_1").csv(lv1_output_file_prefix)
            coalesce_file(lv1_output_file_prefix, append=True)
        """
        lv1_data.unpersist()

        lv2_data = standard_data.map(lambda row: (row[0], (
            const.MODEL_LABEL_KEY + row[0] + '_' + row[1] + ' ' + row[3])))

        lv2_data.persist(pyspark.StorageLevel.DISK_ONLY)
        lv2_df = lv2_data.toDF()
        lv2_output_file_prefix = output_file_path + "lv2/" + output_file_prefix + "lv2_"
        lv2_df.write.partitionBy("_1").csv(lv2_output_file_prefix)
        util.coalesce_folder(output_file_path + "lv2/" + output_file_prefix +
                             "lv2_",
                             append=False)
        """
        if dropout:
            lv2_dropout_data = lv2_data.map(lambda row: (row[0], feature.dropout(row[1])))
            lv2_dropout_df = lv2_dropout_data.toDF()
            lv2_dropout_df.write.partitionBy("_1").csv(lv2_output_file_prefix)
            coalesce_file(lv2_output_file_prefix, append=True)
        """
        lv2_data.unpersist()

        lv3_data = standard_data.map(
            lambda row: (row[1], (const.MODEL_LABEL_KEY + row[0] + '_' + row[1]
                                  + '_' + row[2] + ' ' + row[3])))

        lv3_data.persist(pyspark.StorageLevel.DISK_ONLY)
        lv3_df = lv3_data.toDF()
        lv3_output_file_prefix = output_file_path + "lv3/" + output_file_prefix + "lv3_"
        lv3_df.write.partitionBy("_1").csv(lv3_output_file_prefix)
        util.coalesce_folder(lv3_output_file_prefix, append=False)
        """
        if dropout:
            lv3_dropout_data = lv3_data.map(lambda row: (row[0], feature.dropout(row[1])))
            lv3_dropout_df = lv3_dropout_data.toDF()
            lv3_dropout_df.write.partitionBy("_1").csv(lv3_output_file_prefix)
            coalesce_file(lv3_output_file_prefix, append=True)
        """
        lv3_data.unpersist()

        standard_data.unpersist()
        spark.stop()

    finally:
        sc.stop()

    logger.info('Pre-processing completed for file:' + input_file)