Ejemplo n.º 1
0
def test_set_data():
    """
    ensure that the data is properly extracted, and the basic feature names have been extracted
    Returns:

    """
    data_path: str = "data/sample_train/sample_data.tsv"
    all_manga = pd.read_csv(data_path, sep="\t", index_col=0)
    all_manga = all_manga.drop(columns=["level_0"])
    pre_bi = process_bilingual_data.Preprocess_Bilingual()
    pre_bi.set_data(all_manga)
    print(all_manga.columns)
    print(all_manga.shape)

    # expected original size
    assert all_manga.shape == (313, 22)
    assert pre_bi._data_to_process.shape == (157, 43)

    pre_bi_columns = pre_bi._data_to_process.columns.tolist()

    # ensure add meta features are there
    assert "nn1" in pre_bi_columns, "nearest neighbor feature name does not exist, meta_features may have been modified"

    assert "link" in pre_bi_columns, "link feature name does not exist, data format may have changed"
    # ensure basic featurs were added
    assert "text_jp" in pre_bi_columns, "text english feature not defined after formatting"
    assert "text_en" in pre_bi_columns, "text japanese feature not defined after formatting"
Ejemplo n.º 2
0
def get_default_object():
    """
    create a process object that is used for handling data for training and prediction
    Returns:
        process_bilingual_data.Preprocess_Bilingual
    """
    data_path: str = "data/sample_train/sample_data.tsv"
    all_manga = pd.read_csv(data_path, sep="\t", index_col=0)
    all_manga = all_manga.drop(columns=["level_0"])
    pre_bi = process_bilingual_data.Preprocess_Bilingual()
    pre_bi.set_data((all_manga))
    return pre_bi
Ejemplo n.º 3
0
def main(dir_path: str, save_path: str, max_run_time: int,
         max_run_per_model: int):
    """
    runs through automl sklearn for hyperparameter and model search
    Args:
        data_dir: path to read tsv files
        save_path: path to save output files of automl
        max_run_time: how long to run it overmodel for
        max_run_per_model: max time per model

    Returns:
        None
    """

    import autosklearn.regression
    files: list = os.listdir(dir_path)
    file_names: list = [i for i in dir_path if ("selenium.tsv" in i)]
    full_path: str = [os.path.join(dir_path, name) for name in files]

    mangas = process_bilingual_data.read_tsv_files(full_path)
    process = process_bilingual_data.Preprocess_Bilingual()
    process.set_data(mangas[0:2000])
    print("GREAT")
    x_pd, y_pd = process.output_all_features_iou()
    x = x_pd.values
    y = y_pd.values
    x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(
        x, y, random_state=1)

    automl = autosklearn.regression.AutoSklearnRegressor(
        time_left_for_this_task=max_run_time,
        per_run_time_limit=max_run_per_model,
        tmp_folder=save_path + "_temp",
        output_folder=save_path,
    )
    final_model = automl.fit(x_train,
                             y_train,
                             dataset_name="tsv of manga for iou prediction")

    y_test_pred = final_model.predict(x_test)
    total_score = 0
    for y_pred_el, y_test_el in zip(y_test_pred, y_test):
        total_score += iou_prediction.get_iou_lists(y_pred_el, y_test_el)

    print(total_score / len(y_test))
def main(bucket_dir_path, img_directory_path: str, tsv_directory_path: str,
         destination_dir_path: str, destination_file_path: str):
    """
    script to format data for processing for google's automl image detection

    Args:
        bucket_dir_path: the path where the files will be finally stored
        img_directory_path: directory where all images are originally stored
        tsv_directory_path: directory where all tsv describing data is stored
        destination_dir_path: directory to save new asigned images
        destination_file_path: file to save formatted csv for automl

    Returns:
        None
    """

    files: list = os.listdir(tsv_directory_path)

    # the selenium files that have been extracted
    file_names: list = [i for i in files if ("selenium.tsv" in i)]

    full_path: str = [os.path.join(tsv_directory_path, name) for name in files]

    mangas = process_bilingual_data.read_tsv_files(full_path)
    # binarize fonts, not used, but neccesary for the model
    cuts = pd.qcut(mangas["font-size"], 13, labels=False)
    mangas["font-size"] = cuts

    subgroup = mangas.groupby("manga")

    manga_names = mangas["manga"].unique()
    unique_img_id = 0

    all_pds = []
    type_of_analysis = ["TRAIN", "TRAIN", "VALIDATE",
                        "TEST"]  # 50% train, 25% validate, 25% test
    for manga_name in manga_names:

        relevant_group = subgroup.get_group(manga_name)
        page_numbers = relevant_group["id"].unique()
        unique_pages = relevant_group.groupby("id")
        for page_id in page_numbers:
            path_to_manga_dir = os.path.join(img_directory_path, manga_name)
            path_to_file = os.path.join(path_to_manga_dir,
                                        "{}_jp.png".format(page_id))

            if (os.path.isfile(path_to_file)):
                process = process_bilingual_data.Preprocess_Bilingual()
                current_page_pd = unique_pages.get_group(page_id)
                process.set_data(current_page_pd)

                coords = process.to_box_coords(True)
                fonts = process.extract_font_size()

                if (pd.DataFrame(process.extract_font_size()).shape[0] <
                        18):  # automl has a cap of 20
                    full_unique_img_id = "{}_jp.png".format(unique_img_id)
                    final_data = process.aggregate_to_pandas((coords, fonts))
                    final_data["id"] = unique_img_id

                    outside_params = final_data[(final_data.x1_jp > 1) |
                                                (final_data.x2_jp > 1) |
                                                (final_data.y1_jp > 1) |
                                                (final_data.x2_jp > 1)]

                    final_data["full_id"] = 'gs://{}/{}'.format(
                        bucket_dir_path, full_unique_img_id)
                    final_data["original_path"] = path_to_file
                    final_data["eval_type"] = random.choice(type_of_analysis)

                    unique_img_id += 1
                    if len(outside_params) == 0:  # no errors
                        all_pds.append(final_data)
                    destination_file = os.path.join(destination_dir_path,
                                                    full_unique_img_id)
                    copyfile(destination_dir_path, destination_file)

    ordered_data_for_image_processing = pd.concat(all_pds, sort=False)
    print("finished")

    ordered_data_for_image_processing["gap1"] = ""
    ordered_data_for_image_processing["gap2"] = ""
    ordered_data_for_image_processing["gap3"] = ""
    ordered_data_for_image_processing["gap4"] = ""
    ordered_data_for_image_processing[
        "font-size_en"] = ordered_data_for_image_processing[
            "font-size_en"].astype(int)

    formatted_data = ordered_data_for_image_processing[[
        "eval_type", "full_id", "font-size_en", "x1_en", "y1_en", "gap1",
        "gap2", "x2_en", "y2_en", "gap3", "gap4"
    ]]
    formatted_data.to_csv(destination_file_path,
                          index=False,
                          header=False,
                          index_label=False)
def main(model_type: str,
         dir_path,
         save_path: str = "temp.pkl",
         save_model: bool = False,
         run_ablation: bool = False):
    """
    runs basic model training, and simple ablation if using non-linear models
    Args:
        model_type: is the model for predicting bounds or font size
        dir_path: the directory where underlying data is used for trianing
        save_path: where is the model going to be saved
        save_model: should the model be saved
        run_ablation: do you want to run ablations on each feature

    Returns:
        None
    """

    files: list = os.listdir(dir_path)
    file_names: list = [i for i in dir_path if ("selenium.tsv" in i)]
    full_path: str = [os.path.join(dir_path, name) for name in files]
    print("preparing data")
    mangas = process_bilingual_data.read_tsv_files(full_path)
    process = process_bilingual_data.Preprocess_Bilingual()
    process.set_data(mangas)
    all_data = process.output_all_features()

    print("training")

    if model_type == "bound":
        x_pd, y_pd = process.output_all_features_iou()
        x_names = x_pd.columns.values
        y_names = y_pd.columns.values

        all_data = process.output_all_features()
        prediction_wrapper = iou_prediction.PredictionBoundingTraditional()
        prediction_wrapper.set_data(all_data)
        prediction_wrapper.set_features(x_names)
        prediction_wrapper.set_features(x_names, y_names)

    elif model_type == "font":
        x_pd, y_pd = process.output_all_features_font_size()
        x_names = x_pd.columns.values
        y_names = y_pd.columns.values

        prediction_wrapper = traditional_feature_prediction.FeaturePredictionTraditional(
        )
        prediction_wrapper.set_data(all_data)
        prediction_wrapper.set_features(x_names, y_names)

    else:
        raise Exception("no known model type specified")

    # MultiOutputRegressor(GradientBoostingRegressor())
    prediction_wrapper.set_model(LinearRegression())
    print(prediction_wrapper.score_cv())

    if run_ablation:
        for feature in range(len(x_names)):
            print("{} feature removed".format(x_names[feature]))
            temp_x_names = x_names.copy().tolist()
            del temp_x_names[feature]
            prediction_wrapper.set_features(temp_x_names)
            print(prediction_wrapper.score_cv())

    if save_model:

        prediction_wrapper.fit(prediction_wrapper._x,
                               prediction_wrapper._y,
                               preprocess=True)
        traditional_feature_prediction.save(prediction_wrapper, save_path)