Example #1
0
def pct_helper(data, sample_type, i, decimals, verbose):
    pct_path = path.join("output/stats/percent_abusive", f"percent.{sample_type.lower()}{i}.CSV")
    if path.exists(pct_path):
        print(f"\nImporting {sample_type}-sample abusive content percentages...") if verbose else None
        pct = pd.read_csv(pct_path)  # import if already computed
        print(f"Percentages Imported!") if verbose else None
    else:
        print(f"\nCalculating {sample_type}-sample abusive content percentages...") if verbose else None
        pct = calc_pct_abusive(data, decimals, verbose)  # else, calculate
        print(f"Percentages calculated!") if verbose else None

    print(f"{pct}")
    export_df(pct, sample_type.lower(), i, folder="output/stats/percent_abusive", prefix="percent", index=False)
Example #2
0
def bin_data(data_with_preds, sample_type, i, analyzer, ngram_range):
    # split data into explictly abusive and implictly abusive
    data_with_preds["pred"] = pd.read_csv(
        path.join(
            "output_toxic/pred",
            f"pred.{sample_type.lower()}{i}.CSV"))  # add preds as new column

    explicit_data, implicit_data = bin_data_helper(data_with_preds)

    # store data as vectors
    y_explicit = explicit_data["class"]
    y_pred_explicit = explicit_data["pred"]
    y_implicit = implicit_data["class"]
    y_pred_implicit = implicit_data["pred"]

    # reports
    report_explicit = pd.DataFrame(
        classification_report(y_explicit,
                              y_pred_explicit,
                              output_dict=True,
                              zero_division=0)).transpose()
    report_implicit = pd.DataFrame(
        classification_report(y_implicit,
                              y_pred_implicit,
                              output_dict=True,
                              zero_division=0)).transpose()

    # print + export
    print(
        f"\nClassification Report[{sample_type}.explicit, {analyzer}, ngram_range{ngram_range}]:\n{report_explicit}\n"
    )
    export_df(report_explicit,
              sample_type,
              f"{i}.explicit",
              folder="output_toxic/report/binning",
              prefix="report")

    print(
        f"\nClassification Report[{sample_type}.implicit, {analyzer}, ngram_range{ngram_range}]:\n{report_implicit}\n"
    )
    export_df(report_implicit,
              sample_type,
              f"{i}.implicit",
              folder="output_toxic/report/binning",
              prefix="report")
Example #3
0
def fit_data(rebuild, samples, analyzer, ngram_range, manual_boost, per_sample, verbose, sample_size, calc_pct, decimals):
    """
    rebuild (bool):     if TRUE, rebuild + rewrite the following datasets:
    samples ([str]):    three modes: "random", "boosted", or "all"
    analyzer (str):     either "word" or "char". for CountVectorizer
    ngram_range ((int,int)):    tuple containing lower and upper ngram bounds for CountVectorizer
    manual_boost ([str]):       use given list of strings for filtering instead of built-in wordbanks. Or pass `None`
    per_sample (int):           controls the number of datasets built per sample type (if `rebuild` is TRUE)
    verbose (boolean):  toggles print statements
    sample_size (int):  size of sampled datasets. If set too high, the smaller size will be used
    calc_pct (bool):    if TRUE, calculate percentage of abusive words in each sample
    decimals (int):     number of decimals to round percentages to
    """

    # rebuild datasets
    if rebuild:
        build_datasets(samples, manual_boost, per_sample, sample_size, verbose)
        build_lexicons()

    # struct example: [([random1, random2, ..., random_n], "random"), ...]
    all_data = []
    for x in ["random", "topic", "wordbank"]:
        all_data.append((import_data(x, per_sample), x))

    # choose one or the other sample type if desired
    if samples is "random":
        all_data = all_data[0]
    elif samples is "boosted":
        all_data = all_data[1:2]

    for sample in all_data:  # for each sample type...
        i = 1
        reports_to_avg = []  # list of reports to soon be averaged
        sample_type = ""  # sample type name (e.g. "random", "boosted", etc.) in outer scope for preservation

        for set in sample[0]:  # for each set...
            data = pd.DataFrame(set)  # first member of tuple is the dataframe
            sample_type = sample[1].lower()  # second member of tuple is a string
            print(f"===== {sample_type.capitalize()}-sample: pass {i} =====") if verbose else None

            # store data as vectors
            X = data["comment_text"]
            y = data["class"]

            # model pipeline
            print("Instantiating model pipeline (CV & SVM)...") if verbose else None
            vec = CountVectorizer(analyzer="word", ngram_range=ngram_range)
            svc = SVC(C=1000, kernel="rbf", gamma=0.001)  # GridSearch best params
            clf = Pipeline([('vect', vec), ('svm', svc)])

            # cross-validation
            k = 5

            # calculate + export predictions
            y_pred = pred_helper(X, y, clf, k, sample_type, i, verbose)

            # calculate % abusive
            pct_helper(data, sample_type, i, decimals, verbose) if calc_pct else None

            # report results + export
            report = pd.DataFrame(classification_report(y, y_pred, output_dict=True)).transpose()
            # report = round_report_df(report_to_percentage(report), decimals)  # convert precision + recall columns to percentages + round

            print(f"\nClassification Report[{sample_type}, {analyzer}, ngram_range{ngram_range}]:\n{report}\n")
            export_df(report, sample_type, i, folder="output/report", prefix="report")
            reports_to_avg.append(report)

            bin_data(data, sample_type, i, analyzer, ngram_range)

            i += 1

        # average all reports of the same sample type (e.g. random1, random2, random3)
        print(f"===== {sample_type}-sample: Average of {len(reports_to_avg)} =====") if verbose else None
        averaged = pd.concat(reports_to_avg).groupby(level=0).mean()  # given a list of dataframes, average their values
        # averaged = round_report_df(averaged, decimals)  # convert precision + recall columns to percentages + round
        export_df(averaged, sample_type, i=".avg", folder="output/report", prefix="report")  # export the averaged report
        print(f"\nClassification Report[{sample_type}, {analyzer}, ngram_range{ngram_range}]:\n{averaged}\n")