def embed():
    print("Embedding transcripts")
    data = fm.get_df("0_parsed")
    sentence_embedding = sister.MeanEmbedding(lang="en")
    embedded = data["parsed"]["line"].apply(sentence_embedding)
    d = {"embedded": pd.DataFrame.from_records(embedded, index=embedded.index)}
    embedded = data.join(pd.concat(d, axis=1))

    fm.write_df(embedded, "1_embedded_fasttext")
    return embedded
Beispiel #2
0
def parse(correct_spelling=False,
          stemming=False,
          remove_stopwords=False,
          expand_contractions=True):
    print("Parsing episodes")

    episode_array = fm.get_transcripts()

    lines = []
    for episode in episode_array:
        lines.extend(
            parse_episode(episode, correct_spelling, stemming,
                          remove_stopwords, expand_contractions))

    df = pd.DataFrame(
        lines, columns=["character", "line", "wordcount", "stopwordcount"])
    # Remove duplicate combinations of ["character] and ["line"] and leave only 1
    # Remove duplicate ["line"] since this would mean multiple characters say the same sentence
    df = df[~df.duplicated(subset=["character", "line"])]
    df = df[~df.duplicated(subset=["line"], keep=False)]
    ml = {'parsed': df}
    ml_df = pd.concat(ml, axis=1).reindex()
    fm.write_df(ml_df, "0_parsed")
    return ml_df
Beispiel #3
0
def benchmark_change_data(train_or_test="test",
                          random=False,
                          grid=False,
                          min=2,
                          max=30):
    print("Benchmarking using new method from " + str(min) + " to " + str(max))

    if not grid:
        dictionary = {
            "accuracy": [],
            "cross_entropy": [],
            "predict_proba_predicted_character": [],
        }
    else:
        dictionary = {
            "accuracy": [],
            "cross_entropy": [],
            "predict_proba_predicted_character": [],
            "C": [],
            "max_iter": []
        }
    fasttext_dict = deepcopy(dictionary)
    tfidf_dict = deepcopy(dictionary)

    data = src.file_manager.get_df("1_embedded_fasttext")

    train, test = train_test_split(data, random_state=1515, train_size=0.8)

    test_count = {}
    train_count = {}
    for i in range(min, max):
        test_count.update({
            i:
            test[test["parsed"]["wordcount"] > i].count()["parsed"]
            ["wordcount"]
        })
        train_count.update({
            i:
            train[train["parsed"]["wordcount"] > i].count()["parsed"]
            ["wordcount"]
        })

    # Train shrinks, data needs to be classified only once
    if train_or_test == "test":
        classified_data, params = src.classify(technique="fasttext",
                                               train_data=train,
                                               test_data=test,
                                               unique=False,
                                               C=10.0,
                                               max_iter=200,
                                               write=False)
    for min_wordcount in range(min, max):
        print(min_wordcount)
        if random:
            if train_or_test == "test":
                classified_data = classified_data.sample(
                    n=test_count.get(min_wordcount))
            else:
                train = train.sample(n=train_count.get(min_wordcount))
                classified_data, params = src.classify(technique="fasttext",
                                                       train_data=train,
                                                       test_data=test,
                                                       unique=False,
                                                       grid=grid,
                                                       write=False)
        else:
            if train_or_test == "test":
                classified_data = classified_data[
                    classified_data["parsed"]["wordcount"] >= min_wordcount]
            else:
                train = train[train["parsed"]["wordcount"] >= min_wordcount]
                classified_data, params = src.classify(technique="fasttext",
                                                       train_data=train,
                                                       test_data=test,
                                                       unique=False,
                                                       grid=grid,
                                                       write=False)
        labels = classified_data["predict_proba_"].columns
        fasttext_dict.get("accuracy").append(
            accuracy_score(classified_data["parsed"]["character"],
                           classified_data["classified"]["character"]))
        fasttext_dict.get("cross_entropy").append(
            log_loss(classified_data["parsed"]["character"],
                     classified_data["predict_proba_"],
                     labels=labels))
        fasttext_dict.get("predict_proba_predicted_character").append(
            classified_data["predict_proba_specific"]
            ["predicted_character"].mean())
        if grid:
            fasttext_dict.get("C").append(params.get("C"))
            fasttext_dict.get("max_iter").append(params.get("max_iter"))
        print(fasttext_dict)

    data = src.file_manager.get_df("0_parsed")

    train, test = train_test_split(data, random_state=1515, train_size=0.8)

    wordcount_range = range(min, max)
    if train_or_test == "test":
        classified_data, params = src.classify(technique="tfidf",
                                               train_data=train,
                                               test_data=test,
                                               unique=False,
                                               C=1.0,
                                               max_iter=500,
                                               write=False)
    for min_wordcount in wordcount_range:
        print(min_wordcount)
        if random:
            if train_or_test == "test":
                classified_data = classified_data.sample(
                    n=test_count.get(min_wordcount))
            else:
                train = train.sample(n=train_count.get(min_wordcount))
                classified_data, params = src.classify(technique="tfidf",
                                                       train_data=train,
                                                       test_data=test,
                                                       unique=False,
                                                       grid=grid,
                                                       write=False)
        else:
            if train_or_test == "test":
                classified_data = classified_data[
                    classified_data["parsed"]["wordcount"] >= min_wordcount]
            else:
                train = train[train["parsed"]["wordcount"] >= min_wordcount]
                classified_data, params = src.classify(technique="tfidf",
                                                       train_data=train,
                                                       test_data=test,
                                                       unique=False,
                                                       grid=grid,
                                                       write=False)
        labels = classified_data["predict_proba_"].columns
        tfidf_dict.get("accuracy").append(
            accuracy_score(classified_data["parsed"]["character"],
                           classified_data["classified"]["character"]))
        tfidf_dict.get("cross_entropy").append(
            log_loss(classified_data["parsed"]["character"],
                     classified_data["predict_proba_"],
                     labels=labels))
        tfidf_dict.get("predict_proba_predicted_character").append(
            classified_data["predict_proba_specific"]
            ["predicted_character"].mean())
        if grid:
            tfidf_dict.get("C").append(params.get("C"))
            tfidf_dict.get("max_iter").append(params.get("max_iter"))
        print(tfidf_dict)

    print(wordcount_range)
    print()

    tfidf_df = pd.concat(
        [pd.Series(v, index=wordcount_range) for k, v in tfidf_dict.items()],
        keys=[k for k, v in tfidf_dict.items()],
        axis=1)
    fasttext_df = pd.concat([
        pd.Series(v, index=wordcount_range) for k, v in fasttext_dict.items()
    ],
                            keys=[k for k, v in fasttext_dict.items()],
                            axis=1)
    d = {"tfidf": tfidf_df, "fasttext": fasttext_df}

    df = pd.concat(d, axis=1)
    fm.write_df(
        df, "4_benchmark_change_testing_data_" + train_or_test +
        ("_random" if random else ""))
    return df
def classify(ngrams=None,
             technique="tfidf",
             multi_class="multinomial",
             train_data=None,
             test_data=None,
             grid=False,
             C=None,
             max_iter=None,
             cv=None,
             min_wordcount=None,
             verbose=0,
             unique=False,
             write=True):
    if technique not in sim_types:
        raise ValueError("Invalid classification type " + technique +
                         ". Expected one of: %s" % sim_types)
    print("Classifying lines using " + technique)
    if ngrams is None:
        if technique == "tfidf":
            print("TF-IDF: data obtained from 0_parsed")
            data = fm.get_df("0_parsed", unique=unique)
        else:
            print("fastText: data obtained from 1_embedded_fasttext")
            data = fm.get_df("1_embedded_" + technique, unique=unique)
            if data is None:
                data = embed.embed_transcripts(type=technique)
    else:
        data = fm.get_df("0_parsed_n_grams")

    if train_data is not None and test_data is not None:
        print("Test and train data provided, using those instead")
        train = train_data
        test = test_data
    else:
        train, test = train_test_split(data, random_state=1515, train_size=0.8)

    y_train = train["parsed"]["character"]
    y_test = test["parsed"]["character"]

    if technique == "tfidf":
        tfidf = TfidfVectorizer()
        if ngrams is None:
            x_train = tfidf.fit_transform(train["parsed"]["line"])
            x_test = tfidf.transform(test["parsed"]["line"])
        else:
            x_train = tfidf.fit_transform(train["ngrams"][str(ngrams)])
            x_test = tfidf.transform(test["ngrams"][str(ngrams)])
    else:
        x_train = train["embedded"]
        x_test = test["embedded"]

    if grid:
        params = {
            "C": np.logspace(-5, 5, 11),
            'max_iter': [250, 500, 750, 1000]
        }
        if C:
            params["C"] = C if technique is list else [C]
        if max_iter:
            params["max_iter"] = max_iter if technique is list else [max_iter]
        lg = GridSearchCV(LogisticRegression(),
                          params,
                          verbose=verbose,
                          n_jobs=-1,
                          cv=cv)
    else:
        lg = LogisticRegression(C=C if C else 1,
                                max_iter=max_iter if max_iter else 500)

    lg.fit(x_train, y_train)

    if grid:
        best = lg.best_params_
        print("Best parameters: ", best)
    else:
        best = None

    predict_proba_df = pd.DataFrame(lg.predict_proba(x_test),
                                    columns=lg.classes_,
                                    index=y_test.index)
    decision_function_df = pd.DataFrame(lg.decision_function(x_test),
                                        columns=lg.classes_,
                                        index=y_test.index)

    predict_df = pd.DataFrame(lg.predict(x_test),
                              columns=["character"],
                              index=y_test.index)

    predict_proba_character_series = pd.concat([y_test, predict_proba_df], axis=1)\
        .apply(lambda x: x[x["character"]], axis=1)
    predict_proba_predicted_series = pd.concat([predict_df, predict_proba_df], axis=1)\
        .apply(lambda x: x[x["character"]], axis=1)
    predict_proba_specific_df = pd.concat(
        [predict_proba_character_series, predict_proba_predicted_series],
        keys=["actual_character", "predicted_character"],
        axis=1)

    decision_function_character_series = pd.concat([y_test, decision_function_df], axis=1)\
        .apply(lambda x: x[x["character"]], axis=1)
    decision_function_predicted_series = pd.concat([predict_df, decision_function_df], axis=1) \
        .apply(lambda x: x[x["character"]], axis=1)
    decision_function_specific_df = pd.concat([
        decision_function_character_series, decision_function_predicted_series
    ],
                                              keys=[
                                                  "actual_character",
                                                  "predicted_character"
                                              ],
                                              axis=1)

    is_correct_df = predict_df["character"].eq(y_test).to_frame(
        name="is_correct")

    # confidence = pd.DataFrame(lg.decision_function(x_test), columns=["confidence"],index=y_test.index)

    d = {
        "parsed": test["parsed"],
        "classified": pd.concat([predict_df, is_correct_df], axis=1),
        "predict_proba_specific": predict_proba_specific_df,
        "decision_function_specific": decision_function_specific_df,
        "predict_proba_": predict_proba_df,
        "decision_function": decision_function_df
    }
    data = pd.concat(d, axis=1)
    if write:
        if ngrams is None:
            fm.write_df(data, "2_classified_" + technique)
        else:
            fm.write_df(data,
                        "2_classified_" + technique + "_ngrams_" + str(ngrams))
    return data, best
Beispiel #5
0
def classify(technique="tfidf",
             train_data=None,
             test_data=None,
             C=None,
             max_iter=None,
             write=False):
    if technique not in sim_types:
        raise ValueError("Invalid classification type " + technique +
                         ". Expected one of: %s" % sim_types)
    print("Classifying lines using " + technique)
    if technique == "tfidf":
        data = fm.get_df("0_parsed")
    else:
        data = fm.get_df("1_embedded_fasttext")

    if train_data is not None and test_data is not None:
        print("Test and train data provided, using those instead")
        print("Test size: ", test_data.shape)
        print("Train size: ", train_data.shape)
        train = train_data
        test = test_data
    else:
        train, test = train_test_split(data, random_state=1515, train_size=0.8)

    y_train = train["parsed"]["character"]
    y_test = test["parsed"]["character"]

    if technique == "tfidf":
        tfidf = TfidfVectorizer()
        x_train = tfidf.fit_transform(train["parsed"]["line"])
        x_test = tfidf.transform(test["parsed"]["line"])
    else:
        x_train = train["embedded"]
        x_test = test["embedded"]

    lg = LogisticRegression(C=C, max_iter=max_iter)

    lg.fit(x_train, y_train)

    predict_proba_df = pd.DataFrame(lg.predict_proba(x_test),
                                    columns=lg.classes_,
                                    index=y_test.index)
    predict_df = pd.DataFrame(lg.predict(x_test),
                              columns=["character"],
                              index=y_test.index)

    predict_proba_character_series = pd.concat([y_test, predict_proba_df], axis=1)\
        .apply(lambda x: x[x["character"]], axis=1)
    predict_proba_predicted_series = pd.concat([predict_df, predict_proba_df], axis=1)\
        .apply(lambda x: x[x["character"]], axis=1)
    predict_proba_specific_df = pd.concat(
        [predict_proba_character_series, predict_proba_predicted_series],
        keys=["actual_character", "predicted_character"],
        axis=1)

    is_correct_df = predict_df["character"].eq(y_test).to_frame(
        name="is_correct")

    d = {
        "parsed": test["parsed"],
        "classified": pd.concat([predict_df, is_correct_df], axis=1),
        "predict_proba_specific": predict_proba_specific_df,
        "predict_proba_": predict_proba_df
    }
    data = pd.concat(d, axis=1)

    if write:
        fm.write_df(data, "2_classified_" + technique)

    return data, lg
Beispiel #6
0
def benchmark(train_or_test="test", random=False, min=2, max=30, folds=5):
    print("Benchmarking " + train_or_test + " data from " + str(min) + " to " +
          str(max))

    dictionary = {
        "accuracy": [],
        "accuracy_std": [],
        "predict_proba": [],
        "predict_proba_std": [],
        "cross_entropy_loss": [],
        "cross_entropy_loss_std": []
    }
    techniques = {
        'fasttext': deepcopy(dictionary),
        'tfidf': deepcopy(dictionary)
    }

    data = src.file_manager.get_df("1_embedded_fasttext")
    wordcount = src.file_manager.get_df("details_min_wordcount")
    hyperparams = src.file_manager.get_df(
        "4_benchmark_change_testing_data_train")

    for cur_technique in techniques.keys():
        train, test = train_test_split(data, random_state=1515, train_size=0.8)
        for min_wordcount in range(min, max):
            print("Min wordcount: ", min_wordcount)
            data_size = wordcount["wordcount"][train_or_test].get(
                min_wordcount)

            if train_or_test == "test":
                test = test[test["parsed"]["wordcount"] >= min_wordcount]
                C = hyperparams[cur_technique]["C"].get(min)
                max_iter = hyperparams[cur_technique]["max_iter"].get(min)
            if train_or_test == "train":
                train = train[train["parsed"]["wordcount"] >= min_wordcount]
                C = hyperparams[cur_technique]["C"].get(min_wordcount)
                max_iter = hyperparams[cur_technique]["max_iter"].get(
                    min_wordcount)

            accuracies = []
            predict_probas = []
            cross_entropy_losses = []

            for cur_fold in range(0, folds):

                classified, lg = src.classify_std.classify(
                    technique=cur_technique,
                    train_data=train,
                    test_data=test,
                    C=C,
                    max_iter=max_iter)

                accuracies.append(
                    accuracy_score(classified["parsed"]["character"],
                                   classified["classified"]["character"]))
                cross_entropy_losses.append(
                    log_loss(classified["parsed"]["character"],
                             classified["predict_proba_"],
                             labels=classified["predict_proba_"].columns))
                predict_probas.append(classified["predict_proba_specific"]
                                      ["predicted_character"].mean())

            cur_details = techniques.get(cur_technique)
            cur_details.get("accuracy").append(np.mean(accuracies))
            cur_details.get("accuracy_std").append(np.std(accuracies))
            cur_details.get("predict_proba").append(np.mean(predict_probas))
            cur_details.get("predict_proba_std").append(np.std(predict_probas))
            cur_details.get("cross_entropy_loss").append(
                np.mean(cross_entropy_losses))
            cur_details.get("cross_entropy_loss_std").append(
                np.std(cross_entropy_losses))

            techniques.update({cur_technique: cur_details})
            print(techniques)
    print(techniques)
    fm.write_df(pd.DataFrame.from_dict(techniques), "STD")
    return techniques