Example #1
0
def create_clusters(wdir, freq_table_df, methods = ["KMeans"], min_MFF = 0,
                    max_MFFs = [5000], text_representations = ["rel-zscores"], ns_clusters = [2],
                    sampling_times = 10, 
                   ):
    i = 0
     
    # Two lists for the results is initialized empty
    clustering_results_dict = {}
    parameters_results_dict = {}

    # Iterate over representation or trasnformation of the data    
    for text_representation in text_representations:
        document_data_model_df = text2features.choose_features(freq_table_df, text_representation)
        # Iterate over amount of MFFs
        for MFW in max_MFFs:
            print(MFW)
            document_data_model_cut_df = load_data.cut_corpus(document_data_model_df, min_MFF = min_MFF, max_MFF = MFW)
            print(document_data_model_cut_df.head())
            # Iterate over clustering algorithms            
            for method in methods:
                print(method)
                    
                # This if takes care of the amount of subclusters for those algorithms that need to be defined or not
                if method not in ["KMeans","SpectralClustering","AgglomerativeClustering"]:
                    print(method)
                    actual_ns_clusters = ["-"]
                else:
                    actual_ns_clusters = ns_clusters 

                # Iterate over number of cluster (this is only relevant for the algorithms that need to be initialize with a number of subclusters; for the rest I pass 30, but they will decide the number)
                for n_clusters in actual_ns_clusters:
                    print(n_clusters)

                    # Iterate over sampling times:
                    for j in range(sampling_times):
                        

                        try:
                            # Make labels and take the real number of subclusters:
                            labels_lt = choose_cluster_algorithm(method, n_clusters = n_clusters).fit(document_data_model_cut_df).labels_

                            n_clusters = len(list(set(labels_lt)))
                            
                            clustering_results_dict["cluster_"+str(i)] = labels_lt
                            parameters_results_dict["cluster_"+str(i)] = [text_representation, MFW, method, n_clusters, j]

                            
                        except:
                            print("problem with ", text_representation, method, n_clusters)
                        i += 1
    clustering_results_df =  pd.DataFrame.from_dict(clustering_results_dict)
    parameters_results_df =  pd.DataFrame.from_dict(parameters_results_dict)
    
    print(clustering_results_df.shape)
    print(freq_table_df.shape)
    clustering_results_df.index = freq_table_df.index
    
    return clustering_results_df, parameters_results_df
Example #2
0
def get_coef(wdir, wsdir="corpus/", freq_table=[], metadata="metadata.csv", sep="\t",
             class_="class", verbose=True, method="SVC", max_MFF=5000,
             text_representation="zscores", problematic_class_values=["n.av."],
             minimal_value_samples=2, make_relative=True, under_sample_method="None",
             maximum_cases=5000, sampling_times=1):
    if (type(freq_table) == list) & (type(metadata) == str):
        cut_raw_features, metadata = load_data.load_corpus_metadata(wdir, wsdir, sep, verbose, 0, max_MFF, freq_table,
                                                                    metadata)
    else:
        cut_raw_features = freq_table

    if make_relative == True:
        cut_raw_features = text2features.calculate_relative_frequencies(cut_raw_features)

    filtered_raw_features, labels = cull_data.cull_data(cut_raw_features, metadata, class_, verbose,
                                                        problematic_class_values=problematic_class_values,
                                                        minimal_value_samples=minimal_value_samples)

    document_data_model_cut = load_data.cut_corpus(filtered_raw_features, min_MFF=0, max_MFF=max_MFF)

    document_data_model = text2features.choose_features(document_data_model_cut, text_representation)

    coef_df = pd.DataFrame(columns=document_data_model.columns.tolist())

    intercept_lt = []

    print("The ten first MFWs: ", document_data_model.columns.tolist()[0:10])
    print("The ten first MFWs: ", document_data_model.columns.tolist()[-10:])
    # Meter sampling loop

    for sampling_i in range(sampling_times):
        sampled_labels, sampled_document_data_model = classify.under_sample(labels, document_data_model, under_sample_method,
                                                                   maximum_cases)

        classifier = classify.choose_classifier(method=method)

        model = classifier.fit(sampled_document_data_model, sampled_labels)
        #print(model.coef_)
        print(model.coef_.shape)
        sampled_coef_df = pd.DataFrame(data=model.coef_.tolist(), columns=sampled_document_data_model.columns.tolist())
        print(model.intercept_)
        intercept_lt.append(float(model.intercept_))

        coef_df = pd.concat([coef_df, sampled_coef_df])

    coef_df = coef_df.reindex_axis(coef_df.mean().sort_values().index, axis=1)

    print(coef_df.shape)
    print(intercept_lt)

    return coef_df, intercept_lt
Example #3
0
def regressing(wdir, features, outputs, classes, methods_lt, max_MFFs, text_representations, make_relative=False, cv=10):
    results_lt = []
    if features.shape[0] != outputs.shape[0]:
        print("Features and output do not have the same shape!")
        return
    print("features ", features.head())
    if make_relative == True:
        features = text2features.calculate_relative_frequencies(features)
    for class_ in classes:
        print("\n\nanalysed class:\t", class_)
        
        for text_representation in text_representations:
            transformed_features = text2features.choose_features(features, text_representation)
            for MFW in max_MFFs:
                print("MFW", MFW)
                transformed_features_cut = load_data.cut_corpus(transformed_features, min_MFF = 0, max_MFF = MFW)
                for method_st in methods_lt:
                    try:
                        regression_algorithm = choose_regression_algorithm(method = method_st)

                        results_dc = cross_validate(regression_algorithm, transformed_features_cut,
                                       outputs[class_], cv=10)
                        mean_results_fl = results_dc["test_score"].mean().round(3)
                        print(mean_results_fl)
                        results_lt.append([class_, text_representation, MFW, method_st, mean_results_fl, "R2"])
                    except:
                        print("problems with ", method_st)
    results_df = pd.DataFrame(results_lt, columns = ["class", "text_representation", "MFW", "method", "mean_results", "scoring"])
    results_df.sort_values(by="mean_results",ascending=False, inplace=True)
    
    return results_df
                        
                        

                            
        
Example #4
0
def evaluate_cluster(wdir, freq_table_df, metadata_df, ground_truths = ["author.name","decade","subgenre.cligs.important"],
            methods = ["KMeans"], min_MFF = 0, max_MFFs = [5000], text_representations = ["rel-zscores"],
            ns_clusters = [30], under_sample_method = "author.name", sampling_times = 10, method_evaluation = "ARI",
            
           ):
    # A list for the results is initialized empty
    results_lt = []

    # Iterate over representation or trasnformation of the data    
    for text_representation in text_representations:
        document_data_model_df = text2features.choose_features(freq_table_df, text_representation)
        # Iterate over amount of MFFs
        for MFW in max_MFFs:
            print(MFW)
            document_data_model_cut_df = load_data.cut_corpus(document_data_model_df, min_MFF = min_MFF, max_MFF = MFW)
            try:
                print("first columns ", document_data_model_cut_df.columns.tolist()[0:5])
                print("last columns ", document_data_model_cut_df.columns.tolist()[-5:])
            except:
                print("first columns ", document_data_model_cut_df.columns.tolist()[0:1])
                print("last columns ", document_data_model_cut_df.columns.tolist()[-1:])

            # Iterate over clustering algorithms            
            for method in methods:
                print(method)
                    
                # This if takes care of the amount of subclusters for those algorithms that need to be defined or not
                if method not in ["KMeans","SpectralClustering","AgglomerativeClustering"]:
                    print(method)
                    actual_ns_clusters = ["-"]
                else:
                    actual_ns_clusters = ns_clusters 

                # Iterate over number of cluster (this is only relevant for the algorithms that need to be initialize with a number of subclusters; for the rest I pass 30, but they will decide the number)
                for n_clusters in actual_ns_clusters:
                    print(n_clusters)

                    # Iterate over sampling times:
                    for i in range(sampling_times):

                        # Possibility of undersampling taking only one text per author (or any other class):
                        if under_sample_method in ["author.name","authorial"]:
                            sampled_data_df, sampled_metadata_df = sample_unique_text_by_class(document_data_model_cut_df, metadata_df, class_ = "author.name")
                        else:
                            sampled_data_df, sampled_metadata_df = document_data_model_cut_df, metadata_df
                        
                        try:
                            # Make labels and take the real number of subclusters:
                            labels = choose_cluster_algorithm(method, n_clusters = n_clusters).fit(sampled_data_df).labels_
                            
                            n_clusters = len(list(set(labels)))

                            # Evaluate with 
                            for ground_truth in ground_truths:
    
                                evaluation = evalute_clustering(sampled_metadata_df[ground_truth], labels, method = method_evaluation)
                                
                                # Add everything to the list of the results
                                results_lt.append([ground_truth, evaluation, text_representation,method, n_clusters, MFW, method_evaluation,sampled_data_df.shape[0]])

                        except:
                            print("problem with ", text_representation, method, ground_truths, n_clusters)


    # Convert the list into a dataframe, sort, clean...                            
    results_df = pd.DataFrame(results_lt, columns=["ground_truth", "evaluation", "text_representation","method", "n_clusters", "MFW", "method_evaluation","sample_size"])
    results_df = results_df.sample(frac=1).sort_values(by=["evaluation"], ascending=[False])

    # Save the results 
    results_file = "results"+"_"+ "-".join(ground_truths)+"_"+ "-".join(methods)+"_"+ "-".join(str(x) for x in max_MFFs)+"_" +"-".join(text_representations)
    if len(results_file) > 100:
        results_file = "results_"+str(datetime.datetime.now().year)+str(datetime.datetime.now().month)+str(datetime.datetime.now().day)+str(datetime.datetime.now().hour)+str(datetime.datetime.now().minute)+str(datetime.datetime.now().second)
    print(results_file)
    results_df.to_csv(wdir + results_file+".csv", sep = "\t")

    return results_df
Example #5
0
def predict(wdir,
            entire_raw_features,
            metadata,
            class_="class",
            predict_class_values=["?"],
            verbose=True,
            method="SVC",
            min_MFF=0,
            max_MFF=5000,
            text_representation="relative",
            make_relative=True,
            iterations=1,
            do_scores=False,
            type_classes="binary"):
    if make_relative == True:
        entire_raw_features = text2features.calculate_relative_frequencies(
            entire_raw_features)

    #print(entire_raw_features.columns.tolist()[0:10])
    entire_raw_features = load_data.cut_corpus(entire_raw_features,
                                               min_MFF=min_MFF,
                                               max_MFF=max_MFF)
    print(entire_raw_features.columns.tolist()[0:10])

    print("corpus and metadata are coherent"
          ) if entire_raw_features.index.tolist() == metadata.index.tolist(
          ) else "corpus and metadata are NOT coherent"

    train_class_values = [
        set_label for set_label in list(set(metadata[class_]))
        if set_label not in predict_class_values
    ]
    print("train classes", train_class_values)

    smallest_class = Counter(metadata.loc[metadata[class_].isin(
        train_class_values)][class_]).most_common()[-1]
    print("smallest class", smallest_class)

    document_data_model = text2features.choose_features(
        entire_raw_features, text_representation)

    metadata_predict = metadata.loc[metadata[class_].isin(
        predict_class_values)].copy()  #.sort_index()

    metadata_predict_iterations = pd.DataFrame(
        index=metadata_predict.index, columns=[i for i in range(iterations)])
    if type_classes == "binary":
        metadata_predict["sum_prediction_" + class_] = 0

    document_data_model_predict = document_data_model.loc[
        metadata_predict.index.tolist()]  #.sort_index()
    #print("document data model to predict\n", document_data_model_predict.head(3))
    print(
        "metadata and data to predict coherent?",
        metadata_predict.index.tolist() ==
        document_data_model_predict.index.tolist())

    for i in range(iterations):
        metadata_sample = pd.concat([
            metadata.loc[(~metadata[class_].isin(predict_class_values))
                         & (metadata[class_] != smallest_class[0])],
            metadata.loc[metadata[class_] == smallest_class[0]].sample(
                n=smallest_class[1])
        ]).sample(frac=1)
        document_data_model_sample = document_data_model.loc[
            metadata_sample.index.tolist()]
        #print("document_data_model_sample\n", document_data_model_sample.head(3))
        print("metadata and texts coherent") if metadata_sample.index.tolist(
        ) == document_data_model_sample.index.tolist() else print(
            "metadata and corpus are not coherent")
        print("metadata's shape", metadata_sample.shape)
        classifier = choose_classifier(method=method)
        #print(set(metadata_sample[class_]))
        #print(document_data_model_sample.head())
        classifier.fit(document_data_model_sample,
                       metadata_sample[class_].astype(str))

        if do_scores == True:
            scores = classify_cross(document_data_model_sample,
                                    metadata_sample[class_].astype(str),
                                    classifier,
                                    cv=10,
                                    scoring="f1")

            print("scores", scores)

        print(document_data_model_predict.index.tolist())
        results = classifier.predict(document_data_model_predict)
        print(
            i,
            metadata_sample.index[0:3],
            results,
        )

        metadata_predict_iterations[i] = results

        if type_classes == "binary":
            metadata_predict["sum_prediction_" + class_] = np.array(
                results).astype(int) + metadata_predict["sum_prediction_" +
                                                        class_]

            metadata_predict_iterations[i] = metadata_predict_iterations[
                i].astype(int)

    if type_classes == "binary":
        metadata_predict["sum_prediction_" +
                         class_] = metadata_predict["sum_prediction_" +
                                                    class_] / iterations

        print(metadata_predict["sum_prediction_" + class_])

    return metadata_predict, results, metadata_predict_iterations
Example #6
0
def classify(wdir,
             wsdir="corpus/",
             freq_table=[],
             metadata="metadata.csv",
             sep="\t",
             classes=["class"],
             verbose=True,
             methods=["SVC"],
             min_MFF=0,
             max_MFFs=[5000],
             text_representations=["zscores"],
             typographies=[True],
             sampling_mode="cross",
             problematic_class_values=[
                 "n.av.", "other", "mixed", "?", "unknown", "none",
                 "second-person"
             ],
             minimal_value_samples=2,
             make_relative=True,
             under_sample_method="None",
             maximum_cases=5000,
             sampling_times=1,
             outdir_results="",
             sort_by="median"):
    """
     *  wdir
     *  wsdir = "corpus/"
     *  freq_table  = []
     *  metadata = "metadata.csv"
     *  sep = "\t"
     *  classes = ["class"]
     *  verbose = True
     *  method = ["SVC"]
     *  min_MFF = 0
     *  max_MFFs = [5000]
     *  text_representations = ["zscores"]
     *  typographies = [True,False]
     *  sampling_mode = "cross"
     *  problematic_class_values = ["n.av.", "other", "mixed", "?", "unknown","none", "second-person"]
     *  minimal_value_samples = 2
     *  make_relative = True
     *  scoring = "f1"
     *  under_sample_method = "None"
     *  maximum_cases = 5000,
     *  sampling_times = 1

    """
    cut_raw_features = freq_table

    print("cut_raw_features ", cut_raw_features.head())
    print("in classify, cut_raw_features, ", cut_raw_features.shape)
    if make_relative == True:
        cut_raw_features = text2features.calculate_relative_frequencies(
            cut_raw_features)
        print("cut_raw_features after relative normalization",
              cut_raw_features.head())

    results = []

    for class_ in classes:
        print("\n\nanalysed class:\t", class_)
        # This step deletes too small classes
        filtered_raw_features, labels = cull_data.cull_data(
            cut_raw_features,
            metadata,
            class_,
            verbose,
            problematic_class_values=problematic_class_values,
            minimal_value_samples=minimal_value_samples)

        print("size after culling data:", filtered_raw_features.shape,
              labels.shape)

        for typography in typographies:
            filtered_raw_features_typo = cull_data.cull_typography(
                filtered_raw_features, keep_typography=typography)
            print("typography ", typography)

            for text_representation in text_representations:
                # The corpus is modeled somehow (raw, relative frequencies, tf-idf, z-scores...)
                document_data_model = text2features.choose_features(
                    filtered_raw_features_typo, text_representation)

                print(document_data_model.shape) if verbose == True else 0
                for MFW in max_MFFs:
                    print("MFW", MFW)
                    document_data_model_cut = load_data.cut_corpus(
                        document_data_model,
                        min_MFF=min_MFF,
                        max_MFF=MFW,
                        sort_by=sort_by)
                    print("The three first MFWs: ",
                          document_data_model_cut.columns.tolist()[0:3])
                    print("The three last MFWs: ",
                          document_data_model_cut.columns.tolist()[-3:])
                    if len(set(labels.values.tolist())) < 2:
                        print(
                            "After culling the class", class_,
                            " can't be divided in two groups. This category is going to be ignored"
                        )
                    else:
                        for method in methods:
                            classifier = choose_classifier(method=method)

                            f1s_over_sampling = np.array([])
                            scores_over_sampling_df = pd.DataFrame(
                                columns=["f1", "rec", "prec"])

                            for sampling_i in range(sampling_times):
                                print(labels.shape)
                                print(document_data_model_cut.shape)

                                sampled_labels, sampled_document_data_model_cut = sampling.under_sample(
                                    labels, document_data_model_cut,
                                    under_sample_method, maximum_cases)
                                baseline = cull_data.calculate_baseline(
                                    sampled_labels)

                                least_frequent_class_value = Counter(
                                    sampled_labels).most_common()[-1][1]

                                if sampling_mode == "standard":
                                    print("standard sampling, bug coming!")

                                    results = standard_classification(
                                        wdir, least_frequent_class_value,
                                        document_data_model_cut,
                                        sampled_labels, verbose, classifier,
                                        class_)
                                    return results

                                elif sampling_mode == "cross":
                                    cv = cull_data.calculate_cv(
                                        least_frequent_class_value)
                                    print("cross validation sampling of ",
                                          class_)

                                    scores_df = classify_cross(
                                        sampled_document_data_model_cut,
                                        sampled_labels,
                                        classifier,
                                        cv=cv)

                                    f1s_over_sampling = np.append(
                                        f1s_over_sampling, scores_df["f1"])

                                    scores_over_sampling_df = pd.concat([
                                        scores_df,
                                        pd.DataFrame(scores_df.mean()).T
                                    ],
                                                                        axis=0)

                            #print(scoring + ": %0.2f (+/- %0.2f)" % (evaluation_over_sampling.mean(), evaluation_over_sampling.std() * 2))
                            test_result_param, test_result_pvalue = test_ttest_cross_results_baseline(
                                f1s_over_sampling, baseline)

                            # Creo que aquĆ­ hay que descender en los loops
                            print("Class: \t", class_)
                            print("Scores:\n \t",
                                  scores_over_sampling_df.mean().round(3))
                            print("p-value: ", round(test_result_pvalue, 4))
                            print("Baseline: \t\t", round(baseline, 2))
                            print(method)

                            f1_baseline = scores_over_sampling_df.mean(
                            )["f1"].round(3) - baseline
                            print(
                                scores_over_sampling_df.mean()["f1"].round(3) -
                                baseline)
                            results.append([
                                class_,
                                scores_over_sampling_df.mean()["f1"].round(3),
                                scores_over_sampling_df.mean()["rec"].round(3),
                                scores_over_sampling_df.mean()["prec"].round(
                                    3),
                                scores_over_sampling_df.mean()
                                ["f1_macro"].round(3),
                                scores_over_sampling_df.mean()
                                ["f1_micro"].round(3), baseline, f1_baseline,
                                method, text_representation, MFW, typography,
                                f1s_over_sampling.round(2), test_result_pvalue,
                                sampled_labels, sampled_labels.shape[0], cv,
                                sampling_times, classifier
                            ])
    results_df = pd.DataFrame(
        results,
        columns=[
            "class", 'mean_f1', 'mean_rec', "mean_prec", "f1_macro",
            "f1_micro", 'baseline', "f1-baseline", 'classifier_name',
            'text_representation', 'MFW', 'typography', "f1s",
            'test_result_pvalue', 'labels', "sample_size", "cv",
            "sampling_times", 'classifier'
        ])
    print(results_df.head())

    results_df = results_df.sample(frac=1)
    results_df.sort_values(by=["f1-baseline", "MFW"],
                           ascending=[False, True],
                           inplace=True)
    if outdir_results == "":
        outdir_results = wdir
    results_file = "results" + "_" + "-".join(
        classes) + "_" + "-".join(methods) + "_" + "-".join(
            str(x) for x in max_MFFs) + "_" + "-".join(text_representations)
    if len(results_file) > 100:
        results_file = "results_" + str(datetime.datetime.now().year) + str(
            datetime.datetime.now().month) + str(
                datetime.datetime.now().day) + str(
                    datetime.datetime.now().hour) + str(
                        datetime.datetime.now().minute) + str(
                            datetime.datetime.now().second)

    results_df.to_csv(outdir_results + results_file + ".csv", sep="\t")
    print("done!")
    return results_df