def extract_features():
    for i in range(4):
        if i == 0:
            extractor = Unigram(data_dir + "CASIS25/", "casis25")
        elif i == 1:
            extractor = Stylomerty(data_dir + "CASIS25/", "casis25")
        elif i == 2:
            extractor = BagOfWords(data_dir + "CASIS25/", "casis25")
        else:
            extractor = CharacterGram(data_dir + "CASIS25/",
                                      "casis25",
                                      gram=3,
                                      limit=1000)

        extractor.start()
        lookup_table = extractor.lookup_table
        print("Generated Lookup Table:")
        # print(lookup_table)
        col = []
        if lookup_table is not False:
            print("'" + "', '".join(
                [str("".join(x)).replace("\n", " ")
                 for x in lookup_table]) + "'")
            for x in lookup_table:
                col.append("'" +
                           "', '".join([str("".join(x)).replace("\n", " ")]) +
                           "'")
            generated_file = feature_set_dir + extractor.out_file + ".txt"
            generated_csv_file = feature_set_dir + extractor.out_file + ".csv"
            data, labels = Data_Utils.get_dataset(generated_file)
            df = pd.DataFrame(data, columns=col)
            df.insert(0, "Label", labels, True)
            df.to_csv(generated_csv_file)
        else:
            generated_file = feature_set_dir + extractor.out_file + ".txt"
            generated_csv_file = feature_set_dir + extractor.out_file + ".csv"
            data, labels = Data_Utils.get_dataset(generated_file)
            df = pd.DataFrame(data)
            df.insert(0, "Label", labels, True)
            df.to_csv(generated_csv_file)

        # Get dataset information
        dataset_info = DatasetInfo("casis25_bow")
        dataset_info.read()
        authors = dataset_info.authors
        writing_samples = dataset_info.instances
        print("\n\nAuthors in the dataset:")
        print(authors)

        print("\n\nWriting samples of an author 1000")
        print(authors["1000"])

        print("\n\nAll writing samples in the dataset")
        print(writing_samples)

        print("\n\nThe author of the writing sample 1000_1")
        print(writing_samples["1000_1"])

        # print(labels[0], data[0])
    print("Done")
Esempio n. 2
0
def getUnigramsFromTextFiles(data_dir="./textfiles/",
                             feature_set_dir="./datasets/"):
    extractor = Unigram(data_dir + "", "casis25")
    extractor.start()
    lookup_table = extractor.lookup_table
    print("Generated Lookup Table:")
    #print(lookup_table)
    if lookup_table is not False:
        print("'" + "', '".join(
            [str("".join(x)).replace("\n", " ") for x in lookup_table]) + "'")

    # Get dataset information
    dataset_info = DatasetInfo("casis25_bow")
    dataset_info.read()
    authors = dataset_info.authors
    writing_samples = dataset_info.instances

    print("\n\nAuthors in the dataset:")
    print(authors)

    print("\n\nWriting samples of an author advText")
    print(authors["advText01"])

    print("\n\nAll writing samples in the dataset")
    print(writing_samples)

    print("\n\nThe author of the writing sample advText01")
    print(writing_samples["advText01"])

    generated_file = feature_set_dir + extractor.out_file + ".txt"
    data, labels = Data_Utils.get_dataset(generated_file)
def _get_dataset(filename, info=None):
    if info is None:
        dataset = get_dataset_name(filename)
        info = DatasetInfo(dataset, descriptor="auto").read()

    if isinstance(info, str):
        info = DatasetInfo(info, descriptor="auto").read()

    if info.get_feature_prop("is_numpy", False):
        return get_numpy_dataset  # get_numpy_dataset(filename)

    return get_text_dataset  # get_text_dataset(filename)
Esempio n. 4
0
            generated_file = feature_set_dir + extractor.out_file + ".txt"
            generated_csv_file = feature_set_dir + extractor.out_file + ".csv"
            data, labels = Data_Utils.get_dataset(generated_file)
            df = pd.DataFrame(data, columns=col)
            df.insert(0, "Label", labels, True)
            df.to_csv(generated_csv_file)
        else:
            generated_file = feature_set_dir + extractor.out_file + ".txt"
            generated_csv_file = feature_set_dir + extractor.out_file + ".csv"
            data, labels = Data_Utils.get_dataset(generated_file)
            df = pd.DataFrame(data)
            df.insert(0, "Label", labels, True)
            df.to_csv(generated_csv_file)

        # Get dataset information
        dataset_info = DatasetInfo("casis25_bow")
        dataset_info.read()
        authors = dataset_info.authors
        writing_samples = dataset_info.instances
        print("\n\nAuthors in the dataset:")
        print(authors)

        print("\n\nWriting samples of an author 1000")
        print(authors["1000"])

        print("\n\nAll writing samples in the dataset")
        print(writing_samples)

        print("\n\nThe author of the writing sample 1000_1")
        print(writing_samples["1000_1"])
Esempio n. 5
0
    #  extractor = Stylomerty(data_dir + "CASIS25/", "casis25")
    #  extractor = BagOfWords(data_dir + "CASIS25/", "casis25")
    extractor = CharacterGram(data_dir + data_set + "/",
                              data_set,
                              gram=3,
                              limit=1000)
    extractor.start()
    lookup_table = extractor.lookup_table
    print("Generated Lookup Table:")
    print(lookup_table)
    if lookup_table is not False:
        print("'" + "', '".join(
            [str("".join(x)).replace("\n", " ") for x in lookup_table]) + "'")

    # Get dataset information
    dataset_info = DatasetInfo(data_set + "_bow")
    dataset_info.read()
    authors = dataset_info.authors
    writing_samples = dataset_info.instances
    print("\n\nAuthors in the dataset:")
    print(authors)

    print("\n\nWriting samples of an author 1000")
    print(authors["1000"])

    print("\n\nAll writing samples in the dataset")
    print(writing_samples)

    print("\n\nThe author of the writing sample 1000_1")
    print(writing_samples["1000_1"])