def extract_features(): for i in range(4): if i == 0: extractor = Unigram(data_dir + "CASIS25/", "casis25") elif i == 1: extractor = Stylomerty(data_dir + "CASIS25/", "casis25") elif i == 2: extractor = BagOfWords(data_dir + "CASIS25/", "casis25") else: extractor = CharacterGram(data_dir + "CASIS25/", "casis25", gram=3, limit=1000) extractor.start() lookup_table = extractor.lookup_table print("Generated Lookup Table:") # print(lookup_table) col = [] if lookup_table is not False: print("'" + "', '".join( [str("".join(x)).replace("\n", " ") for x in lookup_table]) + "'") for x in lookup_table: col.append("'" + "', '".join([str("".join(x)).replace("\n", " ")]) + "'") generated_file = feature_set_dir + extractor.out_file + ".txt" generated_csv_file = feature_set_dir + extractor.out_file + ".csv" data, labels = Data_Utils.get_dataset(generated_file) df = pd.DataFrame(data, columns=col) df.insert(0, "Label", labels, True) df.to_csv(generated_csv_file) else: generated_file = feature_set_dir + extractor.out_file + ".txt" generated_csv_file = feature_set_dir + extractor.out_file + ".csv" data, labels = Data_Utils.get_dataset(generated_file) df = pd.DataFrame(data) df.insert(0, "Label", labels, True) df.to_csv(generated_csv_file) # Get dataset information dataset_info = DatasetInfo("casis25_bow") dataset_info.read() authors = dataset_info.authors writing_samples = dataset_info.instances print("\n\nAuthors in the dataset:") print(authors) print("\n\nWriting samples of an author 1000") print(authors["1000"]) print("\n\nAll writing samples in the dataset") print(writing_samples) print("\n\nThe author of the writing sample 1000_1") print(writing_samples["1000_1"]) # print(labels[0], data[0]) print("Done")
def getUnigramsFromTextFiles(data_dir="./textfiles/", feature_set_dir="./datasets/"): extractor = Unigram(data_dir + "", "casis25") extractor.start() lookup_table = extractor.lookup_table print("Generated Lookup Table:") #print(lookup_table) if lookup_table is not False: print("'" + "', '".join( [str("".join(x)).replace("\n", " ") for x in lookup_table]) + "'") # Get dataset information dataset_info = DatasetInfo("casis25_bow") dataset_info.read() authors = dataset_info.authors writing_samples = dataset_info.instances print("\n\nAuthors in the dataset:") print(authors) print("\n\nWriting samples of an author advText") print(authors["advText01"]) print("\n\nAll writing samples in the dataset") print(writing_samples) print("\n\nThe author of the writing sample advText01") print(writing_samples["advText01"]) generated_file = feature_set_dir + extractor.out_file + ".txt" data, labels = Data_Utils.get_dataset(generated_file)
def _get_dataset(filename, info=None): if info is None: dataset = get_dataset_name(filename) info = DatasetInfo(dataset, descriptor="auto").read() if isinstance(info, str): info = DatasetInfo(info, descriptor="auto").read() if info.get_feature_prop("is_numpy", False): return get_numpy_dataset # get_numpy_dataset(filename) return get_text_dataset # get_text_dataset(filename)
generated_file = feature_set_dir + extractor.out_file + ".txt" generated_csv_file = feature_set_dir + extractor.out_file + ".csv" data, labels = Data_Utils.get_dataset(generated_file) df = pd.DataFrame(data, columns=col) df.insert(0, "Label", labels, True) df.to_csv(generated_csv_file) else: generated_file = feature_set_dir + extractor.out_file + ".txt" generated_csv_file = feature_set_dir + extractor.out_file + ".csv" data, labels = Data_Utils.get_dataset(generated_file) df = pd.DataFrame(data) df.insert(0, "Label", labels, True) df.to_csv(generated_csv_file) # Get dataset information dataset_info = DatasetInfo("casis25_bow") dataset_info.read() authors = dataset_info.authors writing_samples = dataset_info.instances print("\n\nAuthors in the dataset:") print(authors) print("\n\nWriting samples of an author 1000") print(authors["1000"]) print("\n\nAll writing samples in the dataset") print(writing_samples) print("\n\nThe author of the writing sample 1000_1") print(writing_samples["1000_1"])
# extractor = Stylomerty(data_dir + "CASIS25/", "casis25") # extractor = BagOfWords(data_dir + "CASIS25/", "casis25") extractor = CharacterGram(data_dir + data_set + "/", data_set, gram=3, limit=1000) extractor.start() lookup_table = extractor.lookup_table print("Generated Lookup Table:") print(lookup_table) if lookup_table is not False: print("'" + "', '".join( [str("".join(x)).replace("\n", " ") for x in lookup_table]) + "'") # Get dataset information dataset_info = DatasetInfo(data_set + "_bow") dataset_info.read() authors = dataset_info.authors writing_samples = dataset_info.instances print("\n\nAuthors in the dataset:") print(authors) print("\n\nWriting samples of an author 1000") print(authors["1000"]) print("\n\nAll writing samples in the dataset") print(writing_samples) print("\n\nThe author of the writing sample 1000_1") print(writing_samples["1000_1"])