def classify(docs): print("Classifying...") print("Preprocessing Train Data...") train_docs = read_docs('../data/phase2_train.csv') preprocessor = EnglishPreprocessor(train_docs) for doc in train_docs.values(): doc.words = preprocessor.preprocess(doc.text) print("Indexing Train Data...") index = PositionalIndexer(train_docs, 1).index sliced_index = slice_index(index=index, n=200) sampled = {} sample_size = 500 for i in random.sample(train_docs.keys(), sample_size): sampled[i] = train_docs[i] classifier = RFClassifier(sampled, sliced_index, len(train_docs)) classifier.train() y_pred = classifier.classify(docs) doc_ids = [doc.id for doc in docs.values()] for i in range(len(doc_ids)): docs[doc_ids[i]].tag = y_pred[i]
print(str(current_param) + ":\t" + str(accuracy)) if maximum_accuracy < accuracy: arg_max_param = current_param maximum_accuracy = accuracy return arg_max_param class RFClassifier(SKLearnClassifier): def __init__(self, train_docs, train_index, index_doc_count): super().__init__(train_docs, train_index, index_doc_count) self.clf = RandomForestClassifier(n_estimators=100) if __name__ == "__main__": train_docs = read_docs('../data/phase2_train.csv') test_docs = read_docs('../data/phase2_test.csv') preprocessor = EnglishPreprocessor(train_docs) for doc in train_docs.values(): doc.words = preprocessor.preprocess(doc.text) for doc in test_docs.values(): doc.words = preprocessor.preprocess(doc.text) print("Preprocess is done!") index = PositionalIndexer(train_docs, 1).index print("Index Created Successfully!") index_doc_count = len(train_docs)
tokens = [t for t in tokens if re.search('[a-zA-Z-]', t) is None] return tokens def __init__(self, docs): super().__init__(docs, Stemmer()) if __name__ == "__main__": # I'm reading this loud to this kids. These self-identifying kids nowadays read more than I ever did. # print("Stop Words:", find_stop_words("../data/English.csv")) # s = input() # ts = preprocess(s) # print(ts) # print(frequency_table(ts, 3)) task = input("Select task: 1. Preprocess a text 2. Show frequent words") language = input("Select language: 1. English 2. Persian") if language == "1": docs = read_docs('../data/English.csv') preprocessor = EnglishPreprocessor(docs) else: docs = read_docs('../data/Persian.xml') preprocessor = PersianPreprocessor(docs) if task == "1": preprocessor.preprocess(input("Enter text:"), True) elif task == "2": preprocessor.print_high_freq_tokens() # print(preprocessor.preprocess(""" # Parmalat to sue auditors PARMALAT, the bankrupt Italian food company, is suing outside auditors Grant Thornton and Deloitte amp; Touche, seeking 5.5 billion in damages. # """))