Ejemplo n.º 1
0
def create_vector_space(indexer=None,
                        table_file_saved=True,
                        vs_address=None,
                        should_write=True,
                        write_address=None):
    if indexer is None:
        indexer = Indexer(MODE)
    if table_file_saved:
        index_table = indexer.read_index_table()
        print(index_table)
    else:
        index_table = indexer.posting_list
    # exit(1)
    vs = VectorSpace(index_table, len(indexer.parser.get_docids()))
    if vs_address is None:
        i = 1
        print(len(index_table))
        # exit(0)
        for id in indexer.parser.get_docids():
            print(id, ':', i, '/', len(indexer.parser.get_docids()))
            words = set(indexer.index_single_doc(id).keys())
            vs.add_doc_vec(id, words)
            i += 1
        if should_write:
            vs.write_vec_to_file(MODE, write_address)
        print(vs.doc_dict)
    else:
        vs.doc_dict = VectorSpace.read_vector_space_model(MODE, vs_address)
    return vs
Ejemplo n.º 2
0
 def __init__(self, class_name, **kwargs):
     self.class_name = class_name
     self.cluster_count = 4
     self.tags_list = []
     self.indexer = Indexer("english",
                            is_data_tagged=True,
                            preload_corpus=False,
                            **kwargs)
Ejemplo n.º 3
0
 def read_data_from_file(self, file_name):
     self.indexer = Indexer("english",
                            is_data_tagged=True,
                            preload_corpus=False)
     self.indexer.set_adresses(
         index_address="DataSet/phase2/index_train",
         bigram_addres="DataSet/phase2/bigram_train.csv")
     self.indexer.parser.read_english_documents(filename=file_name,
                                                set_docs=True)
     labels = [int(label) for label in self.indexer.parser.first_row]
     return self._calculate_all_words(self.indexer), labels
Ejemplo n.º 4
0
 def customised_index_creator(self, label):
     indexer = Indexer("english",
                       preload_corpus=False,
                       is_data_tagged=False)
     indexer.parser.read_english_documents(
         self.get_corpus_address() + str(label) + ".csv", True)
     indexer.bigram_index_filename = self.get_corpus_address() + str(
         label) + "_bigram"
     indexer.index_filename = self.get_corpus_address() + str(
         label) + "_index"
     return indexer
Ejemplo n.º 5
0
def create_index_tables(mode):
    indexer = Indexer(mode)

    selection = input("""
Which table do you want to create?
\t1. Index table
\t2. bi-gram table
""")

    if selection == "1":
        indexer.index()
    elif selection == "2":
        indexer.create_bigram_index()
Ejemplo n.º 6
0
2. Farsi
""")
        print_line()
        if mode == "1":
            mode = "english"
            break
        elif mode == "2":
            mode = "persian"
            break
        else:
            invalid()
    if mode == "english":
        parser = EnglishParser()
    else:
        parser = PersianParser()
    indexer = Indexer(mode)
    index_table = indexer.read_index_table()
    bigram_table = indexer.read_bigram()
    vector_space_model = VectorSpace.read_vector_space_model(mode)
    vector_space = VectorSpace(index_table, model=vector_space_model)
    query_corrector = QueryCorrector(bigram_table)
    searcher = Searcher(query_corrector, index_table, parser, vector_space)
    while True:
        selection = input("""
Which part do you want to test?
\t1. Pre-processing (Parsing)
\t2. Indexing
\t3. Query correction
\t4. Search

\t0. Exit
Ejemplo n.º 7
0
class Classifier:
    def __init__(self, class_name, **kwargs):
        self.class_name = class_name
        self.cluster_count = 4
        self.tags_list = []
        self.indexer = Indexer("english",
                               is_data_tagged=True,
                               preload_corpus=False,
                               **kwargs)

    @staticmethod
    def _calculate_all_words(indexer):
        print("Calculating data set words.")
        result = []
        line_count = 0
        for docid in indexer.parser.get_docids():
            line_count += 1
            if line_count % 1000 == 0:
                print(f'\tProcessed {line_count} doc.')

            result.append(" ".join(
                term for term in list(indexer.create_all_terms(docid))))
        return result

    def train(self, train_data):
        return NotImplementedError

    def test(self, test_data):
        return NotImplementedError

    def read_data_from_file(self, file_name):
        self.indexer = Indexer("english",
                               is_data_tagged=True,
                               preload_corpus=False)
        self.indexer.set_adresses(
            index_address="DataSet/phase2/index_train",
            bigram_addres="DataSet/phase2/bigram_train.csv")
        self.indexer.parser.read_english_documents(filename=file_name,
                                                   set_docs=True)
        labels = [int(label) for label in self.indexer.parser.first_row]
        return self._calculate_all_words(self.indexer), labels

    def print_information(self, confusion_matrices):
        for tag in self.tags_list:
            cm = confusion_matrices[tag]
            precision = cm[0] / (cm[0] + cm[1])
            recall = cm[0] / (cm[0] + cm[2])
            accuracy = (cm[0] + cm[3]) / (sum(cm))
            f1 = 2 * precision * recall / (precision + recall)
            print(f"\tFor class {tag}:")
            print(f"\t\tAccuracy: {accuracy}")
            print(f"\t\tPrecision: {precision}")
            print(f"\t\tRecall: {recall}")
            print(f"\t\tF1: {f1}")
            print("-------------------------------------------")

    def print_metrics(self, predictions, correct_tags):
        print("Confusion Matrix:\n",
              metrics.confusion_matrix(correct_tags, predictions))
        # TODO here we must do something with our tags_list and generic confusion matrix format
        # self.print_information(metrics.confusion_matrix(correct_tags, predictions))
        print(metrics.classification_report(correct_tags, predictions))
        print(metrics.accuracy_score(correct_tags, predictions))
        print('Mean Absolute Error:',
              metrics.mean_absolute_error(correct_tags, predictions))
        print('Mean Squared Error:',
              metrics.mean_squared_error(correct_tags, predictions))
        print('Root Mean Squared Error:',
              np.sqrt(metrics.mean_squared_error(correct_tags, predictions)))

    def _predict_and_calculate_confusion_matrix(self,
                                                predict_dataset,
                                                has_header=True):
        # TP FP FN TN
        confusion_matrices = {}
        if not has_header:
            dataset, labels = predict_dataset
            predict_dataset = zip(labels, dataset)
        for tag in self.tags_list:
            confusion_matrices[tag] = [0, 0, 0, 0]
        for data in predict_dataset:
            correct_tag = data[0]
            if has_header:
                predicted_tag = self.predict_single_input(
                    f'{data[1]} {data[2]}')
            else:
                predicted_tag = self.predict_single_input(data[1])

            for tag in self.tags_list:
                if tag == correct_tag:
                    if tag == predicted_tag:
                        confusion_matrices[tag][0] += 1
                    else:
                        confusion_matrices[tag][2] += 1
                else:
                    if tag == predicted_tag:
                        confusion_matrices[tag][1] += 1
                    else:
                        confusion_matrices[tag][3] += 1
        return confusion_matrices

    def predict_single_input(self, input_str):
        return NotImplementedError

    def get_corpus_address(self):
        return FILE_ADDR_PREFIX + "/" + self.class_name + "/phase1_labeled_"

    def rewrite_csv_with_label(self, source_file, labels):
        documents = {1: [], 2: [], 3: [], 4: []}
        with open(source_file, encoding="utf8") as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
            line_count = 0
            for row in csv_reader:
                if line_count != 0:
                    documents[labels[line_count - 1]].append([row[0], row[1]])
                line_count += 1
            print(f'Processed {line_count} lines.')

        for label in list(documents.keys()):
            with open(self.get_corpus_address() + str(label) + ".csv",
                      mode='w') as csv_file:
                csv_writer = csv.writer(csv_file,
                                        delimiter=',',
                                        quotechar='"',
                                        quoting=csv.QUOTE_MINIMAL)
                csv_writer.writerow(["Title", "Text"])
                for doc in documents[label]:
                    csv_writer.writerow(doc)

    @staticmethod
    def show_prediction_result(y_pred, y_real):

        print("Confusion Matrix:\n", metrics.confusion_matrix(y_real, y_pred))
        # TODO here we must do something with our tags_list and generic confusion matrix format
        # self.print_information(metrics.confusion_matrix(self.test_set_labels, y_pred))
        print(metrics.classification_report(y_real, y_pred))
        print(metrics.accuracy_score(y_real, y_pred))
        print('Mean Absolute Error:',
              metrics.mean_absolute_error(y_real, y_pred))
        print('Mean Squared Error:',
              metrics.mean_squared_error(y_real, y_pred))
        print('Root Mean Squared Error:',
              np.sqrt(metrics.mean_squared_error(y_real, y_pred)))

    def customised_index_creator(self, label):
        indexer = Indexer("english",
                          preload_corpus=False,
                          is_data_tagged=False)
        indexer.parser.read_english_documents(
            self.get_corpus_address() + str(label) + ".csv", True)
        indexer.bigram_index_filename = self.get_corpus_address() + str(
            label) + "_bigram"
        indexer.index_filename = self.get_corpus_address() + str(
            label) + "_index"
        return indexer

    def create_index_and_bigram(self):
        for i in range(1, 5):
            indexer = self.customised_index_creator(i)
            indexer.index()
            indexer.create_bigram_index()

    def create_vector_space(self):
        for i in range(1, 5):
            indexer = self.customised_index_creator(i)
            create_vector_space(indexer=indexer,
                                write_address=self.get_corpus_address() +
                                str(i) + "_vs",
                                should_write=True)

    def read_vector_space_model(self, label):
        print(self.get_corpus_address() + str(label) + "_vs")
        return VectorSpace.read_vector_space_model(
            "english",
            self.get_corpus_address() + str(label) + "_vs")