예제 #1
0
    def index(self):
        self.loadPickle()
        for file in os.listdir(self.file_path)[self.corpus_count:]:
            if file == ".DS_Store":
                continue
            print(self.corpus_count)
            self.corpus_count += 1

            file_path = os.path.join(self.file_path, file)
            docID, text, title, url = self.parse_JSON(file_path)

            title_frequency = Utilities.frequency_counter(Utilities.parser(title))
            text_frequency = Utilities.frequency_counter(Utilities.parser(text))

            self.parse_corpus(docID, title_frequency, True)
            self.parse_corpus(docID, text_frequency)
            self.docID2URL[docID] = url

            # Logger.print_log(docID, file_path, text, text_frequency, title, title_frequency)
            # Logger.print_dicts(self.term2termID, self.term2DocIDFreq, self.term2TFIDF, self.corpus_count)
            self.sanitize()
        print("index complete")
        self.score_data()
        print("scoring complete")
        self.dump_pickle()
        print("done")