コード例 #1
0
ファイル: mead_summary_generator.py プロジェクト: zoew2/CGJZ
    def get_idf_array(self):
        """
        Use external corpus to get IDF scores
        for cluster centroid calculations
        :return: numpy array of idf values
        """
        corpus = brown
        if self.args.corpus == 'R':
            corpus = reuters
        num_words = Vectors().num_unique_words
        n = len(corpus.fileids())  # number of documents in corpus
        docs_word_matrix = np.zeros([n, num_words])
        for doc_idx, doc_id in enumerate(corpus.fileids()):
            sentences = list(corpus.sents(doc_id))
            words_in_doc = set()
            for s in sentences:
                s = ' '.join(s)
                proc_s = Preprocessor.get_processed_tokens(Preprocessor.get_processed_sentence(s))
                if proc_s:
                    words_in_doc = words_in_doc.union(proc_s)
            for word in words_in_doc:
                word_idx = WordMap.id_of(word)
                if word_idx:
                    docs_word_matrix[doc_idx, word_idx] = 1

        docs_per_word = np.sum(docs_word_matrix, axis=0)
        self.idf_array = np.log10(np.divide(n, docs_per_word + 1))  # add one to avoid divide by zero error

        return self.idf_array
コード例 #2
0
ファイル: class_sentence.py プロジェクト: zoew2/CGJZ
    def __init__(self, raw_sentence, sent_pos, doc_id=None):
        """
        initialize Sentence class with methods for plain/raw and tokenized sentence
        options, word count, position of sentence in document and document id
        :param raw_sentence:
        :param sent_pos:
        """
        self.raw_sentence = ' '.join(raw_sentence.rstrip().split())
        self.raw_sentence = Preprocessor.strip_beginning(self.raw_sentence)
        self.tokens = []

        self.processed = Preprocessor.get_processed_sentence(self.raw_sentence)
        self.__tokenize_sentence(self.processed)

        self.sent_pos = int(sent_pos)  # position of sentence in document
        self.doc_id = doc_id
        self.vector = []  # placeholder
        self.order_by = self.sent_pos
        self.c_score = self.p_score = self.f_score = self.mead_score = self.lda_scores = self.melda_scores = None
        self.compressed = self.raw_sentence

        # update global mapping of words to indices
        WordMap.add_words(
            self.tokens)  # make sure self.tokens is the right thing here