def keyphrases_table(keyphrases, texts, similarity_measure=None, synonimizer=None, language=consts.Language.ENGLISH): """ Constructs the keyphrases table, containing their matching scores in a set of texts. The resulting table is stored as a dictionary of dictionaries, where the entry table["keyphrase"]["text"] corresponds to the matching score (0 <= score <= 1) of keyphrase "keyphrase" in the text named "text". :param keyphrases: list of strings :param texts: dictionary of form {text_name: text} :param similarity_measure: similarity measure to use :param synonimizer: SynonymExtractor object to be used :param language: Language of the text collection / keyphrases :returns: dictionary of dictionaries, having keyphrases on its first level and texts on the second level. """ similarity_measure = similarity_measure or relevance.ASTRelevanceMeasure() text_titles = texts.keys() text_collection = texts.values() similarity_measure.set_text_collection(text_collection, language) i = 0 keyphrases_prepared = { keyphrase: utils.prepare_text(keyphrase) for keyphrase in keyphrases } total_keyphrases = len(keyphrases) total_scores = len(text_collection) * total_keyphrases res = {} for keyphrase in keyphrases: if not keyphrase: continue res[keyphrase] = {} for j in range(len(text_collection)): i += 1 logging.progress("Calculating matching scores", i, total_scores) res[keyphrase][text_titles[j]] = similarity_measure.relevance( keyphrases_prepared[keyphrase], text=j, synonimizer=synonimizer) logging.clear() return res
def set_text_collection(self, texts, language=consts.Language.ENGLISH): self.texts = texts self.language = language self.asts = [] total_texts = len(texts) for i in xrange(total_texts): # NOTE(mikhaildubov): utils.text_to_strings_collection() # does utils.prepare_text() as well. self.asts.append( base.AST.get_ast(utils.text_to_strings_collection(texts[i]), self.ast_algorithm)) logging.progress("Indexing texts with ASTs", i + 1, total_texts) logging.clear()
def keyphrases_table(keyphrases, texts, similarity_measure=None, synonimizer=None, language=consts.Language.ENGLISH): """ Constructs the keyphrases table, containing their matching scores in a set of texts. The resulting table is stored as a dictionary of dictionaries, where the entry table["keyphrase"]["text"] corresponds to the matching score (0 <= score <= 1) of keyphrase "keyphrase" in the text named "text". :param keyphrases: list of strings :param texts: dictionary of form {text_name: text} :param similarity_measure: similarity measure to use :param synonimizer: SynonymExtractor object to be used :param language: Language of the text collection / keyphrases :returns: dictionary of dictionaries, having keyphrases on its first level and texts on the second level. """ similarity_measure = similarity_measure or relevance.ASTRelevanceMeasure() text_titles = texts.keys() text_collection = texts.values() similarity_measure.set_text_collection(text_collection, language) i = 0 keyphrases_prepared = {keyphrase: utils.prepare_text(keyphrase) for keyphrase in keyphrases} total_keyphrases = len(keyphrases) total_scores = len(text_collection) * total_keyphrases res = {} for keyphrase in keyphrases: if not keyphrase: continue res[keyphrase] = {} for j in xrange(len(text_collection)): i += 1 logging.progress("Calculating matching scores", i, total_scores) res[keyphrase][text_titles[j]] = similarity_measure.relevance( keyphrases_prepared[keyphrase], text=j, synonimizer=synonimizer) logging.clear() return res
def _preprocess_tokens(self, tokens_in_texts): if self.vector_space == consts.VectorSpace.WORDS: return tokens_in_texts if self.vector_space == consts.VectorSpace.STEMS: # TODO(mikhaildubov): If the user does not specify the language, can we do some # auto language detection here? stemmed_tokens = [] total_texts = len(tokens_in_texts) for i in xrange(total_texts): stemmed_tokens.append( [self.stemmer.stem(token) for token in tokens_in_texts[i]]) logging.progress("Stemming tokens in texts", i + 1, total_texts) return stemmed_tokens elif self.vector_space == consts.VectorSpace.LEMMATA: # TODO(mikhaildubov): Implement this (what lemmatizer to use here?) raise NotImplemented() logging.clear()
def set_text_collection(self, texts, language=consts.Language.ENGLISH): self.language = language if self.vector_space == consts.VectorSpace.STEMS: self.stemmer = snowball.SnowballStemmer(self.language) raw_tokens = [] total_texts = len(texts) for i in xrange(total_texts): raw_tokens.append( utils.tokenize_and_filter(utils.prepare_text(texts[i]))) logging.progress("Preparing texts", i + 1, total_texts) logging.clear() # Convert to stems or lemmata, depending on the vector space type preprocessed_tokens = self._preprocess_tokens(raw_tokens) # Terms define the vector space (they can be words, stems or lemmata). They should be # defined once here because they will be reused when we compute td-idf for queries self.terms = list(set(utils.flatten(preprocessed_tokens))) self.tf, self.idf = self._tf_idf(preprocessed_tokens)
def _tf_idf(self, tokens_in_texts): # Calculate the inverted term index to facilitate further calculations # This is a mapping from a token to its position in the vector term_index = {} for i in xrange(len(self.terms)): term_index[self.terms[i]] = i total_texts = len(tokens_in_texts) terms_count = len(self.terms) # Calculate TF and IDF tf = [np.zeros(terms_count) for _ in xrange(total_texts)] idf_per_ferm = defaultdict(int) for i in xrange(total_texts): logging.progress("Processing texts for TF-IDF", i + 1, total_texts) # NOTE(mikhaildubov): For TF, we want to count each term as many time as it appears for term in tokens_in_texts[i]: if term in term_index: tf[i][term_index[term]] += 1 # NOTE(mikhaildubov): For IDF, we want to count each document once for each term if self.term_weighting == consts.TermWeighting.TF_IDF: for term in set(tokens_in_texts[i]): if term in term_index: idf_per_ferm[term] += 1 # TF Normalization tf[i] = [ freq * 1.0 / max(len(tokens_in_texts[i]), 1) for freq in tf[i] ] # Actual IDF metric calculation if self.term_weighting == consts.TermWeighting.TF_IDF: idf = np.zeros(len(self.terms)) for term in idf_per_ferm: idf[term_index[term]] = 1 + math.log( total_texts * 1.0 / idf_per_ferm[term]) else: idf = None logging.clear() return tf, idf