def _results(self, items, **kwargs): # Fills in a Results object with the invariant information and the # given "items" (a list of (score, docnum) tuples) r = Results(self.top_searcher, self.q, items, **kwargs) r.runtime = self.runtime r.collector = self return r
def text_term_frequency(docs: Results) -> list: """ Ermittelt für eine gegebene Liste an Dokumenten (Results Objekt) die häufigsten Terme. Zurückgegeben werden (in Abhängigkeit der länge der Liste der gefunden Terme) die top 20%, wobei die 0.25% der häufigsten Terme herausgefiltert werden. :param docs: Zu durchsuchende Dokumente als Results Objekt :return: Liste aus Strings """ # Filters stopwords and words shorter then 5 chars def word_filter(word: str) -> str: if word != "None" and word not in STOP_WORDS and len(word) > 5: return word terms = dict() for doc in docs: ranking_treshold = docs.score(int(len(docs) * 0.50)) if doc.score < ranking_treshold: break content = doc["paragraph_content"] for word in content.split(" "): word_filtered = word_filter(word) if word_filtered in terms: terms[word_filtered] += 1 else: terms[word_filtered] = 1 sorted_by_value = sorted(terms.items(), key=lambda kv: kv[1], reverse=True) return [(x[0], x[1]) for x in sorted_by_value[int(len(sorted_by_value) * 0.0025):int(len(sorted_by_value) * 0.20)]]
def _results(self, q, docnums, docset, runtime): top_n = [(None, docnum) for docnum in docnums] return Results(self.searcher, q, top_n, docset, runtime=runtime)