def accept(self, consumer_input: PipedInput): meta = consumer_input.get_meta() base = meta["url"] base_i = self.index_of(base) self.bases.add(base_i) for l in meta["leadsTo"]: full = absoluteURL(base, l) i = self.index_of(full) if i != base_i: self.graph.add_edge(base_i, i)
def accept(self, consumer_input: PipedInput): doc_id = consumer_input.get_doc_id() title = consumer_input.get_meta()["title"] body = consumer_input.get_text() for qid in self.docs_to_queries.get(doc_id, []): pair_key = "{}:{}".format(qid, doc_id) features = self.features[pair_key] text = self.queries[qid] features["match_body"] = fraction_of_words(text, body) features["match_title"] = fraction_of_words(text, title) features["window"] = shortest_window(text, body)
def accept(self, consumer_input: PipedInput): return consumer_input.new(meta=json.loads(consumer_input.get_meta()))
def accept(self, consumer_input: PipedInput): text = consumer_input.get_meta( )['title'] + " . " + consumer_input.get_text() return consumer_input.new(text=text, meta=json.dumps(consumer_input.get_meta()))
def accept(self, consumer_input: PipedInput): new_meta = copy(consumer_input.get_meta()) new_meta["title"] = self.filter_stopwords(new_meta["title"]) return consumer_input.new(text=self.filter_stopwords( consumer_input.get_text()), meta=new_meta)
def accept(self, consumer_input: PipedInput): new_meta = copy(consumer_input.get_meta()) new_meta["title"] = self.lemmatize(new_meta["title"]) return consumer_input.new(text=self.lemmatize( consumer_input.get_text()), meta=new_meta)
def accept(self, consumer_input: PipedInput): meta = json.loads(consumer_input.get_meta()) return consumer_input.new(doc_id=meta["url"], meta=meta)
def accept(self, consumer_input: PipedInput): doc_id = consumer_input.get_doc_id() text = consumer_input.get_text() meta = consumer_input.get_meta() self.doc_features[doc_id]["text_len"] = len(text) self.doc_features[doc_id]["url_len"] = len(meta["url"])