def accept(self, consumer_input: PipedInput):
        meta = consumer_input.get_meta() 
        base = meta["url"]
        base_i = self.index_of(base)
        self.bases.add(base_i)

        for l in meta["leadsTo"]:
            full = absoluteURL(base, l)
            i = self.index_of(full)
            if i != base_i:
                self.graph.add_edge(base_i, i)
    def accept(self, consumer_input: PipedInput):
        doc_id = consumer_input.get_doc_id()
        title = consumer_input.get_meta()["title"]
        body = consumer_input.get_text()

        for qid in self.docs_to_queries.get(doc_id, []):
            pair_key = "{}:{}".format(qid, doc_id)
            features = self.features[pair_key]

            text = self.queries[qid]
            features["match_body"] = fraction_of_words(text, body)
            features["match_title"] = fraction_of_words(text, title)
            features["window"] = shortest_window(text, body)
 def accept(self, consumer_input: PipedInput):
     return consumer_input.new(meta=json.loads(consumer_input.get_meta()))
 def accept(self, consumer_input: PipedInput):
     text = consumer_input.get_meta(
     )['title'] + " . " + consumer_input.get_text()
     return consumer_input.new(text=text,
                               meta=json.dumps(consumer_input.get_meta()))
Exemple #5
0
 def accept(self, consumer_input: PipedInput):
     new_meta = copy(consumer_input.get_meta())
     new_meta["title"] = self.filter_stopwords(new_meta["title"])
     return consumer_input.new(text=self.filter_stopwords(
         consumer_input.get_text()),
                               meta=new_meta)
Exemple #6
0
 def accept(self, consumer_input: PipedInput):
     new_meta = copy(consumer_input.get_meta())
     new_meta["title"] = self.lemmatize(new_meta["title"])
     return consumer_input.new(text=self.lemmatize(
         consumer_input.get_text()),
                               meta=new_meta)
Exemple #7
0
 def accept(self, consumer_input: PipedInput):
     meta = json.loads(consumer_input.get_meta())
     return consumer_input.new(doc_id=meta["url"], meta=meta)
 def accept(self, consumer_input: PipedInput):
     doc_id = consumer_input.get_doc_id()
     text = consumer_input.get_text()
     meta = consumer_input.get_meta()
     self.doc_features[doc_id]["text_len"] = len(text)
     self.doc_features[doc_id]["url_len"] = len(meta["url"])