def accept(self, consumer_input: PipedInput): tokens = self.mystem.lemmatize(consumer_input.get_text().lower()) result = [] for token in tokens: token = token.strip() if is_russian(token) or is_belarusian(token) or is_english(token): result.append(token) return consumer_input.new(text=" ".join(result))
def accept(self, consumer_input: PipedInput): text = consumer_input.get_text().lower() token_words = word_tokenize(text) result = [] for token in token_words: token = token.strip() if is_russian(token) or is_belarusian(token): result.append(self.russian_stemmer.stem(token)) if is_english(token): result.append(self.english_stemmer.stem(token)) return consumer_input.new(text=" ".join(result))
def accept(self, consumer_input: PipedInput): doc_id = consumer_input.get_doc_id() title = consumer_input.get_meta()["title"] body = consumer_input.get_text() for qid in self.docs_to_queries.get(doc_id, []): pair_key = "{}:{}".format(qid, doc_id) features = self.features[pair_key] text = self.queries[qid] features["match_body"] = fraction_of_words(text, body) features["match_title"] = fraction_of_words(text, title) features["window"] = shortest_window(text, body)
def accept(self, consumer_input: PipedInput): text = consumer_input.get_text() doc_id = consumer_input.get_doc_id() self.doc_lengths[doc_id] = len(text) for word in self.df.keys(): if word in text: self.df[word] += 1 if doc_id in self.tf[word]: self.tf[word][doc_id] += 1 else: self.tf[word][doc_id] = 1
def accept(self, consumer_input: PipedInput): meta = consumer_input.get_meta() base = meta["url"] base_i = self.index_of(base) self.bases.add(base_i) doc_id = consumer_input.get_doc_id() self.doc_ids[base_i] = doc_id for l in meta["leadsTo"]: full = absoluteURL(base, l) i = self.index_of(full) if i != base_i: self.graph.add_edge(base_i, i)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-i', "--input", type=str, help="XML to convert reuests", default="web2008_adhoc.xml") parser.add_argument('-m', "--mode", type=str, help="How to convert requests", default="stemmas") args = parser.parse_args() text = "" with open(args.input, "r", encoding="cp1251") as file: text = "\n".join(file.readlines()) if args.mode == "lemmas": formatter = TextLemmatizerStage() elif args.mode == "stemmas": formatter = TextStemmerStage() else: raise dom = minidom.parseString(text) for token in dom.getElementsByTagName("task"): token.firstChild.firstChild.replaceWholeText( formatter.accept( PipedInput(token.firstChild.firstChild.nodeValue, None, None)).get_text()) with open("output_{}.xml".format(args.mode), 'w', encoding="utf-8") as file: file.write(dom.toxml())
def accept(self, consumer_input: PipedInput): doc_id = consumer_input.get_doc_id() self.doc_features[doc_id] = {} for qid in self.docs_to_queries.get(doc_id, []): pair_key = "{}:{}".format(qid, doc_id) self.features[pair_key] = {}
def accept(self, consumer_input: PipedInput): for word in consumer_input.get_text(): if is_english(word): self.english_words += 1 self.english_lengths_sum += len(word) else: self.not_english_words += 1 self.not_english_lengths_sum += len(word)
def accept(self, consumer_input: PipedInput): for word in consumer_input.get_text(): if is_russian(word): self.russian_words += 1 if word in self.__russian_stopwords: self.russian_stopwords += 1 if is_english(word): self.english_words += 1 if word in self.__english_stopwords: self.english_stopwords += 1
def accept(self, consumer_input: PipedInput): for word in set(consumer_input.get_text()): self.idf[word] = self.idf.get(word, 0) + 1 for word in consumer_input.get_text(): self.tf[word] = self.tf.get(word, 0) + 1 self.documents_count += 1
def accept(self, consumer_input: PipedInput): new_meta = copy(consumer_input.get_meta()) new_meta["title"] = self.filter_stopwords(new_meta["title"]) return consumer_input.new(text=self.filter_stopwords( consumer_input.get_text()), meta=new_meta)
def main(): timestamp = time.time() parser = argparse.ArgumentParser() parser.add_argument("-d", "--documents", type=int, help="Number of documents to process", default=100) parser.add_argument('-i', "--input", type=str, help="Directory to read extracted documents from", default="extracted") parser.add_argument('-e', '--encoding', type=str, help="Documents encoding", default='utf8') args = parser.parse_args() print("Ready to process {} files in {} directory.".format(args.documents, args.input)) lemm_holder = StringHolder() stem_holder = StringHolder() header_holder = StringHolder() stages = [ # Turns file lines from the input into the list of normalized words. PipelineDumpingStage(TextLemmatizerStage(), lemm_holder), PipelineDumpingStage(TextStemmerStage(), stem_holder), JsonUnpackerStage(), PipelineImmutableStage(PageRankStage()), PipelineDumpingStage(TextWithHeaderStage(), header_holder), ] # Register your pipeline stage here. try: os.makedirs("logs") os.makedirs("results") os.makedirs("lemmatized") os.makedirs("stemmed") os.makedirs("headered") print("Created directories", file=sys.stderr) except OSError as e: if e.errno != errno.EEXIST: raise with open(os.path.join('logs', '{}.txt'.format(timestamp)), 'w') as logs: sys.stdout = logs doc_id = 0 documents_counter = 0 while documents_counter < args.documents: text_path = os.path.join(args.input, "{:>07d}.txt".format(doc_id)) meta_path = os.path.join(args.input, "{:>07d}.json".format(doc_id)) if os.path.exists(text_path) and os.path.exists(meta_path): lemm_holder.s = os.path.join("lemmatized", "{:>07d}".format(doc_id)) stem_holder.s = os.path.join("stemmed", "{:>07d}".format(doc_id)) header_holder.s = os.path.join("headered", "{:>07d}".format(doc_id)) print("Found files {} and {}. {}/{} processed.".format(text_path, meta_path, documents_counter, args.documents), file=sys.stderr) with open(text_path, 'r', encoding=args.encoding) as text_file: with open(meta_path, 'r', encoding=args.encoding) as meta_file: text = "\n".join(text_file.readlines()) meta = "\n".join(meta_file.readlines()) stage_input = PipedInput(text, meta, doc_id) for consumer in stages: stage_input = consumer.accept(stage_input) documents_counter += 1 doc_id += 1 for stage in stages: stage.dump()
def accept(self, consumer_input: PipedInput): meta = json.loads(consumer_input.get_meta()) return consumer_input.new(doc_id=meta["url"], meta=meta)
def accept(self, consumer_input: PipedInput): doc_id = consumer_input.get_doc_id() text = consumer_input.get_text() meta = consumer_input.get_meta() self.doc_features[doc_id]["text_len"] = len(text) self.doc_features[doc_id]["url_len"] = len(meta["url"])
def main(): parser = argparse.ArgumentParser() parser.add_argument("-d", "--documents", type=int, help="Number of documents to process", default=100) parser.add_argument('-i', "--input", type=str, help="Directory to read extracted documents from", default="extracted") parser.add_argument('-e', '--encoding', type=str, help="Documents encoding", default='utf8') parser.add_argument('-r', '--relevant', type=str, help="Relevance table for queries path", default="or_relevant-minus_table.xml") parser.add_argument('-q', '--queries', type=str, help="Queries path", default="web2008_adhoc.xml") args = parser.parse_args() relevant, irrelevant = load_relevant_docs(args.relevant) queries = load_queries(args.queries, relevant, irrelevant) docs_to_queries = load_docs_to_queries(args.relevant, queries) features = {} query_features = {} doc_features = {} CreateFeatureDumper = lambda stage: PipelineFeaturesDumper( stage, features, query_features, doc_features) stages = [ JsonUnpackerStage(), TextProcessorStage(), StopwordsFilter(), PipelineImmutableStage( CreateFeatureDumper(InitFeaturesStage(queries, docs_to_queries))), PipelineImmutableStage(CreateFeatureDumper(LengthCounterStage())), PipelineImmutableStage( CreateFeatureDumper(FieldMatchStage(queries, docs_to_queries))), PipelineImmutableStage(CreateFeatureDumper(PageRankStage())), PipelineImmutableStage(CreateFeatureDumper(QueryLengthStage(queries))), PipelineImmutableStage( CreateFeatureDumper(BM25Stage(queries, docs_to_queries))), ] # Register your pipeline stage here. print("Ready to process {} files in {} directory.".format( args.documents, args.input)) try: os.makedirs("logs") os.makedirs("results") print("Created directories", file=sys.stderr) except OSError as e: if e.errno != errno.EEXIST: raise timestamp = time.time() with open(os.path.join('logs', '{}.txt'.format(timestamp)), 'w') as logs: sys.stdout = logs doc_id = 0 documents_counter = 0 while documents_counter < args.documents: text_path = os.path.join(args.input, "{:>07d}.txt".format(doc_id)) meta_path = os.path.join(args.input, "{:>07d}.json".format(doc_id)) if os.path.exists(text_path) and os.path.exists(meta_path): print("Found files {} and {}. {}/{} processed.".format( text_path, meta_path, documents_counter, args.documents), file=sys.stderr) with open(text_path, 'r', encoding=args.encoding) as text_file: with open(meta_path, 'r', encoding=args.encoding) as meta_file: text = "\n".join(text_file.readlines()) meta = "\n".join(meta_file.readlines()) stage_input = PipedInput(text, meta, doc_id) for consumer in stages: stage_input = consumer.accept(stage_input) documents_counter += 1 doc_id += 1 for stage in stages: stage.dump() result = {} fid = { "target": "", "qid": "qid:", "text_len": "1:", "url_len": "2:", "pagerank": "3:", "query_len": "4:", "query_words_len": "5:", "match_body": "6:", "match_title": "7:", "window": "8:", "BM25": "9:", } new_qid = 1 qids = {} for doc, queries in docs_to_queries.items(): if doc not in doc_features: continue df = doc_features[doc] for q in queries: print("Recording features for pair doc,qid: ", doc, q, file=sys.stderr) qf = query_features[q] if q not in qids: qids[q] = new_qid new_qid += 1 fs = {"target": int(doc in relevant[q]), "qid": qids[q]} fs.update(df) fs.update(qf) pair = "{}:{}".format(q, doc) fs.update(features[pair]) result[pair] = fs with open("dataset", 'w') as file: for pair, fs in result.items(): for key, value in fs.items(): file.write("{}{} ".format(fid[key], value)) file.write("\n")
def accept(self, consumer_input: PipedInput): text = consumer_input.get_meta( )['title'] + " . " + consumer_input.get_text() return consumer_input.new(text=text, meta=json.dumps(consumer_input.get_meta()))
def accept(self, consumer_input: PipedInput): return consumer_input.new(meta=json.loads(consumer_input.get_meta()))
def accept(self, consumer_input: PipedInput): new_meta = copy(consumer_input.get_meta()) new_meta["title"] = self.lemmatize(new_meta["title"]) return consumer_input.new(text=self.lemmatize( consumer_input.get_text()), meta=new_meta)