def accept(self, consumer_input: PipedInput):
     tokens = self.mystem.lemmatize(consumer_input.get_text().lower())
     result = []
     for token in tokens:
         token = token.strip()
         if is_russian(token) or is_belarusian(token) or is_english(token):
             result.append(token)
     return consumer_input.new(text=" ".join(result))
 def accept(self, consumer_input: PipedInput):
     text = consumer_input.get_text().lower()
     token_words = word_tokenize(text)
     result = []
     for token in token_words:
         token = token.strip()
         if is_russian(token) or is_belarusian(token):
             result.append(self.russian_stemmer.stem(token))
         if is_english(token):
             result.append(self.english_stemmer.stem(token))
     return consumer_input.new(text=" ".join(result))
    def accept(self, consumer_input: PipedInput):
        doc_id = consumer_input.get_doc_id()
        title = consumer_input.get_meta()["title"]
        body = consumer_input.get_text()

        for qid in self.docs_to_queries.get(doc_id, []):
            pair_key = "{}:{}".format(qid, doc_id)
            features = self.features[pair_key]

            text = self.queries[qid]
            features["match_body"] = fraction_of_words(text, body)
            features["match_title"] = fraction_of_words(text, title)
            features["window"] = shortest_window(text, body)
Exemple #4
0
    def accept(self, consumer_input: PipedInput):
        text = consumer_input.get_text()
        doc_id = consumer_input.get_doc_id()

        self.doc_lengths[doc_id] = len(text)

        for word in self.df.keys():
            if word in text:
                self.df[word] += 1
                if doc_id in self.tf[word]:
                    self.tf[word][doc_id] += 1
                else:
                    self.tf[word][doc_id] = 1
Exemple #5
0
    def accept(self, consumer_input: PipedInput):
        meta = consumer_input.get_meta()
        base = meta["url"]

        base_i = self.index_of(base)
        self.bases.add(base_i)

        doc_id = consumer_input.get_doc_id()
        self.doc_ids[base_i] = doc_id

        for l in meta["leadsTo"]:
            full = absoluteURL(base, l)
            i = self.index_of(full)
            if i != base_i:
                self.graph.add_edge(base_i, i)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i',
                        "--input",
                        type=str,
                        help="XML to convert reuests",
                        default="web2008_adhoc.xml")
    parser.add_argument('-m',
                        "--mode",
                        type=str,
                        help="How to convert requests",
                        default="stemmas")

    args = parser.parse_args()
    text = ""
    with open(args.input, "r", encoding="cp1251") as file:
        text = "\n".join(file.readlines())

    if args.mode == "lemmas":
        formatter = TextLemmatizerStage()
    elif args.mode == "stemmas":
        formatter = TextStemmerStage()
    else:
        raise

    dom = minidom.parseString(text)
    for token in dom.getElementsByTagName("task"):
        token.firstChild.firstChild.replaceWholeText(
            formatter.accept(
                PipedInput(token.firstChild.firstChild.nodeValue, None,
                           None)).get_text())

    with open("output_{}.xml".format(args.mode), 'w',
              encoding="utf-8") as file:
        file.write(dom.toxml())
    def accept(self, consumer_input: PipedInput):
        doc_id = consumer_input.get_doc_id()

        self.doc_features[doc_id] = {}
        for qid in self.docs_to_queries.get(doc_id, []):
            pair_key = "{}:{}".format(qid, doc_id)
            self.features[pair_key] = {}
Exemple #8
0
 def accept(self, consumer_input: PipedInput):
     for word in consumer_input.get_text():
         if is_english(word):
             self.english_words += 1
             self.english_lengths_sum += len(word)
         else:
             self.not_english_words += 1
             self.not_english_lengths_sum += len(word)
Exemple #9
0
 def accept(self, consumer_input: PipedInput):
     for word in consumer_input.get_text():
         if is_russian(word):
             self.russian_words += 1
             if word in self.__russian_stopwords:
                 self.russian_stopwords += 1
         if is_english(word):
             self.english_words += 1
             if word in self.__english_stopwords:
                 self.english_stopwords += 1
Exemple #10
0
 def accept(self, consumer_input: PipedInput):
     for word in set(consumer_input.get_text()):
         self.idf[word] = self.idf.get(word, 0) + 1
     for word in consumer_input.get_text():
         self.tf[word] = self.tf.get(word, 0) + 1
     self.documents_count += 1
Exemple #11
0
 def accept(self, consumer_input: PipedInput):
     new_meta = copy(consumer_input.get_meta())
     new_meta["title"] = self.filter_stopwords(new_meta["title"])
     return consumer_input.new(text=self.filter_stopwords(
         consumer_input.get_text()),
                               meta=new_meta)
def main():
    timestamp = time.time()

    parser = argparse.ArgumentParser()

    parser.add_argument("-d", "--documents", type=int, help="Number of documents to process", default=100)
    parser.add_argument('-i', "--input", type=str, help="Directory to read extracted documents from",
                        default="extracted")
    parser.add_argument('-e', '--encoding', type=str, help="Documents encoding", default='utf8')

    args = parser.parse_args()

    print("Ready to process {} files in {} directory.".format(args.documents, args.input))

    lemm_holder = StringHolder()
    stem_holder = StringHolder()
    header_holder = StringHolder()

    stages = [
        # Turns file lines from the input into the list of normalized words.
        PipelineDumpingStage(TextLemmatizerStage(), lemm_holder),
        PipelineDumpingStage(TextStemmerStage(), stem_holder),
        JsonUnpackerStage(),
        PipelineImmutableStage(PageRankStage()),
        PipelineDumpingStage(TextWithHeaderStage(), header_holder),
    ]
    # Register your pipeline stage here.

    try:
        os.makedirs("logs")
        os.makedirs("results")
        os.makedirs("lemmatized")
        os.makedirs("stemmed")
        os.makedirs("headered")
        print("Created directories", file=sys.stderr)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

    with open(os.path.join('logs', '{}.txt'.format(timestamp)), 'w') as logs:
        sys.stdout = logs

        doc_id = 0
        documents_counter = 0
        while documents_counter < args.documents:
            text_path = os.path.join(args.input, "{:>07d}.txt".format(doc_id))
            meta_path = os.path.join(args.input, "{:>07d}.json".format(doc_id))

            if os.path.exists(text_path) and os.path.exists(meta_path):
                lemm_holder.s = os.path.join("lemmatized", "{:>07d}".format(doc_id))
                stem_holder.s = os.path.join("stemmed", "{:>07d}".format(doc_id))
                header_holder.s = os.path.join("headered", "{:>07d}".format(doc_id))

                print("Found files {} and {}. {}/{} processed.".format(text_path, meta_path, documents_counter,
                                                                       args.documents), file=sys.stderr)
                with open(text_path, 'r', encoding=args.encoding) as text_file:
                    with open(meta_path, 'r', encoding=args.encoding) as meta_file:
                        text = "\n".join(text_file.readlines())
                        meta = "\n".join(meta_file.readlines())

                        stage_input = PipedInput(text, meta, doc_id)

                        for consumer in stages:
                            stage_input = consumer.accept(stage_input)
                documents_counter += 1
            doc_id += 1

        for stage in stages:
            stage.dump()
Exemple #13
0
 def accept(self, consumer_input: PipedInput):
     meta = json.loads(consumer_input.get_meta())
     return consumer_input.new(doc_id=meta["url"], meta=meta)
 def accept(self, consumer_input: PipedInput):
     doc_id = consumer_input.get_doc_id()
     text = consumer_input.get_text()
     meta = consumer_input.get_meta()
     self.doc_features[doc_id]["text_len"] = len(text)
     self.doc_features[doc_id]["url_len"] = len(meta["url"])
def main():

    parser = argparse.ArgumentParser()

    parser.add_argument("-d",
                        "--documents",
                        type=int,
                        help="Number of documents to process",
                        default=100)
    parser.add_argument('-i',
                        "--input",
                        type=str,
                        help="Directory to read extracted documents from",
                        default="extracted")
    parser.add_argument('-e',
                        '--encoding',
                        type=str,
                        help="Documents encoding",
                        default='utf8')

    parser.add_argument('-r',
                        '--relevant',
                        type=str,
                        help="Relevance table for queries path",
                        default="or_relevant-minus_table.xml")
    parser.add_argument('-q',
                        '--queries',
                        type=str,
                        help="Queries path",
                        default="web2008_adhoc.xml")

    args = parser.parse_args()

    relevant, irrelevant = load_relevant_docs(args.relevant)
    queries = load_queries(args.queries, relevant, irrelevant)
    docs_to_queries = load_docs_to_queries(args.relevant, queries)

    features = {}
    query_features = {}
    doc_features = {}
    CreateFeatureDumper = lambda stage: PipelineFeaturesDumper(
        stage, features, query_features, doc_features)

    stages = [
        JsonUnpackerStage(),
        TextProcessorStage(),
        StopwordsFilter(),
        PipelineImmutableStage(
            CreateFeatureDumper(InitFeaturesStage(queries, docs_to_queries))),
        PipelineImmutableStage(CreateFeatureDumper(LengthCounterStage())),
        PipelineImmutableStage(
            CreateFeatureDumper(FieldMatchStage(queries, docs_to_queries))),
        PipelineImmutableStage(CreateFeatureDumper(PageRankStage())),
        PipelineImmutableStage(CreateFeatureDumper(QueryLengthStage(queries))),
        PipelineImmutableStage(
            CreateFeatureDumper(BM25Stage(queries, docs_to_queries))),
    ]
    # Register your pipeline stage here.

    print("Ready to process {} files in {} directory.".format(
        args.documents, args.input))

    try:
        os.makedirs("logs")
        os.makedirs("results")
        print("Created directories", file=sys.stderr)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

    timestamp = time.time()
    with open(os.path.join('logs', '{}.txt'.format(timestamp)), 'w') as logs:
        sys.stdout = logs

        doc_id = 0
        documents_counter = 0
        while documents_counter < args.documents:
            text_path = os.path.join(args.input, "{:>07d}.txt".format(doc_id))
            meta_path = os.path.join(args.input, "{:>07d}.json".format(doc_id))

            if os.path.exists(text_path) and os.path.exists(meta_path):
                print("Found files {} and {}. {}/{} processed.".format(
                    text_path, meta_path, documents_counter, args.documents),
                      file=sys.stderr)
                with open(text_path, 'r', encoding=args.encoding) as text_file:
                    with open(meta_path, 'r',
                              encoding=args.encoding) as meta_file:
                        text = "\n".join(text_file.readlines())
                        meta = "\n".join(meta_file.readlines())

                        stage_input = PipedInput(text, meta, doc_id)

                        for consumer in stages:
                            stage_input = consumer.accept(stage_input)
                documents_counter += 1
            doc_id += 1
        for stage in stages:
            stage.dump()

    result = {}

    fid = {
        "target": "",
        "qid": "qid:",
        "text_len": "1:",
        "url_len": "2:",
        "pagerank": "3:",
        "query_len": "4:",
        "query_words_len": "5:",
        "match_body": "6:",
        "match_title": "7:",
        "window": "8:",
        "BM25": "9:",
    }

    new_qid = 1
    qids = {}

    for doc, queries in docs_to_queries.items():
        if doc not in doc_features:
            continue
        df = doc_features[doc]
        for q in queries:
            print("Recording features for pair doc,qid: ",
                  doc,
                  q,
                  file=sys.stderr)

            qf = query_features[q]
            if q not in qids:
                qids[q] = new_qid
                new_qid += 1
            fs = {"target": int(doc in relevant[q]), "qid": qids[q]}
            fs.update(df)
            fs.update(qf)
            pair = "{}:{}".format(q, doc)
            fs.update(features[pair])

            result[pair] = fs

    with open("dataset", 'w') as file:
        for pair, fs in result.items():
            for key, value in fs.items():
                file.write("{}{} ".format(fid[key], value))
            file.write("\n")
 def accept(self, consumer_input: PipedInput):
     text = consumer_input.get_meta(
     )['title'] + " . " + consumer_input.get_text()
     return consumer_input.new(text=text,
                               meta=json.dumps(consumer_input.get_meta()))
 def accept(self, consumer_input: PipedInput):
     return consumer_input.new(meta=json.loads(consumer_input.get_meta()))
Exemple #18
0
 def accept(self, consumer_input: PipedInput):
     new_meta = copy(consumer_input.get_meta())
     new_meta["title"] = self.lemmatize(new_meta["title"])
     return consumer_input.new(text=self.lemmatize(
         consumer_input.get_text()),
                               meta=new_meta)