def iter_over_dump_file(dump_file,
                        min_length_of_article=50,
                        ignore_namespaces=None):
    """
    Iterator over wiki_dump_file.
    Returns title and tokens for next article in dump file.
    Ignores short articles.
    Ignores meta articles, throug given namespaces.
    Default namespaces are 'Wikipedia', 'Category', 'File', 'Portal', 'Template', 'MediaWiki', 'User', 'Help', 'Book', 'Draft'
    :param dump_file: the dump file
    :param min_length_of_article: the min number of words in the next article. Default = 50
    :param ignore_namespaces: list of namespaces which should be ignored.
    :return: title, tokens
    """
    if ignore_namespaces is None:
        ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split(
        )
    for title, text, pageid in _extract_pages(smart_open(dump_file)):
        text = filter_wiki(text)
        tokens = tokenize(text)
        if len(tokens) < min_length_of_article or any(
                title.startswith(namespace + ':')
                for namespace in ignore_namespaces):
            continue  # ignore short articles and various meta-articles
        yield title, tokens
def iter_wiki(dump_file): # making a wiki token stream
    """Yield each article from the Wikipedia dump, as a `(title, tokens)` 2-tuple."""
    ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split()
    for title, text, pageid in _extract_pages(smart_open(dump_file)):
        text = filter_wiki(text)
        tokens = tokenize(text)
        if len(tokens) < 50 or any(title.startswith(ns + ':') for ns in ignore_namespaces):
            continue  # ignore short articles and various meta-articles
        yield tokens
Ejemplo n.º 3
0
 def iter_wiki(self):
     """Yield each article from the Wikipedia dump, as a `(title, tokens)` 2-tuple."""
     ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split()
     for title, text, pageid in _extract_pages(smart_open(self.dump_file)):
         text = filter_wiki(text)
         tokens = [token for token in simple_preprocess(text) if token not in STOPWORDS]
         if len(tokens) < 50 or any(title.startswith(ns + ':') for ns in ignore_namespaces):
             continue  # ignore short articles and various meta-articles
         yield title, tokens
Ejemplo n.º 4
0
def iter_wiki(dump_file):
    """Yield each article from the Wikipedia dump, as a `(title, tokens)` 2-tuple."""
    ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split()
    for title, text, pageid in _extract_pages(smart_open(dump_file)):
        text = filter_wiki(text)
        tokens = tokenize(text)
        if any(title.startswith(ns + ':') for ns in ignore_namespaces):
            continue  # ignore short articles and various meta-articles
        yield title, tokens
def iter_wiki(dump_file):
    ignore_namespaces = "Wikipedia Category File Portal Template MediaWiki User Help Book Draft".split(
    )
    for title, text, pageid in _extract_pages(smart_open(dump_file)):
        text = filter_wiki(text)
        tokens = tokenize(text)
        if len(tokens) < 50 or any(
                title.startswith(ns + ':') for ns in ignore_namespaces):
            continue
        yield title, tokens
def iter_wiki(dump_file, n=-1):
    ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split()
    counter = 0
    for title, text, pageid in _extract_pages(smart_open(dump_file)):
        counter += 1
        if counter == n:
            break
        text = filter_wiki(text)
        tokens = tokenize(text)
        if len(tokens) < 50 or any(title.startswith(ns+':') for ns in ignore_namespaces):
            continue
        yield title, tokens
def parse_wiki_dump(infile,
                    min_words,
                    process_function,
                    processes=multiprocessing.cpu_count() - 2):
    """
  Yield articles from a bz2 Wikipedia dump `infile` as (title, tokens) 2-tuples.

  Only articles of sufficient length are returned (short articles & redirects
  etc are ignored).

  Uses multiple processes to speed up the parsing in parallel.
  
  Args:
    infile (str) : path to bz2 Wikipedia dump
    min_words (int) : skip article if it has less than this many words
    process_function (function) : preprocessing function
    processes (int) : number of cores to be used

  """

    logger.info("Start processing Wikipedia dump `{}`".format(infile))
    articles, articles_all = 0, 0

    pool = multiprocessing.Pool(processes)
    # process the corpus in smaller chunks of docs, because multiprocessing.Pool
    # is dumb and would try to load the entire dump into RAM...
    texts = wikicorpus._extract_pages(bz2.BZ2File(infile))  # generator
    ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split(
    )
    for group in chunkize(texts, chunksize=10 * processes):
        for title, tokens in pool.imap(process_function, group):
            if articles_all % 10000 == 0:
                logger.info(
                    "PROGRESS: at article #{} accepted {} articles".format(
                        articles_all, articles))
            articles_all += 1

            # article redirects and short stubs are pruned here
            if any(
                    title.startswith(ignore + ':') for ignore in
                    ignore_namespaces) or len(tokens) < min_words:
                continue

            # all good: use this article
            articles += 1
            yield title, tokens
    pool.terminate()

    logger.info(
        "finished iterating over Wikipedia corpus of {} documents with total {} articles"
        .format(articles, articles_all))
Ejemplo n.º 8
0
 def iter_wiki(self, dump_file):
     logger.info("preprocessing dump {0}".format(dump_file))
     """Yield each article from the Wikipedia dump, as a `(title, tokens)` 2-tuple."""
     ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split(
     )
     index = 0
     for title, text, pageid in _extract_pages(smart_open(dump_file)):
         if index > 160_000:
             break
         text = filter_wiki(text)
         tokens = self.preprocess_doc(text)
         if len(tokens) < 50 or any(
                 title.startswith(ns + ':') for ns in ignore_namespaces):
             continue  # ignore short articles and various meta-articles
         index += 1
         yield title, tokens
Ejemplo n.º 9
0
    def get_texts(self):
        """
        Iterate over the Wikipedia dump and the HN articles returning text
        """
        wiki_articles, hn_articles, articles_all = 0, 0, 0
        positions, positions_all = 0, 0

        # ************ Wikipedia ************
        texts = ((text, self.lemmatize) for _, text in wikicorpus._extract_pages(bz2.BZ2File(self.wiki_file)))
        pool = multiprocessing.Pool(self.processes)
        for group in utils.chunkize(texts, chunksize=10 * pool._processes, maxsize=1): # otherwise imap puts all the corpus into memory
            for tokens in pool.imap(wikicorpus.process_article, group):
                articles_all += 1
                positions_all += len(tokens)
                if len(tokens) > WIKI_ARTICLE_MIN_WORDS:
                    wiki_articles += 1
                    positions += len(tokens)
                    yield tokens
        pool.terminate()

        print (">>> finished iterating over Wikipedia corpus of %i documents with %i positions (total %i articles, %i positions before pruning articles shorter than %i words)" % (wiki_articles, positions, articles_all, positions_all, WIKI_ARTICLE_MIN_WORDS))

        # ************ HN articles ************
        positions_after_wiki = positions
        fnamelist = []
        for g in glob.iglob(self.hn_folder + '/*.txt'):
            fnamelist.append(g)
        for fileno, fname in enumerate(fnamelist): # TODO parallelize as Wiki
            hn_text = open(fname).read()
            if self.lemmatize:
                result = utils.lemmatize(hn_text) # text into lemmas here
            else:
                result = tokenize(hn_text) # text into tokens here
            articles_all += 1
            positions_all += len(result)
            if len(result) > HN_ARTICLE_MIN_WORDS:
                hn_articles += 1
                positions += len(result)
                yield result

        print (">>> finished iterating over HN corpus of %i documents with %i positions" % (hn_articles, positions - positions_after_wiki))
        # ************ /HN articles ************

        self.length = wiki_articles + hn_articles # cache corpus length
Ejemplo n.º 10
0
    def get_texts(self):
        """
        Iterate over the dump, returning text version of each article as a list
        of tokens.

        Only articles of sufficient length are returned (short articles & redirects
        etc are ignored).

        Note that this iterates over the **texts**; if you want vectors, just use
        the standard corpus interface instead of this function::

        >>> for vec in wiki_corpus:
        >>>     print(vec)
        """
        articles, articles_all = 0, 0
        positions, positions_all = 0, 0
        texts = ((text, self.lemmatize, title, pageid) for title, text, pageid in _extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces))
        #pool = multiprocessing.Pool(self.processes)
        # process the corpus in smaller chunks of docs, because multiprocessing.Pool
        # is dumb and would load the entire input into RAM at once...
        for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1):
            #for tokens, title, pageid in pool.imap(process_article, group): # chunksize=10):
            for tokens, title, pageid in imap(process_article, group): # chunksize=10):
                articles_all += 1
                positions_all += len(tokens)
                # Check if the article is long enough and has tokens in our set of interestx
                if len(tokens) > ARTICLE_MIN_WORDS and set(tokens) & self.terms:
                    articles += 1
                    positions += len(tokens)
                    if self.metadata:
                        yield (tokens, (pageid, title))
                    else:
                        yield tokens
        #pool.terminate()
        
        logger.info("finished iterating over Wikipedia corpus of %i documents with %i positions"
            " (total %i articles, %i positions before pruning articles shorter than %i words)" %
            (articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS))
        self.length = articles # cache corpus length
Ejemplo n.º 11
0
def iter_wiki(dump_file):
    for title, text, pageid in _extract_pages(smart_open(dump_file)):
        text = filter_wiki(text)
        yield title, text
Ejemplo n.º 12
0
Archivo: wiki.py Proyecto: hans/deepBLE
    def get_texts(self):
        """
        Iterate over the corpus data, yielding sentences of the text
        version of each article (each sentence represented as a list of
        tokens).

        See the `WikiCorpus` class for more.
        """

        # Unfortunately due to the OOP-unfriendly implementation of
        # `WikiCorpus` we have to copy-and-paste some code. This code is
        # based on `WikiCorpus#get_texts`.

        n_articles, n_articles_all = 0, 0
        n_sentences, n_sentences_all = 0, 0

        pages = _extract_pages(self.open_corpus_file(), self.filter_namespaces)
        texts = ((text, self.lemmatize, title, pageid)
                 for title, text, pageid in pages)

        pool = multiprocessing.Pool(self.processes)

        # process the corpus in smaller chunks of docs, because
        # multiprocessing.Pool is dumb and would load the entire input
        # into RAM at once...
        chunks = utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1)

        for group in chunks:
            for sentences, title, pageid in pool.imap(process_article, group):
                n_articles_all += 1
                n_sentences_all += len(sentences)

                num_tokens = sum(len(sentence) for sentence in sentences)

                # article redirects and short stubs are pruned here
                if num_tokens > ARTICLE_MIN_WORDS:
                    n_articles += 1
                    n_sentences += len(sentences)

                    for sentence in sentences:
                        if self.metadata:
                            yield (sentence, (pageid, title))
                        else:
                            yield sentence

                        if self.sentences_out is not None:
                            self.sentences_out.write(' '.join(sentence))
                            self.sentences_out.write('\n')

        pool.terminate()

        LOGGER.info("finished iterating over Wikipedia corpus of %i "
                    "articles with %i sentences (%i articles / %i "
                    "sentences retained)" %
                    (n_articles_all, n_sentences_all, n_articles, n_sentences))

        # cache corpus length
        self.length = n_sentences

        # Close sentences file if we were writing one
        if self.sentences_out is not None:
            self.sentences_out.close()
            self.sentences_out = None
Ejemplo n.º 13
0
        model_fn = os.path.splitext(
            os.path.basename(fn))[0] + "-%d.w2v-gensim" % n
    else:
        model_fn = options.model_file

    if os.path.exists(model_fn):
        logging.error("File already exists, %s" % model_fn)
        exit(1)

    logging.info("Generating word vectors size %d from %s" % (n, fn))

    sent_gen = None

    if options.file_type == 'bz-wiki':
        sent_gen = (process_article((text, None))
                    for title, text in _extract_pages(bz2.BZ2File(fn)))
    elif options.file_type == 'mahoney':
        sent_gen = Text8Corpus(fn)
    else:
        raise ValueError

    model = gensim.models.Word2Vec(sent_gen,
                                   workers=n_jobs,
                                   window=w,
                                   size=n,
                                   min_count=c)

    if options.accuracy:
        print accuracy(model, options.accuracy, DEFAULT_ACCURACY_CUTOFF)

    logging.info("Writing model to %s" % model_fn)
    c = options.min_count

    if not options.model_file:
        model_fn = os.path.splitext(os.path.basename(fn))[0] + "-%d.w2v-gensim" % n
    else:
        model_fn = options.model_file

    if os.path.exists(model_fn):
        logging.error("File already exists, %s" % model_fn)
        exit(1)

    logging.info("Generating word vectors size %d from %s" % (n, fn))

    sent_gen = None

    if options.file_type == 'bz-wiki':
        sent_gen = (process_article((text, None))
                    for title, text in _extract_pages(bz2.BZ2File(fn)))
    elif options.file_type == 'mahoney':
        sent_gen = Text8Corpus(fn)
    else:
        raise ValueError

    model = gensim.models.Word2Vec(sent_gen, workers=n_jobs, window=w, size=n, min_count=c)

    if options.accuracy:
        print accuracy(model, options.accuracy, DEFAULT_ACCURACY_CUTOFF)

    logging.info("Writing model to %s" % model_fn)
    model.save(model_fn)