Example #1
0
    def get_texts(self):
        """
        Iterate over the dump, returning text version of each article as a list
        of tokens.

        Only articles of sufficient length are returned (short articles & redirects
        etc are ignored).

        Note that this iterates over the **texts**; if you want vectors, just use
        the standard corpus interface instead of this function::

        >>> for vec in wiki_corpus:
        >>>     print vec
        """
        articles, articles_all = 0, 0
        positions, positions_all = 0, 0
        texts = ((text, self.lemmatize) for _, text in _extract_pages(bz2.BZ2File(self.fname)))
        pool = multiprocessing.Pool(self.processes)
        # process the corpus in smaller chunks of docs, because multiprocessing.Pool
        # is dumb and would load the entire input into RAM at once...
        for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1):
            for tokens in pool.imap(process_article, group): # chunksize=10):
                articles_all += 1
                positions_all += len(tokens)
                if len(tokens) > ARTICLE_MIN_WORDS: # article redirects are pruned here
                    articles += 1
                    positions += len(tokens)
                    yield tokens
        pool.terminate()

        logger.info("finished iterating over Wikipedia corpus of %i documents with %i positions"
            " (total %i articles, %i positions before pruning articles shorter than %i words)" %
            (articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS))
        self.length = articles # cache corpus length
Example #2
0
    def get_texts(self):
        """
        Iterate over the dump, returning text version of each article as a list
        of tokens.
        Only articles of sufficient length are returned (short articles & redirects
        etc are ignored).
        Note that this iterates over the **texts**; if you want vectors, just use
        the standard corpus interface instead of this function::
        >>> for vec in wiki_corpus:
        >>>	 print(vec)
        """
        articles, articles_all = 0, 0
        texts = ((text, title, pageid)
                 for title, text, pageid in extract_pages(
                     bz2.BZ2File(self.fname), self.filter_namespaces))
        pool = multiprocessing.Pool(self.processes)
        # process the corpus in smaller chunks of docs, because multiprocessing.Pool
        # is dumb and would load the entire input into RAM at once...
        ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split(
        )
        for group in utils.chunkize(texts,
                                    chunksize=10 * self.processes,
                                    maxsize=1):
            for text, title, pageid in pool.imap(process_article,
                                                 group):  # chunksize=10):
                articles_all += 1
                # article redirects and short stubs are pruned here
                articles += 1
                yield text, title, pageid
        pool.terminate()

        # logger.info("finished iterating over Wikipedia corpus of %i documents (all : %i)" % (articles, articles_all))
        self.length = articles  # cache corpus length
Example #3
0
    def get_claims(self):
        """
        Iterate over the dump, creating a pseudo-XML file called "output" containing claims that are marked
        with the "citation needed" template
        """
        articles, articles_all = 0, 0
        positions, positions_all = 0, 0
        texts = ((text, self.lemmatize, title, pageid, self.set_citation,
                  self.quote_identifiers)
                 for title, text, pageid in extract_pages(
                     bz2.BZ2File(self.fname), self.filter_namespaces))
        pool = multiprocessing.Pool(self.processes)
        # process the corpus in smaller chunks of docs, because multiprocessing.Pool
        # is dumb and would load the entire input into RAM at once...
        claim_list = []
        for group in utils.chunkize(texts,
                                    chunksize=10 * self.processes,
                                    maxsize=1):
            for claims in pool.imap(get_article_claims,
                                    group):  # chunksize=10):
                claim_list.append(claims)
        pool.terminate()

        #with open("output.finder", "w") as outfile:
        #    for claim in retList:
        #        outfile.write(claim)
        for x in claim_list:
            for c in x.claims:
                c.get_query(self.dictionary, self.articlecount)

        return claim_list
    def get_texts(self):
        articles, articles_all = 0, 0
        positions, positions_all = 0, 0

        texts = ((text, self.lemmatize, title, pageid) for pageid, title, text in self.pages_gen())

        pool = multiprocessing.Pool(self.processes)
        # process the corpus in smaller chunks of docs, because multiprocessing.Pool
        # is dumb and would load the entire input into RAM at once...
        for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1):
            for tokens, title, pageid in pool.imap(process_article, group):  # chunksize=10):
                articles_all += 1
                positions_all += len(tokens)
                # article redirects and short stubs are pruned here
                if len(tokens) < ARTICLE_MIN_WORDS or any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):
                    continue
                articles += 1
                positions += len(tokens)
                if self.metadata:
                    yield (tokens, (pageid, title))
                else:
                    yield tokens
        pool.terminate()

        logger.info(
            "finished iterating over Wikipedia corpus of %i documents with %i positions"
            " (total %i articles, %i positions before pruning articles shorter than %i words)",
            articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS)
        self.length = articles  # cache corpus length
Example #5
0
 def __iter__(self):
     pool = multiprocessing.Pool(pool_size)        
     for file_chunk in utils.chunkize(self.file_path_iter(), chunksize=1000, maxsize=20):
         docs = pool.imap(tokenized_from_file, file_chunk)
         for doc_tokenized in docs:
             yield self.dictionary.doc2bow(doc_tokenized)
     pool.terminate()     
Example #6
0
 def load_corpus(self):
     pool = multiprocessing.Pool(pool_size)
     for file_chunk in utils.chunkize(self.file_path_iter(), chunksize=1000, maxsize=20):
         results = pool.imap(tokenized_from_file, file_chunk)
         self.dictionary.add_documents(results, prune_at=200000)
         self.document_file_names += [file_path for file_path in file_chunk]
     pool.terminate()            
Example #7
0
def vanilla_chunk(unigrams, n):
    chunks = []
    for doc in unigrams:
        clen = len(doc) / n
        for chunk in chunkize(doc, clen):
            chunks.append(chunk)
    return chunks
def map_wikidocs2lda(num_topics):

    logger = logging.getLogger('Wiki2LDA')
    pool = Pool(cpu_count())

    logger.info('Loading LDA model...')
    lda = gensim.models.ldamodel.LdaState.load(LDA_MODEL_FILENAME(num_topics))

    # Mapping betwee doc-id -> topics
    logger.info('Mapping Wikipedia documents to LDA model...')
    doc_topics = os.path.join(
        LDA_MODEL_DIR(num_topics) + '/topical_documents.gz')
    with smart_open(doc_topics, 'wb') as f:
        n = 0

        for wiki_docs in utils.chunkize(wiki_document_generator(),
                                        100):  #, 10000):

            lda_wiki_docs = pool.map(wiki2LDA, [(lda, wiki_doc)
                                                for wiki_doc in wiki_docs])

            # Writes to file as 'wiki_id \t index:value \n' format.
            for wiki_id, lda_text in lda_wiki_docs:
                f.write(str(wiki_id) + '\t')

                embedding = [
                    '{0}:{1:.10f}'.format(index, value)
                    for index, value in lda_text
                ]
                f.write('\t'.join(embedding) + '\n')

            n += len(wiki_docs)
            logger.info('{0} documents mapped to LDA embedding.'.format(n))
Example #9
0
    def get_texts_sent_split(self):
        """
        Iterate over the dump, returning text version of each article as a list
        of tokens.
        Only articles of sufficient length are returned (short articles & redirects
        etc are ignored).
        Note that this iterates over the **texts**; if you want vectors, just use
        the standard corpus interface instead of this function::
        """

        texts = ((text, self.lemmatize, title, pageid) for title, text, pageid in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces))
        pool = multiprocessing.Pool(self.processes)
        # process the corpus in smaller chunks of docs, because multiprocessing.Pool
        # is dumb and would load the entire input into RAM at once...
        for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1):
            for categories, tokens, title, pageid in pool.imap(process_article, group):  # chunksize=10):
                if categories is None:
                    continue
                tokens_all = [x for x in tokens.values() for x in x for x in x]
                # article redirects and short stubs are pruned here
                if len(tokens_all) < ARTICLE_MIN_WORDS or any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):
                    continue
                if self.metadata:
                    for k, v in tokens.items():
                        yield (v, (pageid, k))
                else:
                    for k, v in tokens.items():
                        yield v
        pool.terminate()
    def get_texts(self):
        articles, articles_all = 0, 0
        positions, positions_all = 0, 0
        texts = ((text, self.lemmatize, title, pageid)
                 for title, text, pageid in extract_pages(
                     bz2.BZ2File(self.fname), self.filter_namespaces))
        pool = multiprocessing.Pool(self.processes)
        # process the corpus in smaller chunks of docs, because multiprocessing.Pool
        # is dumb and would load the entire input into RAM at once...
        for group in utils.chunkize(texts,
                                    chunksize=10 * self.processes,
                                    maxsize=1):
            for tokens, title, pageid in pool.imap(process_article,
                                                   group):  # chunksize=10):
                articles_all += 1
                positions_all += len(tokens)
                # article redirects and short stubs are pruned here
                if len(tokens) < ARTICLE_MIN_WORDS or any(
                        title.startswith(ignore + ':')
                        for ignore in IGNORED_NAMESPACES):
                    continue
                articles += 1
                positions += len(tokens)
                if self.metadata:
                    yield (tokens, (pageid, title))
                else:
                    yield tokens
        pool.terminate()

        logger.info(
            "finished iterating over Wikipedia corpus of %i documents with %i positions"
            " (total %i articles, %i positions before pruning articles shorter than %i words)",
            articles, positions, articles_all, positions_all,
            ARTICLE_MIN_WORDS)
        self.length = articles  # cache corpus length
Example #11
0
    def get_texts(self):
        """
        Iterate over the dump, returning text version of each article as a list
        of tokens.
        """

        reviews = 0
        positions = 0
        texts = [text for text in _extract_reviews(self.fname)]
        pool = multiprocessing.Pool(self.processes)
        # process the corpus in smaller chunks of docs, because multiprocessing.Pool
        # is dumb and would load the entire input into RAM at once...
        #for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1):
        for group in utils.chunkize(texts,
                                    chunksize=10 * self.processes,
                                    maxsize=1):
            for tokens in pool.imap(process_review, group):  # chunksize=10):
                reviews += 1
                positions += len(tokens)
                yield tokens
        pool.terminate()

        logger.info(
            "finished iterating over the generated Yelp corpus of %i documents with %i positions"
            " (total %i articles, %i positions before pruning articles shorter than %i words)"
            % (reviews, positions, reviews, positions, 10000))
        self.length = reviews  # cache corpus length
Example #12
0
    def get_texts(self):
        """
        Iterate over the dump, returning text version of each article as a list
        of tokens.

        Only articles of sufficient length are returned (short articles &
        redirects etc are ignored).

        Note that this iterates over the **texts**; if you want vectors,
        just use the standard corpus interface instead of this function::

        >>> for vec in wiki_corpus:
        >>>     print(vec)
        """
        articles, articles_all = 0, 0
        positions, positions_all = 0, 0

        texts = ((text, title, pageid)
                 for title, text, pageid in extract_pages(
                     bz2.BZ2File(self.fname), self.filter_namespaces))

        batch_idx = 0
        pool = multiprocessing.Pool(self.processes)
        # Process the corpus in smaller chunks of docs,
        # because multiprocessing.Pool is dumb and would load the entire input
        # into RAM at once...
        for group in chunkize(texts, chunksize=10 * self.processes, maxsize=1):
            for tokens, title, pageid in pool.imap(process_article, group):
                articles_all += 1
                positions_all += len(tokens)

                # article redirects and short stubs are pruned here
                to_ignored = any(
                    title.startswith(ignore + ':')
                    for ignore in IGNORED_NAMESPACES)
                if len(tokens) < ARTICLE_MIN_WORDS or to_ignored:
                    continue

                articles += 1
                positions += len(tokens)

                if self.metadata:
                    yield title, tokens
                else:
                    yield tokens

            batch_idx += 1
            if self.max_batch and batch_idx == self.max_batch:
                break

        pool.terminate()

        logger.info(
            "Finished iterating over Wikipedia corpus of %i documents with "
            "%i positions (total %i articles, %i positions before pruning "
            "articles shorter than %i words)", articles, positions,
            articles_all, positions_all, ARTICLE_MIN_WORDS)

        self.length = articles
Example #13
0
    def get_texts(self):
        """
        Iterate over the dump, returning text version of each article as a list
        of tokens.

        Only articles of sufficient length are returned (short articles & redirects
        etc are ignored). This is control by `article_min_tokens` on the class instance.

        Note that this iterates over the **texts**; if you want vectors, just use
        the standard corpus interface instead of this function::

        >>> for vec in wiki_corpus:
        >>>     print(vec)
        """

        articles, articles_all = 0, 0
        positions, positions_all = 0, 0

        tokenization_params = (self.tokenizer_func, self.token_min_len, self.token_max_len, self.lower)
        texts = \
            ((text, self.lemmatize, title, pageid, tokenization_params)
             for title, text, pageid
             in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces))
        pool = multiprocessing.Pool(self.processes, init_to_ignore_interrupt)

        try:
            # process the corpus in smaller chunks of docs, because multiprocessing.Pool
            # is dumb and would load the entire input into RAM at once...
            for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1):
                for tokens, title, pageid in pool.imap(_process_article, group):
                    articles_all += 1
                    positions_all += len(tokens)
                    # article redirects and short stubs are pruned here
                    if len(tokens) < self.article_min_tokens or \
                            any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):
                        continue
                    articles += 1
                    positions += len(tokens)
                    if self.metadata:
                        yield (tokens, (pageid, title))
                    else:
                        yield tokens
        except KeyboardInterrupt:
            logger.warn(
                "user terminated iteration over Wikipedia corpus after %i documents with %i positions "
                "(total %i articles, %i positions before pruning articles shorter than %i words)",
                articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS
            )
        else:
            logger.info(
                "finished iterating over Wikipedia corpus of %i documents with %i positions "
                "(total %i articles, %i positions before pruning articles shorter than %i words)",
                articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS
            )
            self.length = articles  # cache corpus length
        finally:
            pool.terminate()
Example #14
0
    def get_texts(self):
        """
        Iterate over the dump, returning text version of each article as a list
        of tokens.

        Only articles of sufficient length are returned (short articles & redirects
        etc are ignored). This is control by `article_min_tokens` on the class instance.

        Note that this iterates over the **texts**; if you want vectors, just use
        the standard corpus interface instead of this function::

        >>> for vec in wiki_corpus:
        >>>     print(vec)
        """

        articles, articles_all = 0, 0
        positions, positions_all = 0, 0

        tokenization_params = (self.tokenizer_func, self.token_min_len, self.token_max_len, self.lower)
        texts = \
            ((text, self.lemmatize, title, pageid, tokenization_params)
             for title, text, pageid
             in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces))
        pool = multiprocessing.Pool(self.processes, init_to_ignore_interrupt)

        try:
            # process the corpus in smaller chunks of docs, because multiprocessing.Pool
            # is dumb and would load the entire input into RAM at once...
            for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1):
                for tokens, title, pageid in pool.imap(_process_article, group):
                    articles_all += 1
                    positions_all += len(tokens)
                    # article redirects and short stubs are pruned here
                    if len(tokens) < self.article_min_tokens or \
                            any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):
                        continue
                    articles += 1
                    positions += len(tokens)
                    if self.metadata:
                        yield (tokens, (pageid, title))
                    else:
                        yield tokens
        except KeyboardInterrupt:
            logger.warn(
                "user terminated iteration over Wikipedia corpus after %i documents with %i positions "
                "(total %i articles, %i positions before pruning articles shorter than %i words)",
                articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS
            )
        else:
            logger.info(
                "finished iterating over Wikipedia corpus of %i documents with %i positions "
                "(total %i articles, %i positions before pruning articles shorter than %i words)",
                articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS
            )
            self.length = articles  # cache corpus length
        finally:
            pool.terminate()
def parse_wiki_dump(infile,
                    min_words,
                    process_function,
                    processes=multiprocessing.cpu_count() - 2):
    """
  Yield articles from a bz2 Wikipedia dump `infile` as (title, tokens) 2-tuples.

  Only articles of sufficient length are returned (short articles & redirects
  etc are ignored).

  Uses multiple processes to speed up the parsing in parallel.
  
  Args:
    infile (str) : path to bz2 Wikipedia dump
    min_words (int) : skip article if it has less than this many words
    process_function (function) : preprocessing function
    processes (int) : number of cores to be used

  """

    logger.info("Start processing Wikipedia dump `{}`".format(infile))
    articles, articles_all = 0, 0

    pool = multiprocessing.Pool(processes)
    # process the corpus in smaller chunks of docs, because multiprocessing.Pool
    # is dumb and would try to load the entire dump into RAM...
    texts = wikicorpus._extract_pages(bz2.BZ2File(infile))  # generator
    ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split(
    )
    for group in chunkize(texts, chunksize=10 * processes):
        for title, tokens in pool.imap(process_function, group):
            if articles_all % 10000 == 0:
                logger.info(
                    "PROGRESS: at article #{} accepted {} articles".format(
                        articles_all, articles))
            articles_all += 1

            # article redirects and short stubs are pruned here
            if any(
                    title.startswith(ignore + ':') for ignore in
                    ignore_namespaces) or len(tokens) < min_words:
                continue

            # all good: use this article
            articles += 1
            yield title, tokens
    pool.terminate()

    logger.info(
        "finished iterating over Wikipedia corpus of {} documents with total {} articles"
        .format(articles, articles_all))
Example #16
0
    def get_texts(self):
        """
        Iterate over the dump, returning text version of each article as a list
        of tokens.

        Only articles of sufficient length are returned (short articles & redirects
        etc are ignored).

        Note that this iterates over the **texts**; if you want vectors, just use
        the standard corpus interface instead of this function::

        >>> for vec in wiki_corpus:
        >>>     print(vec)
        """
        articles, articles_all = 0, 0
        positions, positions_all = 0, 0
        texts = ((text, self.lemmatize, title, pageid)
                 for title, text, pageid in extract_pages(
                     bz2.BZ2File(self.fname), self.filter_namespaces))
        pool = multiprocessing.Pool(self.processes)
        # process the corpus in smaller chunks of docs, because multiprocessing.Pool

        # is dumb and would load the entire input into RAM at once...
        ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split(
        )
        for group in utils.chunkize(texts,
                                    chunksize=10 * self.processes,
                                    maxsize=1):
            for tokens, title, pageid in pool.imap(process_article,
                                                   group):  # chunksize=10):
                articles_all += 1
                positions_all += len(tokens)
                # article redirects and short stubs are pruned here
                if len(tokens) < ARTICLE_MIN_WORDS or any(
                        title.startswith(ignore + ':')
                        for ignore in ignore_namespaces):
                    continue
                articles += 1
                positions += len(tokens)
                if self.metadata:
                    yield (tokens, (pageid, title))
                else:
                    yield tokens
        pool.terminate()

        logger.info(
            "finished iterating over Wikipedia corpus of %i documents with %i positions"
            " (total %i articles, %i positions before pruning articles shorter than %i words)"
            % (articles, positions, articles_all, positions_all,
               ARTICLE_MIN_WORDS))
        self.length = articles  # cache corpus length
Example #17
0
    def __iter__(self):
        files = iter_files(self.directory)
        posts = 0
        pool = multiprocessing.Pool(self.n_workers)
        # process the corpus in smaller chunks of docs, because multiprocessing.Pool
        # is dumb and would load the entire input into RAM at once...
        for group in utils.chunkize(files, chunksize=self.job_size * self.n_workers, maxsize=1):
            for result in pool.imap(process_file, zip(group, itertools.repeat(self.out_dir))):
                posts += 1
                yield result
        pool.terminate()

        logger.info("finished iterating over corpus of %i documents", posts)
        self.length = posts  # cache corpus length
Example #18
0
    def get_texts(self):
        """
        Iterate over the dump, returning text version of each article as a list
        of tokens.

        Only articles of sufficient length are returned (short articles & redirects
        etc are ignored).

        Note that this iterates over the **texts**; if you want vectors, just use
        the standard corpus interface instead of this function::

        >>> for vec in wiki_corpus:
        >>>     print(vec)
        """
        articles, articles_all = 0, 0
        positions, positions_all = 0, 0
        texts = (
            (text, self.lemmatize, title, pageid)
            for title, text, pageid in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces)
        )
        pool = multiprocessing.Pool(self.processes)
        # process the corpus in smaller chunks of docs, because multiprocessing.Pool
        # is dumb and would load the entire input into RAM at once...
        ignore_namespaces = "Wikipedia Category File Portal Template MediaWiki User Help Book Draft".split()
        for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1):
            for tokens, title, pageid in pool.imap(process_article, group):  # chunksize=10):
                articles_all += 1
                positions_all += len(tokens)
                # article redirects and short stubs are pruned here
                if len(tokens) < ARTICLE_MIN_WORDS or any(
                    title.startswith(ignore + ":") for ignore in ignore_namespaces
                ):
                    continue
                articles += 1
                positions += len(tokens)
                if self.metadata:
                    yield (tokens, (pageid, title))
                else:
                    yield tokens
        pool.terminate()

        logger.info(
            "finished iterating over Wikipedia corpus of %i documents with %i positions"
            " (total %i articles, %i positions before pruning articles shorter than %i words)"
            % (articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS)
        )
        self.length = articles  # cache corpus length
Example #19
0
    def populate_database(self, ):
        articles, articles_all = 0, 0
        positions, positions_all = 0, 0

        texts = ((doc["text"], doc["title"], doc["id"])
                 for doc in self.extract_wiki_pages(self.dbconfig.dataset_dir))

        try:
            # process the corpus in smaller chunks of docs, because multiprocessing.Pool
            # is dumb and would load the entire input into RAM at once...
            for group in utils.chunkize(texts, chunksize=1000, maxsize=1):
                for g in group:
                    for text, tokens_ids, title, pageid in self.process_article(
                            g):
                        articles_all += 1
                        positions_all += len(tokens_ids)
                        # article redirects and short stubs are pruned here
                        if len(tokens_ids) < ARTICLE_MIN_WORDS or any(
                                title.startswith(ignore + ':')
                                for ignore in IGNORED_NAMESPACES):
                            continue
                        articles += 1
                        positions += len(tokens_ids)
                        document = {
                            "article": articles,
                            "title": title,
                            "text": text,
                            "token_ids": tokens_ids,
                            "pageid": pageid,
                        }
                        self.db[self.dbconfig.collection_name].insert_one(
                            document)

        except KeyboardInterrupt:
            print(
                "user terminated iteration over Wikipedia corpus after %i documents with %i positions "
                "(total %i articles, %i positions before pruning articles shorter than %i words)",
                articles, positions, articles_all, positions_all,
                ARTICLE_MIN_WORDS)
        else:
            print(
                "finished iterating over Wikipedia corpus of %i documents with %i positions "
                "(total %i articles, %i positions before pruning articles shorter than %i words)",
                articles, positions, articles_all, positions_all,
                ARTICLE_MIN_WORDS)
            length = articles  # cache corpus length
    def get_texts(self):
        articles, articles_all = 0, 0
        positions, positions_all = 0, 0
        tokenization_params = (self.tokenizer_func, self.token_min_len,
                               self.token_max_len, self.lower)
        texts = \
            ((text, self.lemmatize, title, pageid, tokenization_params)
             for title, text, pageid
             #in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces))
             in extract_pages_without_namespaces(bz2.BZ2File(self.fname), self.filter_namespaces))
        pool = multiprocessing.Pool(self.processes, init_to_ignore_interrupt)

        try:
            for group in utils.chunkize(texts,
                                        chunksize=10 * self.processes,
                                        maxsize=1):
                for tokens, title, pageid in pool.imap(_process_article,
                                                       group):
                    articles_all += 1
                    positions_all += len(tokens)
                    if len(tokens) < self.article_min_tokens or \
                            any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):
                        continue
                    articles += 1
                    positions += len(tokens)
                    if self.metadata:
                        yield (tokens, (pageid, title))
                    else:
                        yield tokens

        except KeyboardInterrupt:
            logger.warn(
                "user terminated iteration over Wikipedia corpus after %i documents with %i positions "
                "(total %i articles, %i positions before pruning articles shorter than %i words)",
                articles, positions, articles_all, positions_all,
                ARTICLE_MIN_WORDS)
        else:
            logger.info(
                "finished iterating over Wikipedia corpus of %i documents with %i positions "
                "(total %i articles, %i positions before pruning articles shorter than %i words)",
                articles, positions, articles_all, positions_all,
                ARTICLE_MIN_WORDS)
            self.length = articles  # cache corpus length
        finally:
            pool.terminate()
Example #21
0
 def get_texts(self):
     texts = ((content, self.lemmatize, subject, pageid) for subject, content, pageid in get_messages(self.conn))
     pool = multiprocessing.Pool(self.processes)
     
     posts, token_count = 0, 0
     for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1):
         for tokens, title, pageid in pool.imap(process_post, group):
             if len(tokens) < ARTICLE_MIN_WORDS:
                 continue
             if self.metadata:
                 yield (tokens, (repr(pageid), title))
             else:
                 yield tokens
             posts += 1
             token_count += len(tokens) 
     pool.terminate()
     
     log.info("Processed %d posts with %d tokens", posts, token_count)
Example #22
0
    def get_texts(self):
        """
        Iterate over the Wikipedia dump and the HN articles returning text
        """
        wiki_articles, hn_articles, articles_all = 0, 0, 0
        positions, positions_all = 0, 0

        # ************ Wikipedia ************
        texts = ((text, self.lemmatize) for _, text in wikicorpus._extract_pages(bz2.BZ2File(self.wiki_file)))
        pool = multiprocessing.Pool(self.processes)
        for group in utils.chunkize(texts, chunksize=10 * pool._processes, maxsize=1): # otherwise imap puts all the corpus into memory
            for tokens in pool.imap(wikicorpus.process_article, group):
                articles_all += 1
                positions_all += len(tokens)
                if len(tokens) > WIKI_ARTICLE_MIN_WORDS:
                    wiki_articles += 1
                    positions += len(tokens)
                    yield tokens
        pool.terminate()

        print (">>> finished iterating over Wikipedia corpus of %i documents with %i positions (total %i articles, %i positions before pruning articles shorter than %i words)" % (wiki_articles, positions, articles_all, positions_all, WIKI_ARTICLE_MIN_WORDS))

        # ************ HN articles ************
        positions_after_wiki = positions
        fnamelist = []
        for g in glob.iglob(self.hn_folder + '/*.txt'):
            fnamelist.append(g)
        for fileno, fname in enumerate(fnamelist): # TODO parallelize as Wiki
            hn_text = open(fname).read()
            if self.lemmatize:
                result = utils.lemmatize(hn_text) # text into lemmas here
            else:
                result = tokenize(hn_text) # text into tokens here
            articles_all += 1
            positions_all += len(result)
            if len(result) > HN_ARTICLE_MIN_WORDS:
                hn_articles += 1
                positions += len(result)
                yield result

        print (">>> finished iterating over HN corpus of %i documents with %i positions" % (hn_articles, positions - positions_after_wiki))
        # ************ /HN articles ************

        self.length = wiki_articles + hn_articles # cache corpus length
Example #23
0
def process_wiki_dump(source, target, processes=None):
    if processes is None:
        processes = max(1, multiprocessing.cpu_count() - 1)
    print(processes)

    with open(source, 'r', encoding='utf-8') as dump_file, \
         open(target, 'w', encoding='utf-8') as out_file:

        page_generator = extract_pages(dump_file, filter_namespaces=set(['0']))

        #for title, text, pageid in page_generator:
        #    sentences, title, pageid = process_page(title, text, pageid)
        #    for sentence in sentences:
        #        out_file.write(sentence + '\n')

        with multiprocessing.Pool(processes) as pool:
            for group in utils.chunkize(page_generator, chunksize=10 * processes, maxsize=1):
                for sentences, title, pageid in pool.imap(process_page, group):
                    for sentence in sentences:
                        out_file.write(sentence + '\n')
Example #24
0
    def get_texts(self):
        """
        Iterate over the dump, returning text version of each article as a list
        of tokens.

        Only articles of sufficient length are returned (short articles & redirects
        etc are ignored).

        Note that this iterates over the **texts**; if you want vectors, just use
        the standard corpus interface instead of this function::

        >>> for vec in wiki_corpus:
        >>>     print vec
        """
        articles, articles_all = 0, 0
        positions, positions_all = 0, 0
        texts = ((text, self.lemmatize)
                 for _, text in _extract_pages(bz2.BZ2File(self.fname)))
        pool = multiprocessing.Pool(self.processes)
        # process the corpus in smaller chunks of docs, because multiprocessing.Pool
        # is dumb and would load the entire input into RAM at once...
        for group in utils.chunkize(texts,
                                    chunksize=10 * self.processes,
                                    maxsize=1):
            for tokens in pool.imap(process_article, group):  # chunksize=10):
                articles_all += 1
                positions_all += len(tokens)
                if len(
                        tokens
                ) > ARTICLE_MIN_WORDS:  # article redirects and short stubs are pruned here
                    articles += 1
                    positions += len(tokens)
                    yield tokens
        pool.terminate()

        logger.info(
            "finished iterating over Wikipedia corpus of %i documents with %i positions"
            " (total %i articles, %i positions before pruning articles shorter than %i words)"
            % (articles, positions, articles_all, positions_all,
               ARTICLE_MIN_WORDS))
        self.length = articles  # cache corpus length
Example #25
0
    def get_texts(self):
        texts = ((content, self.lemmatize, subject, pageid)
                 for subject, content, pageid in get_messages(self.conn))
        pool = multiprocessing.Pool(self.processes)

        posts, token_count = 0, 0
        for group in utils.chunkize(texts,
                                    chunksize=10 * self.processes,
                                    maxsize=1):
            for tokens, title, pageid in pool.imap(process_post, group):
                if len(tokens) < ARTICLE_MIN_WORDS:
                    continue
                if self.metadata:
                    yield (tokens, (repr(pageid), title))
                else:
                    yield tokens
                posts += 1
                token_count += len(tokens)
        pool.terminate()

        log.info("Processed %d posts with %d tokens", posts, token_count)
Example #26
0
    def get_texts(self):
        """
        Iterate over the dump, returning text version of each article as a list
        of tokens.
        """

        reviews = 0
        positions = 0
        texts = [text for text in _extract_reviews(self.fname)]
        pool = multiprocessing.Pool(self.processes)
        # process the corpus in smaller chunks of docs, because multiprocessing.Pool
        # is dumb and would load the entire input into RAM at once...
        #for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1):
        for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1):
            for tokens in pool.imap(process_review, group): # chunksize=10):
                reviews += 1
                positions += len(tokens)
                yield tokens
        pool.terminate()

        logger.info("finished iterating over the generated Yelp corpus of %i documents with %i positions"
                " (total %i articles, %i positions before pruning articles shorter than %i words)" %
                (reviews, positions, reviews, positions, 10000))
        self.length = reviews # cache corpus length
Example #27
0
    def addDocuments(self, corpus, chunks=None, decay=None):
        """
        Update singular value decomposition to take into account a new
        corpus of documents.

        Training proceeds in chunks of `chunks` documents at a time. The size of
        `chunks` is a tradeoff between increased speed (bigger `chunks`)
        vs. lower memory footprint (smaller `chunks`). If the distributed mode
        is on, each chunk is sent to a different worker/computer.

        Setting `decay` < 1.0 causes re-orientation towards new data trends in the
        input document stream, by giving less emphasis to old observations. This allows
        LSA to gradually "forget" old observations (documents) and give more
        preference to new ones.
        """
        logger.info("updating SVD with new documents")

        # get computation parameters; if not specified, use the ones from constructor
        if chunks is None:
            chunks = self.chunks
        if decay is None:
            decay = self.decay

        if not scipy.sparse.issparse(corpus):
            if not self.onepass:
                # we are allowed multiple passes over the input => use a faster, randomized two-pass algo
                update = Projection(self.numTerms, self.numTopics, None)
                update.u, update.s = stochasticSvd(
                    corpus,
                    self.numTopics,
                    num_terms=self.numTerms,
                    chunks=chunks,
                    extra_dims=self.extra_samples,
                    power_iters=self.power_iters)
                self.projection.merge(update, decay=decay)
            else:
                # the one-pass algo

                doc_no = 0
                # the corpus will be processed in chunks of `chunks` of documents.
                # keep preparing new chunks in a separate thread, so that we don't
                # waste time waiting for chunks to be read from disk. instead, fill
                # a (relatively short) chunk queue asynchronously in utils.chunkize,
                # and pop already-ready chunks from it as needed.
                for chunk_no, chunk in enumerate(
                        utils.chunkize(corpus, chunks, self.numworkers)):
                    # construct the job as a sparse matrix, to minimize memory overhead
                    # definitely avoid materializing it as a dense matrix!
                    job = matutils.corpus2csc(chunk, num_terms=self.numTerms)
                    del chunk
                    doc_no += job.shape[1]
                    if self.dispatcher:
                        # distributed version: add this job to the job queue, so workers can work on it
                        logger.debug("creating job #%i" % chunk_no)
                        self.dispatcher.putjob(
                            job
                        )  # put job into queue; this will eventually block, because the queue has a small finite size
                        del job
                        logger.info("dispatched documents up to #%s" % doc_no)
                    else:
                        # serial version, there is only one "worker" (myself) => process the job directly
                        update = Projection(self.numTerms, self.numTopics, job)
                        del job
                        self.projection.merge(update, decay=decay)
                        del update
                        logger.info("processed documents up to #%s" % doc_no)
                        self.printTopics(
                            5
                        )  # TODO see if printDebug works and remove one of these..

                # wait for all workers to finish (distributed version only)
                if self.dispatcher:
                    logger.info(
                        "reached the end of input; now waiting for all remaining jobs to finish"
                    )
                    self.projection = self.dispatcher.getstate()
#            logging.info("top topics after adding %i documents" % doc_no)
#            self.printDebug(10)
        else:
            assert not self.dispatcher, "must be in serial mode to receive jobs"
            assert self.onepass, "distributed two-pass algo not supported yet"
            update = Projection(self.numTerms, self.numTopics, corpus.tocsc())
            self.projection.merge(update, decay=decay)
            logger.info("processed sparse job of %i documents" %
                        (corpus.shape[1]))
Example #28
0
    def update(self, corpus, chunks=None, decay=None, passes=None, update_every=None):
        """
        Train the model with new documents, by EM-iterating over `corpus` until
        the topics converge (or until the maximum number of allowed iterations 
        is reached).
        
        In distributed mode, the E step is distributed over a cluster of machines.
        
        This update also supports updating an already trained model (`self`) 
        with new documents from `corpus`; the two models are then merged in 
        proportion to the number of old vs. new documents. This feature is still 
        experimental for non-stationary input streams. 
        
        For stationary input (no topic drift in new documents), on the other hand, 
        this equals the online update of Hoffman et al. and is guaranteed to 
        converge for any `decay` in (0.5, 1.0>.
        """
        if chunks is None:
            chunks = self.chunks
        if decay is None:
            decay = self.decay
        if passes is None:
            passes = self.passes
        if update_every is None:
            update_every = self.update_every
        if not passes:
            # if the number of whole-corpus iterations was not specified explicitly,
            # assume iterating over the corpus until convergence (or until self.MAXITER 
            # iterations, whichever happens first)
            passes = self.MAXITER
        
        # rho is the "speed" of updating; TODO try other fncs
        rho = lambda: pow(1.0 + self.num_updates, -decay)
        
        try:
            lencorpus = len(corpus)
        except:
            logger.warning("input corpus stream has no len(); counting documents")
            lencorpus = sum(1 for _ in corpus)
        if lencorpus == 0:
            logger.warning("LdaModel.update() called with an empty corpus")
            return
        self.state.numdocs += lencorpus
        
        if update_every > 0:
            updatetype = "online"
            updateafter = min(lencorpus, update_every * self.numworkers * chunks)
        else:
            updatetype = "batch"
            updateafter = lencorpus
        
        updates_per_pass = max(1, lencorpus / updateafter)
        logger.info("running %s LDA training, %s topics, %i passes over "
                    "the supplied corpus of %i documets, updating model once "
                    "every %i documents" %
                    (updatetype, self.numTopics, passes, lencorpus, updateafter))
        if updates_per_pass * passes < 10:
            logger.warning("too few updates, training might not converge; consider "
                           "increasing the number of passes to improve accuracy")

        for iteration in xrange(passes):
            if self.dispatcher:
                logger.info('initializing workers')
                self.dispatcher.reset(self.state)
            else:
                other = LdaState(self.state.sstats)
            dirty = False
            
            # the corpus will be processed in chunks of `chunks` of documents. 
            # keep preparing new chunks in a separate thread, so that we don't 
            # waste time waiting for chunks to be read from disk. instead, fill 
            # a (relatively short) chunk queue asynchronously in utils.chunkize, 
            # and pop already-ready chunks from it as needed.
            for chunk_no, chunk in enumerate(utils.chunkize(corpus, chunks, self.numworkers)):
                if self.dispatcher:
                    # add the chunk to dispatcher's job queue, so workers can munch on it
                    logger.info('PROGRESS: iteration %i, dispatching documents up to #%i/%i' % 
                                (iteration, chunk_no * chunks + len(chunk), lencorpus))
                    # this will eventually block until some jobs finish, because the queue has a small finite length
                    # convert each document to a 2d numpy array (~6x faster when transmitting 
                    # list data over the wire, in Pyro)
                    self.dispatcher.putjob(chunk)
                else:
                    logger.info('PROGRESS: iteration %i, at document #%i/%i' %
                                (iteration, chunk_no * chunks + len(chunk), lencorpus))
                    self.doEstep(chunk, other)
                dirty = True
                
                if update_every and (chunk_no + 1) % (update_every * self.numworkers) == 0:
                    if self.dispatcher:
                        # distributed mode: wait for all workers to finish
                        logger.info("reached the end of input; now waiting for all remaining jobs to finish")
                        other = self.dispatcher.getstate()
                    
                    diff = self.doMstep(rho(), other)
                    del other # free up some mem
                    
                    if self.dispatcher:
                        logger.info('initializing workers')
                        self.dispatcher.reset(self.state)
                    else:
                        other = LdaState(self.state.sstats)
                    dirty = False
            #endfor corpus iteration
            
            if dirty:
                # finish any remaining updates
                if self.dispatcher:
                    # distributed mode: wait for all workers to finish
                    logger.info("reached the end of input; now waiting for all remaining jobs to finish")
                    other = self.dispatcher.getstate()
                self.doMstep(rho(), other)
                dirty = False
Example #29
0
    def get_texts(self):
        """Iterate over the dump, yielding a list of tokens for each article that passed
        the length and namespace filtering.

        Uses multiprocessing internally to parallelize the work and process the dump more quickly.

        Notes
        -----
        This iterates over the **texts**. If you want vectors, just use the standard corpus interface
        instead of this method:

        Examples
        --------
        >>> from gensim.test.utils import datapath
        >>> from gensim.corpora import WikiCorpus
        >>>
        >>> path_to_wiki_dump = datapath("enwiki-latest-pages-articles1.xml-p000000010p000030302-shortened.bz2")
        >>>
        >>> for vec in WikiCorpus(path_to_wiki_dump):
        ...     pass

        Yields
        ------
        list of str
            If `metadata` is False, yield only list of token extracted from the article.
        (list of str, (int, str))
            List of tokens (extracted from the article), page id and article title otherwise.

        """
        articles, articles_all = 0, 0
        positions, positions_all = 0, 0

        tokenization_params = (self.tokenizer_func, self.token_min_len,
                               self.token_max_len, self.lower)
        texts = \
            ((text, self.lemmatize, title, pageid, tokenization_params)
             for title, text, pageid
             in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces))
        pool = multiprocessing.Pool(self.processes, init_to_ignore_interrupt)

        try:
            # process the corpus in smaller chunks of docs, because multiprocessing.Pool
            # is dumb and would load the entire input into RAM at once...
            for group in utils.chunkize(texts,
                                        chunksize=10 * self.processes,
                                        maxsize=1):
                for tokens, title, pageid in pool.imap(_process_article,
                                                       group):
                    articles_all += 1
                    positions_all += len(tokens)
                    # article redirects and short stubs are pruned here
                    if len(tokens) < self.article_min_tokens or \
                            any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):
                        continue
                    articles += 1
                    positions += len(tokens)
                    if self.metadata:
                        yield (tokens, (pageid, title))
                    else:
                        yield tokens

        except KeyboardInterrupt:
            logger.warn(
                "user terminated iteration over Wikipedia corpus after %i documents with %i positions "
                "(total %i articles, %i positions before pruning articles shorter than %i words)",
                articles, positions, articles_all, positions_all,
                ARTICLE_MIN_WORDS)
        else:
            logger.info(
                "finished iterating over Wikipedia corpus of %i documents with %i positions "
                "(total %i articles, %i positions before pruning articles shorter than %i words)",
                articles, positions, articles_all, positions_all,
                ARTICLE_MIN_WORDS)
            self.length = articles  # cache corpus length
        finally:
            pool.terminate()
Example #30
0
    def update(self, corpus, chunks=None, decay=None, passes=None, update_every=None):
        """
        Train the model with new documents, by EM-iterating over `corpus` until
        the topics converge (or until the maximum number of allowed iterations
        is reached).

        In distributed mode, the E step is distributed over a cluster of machines.

        This update also supports updating an already trained model (`self`)
        with new documents from `corpus`; the two models are then merged in
        proportion to the number of old vs. new documents. This feature is still
        experimental for non-stationary input streams.

        For stationary input (no topic drift in new documents), on the other hand,
        this equals the online update of Hoffman et al. and is guaranteed to
        converge for any `decay` in (0.5, 1.0>.
        """
        if chunks is None:
            chunks = self.chunks
        if decay is None:
            decay = self.decay
        if passes is None:
            passes = self.passes
        if update_every is None:
            update_every = self.update_every
        if not passes:
            # if the number of whole-corpus iterations was not specified explicitly,
            # assume iterating over the corpus until convergence (or until self.MAXITER
            # iterations, whichever happens first)
            passes = self.MAXITER

        # rho is the "speed" of updating; TODO try other fncs
        rho = lambda: pow(1.0 + self.num_updates, -decay)

        try:
            lencorpus = len(corpus)
        except:
            logger.warning("input corpus stream has no len(); counting documents")
            lencorpus = sum(1 for _ in corpus)
        if lencorpus == 0:
            logger.warning("LdaModel.update() called with an empty corpus")
            return
        self.state.numdocs += lencorpus

        if update_every > 0:
            updatetype = "online"
            updateafter = min(lencorpus, update_every * self.numworkers * chunks)
        else:
            updatetype = "batch"
            updateafter = lencorpus

        updates_per_pass = max(1, lencorpus / updateafter)
        logger.info("running %s LDA training, %s topics, %i passes over "
                    "the supplied corpus of %i documents, updating model once "
                    "every %i documents" %
                    (updatetype, self.numTopics, passes, lencorpus, updateafter))
        if updates_per_pass * passes < 10:
            logger.warning("too few updates, training might not converge; consider "
                           "increasing the number of passes to improve accuracy")

        for iteration in xrange(passes):
            if self.dispatcher:
                logger.info('initializing %s workers' % self.numworkers)
                self.dispatcher.reset(self.state)
            else:
                other = LdaState(self.state.sstats)
            dirty = False

            # the corpus will be processed in chunks of `chunks` of documents.
            # keep preparing new chunks in a separate thread, so that we don't
            # waste time waiting for chunks to be read from disk. instead, fill
            # a (relatively short) chunk queue asynchronously in utils.chunkize,
            # and pop already-ready chunks from it as needed.
            for chunk_no, chunk in enumerate(utils.chunkize(corpus, chunks, self.numworkers)):
                if self.dispatcher:
                    # add the chunk to dispatcher's job queue, so workers can munch on it
                    logger.info('PROGRESS: iteration %i, dispatching documents up to #%i/%i' %
                                (iteration, chunk_no * chunks + len(chunk), lencorpus))
                    # this will eventually block until some jobs finish, because the queue has a small finite length
                    # convert each document to a 2d numpy array (~6x faster when transmitting
                    # list data over the wire, in Pyro)
                    self.dispatcher.putjob(chunk)
                else:
                    logger.info('PROGRESS: iteration %i, at document #%i/%i' %
                                (iteration, chunk_no * chunks + len(chunk), lencorpus))
                    self.doEstep(chunk, other)
                dirty = True

                if update_every and (chunk_no + 1) % (update_every * self.numworkers) == 0:
                    if self.dispatcher:
                        # distributed mode: wait for all workers to finish
                        logger.info("reached the end of input; now waiting for all remaining jobs to finish")
                        other = self.dispatcher.getstate()

                    diff = self.doMstep(rho(), other)
                    del other # free up some mem

                    if self.dispatcher:
                        logger.info('initializing workers')
                        self.dispatcher.reset(self.state)
                    else:
                        other = LdaState(self.state.sstats)
                    dirty = False
            #endfor corpus iteration

            if dirty:
                # finish any remaining updates
                if self.dispatcher:
                    # distributed mode: wait for all workers to finish
                    logger.info("reached the end of input; now waiting for all remaining jobs to finish")
                    other = self.dispatcher.getstate()
                self.doMstep(rho(), other)
                dirty = False
Example #31
0
def preprocess_data(
    train_infile,
    test_infile,
    output_dir,
    train_prefix,
    test_prefix,
    min_doc_count=0,
    max_doc_freq=1.0,
    ngram_range=(1, 1),
    vocab_size=None,
    stopwords=None,
    keep_num=False,
    keep_alphanum=False,
    strip_html=False,
    lower=True,
    min_word_length=3,
    max_doc_length=5000,
    label_fields=None,
    workers=4,
    proc_multiplier=500,
):

    if stopwords == "mallet":
        print("Using Mallet stopwords")
        stopword_list = fh.read_text(
            os.path.join("stopwords", "mallet_stopwords.txt"))
    elif stopwords == "snowball":
        print("Using snowball stopwords")
        stopword_list = fh.read_text(
            os.path.join("stopwords", "snowball_stopwords.txt"))
    elif stopwords is not None:
        print("Using custom stopwords")
        stopword_list = fh.read_text(
            os.path.join("stopwords", stopwords + "_stopwords.txt"))
    else:
        stopword_list = []
    stopword_set = {s.strip() for s in stopword_list}

    print("Reading data files")
    train_items = fh.LazyJsonlistReader(train_infile)
    n_train = len(train_items)
    print("Found {:d} training documents".format(n_train))

    if test_infile is not None:
        test_items = fh.LazyJsonlistReader(test_infile)
        n_test = len(test_items)
        print("Found {:d} test documents".format(n_test))
    else:
        test_items = []
        n_test = 0

    n_items = n_train + n_test

    if label_fields:
        label_lists = {}
        if "," in label_fields:
            label_fields = label_fields.split(",")
        else:
            label_fields = [label_fields]
    if label_fields is None:
        label_fields = []

    # make vocabulary
    train_ids, train_parsed, train_labels = [], [], []
    test_ids, test_parsed, test_labels = [], [], []

    print("Parsing documents")
    word_counts = Counter()
    doc_counts = Counter()

    vocab = None

    # process in blocks
    pool = multiprocessing.Pool(workers)
    chunksize = proc_multiplier * workers

    kwargs = {
        "strip_html": strip_html,
        "lower": lower,
        "keep_numbers": keep_num,
        "keep_alphanum": keep_alphanum,
        "min_length": min_word_length,
        "stopwords": stopword_set,
        "ngram_range": ngram_range,
        "vocab": vocab,
        "label_fields": label_fields,
    }

    # these two loops below do the majority of the preprocessing. unfortunately, without
    # a major refactor, they cannot be turned into generators and the results of
    # tokenization must be appended to a list. this unfortunately implies a large
    # memory footprint
    for i, group in enumerate(chunkize(iter(train_items),
                                       chunksize=chunksize)):
        print(f"On training chunk {i} of {len(train_items) // chunksize}",
              end="\r")
        for ids, tokens, labels in pool.imap(partial(_process_item, **kwargs),
                                             group):
            # store the parsed documents
            if ids is not None:
                train_ids.append(ids)
            if labels is not None:
                train_labels.append(labels)
            tokens = tokens[:max_doc_length]

            # keep track of the number of documents with each word
            word_counts.update(tokens)
            doc_counts.update(set(tokens))
            train_parsed.append(" ".join(tokens))  # more efficient storage

    print("Train set processing complete")

    for i, group in enumerate(chunkize(iter(test_items), chunksize=chunksize)):
        print(f"On testing chunk {i} of {len(test_items) // chunksize}",
              end="\r")
        for ids, tokens, labels in pool.imap(partial(_process_item, **kwargs),
                                             group):
            # store the parsed documents
            if ids is not None:
                test_ids.append(ids)
            if labels is not None:
                test_labels.append(labels)
            tokens = tokens[:max_doc_length]

            # keep track of the number of documents with each word
            word_counts.update(tokens)
            doc_counts.update(set(tokens))
            test_parsed.append(" ".join(tokens))  # more efficient storage

    print("Test set processing complete")
    pool.terminate()

    print("Size of full vocabulary=%d" % len(word_counts))

    # store possible label values
    if label_fields:
        labels_df = pd.DataFrame.from_records(train_labels + test_labels)
    for label_name in label_fields:
        label_list = sorted(labels_df[label_name].unique().tolist())
        n_labels = len(label_list)
        print("Found label %s with %d classes" % (label_name, n_labels))
        label_lists[label_name] = label_list

    print("Selecting the vocabulary")
    most_common = doc_counts.most_common()
    words, doc_counts = zip(*most_common)
    doc_freqs = np.array(doc_counts) / float(n_items)
    vocab = [
        word for i, word in enumerate(words)
        if doc_counts[i] >= min_doc_count and doc_freqs[i] <= max_doc_freq
    ]
    most_common = [
        word for i, word in enumerate(words) if doc_freqs[i] > max_doc_freq
    ]
    if max_doc_freq < 1.0:
        print(
            "Excluding words with frequency > {:0.2f}:".format(max_doc_freq),
            most_common,
        )

    print("Vocab size after filtering = %d" % len(vocab))
    if vocab_size is not None:
        if len(vocab) > int(vocab_size):
            vocab = vocab[:int(vocab_size)]

    vocab_size = len(vocab)
    print("Final vocab size = %d" % vocab_size)

    print("Most common words remaining:", " ".join(vocab[:10]))
    vocab.sort()

    fh.write_to_json(vocab,
                     os.path.join(output_dir, train_prefix + ".vocab.json"))

    count_dtype = np.uint16 if max_doc_length < np.iinfo(
        np.uint16).max else np.int

    train_X_sage, tr_aspect, tr_no_aspect, tr_widx, vocab_for_sage = process_subset(
        train_items,
        train_ids,
        train_parsed,
        train_labels,
        label_fields,
        label_lists,
        vocab,
        output_dir,
        train_prefix,
        count_dtype=count_dtype,
    )
    if n_test > 0:
        test_X_sage, te_aspect, te_no_aspect, _, _ = process_subset(
            test_items,
            test_ids,
            test_parsed,
            test_labels,
            label_fields,
            label_lists,
            vocab,
            output_dir,
            test_prefix,
            count_dtype=count_dtype,
        )

    train_sum = np.array(train_X_sage.sum(axis=0))
    print("%d words missing from training data" % np.sum(train_sum == 0))

    if n_test > 0:
        test_sum = np.array(test_X_sage.sum(axis=0))
        print("%d words missing from test data" % np.sum(test_sum == 0))

    sage_output = {
        "tr_data": train_X_sage,
        "tr_aspect": tr_aspect,
        "widx": tr_widx,
        "vocab": vocab_for_sage,
    }
    if n_test > 0:
        sage_output["te_data"] = test_X_sage
        sage_output["te_aspect"] = te_aspect
    savemat(os.path.join(output_dir, "sage_labeled.mat"), sage_output)
    sage_output["tr_aspect"] = tr_no_aspect
    if n_test > 0:
        sage_output["te_aspect"] = te_no_aspect
    savemat(os.path.join(output_dir, "sage_unlabeled.mat"), sage_output)

    print("Done!")
Example #32
0
File: wiki.py Project: hans/deepBLE
    def get_texts(self):
        """
        Iterate over the corpus data, yielding sentences of the text
        version of each article (each sentence represented as a list of
        tokens).

        See the `WikiCorpus` class for more.
        """

        # Unfortunately due to the OOP-unfriendly implementation of
        # `WikiCorpus` we have to copy-and-paste some code. This code is
        # based on `WikiCorpus#get_texts`.

        n_articles, n_articles_all = 0, 0
        n_sentences, n_sentences_all = 0, 0

        pages = _extract_pages(self.open_corpus_file(), self.filter_namespaces)
        texts = ((text, self.lemmatize, title, pageid)
                 for title, text, pageid in pages)

        pool = multiprocessing.Pool(self.processes)

        # process the corpus in smaller chunks of docs, because
        # multiprocessing.Pool is dumb and would load the entire input
        # into RAM at once...
        chunks = utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1)

        for group in chunks:
            for sentences, title, pageid in pool.imap(process_article, group):
                n_articles_all += 1
                n_sentences_all += len(sentences)

                num_tokens = sum(len(sentence) for sentence in sentences)

                # article redirects and short stubs are pruned here
                if num_tokens > ARTICLE_MIN_WORDS:
                    n_articles += 1
                    n_sentences += len(sentences)

                    for sentence in sentences:
                        if self.metadata:
                            yield (sentence, (pageid, title))
                        else:
                            yield sentence

                        if self.sentences_out is not None:
                            self.sentences_out.write(' '.join(sentence))
                            self.sentences_out.write('\n')

        pool.terminate()

        LOGGER.info("finished iterating over Wikipedia corpus of %i "
                    "articles with %i sentences (%i articles / %i "
                    "sentences retained)" %
                    (n_articles_all, n_sentences_all, n_articles, n_sentences))

        # cache corpus length
        self.length = n_sentences

        # Close sentences file if we were writing one
        if self.sentences_out is not None:
            self.sentences_out.close()
            self.sentences_out = None
Example #33
0
    def get_texts(self):
        """Iterate over the dump, yielding a list of tokens for each article that passed
        the length and namespace filtering.

        Uses multiprocessing internally to parallelize the work and process the dump more quickly.

        Notes
        -----
        This iterates over the **texts**. If you want vectors, just use the standard corpus interface
        instead of this method:

        Examples
        --------
        .. sourcecode:: pycon

            >>> from gensim.test.utils import datapath
            >>> from gensim.corpora import WikiCorpus
            >>>
            >>> path_to_wiki_dump = datapath("enwiki-latest-pages-articles1.xml-p000000010p000030302-shortened.bz2")
            >>>
            >>> for vec in WikiCorpus(path_to_wiki_dump):
            ...     pass

        Yields
        ------
        list of str
            If `metadata` is False, yield only list of token extracted from the article.
        (list of str, (int, str))
            List of tokens (extracted from the article), page id and article title otherwise.

        """
        articles, articles_all = 0, 0
        positions, positions_all = 0, 0

        tokenization_params = (self.tokenizer_func, self.token_min_len, self.token_max_len, self.lower)
        texts = \
            ((text, self.lemmatize, title, pageid, tokenization_params)
             for title, text, pageid
             in extract_pages(bz2.BZ2File(self.fname), self.filter_namespaces, self.filter_articles))
        pool = multiprocessing.Pool(self.processes, init_to_ignore_interrupt)

        try:
            # process the corpus in smaller chunks of docs, because multiprocessing.Pool
            # is dumb and would load the entire input into RAM at once...
            for group in utils.chunkize(texts, chunksize=10 * self.processes, maxsize=1):
                for tokens, title, pageid in pool.imap(_process_article, group):
                    articles_all += 1
                    positions_all += len(tokens)
                    # article redirects and short stubs are pruned here
                    if len(tokens) < self.article_min_tokens or \
                            any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):
                        continue
                    articles += 1
                    positions += len(tokens)
                    if self.metadata:
                        yield (tokens, (pageid, title))
                    else:
                        yield tokens

        except KeyboardInterrupt:
            logger.warn(
                "user terminated iteration over Wikipedia corpus after %i documents with %i positions "
                "(total %i articles, %i positions before pruning articles shorter than %i words)",
                articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS
            )
        except PicklingError as exc:
            raise_from(PicklingError('Can not send filtering function {} to multiprocessing, '
                'make sure the function can be pickled.'.format(self.filter_articles)), exc)
        else:
            logger.info(
                "finished iterating over Wikipedia corpus of %i documents with %i positions "
                "(total %i articles, %i positions before pruning articles shorter than %i words)",
                articles, positions, articles_all, positions_all, ARTICLE_MIN_WORDS
            )
            self.length = articles  # cache corpus length
        finally:
            pool.terminate()
Example #34
0
       contents.append(regex.sub('', text))
os.chdir(wd)
# import stopword list
filename = 'stopwords_eng.txt'
with io.open(filename,'r',encoding='utf8') as f:
    text = f.read()
stoplist = set(text.split())
# tokenize and case fold
contents_tok = [[w for w in doc.lower().split() if w not in stoplist] for doc in contents]
# chunk documents in n chuncks
n = 100
from gensim.utils import chunkize
contents_chunk = []
for doc in contents_tok:
    clen = len(doc)/n
    for c in chunkize(doc,clen):
         contents_chunk.append(c)
# extract raw frequencies
from gensim import corpora, models
from collections import defaultdict
import numpy as np
# compute word freq
frequency = defaultdict(int)
for chunk in contents_chunk:
    for token in chunk:
        frequency[token] += 1
freq = [val for val in frequency.values()]
# prune bottum (mn) and top (mx)
mn = 1
mx = np.percentile(freq, 98)
contents_chunk = [[token for token in chunk if frequency[token] > mn and frequency[token] <= mx] for chunk in contents_chunk]
Example #35
0
    def addDocuments(self, corpus, chunks=None, decay=None):
        """
        Update singular value decomposition to take into account a new
        corpus of documents.

        Training proceeds in chunks of `chunks` documents at a time. The size of
        `chunks` is a tradeoff between increased speed (bigger `chunks`)
        vs. lower memory footprint (smaller `chunks`). If the distributed mode
        is on, each chunk is sent to a different worker/computer.

        Setting `decay` < 1.0 causes re-orientation towards new data trends in the
        input document stream, by giving less emphasis to old observations. This allows
        LSA to gradually "forget" old observations (documents) and give more
        preference to new ones.
        """
        logger.info("updating SVD with new documents")

        # get computation parameters; if not specified, use the ones from constructor
        if chunks is None:
            chunks = self.chunks
        if decay is None:
            decay = self.decay

        if not scipy.sparse.issparse(corpus):
            if not self.onepass:
                # we are allowed multiple passes over the input => use a faster, randomized two-pass algo
                update = Projection(self.numTerms, self.numTopics, None)
                update.u, update.s = stochasticSvd(corpus, self.numTopics,
                    num_terms=self.numTerms, chunks=chunks,
                    extra_dims=self.extra_samples, power_iters=self.power_iters)
                self.projection.merge(update, decay=decay)
            else:
                # the one-pass algo

                doc_no = 0
                # the corpus will be processed in chunks of `chunks` of documents.
                # keep preparing new chunks in a separate thread, so that we don't
                # waste time waiting for chunks to be read from disk. instead, fill
                # a (relatively short) chunk queue asynchronously in utils.chunkize,
                # and pop already-ready chunks from it as needed.
                for chunk_no, chunk in enumerate(utils.chunkize(corpus, chunks, self.numworkers)):
                    # construct the job as a sparse matrix, to minimize memory overhead
                    # definitely avoid materializing it as a dense matrix!
                    job = matutils.corpus2csc(chunk, num_terms=self.numTerms)
                    del chunk
                    doc_no += job.shape[1]
                    if self.dispatcher:
                        # distributed version: add this job to the job queue, so workers can work on it
                        logger.debug("creating job #%i" % chunk_no)
                        self.dispatcher.putjob(job) # put job into queue; this will eventually block, because the queue has a small finite size
                        del job
                        logger.info("dispatched documents up to #%s" % doc_no)
                    else:
                        # serial version, there is only one "worker" (myself) => process the job directly
                        update = Projection(self.numTerms, self.numTopics, job)
                        del job
                        self.projection.merge(update, decay = decay)
                        del update
                        logger.info("processed documents up to #%s" % doc_no)
                        self.printTopics(5) # TODO see if printDebug works and remove one of these..

                # wait for all workers to finish (distributed version only)
                if self.dispatcher:
                    logger.info("reached the end of input; now waiting for all remaining jobs to finish")
                    self.projection = self.dispatcher.getstate()
#            logging.info("top topics after adding %i documents" % doc_no)
#            self.printDebug(10)
        else:
            assert not self.dispatcher, "must be in serial mode to receive jobs"
            assert self.onepass, "distributed two-pass algo not supported yet"
            update = Projection(self.numTerms, self.numTopics, corpus.tocsc())
            self.projection.merge(update, decay=decay)
            logger.info("processed sparse job of %i documents" % (corpus.shape[1]))