class WikiCorpus(interfaces.CorpusABC):
    """
    Treat a wikipedia articles dump (*articles.xml.bz2) as a (read-only) corpus.
    
    The documents are extracted on-the-fly, so that the whole (massive) dump
    can stay compressed on disk.
    
    >>> wiki = WikiCorpus('enwiki-20100622-pages-articles.xml.bz2') # create word->word_id, takes almost 7h
    >>> wiki.saveAsText('wiki_en_vocab200k') # another 7.5h, creates a file in MatrixMarket format plus file with id->word
    
    """
    def __init__(self, fname, noBelow = 20, keep_words = 200000, dictionary = None):
        """
        Initialize the corpus. This scans the corpus once, to determine its 
        vocabulary (only the first `keep_words` most frequent words that 
        appear in at least `noBelow` documents are kept).
        """
        self.fname = fname
        if dictionary is None:
            self.dictionary = Dictionary(self.getArticles())
            self.dictionary.filterExtremes(noBelow = noBelow, noAbove = 0.1, keepN = keep_words)
        else:
            self.dictionary = dictionary

    
    def __len__(self):
        return self.numDocs


    def __iter__(self):
        """
        The function that defines a corpus -- iterating over the corpus yields 
        vectors, one for each document.
        """
        for docNo, text in enumerate(self.getArticles()):
            yield self.dictionary.doc2bow(text, allowUpdate = False)

        
    def saveDictionary(self, fname):
        """
        Store id->word mapping to a file, in format `id[TAB]word_utf8[TAB]document frequency[NEWLINE]`.
        """
        logger.info("saving dictionary mapping to %s" % fname)
        fout = open(fname, 'w')
        for token, tokenId in sorted(self.dictionary.token2id.iteritems()):
            fout.write("%i\t%s\t%i\n" % (tokenId, token, self.dictionary.docFreq[tokenId]))
        fout.close()
    
    
    @staticmethod
    def loadDictionary(fname):
        """
        Load previously stored mapping between words and their ids.
        
        The result can be used as the `id2word` parameter for input to transformations.
        """
        result = {}
        for lineNo, line in enumerate(open(fname)):
            cols = line[:-1].split('\t')
            if len(cols) == 2:
                wordId, word = cols
            elif len(cols) == 3:
                wordId, word, docFreq = cols
            else:
                continue
            result[int(wordId)] = word # docFreq not used
        return result
    
    
    def saveAsText(self, fname):
        """
        Store the corpus to disk, in a human-readable text format.
        
        This actually saves two files:
        
        1. Document-term co-occurence frequency counts (bag-of-words), as 
           a Matrix Market file `fname_bow.mm`.
        2. Token to integer mapping, as a text file `fname_wordids.txt`.
        
        """
        self.saveDictionary(fname + '_wordids.txt')
        matutils.MmWriter.writeCorpus(fname + '_bow.mm', self, progressCnt = 10000)
        
    
    def getArticles(self):
        """
        Iterate over the dump, returning text version of each article.
        
        Only articles of sufficient length are returned (short articles & redirects
        etc are ignored).
        """
        articles, intext = 0, False
        for lineno, line in enumerate(bz2.BZ2File(self.fname)):
            if line.startswith('      <text'):
                intext = True
                line = line[line.find('>') + 1 : ]
                lines = [line]
            elif intext:
                lines.append(line)
            pos = line.find('</text>') # can be on the same line as <text>
            if pos >= 0:
                intext = False
                if not lines:
                    continue
                lines[-1] = line[:pos]
                text = filterWiki(''.join(lines))
                if len(text) > ARTICLE_MIN_CHARS: # article redirects are pruned here
                    articles += 1
                    yield tokenize(text) # split text into tokens
        
        self.numDocs = articles # cache corpus length
class WikiCorpus(interfaces.CorpusABC):
    """
    Treat a wikipedia articles dump (*articles.xml.bz2) as a (read-only) corpus.
    
    The documents are extracted on-the-fly, so that the whole (massive) dump
    can stay compressed on disk.
    
    >>> wiki = WikiCorpus('enwiki-20100622-pages-articles.xml.bz2') # create word->word_id mapping, takes almost 8h
    >>> wiki.saveAsText('wiki_en_vocab200k') # another 8h, creates a file in MatrixMarket format plus file with id->word
    
    """
    def __init__(self, fname, noBelow=20, keep_words=200000, dictionary=None):
        """
        Initialize the corpus. This scans the corpus once, to determine its 
        vocabulary (only the first `keep_words` most frequent words that 
        appear in at least `noBelow` documents are kept).
        """
        self.fname = fname
        if dictionary is None:
            self.dictionary = Dictionary(self.getArticles())
            self.dictionary.filterExtremes(noBelow=noBelow,
                                           noAbove=0.1,
                                           keepN=keep_words)
        else:
            self.dictionary = dictionary

    def __len__(self):
        return self.numDocs

    def __iter__(self):
        """
        The function that defines a corpus -- iterating over the corpus yields 
        vectors, one for each document.
        """
        for docNo, text in enumerate(self.getArticles()):
            yield self.dictionary.doc2bow(text, allowUpdate=False)

    def saveDictionary(self, fname):
        """
        Store id->word mapping to a file, in format `id[TAB]word_utf8[TAB]document frequency[NEWLINE]`.
        """
        logger.info("saving dictionary mapping to %s" % fname)
        fout = open(fname, 'w')
        for token, tokenId in sorted(self.dictionary.token2id.iteritems()):
            fout.write("%i\t%s\t%i\n" %
                       (tokenId, token, self.dictionary.docFreq[tokenId]))
        fout.close()

    @staticmethod
    def loadDictionary(fname):
        """
        Load previously stored mapping between words and their ids.
        
        The result can be used as the `id2word` parameter for input to transformations.
        """
        result = {}
        for lineNo, line in enumerate(open(fname)):
            cols = line[:-1].split('\t')
            if len(cols) == 2:
                wordId, word = cols
            elif len(cols) == 3:
                wordId, word, docFreq = cols
            else:
                continue
            result[int(wordId)] = word  # docFreq not used
        return result

    def saveAsText(self, fname):
        """
        Store the corpus to disk, in a human-readable text format.
        
        This actually saves two files:
        
        1. Document-term co-occurence frequency counts (bag-of-words), as 
           a Matrix Market file `fname_bow.mm`.
        2. Token to integer mapping, as a text file `fname_wordids.txt`.
        
        """
        self.saveDictionary(fname + '_wordids.txt')
        matutils.MmWriter.writeCorpus(fname + '_bow.mm',
                                      self,
                                      progressCnt=10000)

    def getArticles(self):
        """
        Iterate over the dump, returning text version of each article.
        
        Only articles of sufficient length are returned (short articles & redirects
        etc are ignored).
        """
        articles, intext = 0, False
        for lineno, line in enumerate(bz2.BZ2File(self.fname)):
            if line.startswith('      <text'):
                intext = True
                line = line[line.find('>') + 1:]
                lines = [line]
            elif intext:
                lines.append(line)
            pos = line.find('</text>')  # can be on the same line as <text>
            if pos >= 0:
                intext = False
                if not lines:
                    continue
                lines[-1] = line[:pos]
                text = filterWiki(''.join(lines))
                if len(
                        text
                ) > ARTICLE_MIN_CHARS:  # article redirects are pruned here
                    articles += 1
                    yield tokenize(text)  # split text into tokens

        self.numDocs = articles  # cache corpus length