Beispiel #1
0
    def __init__(self,
                 fname,
                 processes=None,
                 lemmatize=utils.has_pattern(),
                 dictionary=None,
                 filter_namespaces=('0', )):
        """
        Initialize the corpus. Unless a dictionary is provided, this scans the
        corpus once, to determine its vocabulary.

        If `pattern` package is installed, use fancier shallow parsing to get
        token lemmas. Otherwise, use simple regexp tokenization. You can override
        this automatic logic by forcing the `lemmatize` parameter explicitly.

        """
        self.fname = fname
        self.filter_namespaces = filter_namespaces
        self.metadata = False
        if processes is None:
            processes = max(1, multiprocessing.cpu_count() - 1)
        self.processes = processes
        self.lemmatize = lemmatize
        if dictionary is None:
            self.dictionary = Dictionary(self.get_texts())
        else:
            self.dictionary = dictionary
def lemmatize2(content,
               allowed_tags=re.compile(r'(NN|VB|JJ|RB)'),
               light=False,
               stopwords=frozenset(),
               min_length=2,
               max_length=15):
    """Use the English lemmatizer from `pattern <https://github.com/clips/pattern>`_ to extract UTF8-encoded tokens in
    their base form aka lemma, e.g. "are, is, being" becomes "be" etc.
    This is a smarter version of stemming, taking word context into account.
    MODIFIED to return only wordsm and not concatenated tags!
"""
    if not has_pattern():
        raise ImportError(
            "Pattern library is not installed. Pattern library is needed in order to use lemmatize function"
        )
    from pattern.en import parse
    if light:
        import warnings
        warnings.warn("The light flag is no longer supported by pattern.")
    # tokenization in `pattern` is weird; it gets thrown off by non-letters,
    # producing '==relate/VBN' or '**/NN'... try to preprocess the text a little
    # FIXME this throws away all fancy parsing cues, including sentence structure,
    # abbreviations etc.
    content = u' '.join(tokenize(content, lower=True, errors='ignore'))
    parsed = parse(content, lemmata=True, collapse=False)
    result = []
    for sentence in parsed:
        for token, tag, _, _, lemma in sentence:
            if min_length <= len(lemma) <= max_length and not lemma.startswith(
                    '_') and lemma not in stopwords:
                if allowed_tags.match(tag):
                    #lemma += "/" + tag[:2]
                    result.append(lemma.encode('utf8'))
    return result
 def __init__(self,
              fname,
              processes=None,
              lemmatize=utils.has_pattern(),
              dictionary=None,
              filter_namespaces=('0', )):
     WikiCorpus.__init__(self, fname, processes, lemmatize, dictionary,
                         filter_namespaces)
Beispiel #4
0
    def __init__(self,
                 fname,
                 processes=None,
                 lemmatize=utils.has_pattern(),
                 dictionary=None,
                 filter_namespaces=('0', ),
                 tokenizer_func=tokenize,
                 article_min_tokens=ARTICLE_MIN_WORDS,
                 token_min_len=TOKEN_MIN_LEN,
                 token_max_len=TOKEN_MAX_LEN,
                 lower=True):
        """Initialize the corpus.

        Unless a dictionary is provided, this scans the corpus once,
        to determine its vocabulary.

        Parameters
        ----------
        fname : str
            Path to file with wikipedia dump.
        processes : int, optional
            Number of processes to run, defaults to **number of cpu - 1**.
        lemmatize : bool
            Whether to use lemmatization instead of simple regexp tokenization.
            Defaults to `True` if *pattern* package installed.
        dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional
            Dictionary, if not provided,  this scans the corpus once, to determine its vocabulary
            (this needs **really long time**).
        filter_namespaces : tuple of str
            Namespaces to consider.
        tokenizer_func : function, optional
            Function that will be used for tokenization. By default, use :func:`~gensim.corpora.wikicorpus.tokenize`.
            Need to support interface:
            tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str.
        article_min_tokens : int, optional
            Minimum tokens in article. Article will be ignored if number of tokens is less.
        token_min_len : int, optional
            Minimal token length.
        token_max_len : int, optional
            Maximal token length.
        lower : bool, optional
             If True - convert all text to lower case.

        """
        self.fname = fname
        self.filter_namespaces = filter_namespaces
        self.metadata = False
        if processes is None:
            processes = max(1, multiprocessing.cpu_count() - 1)
        self.processes = processes
        self.lemmatize = lemmatize
        self.tokenizer_func = tokenizer_func
        self.article_min_tokens = article_min_tokens
        self.token_min_len = token_min_len
        self.token_max_len = token_max_len
        self.lower = lower
        #print "setting the dictionary to {}"
        self.dictionary = {}
Beispiel #5
0
    def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None,
                 filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS,
                 token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True, filter_articles=None):
        """Initialize the corpus.
        Unless a dictionary is provided, this scans the corpus once,
        to determine its vocabulary.
        Parameters
        ----------
        fname : str
            Path to the Wikipedia dump file.
        processes : int, optional
            Number of processes to run, defaults to `max(1, number of cpu - 1)`.
        lemmatize : bool
            Use lemmatization instead of simple regexp tokenization.
            Defaults to `True` if you have the `pattern <https://github.com/clips/pattern>`_ package installed.
        dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional
            Dictionary, if not provided,  this scans the corpus once, to determine its vocabulary
            **IMPORTANT: this needs a really long time**.
        filter_namespaces : tuple of str, optional
            Namespaces to consider.
        tokenizer_func : function, optional
            Function that will be used for tokenization. By default, use :func:`~gensim.corpora.wikicorpus.tokenize`.
            If you inject your own tokenizer, it must conform to this interface:
            `tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str`
        article_min_tokens : int, optional
            Minimum tokens in article. Article will be ignored if number of tokens is less.
        token_min_len : int, optional
            Minimal token length.
        token_max_len : int, optional
            Maximal token length.
        lower : bool, optional
             If True - convert all text to lower case.
        filter_articles: callable or None, optional
            If set, each XML article element will be passed to this callable before being processed. Only articles
            where the callable returns an XML element are processed, returning None allows filtering out
            some articles based on customised rules.
        Warnings
        --------
        Unless a dictionary is provided, this scans the corpus once, to determine its vocabulary.
        """
        self.fname = fname
        self.filter_namespaces = filter_namespaces
        self.filter_articles = filter_articles
        self.metadata = False
        if processes is None:
            processes = max(1, multiprocessing.cpu_count() - 1)
        self.processes = processes
        self.lemmatize = lemmatize
        self.tokenizer_func = tokenizer_func
        self.article_min_tokens = article_min_tokens
        self.token_min_len = token_min_len
        self.token_max_len = token_max_len
        self.lower = lower

        if dictionary is None:
            self.dictionary = Dictionary(self.get_texts())
        else:
            self.dictionary = dictionary
 def __init__(self, pages_gen, processes=None, lemmatize=utils.has_pattern(), dictionary=None):
     self.pages_gen = pages_gen
     self.metadata = False
     if processes is None:
         processes = max(1, multiprocessing.cpu_count() - 1)
     self.processes = processes
     self.lemmatize = lemmatize
     if dictionary is None:
         self.dictionary = Dictionary(self.get_texts())
     else:
         self.dictionary = dictionary
Beispiel #7
0
    def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None,
                 filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS,
                 token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True):
        """

        Parameters
        ----------
        fname : str
            Path to the Wikipedia dump file.
        processes : int, optional
            Number of processes to run, defaults to `max(1, number of cpu - 1)`.
        lemmatize : bool
            Use lemmatization instead of simple regexp tokenization.
            Defaults to `True` if you have the `pattern <https://github.com/clips/pattern>`_ package installed.
        dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional
            Dictionary, if not provided,  this scans the corpus once, to determine its vocabulary
            **IMPORTANT: this needs a really long time**.
        filter_namespaces : tuple of str, optional
            Namespaces to consider.
        tokenizer_func : function, optional
            Function that will be used for tokenization. By default, use :func:`~gensim.corpora.wikicorpus.tokenize`.
            If you inject your own tokenizer, it must conform to this interface:
            `tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str`
        article_min_tokens : int, optional
            Minimum tokens in article. Article will be ignored if number of tokens is less.
        token_min_len : int, optional
            Minimal token length.
        token_max_len : int, optional
            Maximal token length.
        lower : bool, optional
             Convert all text to lower case?

        Warnings
        --------
        Unless a dictionary is provided, this scans the corpus once, to determine its vocabulary.

        """
        self.fname = fname
        self.filter_namespaces = filter_namespaces
        self.metadata = False
        if processes is None:
            processes = max(1, multiprocessing.cpu_count() - 1)
        self.processes = processes
        self.lemmatize = lemmatize
        self.tokenizer_func = tokenizer_func
        self.article_min_tokens = article_min_tokens
        self.token_min_len = token_min_len
        self.token_max_len = token_max_len
        self.lower = lower

        if dictionary is None:
            self.dictionary = Dictionary(self.get_texts())
        else:
            self.dictionary = dictionary
 def __init__(self,
              fname,
              processes=None,
              lemmatize=utils.has_pattern(),
              dictionary=None,
              filter_namespaces=('0', ),
              tokenizer_func=tokenize,
              article_min_tokens=ARTICLE_MIN_WORDS,
              token_min_len=TOKEN_MIN_LEN,
              token_max_len=TOKEN_MAX_LEN,
              lower=True):
     super().__init__(fname, processes, lemmatize, dictionary,
                      filter_namespaces, tokenizer_func, article_min_tokens,
                      token_min_len, token_max_len, lower)
def iterate_wiki(input_path):
    lemmatize = utils.has_pattern()
    filter_namespaces = ('0',)
    texts = ((text, lemmatize, title, pageid) for title, text, pageid in
             extract_pages(bz2.BZ2File(input_path), filter_namespaces))
    for article in texts:
        text, lemmatize, title, pageid = article
        text = utils.to_unicode(text, 'utf8', errors='ignore')
        text = utils.decode_htmlentities(text)  # '&amp;nbsp;' --> '\xa0'
        text = remove_markup(text)
        tokens = get_all_words(text)
        if len(tokens) < ARTICLE_MIN_WORDS or any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):
            continue
        yield title, tokens
Beispiel #10
0
    def __init__(self,
                 fname,
                 processes=None,
                 lemmatize=utils.has_pattern(),
                 dictionary=None,
                 filter_namespaces=('0', ),
                 tokenizer_func=tokenize,
                 article_min_tokens=ARTICLE_MIN_WORDS,
                 token_min_len=TOKEN_MIN_LEN,
                 token_max_len=TOKEN_MAX_LEN,
                 lower=True):
        """
        Initialize the corpus. Unless a dictionary is provided, this scans the
        corpus once, to determine its vocabulary.

        If `pattern` package is installed, use fancier shallow parsing to get
        token lemmas. Otherwise, use simple regexp tokenization. You can override
        this automatic logic by forcing the `lemmatize` parameter explicitly.
        self.metadata if set to true will ensure that serialize will write out article titles to a pickle file.

        Set `article_min_tokens` as a min threshold for article token count (defaults to 50). Any article below this is
        ignored.

        Set `tokenizer_func` (defaults to `tokenize`) with a custom function reference to control tokenization else use
        the default regexp tokenization. Set this parameter for languages like japanese or thai to perform better
        tokenization. The `tokenizer_func` needs to take 4 parameters: (text, token_min_len, token_max_len, lower). The
        parameter values are as configured on the class instance by default.

        Set `lower` to control if everything should be converted to lowercase or not (default True).

        Set `token_min_len`, `token_max_len` as thresholds for token lengths that are returned (default to 2 and 15).

        """
        self.fname = fname
        self.filter_namespaces = filter_namespaces
        self.metadata = False
        if processes is None:
            processes = max(1, multiprocessing.cpu_count() - 1)
        self.processes = processes
        self.lemmatize = lemmatize
        self.tokenizer_func = tokenizer_func
        self.article_min_tokens = article_min_tokens
        self.token_min_len = token_min_len
        self.token_max_len = token_max_len
        self.lower = lower

        if dictionary is None:
            self.dictionary = Dictionary(self.get_texts())
        else:
            self.dictionary = dictionary
Beispiel #11
0
    def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None,
                 filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS,
                 token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True):
        """Initialize the corpus.

        Unless a dictionary is provided, this scans the corpus once,
        to determine its vocabulary.

        Parameters
        ----------
        fname : str
            Path to file with wikipedia dump.
        processes : int, optional
            Number of processes to run, defaults to **number of cpu - 1**.
        lemmatize : bool
            Whether to use lemmatization instead of simple regexp tokenization.
            Defaults to `True` if *pattern* package installed.
        dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional
            Dictionary, if not provided,  this scans the corpus once, to determine its vocabulary
            (this needs **really long time**).
        filter_namespaces : tuple of str
            Namespaces to consider.
        tokenizer_func : function, optional
            Function that will be used for tokenization. By default, use :func:`~gensim.corpora.wikicorpus.tokenize`.
            Need to support interface:
            tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str.
        article_min_tokens : int, optional
            Minimum tokens in article. Article will be ignored if number of tokens is less.
        token_min_len : int, optional
            Minimal token length.
        token_max_len : int, optional
            Maximal token length.
        lower : bool, optional
             If True - convert all text to lower case.

        """
        self.fname = fname
        self.filter_namespaces = filter_namespaces
        self.metadata = False
        if processes is None:
            processes = max(1, multiprocessing.cpu_count() - 1)
        self.processes = processes
        self.lemmatize = lemmatize
        self.tokenizer_func = tokenizer_func
        self.article_min_tokens = article_min_tokens
        self.token_min_len = token_min_len
        self.token_max_len = token_max_len
        self.lower = lower
        self.dictionary = dictionary or Dictionary(self.get_texts())
Beispiel #12
0
 def __init__(self,
              pages_gen,
              processes=None,
              lemmatize=utils.has_pattern(),
              dictionary=None):
     self.pages_gen = pages_gen
     self.metadata = False
     if processes is None:
         processes = max(1, multiprocessing.cpu_count() - 1)
     self.processes = processes
     self.lemmatize = lemmatize
     if dictionary is None:
         self.dictionary = Dictionary(self.get_texts())
     else:
         self.dictionary = dictionary
Beispiel #13
0
    def __init__(self,
                 fname,
                 dictionary,
                 article_count,
                 set_citation,
                 quote_identifiers,
                 processes=None,
                 lemmatize=utils.has_pattern(),
                 filter_namespaces=('0', )):
        WikiCorpus.__init__(self, fname, processes, False, dictionary,
                            filter_namespaces)

        self.set_citation = set_citation
        self.articlecount = article_count
        self.quote_identifiers = quote_identifiers

        self.base_url = self._get_base_wikipedia_url(bz2.BZ2File(self.fname),
                                                     filter_namespaces)
Beispiel #14
0
    def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None,
                 filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS,
                 token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True):
        """
        Initialize the corpus. Unless a dictionary is provided, this scans the
        corpus once, to determine its vocabulary.

        If `pattern` package is installed, use fancier shallow parsing to get
        token lemmas. Otherwise, use simple regexp tokenization. You can override
        this automatic logic by forcing the `lemmatize` parameter explicitly.
        self.metadata if set to true will ensure that serialize will write out article titles to a pickle file.

        Set `article_min_tokens` as a min threshold for article token count (defaults to 50). Any article below this is
        ignored.

        Set `tokenizer_func` (defaults to `tokenize`) with a custom function reference to control tokenization else use
        the default regexp tokenization. Set this parameter for languages like japanese or thai to perform better
        tokenization. The `tokenizer_func` needs to take 4 parameters: (text, token_min_len, token_max_len, lower). The
        parameter values are as configured on the class instance by default.

        Set `lower` to control if everything should be converted to lowercase or not (default True).

        Set `token_min_len`, `token_max_len` as thresholds for token lengths that are returned (default to 2 and 15).

        """
        self.fname = fname
        self.filter_namespaces = filter_namespaces
        self.metadata = False
        if processes is None:
            processes = max(1, multiprocessing.cpu_count() - 1)
        self.processes = processes
        self.lemmatize = lemmatize
        self.tokenizer_func = tokenizer_func
        self.article_min_tokens = article_min_tokens
        self.token_min_len = token_min_len
        self.token_max_len = token_max_len
        self.lower = lower

        if dictionary is None:
            self.dictionary = Dictionary(self.get_texts())
        else:
            self.dictionary = dictionary
Beispiel #15
0
    def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0',)):
        """
        Initialize the corpus. Unless a dictionary is provided, this scans the
        corpus once, to determine its vocabulary.

        If `pattern` package is installed, use fancier shallow parsing to get
        token lemmas. Otherwise, use simple regexp tokenization. You can override
        this automatic logic by forcing the `lemmatize` parameter explicitly.

        """
        self.fname = fname
        self.filter_namespaces = filter_namespaces
        self.metadata = False
        if processes is None:
            processes = max(1, multiprocessing.cpu_count() - 1)
        self.processes = processes
        self.lemmatize = lemmatize
        if dictionary is None:
            self.dictionary = Dictionary(self.get_texts())
        else:
            self.dictionary = dictionary
Beispiel #16
0
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""This module contains functions and processors used for processing text,
extracting sentences from text, working with acronyms and abbreviations.

"""

import re
import logging

from gensim.summarization.syntactic_unit import SyntacticUnit
from gensim.parsing.preprocessing import preprocess_documents
from gensim.utils import tokenize, has_pattern

logger = logging.getLogger(__name__)

HAS_PATTERN = has_pattern()
if HAS_PATTERN:
    from pattern.en import tag

# Special separator used in abbreviations.
SEPARATOR = r'@'

# Pattern to split text to sentences.
RE_SENTENCE = re.compile(r'(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)',
                         re.UNICODE)

# Pattern for detecting abbreviations (example: Sgt. Pepper).
AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)\s(\w)', re.UNICODE)

# Pattern for detecting acronyms.
AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)\s(\w)', re.UNICODE)

def config_argparser():
    argparser = argparse.ArgumentParser(description='Wikipedia Dump Extractor')
    argparser.add_argument('-input_path', type=str, required=True, help='Path to the raw Wikipedia dump')
    argparser.add_argument('-output_path', type=str, required=True, help='Write path for extracted text content')
    return argparser.parse_args()


if __name__ == '__main__':
    arguments = config_argparser()
    program = os.path.basename(sys.argv[0])
    logger = logging.getLogger(program)
    logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
    logging.root.setLevel(level=logging.INFO)
    lemmatize = utils.has_pattern()
    filter_namespaces = ('0',)
    texts = ((text, lemmatize, title, pageid) for title, text, pageid in
             extract_pages(bz2.BZ2File(arguments.input_path), filter_namespaces))
    parsed_article_counter = 0
    space = u' '
    output = codecs.open(arguments.output_path, 'w', 'utf-8')
    for article in texts:
        text, lemmatize, title, pageid = article
        text = utils.to_unicode(text, 'utf8', errors='ignore')
        text = utils.decode_htmlentities(text)  # '&amp;nbsp;' --> '\xa0'
        text = remove_markup(text)
        tokens = get_all_words(text)
        if len(tokens) < ARTICLE_MIN_WORDS or any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES):
            continue
        output.write("{}\n".format(space.join(tokens) + "\n"))