def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0', )): """ Initialize the corpus. Unless a dictionary is provided, this scans the corpus once, to determine its vocabulary. If `pattern` package is installed, use fancier shallow parsing to get token lemmas. Otherwise, use simple regexp tokenization. You can override this automatic logic by forcing the `lemmatize` parameter explicitly. """ self.fname = fname self.filter_namespaces = filter_namespaces self.metadata = False if processes is None: processes = max(1, multiprocessing.cpu_count() - 1) self.processes = processes self.lemmatize = lemmatize if dictionary is None: self.dictionary = Dictionary(self.get_texts()) else: self.dictionary = dictionary
def lemmatize2(content, allowed_tags=re.compile(r'(NN|VB|JJ|RB)'), light=False, stopwords=frozenset(), min_length=2, max_length=15): """Use the English lemmatizer from `pattern <https://github.com/clips/pattern>`_ to extract UTF8-encoded tokens in their base form aka lemma, e.g. "are, is, being" becomes "be" etc. This is a smarter version of stemming, taking word context into account. MODIFIED to return only wordsm and not concatenated tags! """ if not has_pattern(): raise ImportError( "Pattern library is not installed. Pattern library is needed in order to use lemmatize function" ) from pattern.en import parse if light: import warnings warnings.warn("The light flag is no longer supported by pattern.") # tokenization in `pattern` is weird; it gets thrown off by non-letters, # producing '==relate/VBN' or '**/NN'... try to preprocess the text a little # FIXME this throws away all fancy parsing cues, including sentence structure, # abbreviations etc. content = u' '.join(tokenize(content, lower=True, errors='ignore')) parsed = parse(content, lemmata=True, collapse=False) result = [] for sentence in parsed: for token, tag, _, _, lemma in sentence: if min_length <= len(lemma) <= max_length and not lemma.startswith( '_') and lemma not in stopwords: if allowed_tags.match(tag): #lemma += "/" + tag[:2] result.append(lemma.encode('utf8')) return result
def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0', )): WikiCorpus.__init__(self, fname, processes, lemmatize, dictionary, filter_namespaces)
def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0', ), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True): """Initialize the corpus. Unless a dictionary is provided, this scans the corpus once, to determine its vocabulary. Parameters ---------- fname : str Path to file with wikipedia dump. processes : int, optional Number of processes to run, defaults to **number of cpu - 1**. lemmatize : bool Whether to use lemmatization instead of simple regexp tokenization. Defaults to `True` if *pattern* package installed. dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional Dictionary, if not provided, this scans the corpus once, to determine its vocabulary (this needs **really long time**). filter_namespaces : tuple of str Namespaces to consider. tokenizer_func : function, optional Function that will be used for tokenization. By default, use :func:`~gensim.corpora.wikicorpus.tokenize`. Need to support interface: tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str. article_min_tokens : int, optional Minimum tokens in article. Article will be ignored if number of tokens is less. token_min_len : int, optional Minimal token length. token_max_len : int, optional Maximal token length. lower : bool, optional If True - convert all text to lower case. """ self.fname = fname self.filter_namespaces = filter_namespaces self.metadata = False if processes is None: processes = max(1, multiprocessing.cpu_count() - 1) self.processes = processes self.lemmatize = lemmatize self.tokenizer_func = tokenizer_func self.article_min_tokens = article_min_tokens self.token_min_len = token_min_len self.token_max_len = token_max_len self.lower = lower #print "setting the dictionary to {}" self.dictionary = {}
def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True, filter_articles=None): """Initialize the corpus. Unless a dictionary is provided, this scans the corpus once, to determine its vocabulary. Parameters ---------- fname : str Path to the Wikipedia dump file. processes : int, optional Number of processes to run, defaults to `max(1, number of cpu - 1)`. lemmatize : bool Use lemmatization instead of simple regexp tokenization. Defaults to `True` if you have the `pattern <https://github.com/clips/pattern>`_ package installed. dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional Dictionary, if not provided, this scans the corpus once, to determine its vocabulary **IMPORTANT: this needs a really long time**. filter_namespaces : tuple of str, optional Namespaces to consider. tokenizer_func : function, optional Function that will be used for tokenization. By default, use :func:`~gensim.corpora.wikicorpus.tokenize`. If you inject your own tokenizer, it must conform to this interface: `tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str` article_min_tokens : int, optional Minimum tokens in article. Article will be ignored if number of tokens is less. token_min_len : int, optional Minimal token length. token_max_len : int, optional Maximal token length. lower : bool, optional If True - convert all text to lower case. filter_articles: callable or None, optional If set, each XML article element will be passed to this callable before being processed. Only articles where the callable returns an XML element are processed, returning None allows filtering out some articles based on customised rules. Warnings -------- Unless a dictionary is provided, this scans the corpus once, to determine its vocabulary. """ self.fname = fname self.filter_namespaces = filter_namespaces self.filter_articles = filter_articles self.metadata = False if processes is None: processes = max(1, multiprocessing.cpu_count() - 1) self.processes = processes self.lemmatize = lemmatize self.tokenizer_func = tokenizer_func self.article_min_tokens = article_min_tokens self.token_min_len = token_min_len self.token_max_len = token_max_len self.lower = lower if dictionary is None: self.dictionary = Dictionary(self.get_texts()) else: self.dictionary = dictionary
def __init__(self, pages_gen, processes=None, lemmatize=utils.has_pattern(), dictionary=None): self.pages_gen = pages_gen self.metadata = False if processes is None: processes = max(1, multiprocessing.cpu_count() - 1) self.processes = processes self.lemmatize = lemmatize if dictionary is None: self.dictionary = Dictionary(self.get_texts()) else: self.dictionary = dictionary
def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True): """ Parameters ---------- fname : str Path to the Wikipedia dump file. processes : int, optional Number of processes to run, defaults to `max(1, number of cpu - 1)`. lemmatize : bool Use lemmatization instead of simple regexp tokenization. Defaults to `True` if you have the `pattern <https://github.com/clips/pattern>`_ package installed. dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional Dictionary, if not provided, this scans the corpus once, to determine its vocabulary **IMPORTANT: this needs a really long time**. filter_namespaces : tuple of str, optional Namespaces to consider. tokenizer_func : function, optional Function that will be used for tokenization. By default, use :func:`~gensim.corpora.wikicorpus.tokenize`. If you inject your own tokenizer, it must conform to this interface: `tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str` article_min_tokens : int, optional Minimum tokens in article. Article will be ignored if number of tokens is less. token_min_len : int, optional Minimal token length. token_max_len : int, optional Maximal token length. lower : bool, optional Convert all text to lower case? Warnings -------- Unless a dictionary is provided, this scans the corpus once, to determine its vocabulary. """ self.fname = fname self.filter_namespaces = filter_namespaces self.metadata = False if processes is None: processes = max(1, multiprocessing.cpu_count() - 1) self.processes = processes self.lemmatize = lemmatize self.tokenizer_func = tokenizer_func self.article_min_tokens = article_min_tokens self.token_min_len = token_min_len self.token_max_len = token_max_len self.lower = lower if dictionary is None: self.dictionary = Dictionary(self.get_texts()) else: self.dictionary = dictionary
def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0', ), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True): super().__init__(fname, processes, lemmatize, dictionary, filter_namespaces, tokenizer_func, article_min_tokens, token_min_len, token_max_len, lower)
def iterate_wiki(input_path): lemmatize = utils.has_pattern() filter_namespaces = ('0',) texts = ((text, lemmatize, title, pageid) for title, text, pageid in extract_pages(bz2.BZ2File(input_path), filter_namespaces)) for article in texts: text, lemmatize, title, pageid = article text = utils.to_unicode(text, 'utf8', errors='ignore') text = utils.decode_htmlentities(text) # '&nbsp;' --> '\xa0' text = remove_markup(text) tokens = get_all_words(text) if len(tokens) < ARTICLE_MIN_WORDS or any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES): continue yield title, tokens
def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0', ), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True): """ Initialize the corpus. Unless a dictionary is provided, this scans the corpus once, to determine its vocabulary. If `pattern` package is installed, use fancier shallow parsing to get token lemmas. Otherwise, use simple regexp tokenization. You can override this automatic logic by forcing the `lemmatize` parameter explicitly. self.metadata if set to true will ensure that serialize will write out article titles to a pickle file. Set `article_min_tokens` as a min threshold for article token count (defaults to 50). Any article below this is ignored. Set `tokenizer_func` (defaults to `tokenize`) with a custom function reference to control tokenization else use the default regexp tokenization. Set this parameter for languages like japanese or thai to perform better tokenization. The `tokenizer_func` needs to take 4 parameters: (text, token_min_len, token_max_len, lower). The parameter values are as configured on the class instance by default. Set `lower` to control if everything should be converted to lowercase or not (default True). Set `token_min_len`, `token_max_len` as thresholds for token lengths that are returned (default to 2 and 15). """ self.fname = fname self.filter_namespaces = filter_namespaces self.metadata = False if processes is None: processes = max(1, multiprocessing.cpu_count() - 1) self.processes = processes self.lemmatize = lemmatize self.tokenizer_func = tokenizer_func self.article_min_tokens = article_min_tokens self.token_min_len = token_min_len self.token_max_len = token_max_len self.lower = lower if dictionary is None: self.dictionary = Dictionary(self.get_texts()) else: self.dictionary = dictionary
def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True): """Initialize the corpus. Unless a dictionary is provided, this scans the corpus once, to determine its vocabulary. Parameters ---------- fname : str Path to file with wikipedia dump. processes : int, optional Number of processes to run, defaults to **number of cpu - 1**. lemmatize : bool Whether to use lemmatization instead of simple regexp tokenization. Defaults to `True` if *pattern* package installed. dictionary : :class:`~gensim.corpora.dictionary.Dictionary`, optional Dictionary, if not provided, this scans the corpus once, to determine its vocabulary (this needs **really long time**). filter_namespaces : tuple of str Namespaces to consider. tokenizer_func : function, optional Function that will be used for tokenization. By default, use :func:`~gensim.corpora.wikicorpus.tokenize`. Need to support interface: tokenizer_func(text: str, token_min_len: int, token_max_len: int, lower: bool) -> list of str. article_min_tokens : int, optional Minimum tokens in article. Article will be ignored if number of tokens is less. token_min_len : int, optional Minimal token length. token_max_len : int, optional Maximal token length. lower : bool, optional If True - convert all text to lower case. """ self.fname = fname self.filter_namespaces = filter_namespaces self.metadata = False if processes is None: processes = max(1, multiprocessing.cpu_count() - 1) self.processes = processes self.lemmatize = lemmatize self.tokenizer_func = tokenizer_func self.article_min_tokens = article_min_tokens self.token_min_len = token_min_len self.token_max_len = token_max_len self.lower = lower self.dictionary = dictionary or Dictionary(self.get_texts())
def __init__(self, pages_gen, processes=None, lemmatize=utils.has_pattern(), dictionary=None): self.pages_gen = pages_gen self.metadata = False if processes is None: processes = max(1, multiprocessing.cpu_count() - 1) self.processes = processes self.lemmatize = lemmatize if dictionary is None: self.dictionary = Dictionary(self.get_texts()) else: self.dictionary = dictionary
def __init__(self, fname, dictionary, article_count, set_citation, quote_identifiers, processes=None, lemmatize=utils.has_pattern(), filter_namespaces=('0', )): WikiCorpus.__init__(self, fname, processes, False, dictionary, filter_namespaces) self.set_citation = set_citation self.articlecount = article_count self.quote_identifiers = quote_identifiers self.base_url = self._get_base_wikipedia_url(bz2.BZ2File(self.fname), filter_namespaces)
def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0',), tokenizer_func=tokenize, article_min_tokens=ARTICLE_MIN_WORDS, token_min_len=TOKEN_MIN_LEN, token_max_len=TOKEN_MAX_LEN, lower=True): """ Initialize the corpus. Unless a dictionary is provided, this scans the corpus once, to determine its vocabulary. If `pattern` package is installed, use fancier shallow parsing to get token lemmas. Otherwise, use simple regexp tokenization. You can override this automatic logic by forcing the `lemmatize` parameter explicitly. self.metadata if set to true will ensure that serialize will write out article titles to a pickle file. Set `article_min_tokens` as a min threshold for article token count (defaults to 50). Any article below this is ignored. Set `tokenizer_func` (defaults to `tokenize`) with a custom function reference to control tokenization else use the default regexp tokenization. Set this parameter for languages like japanese or thai to perform better tokenization. The `tokenizer_func` needs to take 4 parameters: (text, token_min_len, token_max_len, lower). The parameter values are as configured on the class instance by default. Set `lower` to control if everything should be converted to lowercase or not (default True). Set `token_min_len`, `token_max_len` as thresholds for token lengths that are returned (default to 2 and 15). """ self.fname = fname self.filter_namespaces = filter_namespaces self.metadata = False if processes is None: processes = max(1, multiprocessing.cpu_count() - 1) self.processes = processes self.lemmatize = lemmatize self.tokenizer_func = tokenizer_func self.article_min_tokens = article_min_tokens self.token_min_len = token_min_len self.token_max_len = token_max_len self.lower = lower if dictionary is None: self.dictionary = Dictionary(self.get_texts()) else: self.dictionary = dictionary
def __init__(self, fname, processes=None, lemmatize=utils.has_pattern(), dictionary=None, filter_namespaces=('0',)): """ Initialize the corpus. Unless a dictionary is provided, this scans the corpus once, to determine its vocabulary. If `pattern` package is installed, use fancier shallow parsing to get token lemmas. Otherwise, use simple regexp tokenization. You can override this automatic logic by forcing the `lemmatize` parameter explicitly. """ self.fname = fname self.filter_namespaces = filter_namespaces self.metadata = False if processes is None: processes = max(1, multiprocessing.cpu_count() - 1) self.processes = processes self.lemmatize = lemmatize if dictionary is None: self.dictionary = Dictionary(self.get_texts()) else: self.dictionary = dictionary
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html """This module contains functions and processors used for processing text, extracting sentences from text, working with acronyms and abbreviations. """ import re import logging from gensim.summarization.syntactic_unit import SyntacticUnit from gensim.parsing.preprocessing import preprocess_documents from gensim.utils import tokenize, has_pattern logger = logging.getLogger(__name__) HAS_PATTERN = has_pattern() if HAS_PATTERN: from pattern.en import tag # Special separator used in abbreviations. SEPARATOR = r'@' # Pattern to split text to sentences. RE_SENTENCE = re.compile(r'(\S.+?[.!?])(?=\s+|$)|(\S.+?)(?=[\n]|$)', re.UNICODE) # Pattern for detecting abbreviations (example: Sgt. Pepper). AB_SENIOR = re.compile(r'([A-Z][a-z]{1,2}\.)\s(\w)', re.UNICODE) # Pattern for detecting acronyms. AB_ACRONYM = re.compile(r'(\.[a-zA-Z]\.)\s(\w)', re.UNICODE)
def config_argparser(): argparser = argparse.ArgumentParser(description='Wikipedia Dump Extractor') argparser.add_argument('-input_path', type=str, required=True, help='Path to the raw Wikipedia dump') argparser.add_argument('-output_path', type=str, required=True, help='Write path for extracted text content') return argparser.parse_args() if __name__ == '__main__': arguments = config_argparser() program = os.path.basename(sys.argv[0]) logger = logging.getLogger(program) logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') logging.root.setLevel(level=logging.INFO) lemmatize = utils.has_pattern() filter_namespaces = ('0',) texts = ((text, lemmatize, title, pageid) for title, text, pageid in extract_pages(bz2.BZ2File(arguments.input_path), filter_namespaces)) parsed_article_counter = 0 space = u' ' output = codecs.open(arguments.output_path, 'w', 'utf-8') for article in texts: text, lemmatize, title, pageid = article text = utils.to_unicode(text, 'utf8', errors='ignore') text = utils.decode_htmlentities(text) # '&nbsp;' --> '\xa0' text = remove_markup(text) tokens = get_all_words(text) if len(tokens) < ARTICLE_MIN_WORDS or any(title.startswith(ignore + ':') for ignore in IGNORED_NAMESPACES): continue output.write("{}\n".format(space.join(tokens) + "\n"))