Python LazyLoader Examples

Programming Language: Python

Namespace/Package Name: nltk.data

Class/Type: LazyLoader

Examples at hotexamples.com: 8

Python LazyLoader - 8 examples found. These are the top rated real world Python examples of nltk.data.LazyLoader extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

LazyLoader(7)

tokenize(1)

Frequently Used Methods

LazyLoader (7)

tokenize (1)

Example #1

Show file

File: preprocess.py Project: emeriles/mdt2017

 def __init__(self, filepath):
     self.filepath = filepath
     with codecs.open(self.filepath, 'r', 'utf-8') as f:
         corpus = f.read()
     corpus = re.sub('\\n', '. ', corpus)
     corpus_sent = LazyLoader('tokenizers/punkt/spanish.pickle').tokenize(corpus)
     self.corpus_clean = [re.sub('&#\d+', '', sents) for sents in corpus_sent]

Example #2

Show file

File: mongoreader.py Project: RomanZacharia/python_text_processing_w_nltk2_cookbook

 def __init__(self,
              word_tokenizer=TreebankWordTokenizer(),
              sent_tokenizer=LazyLoader('tokenizers/punkt/english.pickle'),
              **kwargs):
     self._seq = MongoDBLazySequence(**kwargs)
     self._word_tokenize = word_tokenizer.tokenize
     self._sent_tokenize = sent_tokenizer.tokenize

Example #3

Show file

File: preproccess.py Project: jonathanmutal/dataMining

    def __init__(self, path='/home/jmutal/dataMining/lavoztextodump.txt',
                 word_tokenizer=regex,
                 sent_tokenizer=LazyLoader('tokenizers/punkt/spanish.pickle')):
        self.path = path
        self.__word_tokenizer = word_tokenizer
        self.__sent_tokenizer = sent_tokenizer

        # Should optimaze for big files
        with codecs.open(self.path, 'r', 'utf-8') as f:
            corpus = f.read()

        corpus_sent = self.__sent_tokenizer.tokenize(corpus)
        self.clean_corpus = [re.sub('&#\d+;', '', sents) for sents in corpus_sent]

Example #4

Show file

File: tagger_worker.py Project: israelst/pypln

Created on 28/09/11
by flavio
"""
__author__ = 'flavio'
__docformat__ = "restructuredtext en"

import zmq
from base import PushPullWorker
from nltk import pos_tag, word_tokenize
from nltk.data import LazyLoader
from nltk.tag import tuple2str

context = zmq.Context()

eng_sent_tokenizer = LazyLoader('tokenizers/punkt/english.pickle')
port_sent_tokenizer = LazyLoader('tokenizers/punkt/portuguese.pickle')
#TODO: alow the usage of PaLavras when tagging portuguese texts


class POSTaggerWorker(PushPullWorker):
    """
    Worker to tag words in texts according to their morphological type
    Expects to receive a JSON message with the following structure
    {"text":"...","lang":"<language iso code>"} where text is a raw text string.
    To be used together with the MongoUpdateSink class.
    """
    def process(self, msg):
        """
        Does the POS tagging
        """

Example #5

Show file

 def __init__(self, collection):
   word_tokenizer=TreebankWordTokenizer()
   sent_tokenizer=LazyLoader('tokenizers/punkt/PY3/english.pickle')
   self._seq = MongoDBLazySequence('localhost', 27017, 'test', collection, 'text')
   self._word_tokenize = word_tokenizer.tokenize
   self._sent_tokenize = sent_tokenizer.tokenize

Example #6

Show file

from model.grammar import Grammar, Production
from model.symbol import Symbol
import pickle

if __name__ == '__main__':
    opts = docopt(__doc__)
    print("TRAIN GRAMMAR: ")
    # Get the corpus
    corpus = opts['-c']
    location = opts['-d']

    print("getting corpus from: " + corpus)
    model = PlaintextCorpusReader(
        corpus,
        '.*\.txt',
        sent_tokenizer=LazyLoader('tokenizers/punkt/spanish.pickle'),
        encoding="utf8")

    # Create grammar
    terminals = set()
    epsilon = Symbol("ε", True)
    terminals.add(epsilon)  ## epsilon terminal
    non_terminals = set()
    s = Symbol("S", False)  # Starting non terminal
    non_terminals.add(s)
    grammar = Grammar(non_terminals, terminals, s)
    # This is only to tell me how advanced the process is
    count = 0.0
    len_fileids = len(model.fileids())

    # Get the tokenized corpus

Example #7

Show file

File: floras_nltk.py Project: ggosline/taxonparser

            print(sent)
            linkages = p.parse_sent(sent)
            for linkage in linkages[0:1]:
                print(linkage.num_of_links, linkage.constituent_phrases_nested)
                pass
        pass

    if False:
        from floraparser.fltoken import FlTokenizer
        from nltk.stem import WordNetLemmatizer
        from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters, PunktLanguageVars

        fltk = FlTokenizer()
        wnl = WordNetLemmatizer()
        NUMBERS = re.compile(r'^[0-9–—.·()-]+$')
        sent_tokenizer = LazyLoader(r'..\resources\FloraPunkt.pickle')
        PunktLanguageVars.sent_end_chars = ('.',)
        wordset = set()
        with open('../resources/AllTaxa.txt') as at:
            for desc in at:
                sents = sent_tokenizer.tokenize(desc)
                for sent in sents:
                    tl = fltk.tokenize(sent)
                    if tl[-1].endswith('.'):
                        tl[-1] = tl[-1][:-1]

                    wl = [wnl.lemmatize(word.lower()) for word in tl if (not NUMBERS.match(word) and not '.' in word)]
                    wordset.update(wl)
        with open('../resources/AllTaxa.words', 'w', encoding='utf-8') as wf:
            for w in sorted(wordset):
                print(w, file=wf)

Example #8

Show file

  train_model.py -h | --help

Options:
  -c <file>     Corpus location
  -d <file>     Directory for trained data
  -h --help     Show this screen.
"""
from docopt import docopt
from nltk.corpus import PlaintextCorpusReader
from nltk.data import LazyLoader
import pickle

from model.ngram import NGram, InterpolatedNGram, AddOneNGram

if __name__ == '__main__':
    opts = docopt(__doc__)
    corpus_directory = opts['-c']
    location = opts['-d']

    corpus = PlaintextCorpusReader(corpus_directory, '.*\.txt', sent_tokenizer=LazyLoader('tokenizers/punkt/spanish.pickle'), encoding="utf8")
    sents = corpus.sents()

    model = AddOneNGram(3, sents)

    # save it
    filename = location + "model"
    f = open(filename, 'wb')
    pickle.dump(model, f)
    f.close()