Example #1
0
 def __init__(self, filepath):
     self.filepath = filepath
     with codecs.open(self.filepath, 'r', 'utf-8') as f:
         corpus = f.read()
     corpus = re.sub('\\n', '. ', corpus)
     corpus_sent = LazyLoader('tokenizers/punkt/spanish.pickle').tokenize(corpus)
     self.corpus_clean = [re.sub('&#\d+', '', sents) for sents in corpus_sent]
 def __init__(self,
              word_tokenizer=TreebankWordTokenizer(),
              sent_tokenizer=LazyLoader('tokenizers/punkt/english.pickle'),
              **kwargs):
     self._seq = MongoDBLazySequence(**kwargs)
     self._word_tokenize = word_tokenizer.tokenize
     self._sent_tokenize = sent_tokenizer.tokenize
Example #3
0
    def __init__(self, path='/home/jmutal/dataMining/lavoztextodump.txt',
                 word_tokenizer=regex,
                 sent_tokenizer=LazyLoader('tokenizers/punkt/spanish.pickle')):
        self.path = path
        self.__word_tokenizer = word_tokenizer
        self.__sent_tokenizer = sent_tokenizer

        # Should optimaze for big files
        with codecs.open(self.path, 'r', 'utf-8') as f:
            corpus = f.read()

        corpus_sent = self.__sent_tokenizer.tokenize(corpus)
        self.clean_corpus = [re.sub('&#\d+;', '', sents) for sents in corpus_sent]
Example #4
0
Created on 28/09/11
by flavio
"""
__author__ = 'flavio'
__docformat__ = "restructuredtext en"

import zmq
from base import PushPullWorker
from nltk import pos_tag, word_tokenize
from nltk.data import LazyLoader
from nltk.tag import tuple2str

context = zmq.Context()

eng_sent_tokenizer = LazyLoader('tokenizers/punkt/english.pickle')
port_sent_tokenizer = LazyLoader('tokenizers/punkt/portuguese.pickle')
#TODO: alow the usage of PaLavras when tagging portuguese texts


class POSTaggerWorker(PushPullWorker):
    """
    Worker to tag words in texts according to their morphological type
    Expects to receive a JSON message with the following structure
    {"text":"...","lang":"<language iso code>"} where text is a raw text string.
    To be used together with the MongoUpdateSink class.
    """
    def process(self, msg):
        """
        Does the POS tagging
        """
Example #5
0
 def __init__(self, collection):
   word_tokenizer=TreebankWordTokenizer()
   sent_tokenizer=LazyLoader('tokenizers/punkt/PY3/english.pickle')
   self._seq = MongoDBLazySequence('localhost', 27017, 'test', collection, 'text')
   self._word_tokenize = word_tokenizer.tokenize
   self._sent_tokenize = sent_tokenizer.tokenize
Example #6
0
from model.grammar import Grammar, Production
from model.symbol import Symbol
import pickle

if __name__ == '__main__':
    opts = docopt(__doc__)
    print("TRAIN GRAMMAR: ")
    # Get the corpus
    corpus = opts['-c']
    location = opts['-d']

    print("getting corpus from: " + corpus)
    model = PlaintextCorpusReader(
        corpus,
        '.*\.txt',
        sent_tokenizer=LazyLoader('tokenizers/punkt/spanish.pickle'),
        encoding="utf8")

    # Create grammar
    terminals = set()
    epsilon = Symbol("ε", True)
    terminals.add(epsilon)  ## epsilon terminal
    non_terminals = set()
    s = Symbol("S", False)  # Starting non terminal
    non_terminals.add(s)
    grammar = Grammar(non_terminals, terminals, s)
    # This is only to tell me how advanced the process is
    count = 0.0
    len_fileids = len(model.fileids())

    # Get the tokenized corpus
Example #7
0
            print(sent)
            linkages = p.parse_sent(sent)
            for linkage in linkages[0:1]:
                print(linkage.num_of_links, linkage.constituent_phrases_nested)
                pass
        pass

    if False:
        from floraparser.fltoken import FlTokenizer
        from nltk.stem import WordNetLemmatizer
        from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters, PunktLanguageVars

        fltk = FlTokenizer()
        wnl = WordNetLemmatizer()
        NUMBERS = re.compile(r'^[0-9–—.·()-]+$')
        sent_tokenizer = LazyLoader(r'..\resources\FloraPunkt.pickle')
        PunktLanguageVars.sent_end_chars = ('.',)
        wordset = set()
        with open('../resources/AllTaxa.txt') as at:
            for desc in at:
                sents = sent_tokenizer.tokenize(desc)
                for sent in sents:
                    tl = fltk.tokenize(sent)
                    if tl[-1].endswith('.'):
                        tl[-1] = tl[-1][:-1]

                    wl = [wnl.lemmatize(word.lower()) for word in tl if (not NUMBERS.match(word) and not '.' in word)]
                    wordset.update(wl)
        with open('../resources/AllTaxa.words', 'w', encoding='utf-8') as wf:
            for w in sorted(wordset):
                print(w, file=wf)
Example #8
0
  train_model.py -h | --help

Options:
  -c <file>     Corpus location
  -d <file>     Directory for trained data
  -h --help     Show this screen.
"""
from docopt import docopt
from nltk.corpus import PlaintextCorpusReader
from nltk.data import LazyLoader
import pickle

from model.ngram import NGram, InterpolatedNGram, AddOneNGram

if __name__ == '__main__':
    opts = docopt(__doc__)
    corpus_directory = opts['-c']
    location = opts['-d']

    corpus = PlaintextCorpusReader(corpus_directory, '.*\.txt', sent_tokenizer=LazyLoader('tokenizers/punkt/spanish.pickle'), encoding="utf8")
    sents = corpus.sents()

    model = AddOneNGram(3, sents)

    # save it
    filename = location + "model"
    f = open(filename, 'wb')
    pickle.dump(model, f)
    f.close()