def __init__(self, filepath): self.filepath = filepath with codecs.open(self.filepath, 'r', 'utf-8') as f: corpus = f.read() corpus = re.sub('\\n', '. ', corpus) corpus_sent = LazyLoader('tokenizers/punkt/spanish.pickle').tokenize(corpus) self.corpus_clean = [re.sub('&#\d+', '', sents) for sents in corpus_sent]
def __init__(self, word_tokenizer=TreebankWordTokenizer(), sent_tokenizer=LazyLoader('tokenizers/punkt/english.pickle'), **kwargs): self._seq = MongoDBLazySequence(**kwargs) self._word_tokenize = word_tokenizer.tokenize self._sent_tokenize = sent_tokenizer.tokenize
def __init__(self, path='/home/jmutal/dataMining/lavoztextodump.txt', word_tokenizer=regex, sent_tokenizer=LazyLoader('tokenizers/punkt/spanish.pickle')): self.path = path self.__word_tokenizer = word_tokenizer self.__sent_tokenizer = sent_tokenizer # Should optimaze for big files with codecs.open(self.path, 'r', 'utf-8') as f: corpus = f.read() corpus_sent = self.__sent_tokenizer.tokenize(corpus) self.clean_corpus = [re.sub('&#\d+;', '', sents) for sents in corpus_sent]
Created on 28/09/11 by flavio """ __author__ = 'flavio' __docformat__ = "restructuredtext en" import zmq from base import PushPullWorker from nltk import pos_tag, word_tokenize from nltk.data import LazyLoader from nltk.tag import tuple2str context = zmq.Context() eng_sent_tokenizer = LazyLoader('tokenizers/punkt/english.pickle') port_sent_tokenizer = LazyLoader('tokenizers/punkt/portuguese.pickle') #TODO: alow the usage of PaLavras when tagging portuguese texts class POSTaggerWorker(PushPullWorker): """ Worker to tag words in texts according to their morphological type Expects to receive a JSON message with the following structure {"text":"...","lang":"<language iso code>"} where text is a raw text string. To be used together with the MongoUpdateSink class. """ def process(self, msg): """ Does the POS tagging """
def __init__(self, collection): word_tokenizer=TreebankWordTokenizer() sent_tokenizer=LazyLoader('tokenizers/punkt/PY3/english.pickle') self._seq = MongoDBLazySequence('localhost', 27017, 'test', collection, 'text') self._word_tokenize = word_tokenizer.tokenize self._sent_tokenize = sent_tokenizer.tokenize
from model.grammar import Grammar, Production from model.symbol import Symbol import pickle if __name__ == '__main__': opts = docopt(__doc__) print("TRAIN GRAMMAR: ") # Get the corpus corpus = opts['-c'] location = opts['-d'] print("getting corpus from: " + corpus) model = PlaintextCorpusReader( corpus, '.*\.txt', sent_tokenizer=LazyLoader('tokenizers/punkt/spanish.pickle'), encoding="utf8") # Create grammar terminals = set() epsilon = Symbol("ε", True) terminals.add(epsilon) ## epsilon terminal non_terminals = set() s = Symbol("S", False) # Starting non terminal non_terminals.add(s) grammar = Grammar(non_terminals, terminals, s) # This is only to tell me how advanced the process is count = 0.0 len_fileids = len(model.fileids()) # Get the tokenized corpus
print(sent) linkages = p.parse_sent(sent) for linkage in linkages[0:1]: print(linkage.num_of_links, linkage.constituent_phrases_nested) pass pass if False: from floraparser.fltoken import FlTokenizer from nltk.stem import WordNetLemmatizer from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters, PunktLanguageVars fltk = FlTokenizer() wnl = WordNetLemmatizer() NUMBERS = re.compile(r'^[0-9–—.·()-]+$') sent_tokenizer = LazyLoader(r'..\resources\FloraPunkt.pickle') PunktLanguageVars.sent_end_chars = ('.',) wordset = set() with open('../resources/AllTaxa.txt') as at: for desc in at: sents = sent_tokenizer.tokenize(desc) for sent in sents: tl = fltk.tokenize(sent) if tl[-1].endswith('.'): tl[-1] = tl[-1][:-1] wl = [wnl.lemmatize(word.lower()) for word in tl if (not NUMBERS.match(word) and not '.' in word)] wordset.update(wl) with open('../resources/AllTaxa.words', 'w', encoding='utf-8') as wf: for w in sorted(wordset): print(w, file=wf)
train_model.py -h | --help Options: -c <file> Corpus location -d <file> Directory for trained data -h --help Show this screen. """ from docopt import docopt from nltk.corpus import PlaintextCorpusReader from nltk.data import LazyLoader import pickle from model.ngram import NGram, InterpolatedNGram, AddOneNGram if __name__ == '__main__': opts = docopt(__doc__) corpus_directory = opts['-c'] location = opts['-d'] corpus = PlaintextCorpusReader(corpus_directory, '.*\.txt', sent_tokenizer=LazyLoader('tokenizers/punkt/spanish.pickle'), encoding="utf8") sents = corpus.sents() model = AddOneNGram(3, sents) # save it filename = location + "model" f = open(filename, 'wb') pickle.dump(model, f) f.close()