def process(filename): print >> sys.stderr, "Processing " + filename doc = folia.Document(file=filename) freqlist = FrequencyList() if settings.n == 1: for word in doc.words(): text = word.toktext() if settings.casesensitive: text = text.lower() freqlist.count(text) elif settings.sentencemarkers: for sentence in doc.sentences(): for ngram in Windower(sentence.words(), settings.n): text = ' '.join([x for x in ngram.toktext()]) if settings.casesensitive: text = text.lower() freqlist.count(text) else: for word in Windower(sentence.words(), settings.n, None, None): text = ' '.join([x for x in ngram.toktext()]) if settings.casesensitive: text = text.lower() freqlist.count(text) if settings.autooutput: if filename[-len(settings.extension) - 1:].lower() == '.' + settings.extension: outfilename = filename[:-len(settings.extension) - 1] + '.freqlist' else: outfilename += '.freqlist' freqlist.save(outfilename, True) return freqlist
def append(self, sentence): if isinstance(sentence, str) or isinstance(sentence, unicode): sentence = sentence.strip().split(' ') self.sentences += 1 for ngram in Windower(sentence, self.n, self.beginmarker, self.endmarker): self.freqlistN.count(ngram) for ngram in Windower(sentence, self.n - 1, self.beginmarker, self.endmarker): self.freqlistNm1.count(ngram)
def test_trigrams_word(self): """Windower (trigrams) (on single word)""" global text result = list(iter(Windower(["hi"], 3))) self.assertEqual(result, [('<begin>', '<begin>', 'hi'), ('<begin>', 'hi', '<end>'), ('hi', '<end>', '<end>')])
def test_bigrams(self): """Windower (bigrams)""" global text result = list(iter(Windower(text, 2))) self.assertEqual(result, [("<begin>", "This"), ("This", "is"), ("is", "a"), ("a", "test"), ("test", "."), (".", "<end>")])
def test_freqlist_caseinsens(self): """Bigram Frequency List (case insensitive)""" global sentences f = FrequencyList(None, False) for sentence in sentences: f.append(Windower(sentence, 2)) self.assertTrue((f[('is', 'a')] == 2 and f[('this', 'is')] == 1))
def scoresentence(self, sentence, unknownwordprob=-12): score = 0 for ngram in Windower(sentence, self.n, "<s>", "</s>"): try: score += self.logscore(ngram) except KeyError: score += unknownwordprob return 10**score
def test_trigrams(self): """Windower (trigrams)""" global text result = list(iter(Windower(text, 3))) self.assertEqual(result, [('<begin>', '<begin>', 'This'), ('<begin>', 'This', 'is'), ('This', 'is', 'a'), ('is', 'a', 'test'), ('a', 'test', '.'), ('test', '.', '<end>'), ('.', '<end>', '<end>')])
def process(filename): try: print("Processing " + filename, file=sys.stderr) doc = folia.Document(file=filename) freqlist = FrequencyList() if settings.n == 1: for word in doc.words(): text = word.toktext() if settings.casesensitive: text = text.lower() freqlist.count(text) elif settings.sentencemarkers: for sentence in doc.sentences(): for ngram in Windower(sentence.words(), settings.n): text = ' '.join([x for x in ngram.toktext()]) if settings.casesensitive: text = text.lower() freqlist.count(text) else: for word in Windower(sentence.words(), settings.n, None, None): text = ' '.join([x for x in ngram.toktext()]) if settings.casesensitive: text = text.lower() freqlist.count(text) if settings.autooutput: if filename[-len(settings.extension) - 1:].lower() == '.' + settings.extension: outfilename = filename[:-len(settings.extension) - 1] + '.freqlist' else: outfilename += '.freqlist' freqlist.save(outfilename, True) except Exception as e: if settings.ignoreerrors: print("ERROR: An exception was raised whilst processing " + filename, e, file=sys.stderr) else: raise return freqlist
def main(): try: opts, files = getopt.getopt(sys.argv[1:], "hn:ie:", ["help"]) except getopt.GetoptError as err: # print help information and exit: print(str(err), file=sys.stderr) usage() sys.exit(2) testsetsize = devsetsize = 0 casesensitive = True encoding = 'utf-8' n = 1 for o, a in opts: if o == "-n": n = int(a) elif o == "-i": casesensitive = False elif o == "-e": encoding = a else: print("ERROR: Unknown option:", o, file=sys.stderr) sys.exit(1) if not files: print >> sys.stderr, "No files specified" sys.exit(1) freqlist = FrequencyList(None, casesensitive) for filename in files: f = codecs.open(filename, 'r', encoding) for line in f: if n > 1: freqlist.append(Windower(crude_tokenizer(line), n)) else: freqlist.append(crude_tokenizer(line)) f.close() dist = Distribution(freqlist) for type, count in freqlist: if isinstance(type, tuple) or isinstance(type, list): type = " ".join(type) s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str( dist.information(type)) print(s) print("Tokens: ", freqlist.tokens(), file=sys.stderr) print("Types: ", len(freqlist), file=sys.stderr) print("Type-token ratio: ", freqlist.typetokenratio(), file=sys.stderr) print("Entropy: ", dist.entropy(), file=sys.stderr)
def train(self, sourcefile, modelfile, **parameters): if self.hapaxer: self.log("Training hapaxer...") self.hapaxer.train() l = self.settings['leftcontext'] r = self.settings['rightcontext'] n = l + 1 + r self.log("Generating training instances...") fileprefix = modelfile.replace(".ibase", "") #has been verified earlier classifier = TimblClassifier(fileprefix, self.gettimbloptions()) if sourcefile.endswith(".bz2"): iomodule = bz2 elif sourcefile.endswith(".gz"): iomodule = gzip else: iomodule = io with iomodule.open(sourcefile, mode='rt', encoding='utf-8', errors='ignore') as f: for i, line in enumerate(f): if i % 100000 == 0: print( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i), file=sys.stderr) for ngram in Windower(line, n): confusible = ngram[l] if confusible in self.settings['confusibles']: if self.hapaxer: ngram = self.hapaxer(ngram) leftcontext = tuple(ngram[:l]) rightcontext = tuple(ngram[l + 1:]) classifier.append(leftcontext + rightcontext, confusible) self.log("Training classifier...") classifier.train() self.log("Saving model " + modelfile) classifier.save()
def test_unigrams(self): """Windower (unigrams)""" global text result = list(iter(Windower(text, 1))) self.assertEqual(result, [("This", ), ("is", ), ("a", ), ("test", ), (".", )])
def train(self, sourcefile, modelfile, **parameters): if modelfile == self.confusiblefile: #Build frequency list self.log( "Preparing to generate lexicon for suffix confusible module") classfile = stripsourceextensions(sourcefile) + ".cls" corpusfile = stripsourceextensions(sourcefile) + ".dat" if not os.path.exists(classfile): self.log("Building class file") classencoder = colibricore.ClassEncoder( "", self.settings['minlength'], self.settings['maxlength']) #character length constraints classencoder.build(sourcefile) classencoder.save(classfile) else: classencoder = colibricore.ClassEncoder( classfile, self.settings['minlength'], self.settings['maxlength']) if not os.path.exists(corpusfile): self.log("Encoding corpus") classencoder.encodefile(sourcefile, corpusfile) self.log("Generating frequency list") options = colibricore.PatternModelOptions( mintokens=self.settings['freqthreshold'], minlength=1, maxlength=1) #unigrams only model = colibricore.UnindexedPatternModel() model.train(corpusfile, options) self.log("Finding confusible pairs") classdecoder = colibricore.ClassDecoder(classfile) self.confusibles = [] #pylint: disable=attribute-defined-outside-init for pattern in model: try: pattern_s = pattern.tostring(classdecoder) except UnicodeDecodeError: self.log( "WARNING: Unable to decode a pattern in the model!!! Invalid utf-8!" ) for suffix in self.suffixes: if pattern_s.endswith( suffix) and not pattern_s in self.confusibles: found = [] for othersuffix in self.suffixes: if othersuffix != suffix: otherpattern_s = pattern_s[:-len( suffix)] + othersuffix try: otherpattern = classencoder.buildpattern( otherpattern_s, False, False) except KeyError: if found: found = [] break if not otherpattern in model: if found: found = [] break if self.settings['maxratio'] != 0: freqs = ( model.occurrencecount(pattern), model.occurrencecount(otherpattern)) ratio = max(freqs) / min(freqs) if ratio < self.settings['maxratio']: if found: found = [] break found.append(otherpattern_s) if found: self.confusibles.append(pattern_s) for s in found: self.confusibles.append(s) self.log("Writing confusible list") with open(modelfile, 'w', encoding='utf-8') as f: for confusible in self.confusibles: f.write(confusible + "\n") elif modelfile == self.modelfile: try: self.confusibles except AttributeError: self.confusibles = [] self.log("Loading confusiblefile") with open(self.confusiblefile, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if line: self.confusibles.append(line) if self.hapaxer: self.log("Training hapaxer...") self.hapaxer.train() l = self.settings['leftcontext'] r = self.settings['rightcontext'] n = l + 1 + r self.log("Generating training instances...") fileprefix = modelfile.replace(".ibase", "") #has been verified earlier classifier = TimblClassifier(fileprefix, self.gettimbloptions()) if sourcefile.endswith(".bz2"): iomodule = bz2 elif sourcefile.endswith(".gz"): iomodule = gzip else: iomodule = io with iomodule.open(sourcefile, mode='rt', encoding='utf-8', errors='ignore') as f: for i, line in enumerate(f): for ngram in Windower(line, n): if i % 100000 == 0: print(datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") + " - " + str(i), file=sys.stderr) confusible = ngram[l] if confusible in self.confusibles: if self.hapaxer: ngram = self.hapaxer(ngram) leftcontext = tuple(ngram[:l]) rightcontext = tuple(ngram[l + 1:]) suffix, normalized = self.getsuffix(confusible) if suffix is not None: classifier.append( leftcontext + (normalized, ) + rightcontext, suffix) self.log("Training classifier...") classifier.train() self.log("Saving model " + modelfile) classifier.save()
def run(self, inputdata): """This method gets called by the module's server and handles a message by the client. The return value (str) is returned to the client""" words = [word_text for word_id, word_text in inputdata] #pylint: disable=unused-variable word_ids = [word_id for word_id, word_text in inputdata] #pylint: disable=unused-variable actions = [None] * len( words ) #array of actions to be taken for each token, actions are (None,freq) for deletions or (punct,freq) for insertions #find possible deletions for i, trigram in enumerate(Windower(words, 3)): if trigram[0] != "<begin>" and trigram[-1] != "<end>": if trigram[1] in self.PUNCTUATION and trigram[ 0] not in self.PUNCTUATION and trigram[ -1] not in self.PUNCTUATION: #trigram pattern (X p Y) focussing on a punctuation token trigram_pattern = self.classencoder.buildpattern( " ".join(trigram)) trigram_oc = self.trigram_model.occurrencecount( trigram_pattern) if trigram_oc >= self.settings['deletioncutoff']: if self.debug: self.log( " (Trigram '" + " ".join(trigram) + "' too frequent to consider for deletion (" + str(trigram_oc) + ")") else: #bigram version without the punctuation token if trigram[1] in self.EOSMARKERS and trigram[-1].isalpha( ) and trigram[-1][0] == trigram[-1][0].upper( ): #deletion candidate is an eos marker, remove casing bigram = (trigram[0], trigram[-1].lower()) else: bigram = (trigram[0], trigram[-1]) bigram_pattern = self.classencoder.buildpattern( " ".join(bigram)) if not bigram_pattern.unknown(): #get occurrences bigram_oc = self.bigram_model.occurrencecount( bigram_pattern) if bigram_oc >= self.settings['deletionthreshold']: #bigram (X Y) is prevalent enough to warrant as a deletion solution if self.debug: self.log( " (Bigram candidate without punctuation prevalent enough to warrant as a deletion solution: '" + " ".join(bigram) + "')") #but first check if bigrams X p and p Y don't reach the cut-off threshold bigram_trailpunct = trigram_pattern[0:2] if self.bigram_model.occurrencecount( bigram_trailpunct ) >= self.settings['deletioncutoff']: if self.debug: self.log( " (Bigram with trailing punctuation exceeds cut-off threshold, no deletion)" ) else: bigram_initialpunct = trigram_pattern[1:3] if self.bigram_model.occurrencecount( bigram_initialpunct ) >= self.settings['deletioncutoff']: if self.debug: self.log( " (Bigram with initial punctuation does not reach cut-off threshold, no deletion)" ) else: if self.debug: self.log( " (Punctuation deletion candidate: " + " ".join(bigram) + " (" + str(bigram_oc) + ") vs " + " ".join(trigram) + " (" + str(trigram_oc) + ")") actions[i - 1] = ('delete', trigram[1], bigram_oc) if i > 0 and len(actions) > i - 1 and actions[i - 1] is None: #Recasing #given a bigram x y (from trigram x y z) #check if x Y is more frequent than x y recase = False bigram_left = trigram[:-1] firstchar = bigram_left[-1][0] if firstchar.isalpha(): if firstchar == firstchar.lower(): firstchar = firstchar.upper() else: firstchar = firstchar.lower() word = bigram_left[1] word_recased = firstchar + bigram_left[1][1:] word_pattern = self.classencoder.buildpattern(word) word_pattern_recased = self.classencoder.buildpattern( word_recased) if not word_pattern_recased.unknown(): word_pattern_recased_oc = self.unigram_model.occurrencecount( word_pattern_recased) if word_pattern_recased_oc >= self.settings[ 'recasethreshold']: word_pattern_oc = self.unigram_model.occurrencecount( word_pattern) if word_pattern_recased_oc >= word_pattern_oc * self.settings[ 'recasefactor'] or ( word_pattern_oc == 0 and word_pattern_recased_oc >= self.settings['recasefactor']): #contextless approach recase = True if not recase: #context-based approach if bigram_left[0] == "<begin>": #first word if word_pattern_recased_oc >= word_pattern_oc and firstchar == firstchar.upper( ): recase = True else: bigram_left_recased = (bigram_left[0], firstchar + bigram_left[1][1:]) bigram_left_recased_pattern = self.classencoder.buildpattern( " ".join(bigram_left_recased)) if not bigram_left_recased_pattern.unknown( ): #if self.debug >= 3: self.log(" (Considering recasing " + bigram_left[1] + " -> " + bigram_left_recased[1] + ")") bigram_left_recased_oc = self.bigram_model.occurrencecount( bigram_left_recased_pattern) bigram_left_pattern = self.classencoder.buildpattern( " ".join(bigram_left)) bigram_left_oc = self.bigram_model.occurrencecount( bigram_left_pattern) if bigram_left_recased_oc >= self.settings[ 'recasethreshold2'] and bigram_left_recased_oc > self.bigram_model.occurrencecount( self.classencoder. buildpattern(" ".join( bigram_left))): if self.debug: self.log( " (left bigram suggests recasing '" + " ".join(bigram_left) + "' (" + str(bigram_left_oc) + ") -> '" + " ".join( bigram_left_recased) + "' (" + str(bigram_left_recased_oc) + ")") recase = True #bigram_right = trigram[1:] #bigram_right_pattern = self.classencoder.buildpattern(" ".join(bigram_right)) #bigram_right_recased = (firstchar + bigram_right[0][1:], bigram_right[1]) #bigram_right_recased_pattern = self.classencoder.buildpattern(" ".join(bigram_right_recased)) #bigram_right_oc = self.bigram_model.occurrencecount(bigram_right_pattern) #if not bigram_right_recased_pattern.unknown(): # bigram_right_recased_oc = self.bigram_model.occurrencecount(bigram_right_recased_pattern) # if bigram_right_oc == 0 or bigram_right_recased_oc > bigram_right_oc: # #checks pass, recase: # recase = True # else: # if self.debug: self.log(" (right bigram refutes recasing '" + " ".join(bigram_right) + "' (" + str(bigram_right_oc) + ") -> '" + " ".join(bigram_right_recased) + "' (" + str(bigram_right_recased_oc) + ")") #elif bigram_right_oc == 0: # recase = True #else: # if self.debug: self.log(" (right bigram refutes recasing '" + " ".join(bigram_right) + "' (" + str(bigram_right_oc) + ") -> '" + " ".join(bigram_right_recased) + "' (not found)") if recase: if self.debug: self.log(" (Recasing: '" + word + "' -> '" + word_recased + "' in " + " ".join(trigram)) actions[i - 1] = ('recase', word_recased, 1) #find possible insertions for i, bigram in enumerate(Windower(words, 2, None, None)): if bigram[0] not in self.PUNCTUATION and bigram[ 1] not in self.PUNCTUATION: bigram_pattern = self.classencoder.buildpattern( " ".join(bigram)) bigram_oc = self.bigram_model.occurrencecount(bigram_pattern) if bigram_oc >= self.settings['insertioncutoff']: continue #bigram too prevalent to consider for insertion for punct in self.PUNCTUATION: if punct in self.EOSMARKERS and bigram[-1].isalpha( ) and bigram[-1][0] == bigram[-1][0].lower(): trigram = ( bigram[0], punct, bigram[-1][0].upper() + bigram[-1][1:] ) #insertion candidate is an eos marker, do recasing to initial capital else: trigram = (bigram[0], punct, bigram[-1]) trigram_pattern = self.classencoder.buildpattern( " ".join(trigram)) if trigram_pattern.unknown(): continue trigram_oc = self.trigram_model.occurrencecount( trigram_pattern) if trigram_oc >= bigram_oc and trigram_oc >= self.settings[ 'insertionthreshold']: if self.debug: self.log(" (Punctuation insertion candidate: " + " ".join(trigram) + " (" + str(trigram_oc) + ") vs " + " ".join(bigram) + " (" + str(bigram_oc) + ")") actions[i] = ('insert', punct, trigram_oc) #Consolidate all the actions through a simple survival of the fittest mechanism #making sure no adjacent deletions/insertion occur recaseactions = [None] * len(words) for i, (prevaction, action) in enumerate(Windower(actions, 2)): i = i - 1 if action is not None and action[0] != 'recase': if prevaction is not None and prevaction != "<begin>" and prevaction[ 0] != 'recase': if self.debug: self.log( "(Consolidating punc/recase actions, removing conflict)" ) if action[2] > prevaction[2]: #highest frequency wins actions[i - 1] = None else: actions[i] = None #Add recasing actions after insertion/deletion of EOS markers for i, action in enumerate(actions): if action is not None: if action[ 1] in self.EOSMARKERS: #Do we have have action on an EOS marker? if action[0] == 'insert': #Is it an insertion? if len(words) > i + 1 and words[i + 1].isalpha( ) and words[i + 1] == words[ i + 1].lower(): #Is the next word lowercase? if self.debug: self.log(" (Recasing after EOS insertion)") recaseactions[i + 1] = words[i + 1][0].upper() + words[ i + 1][1:] #yes, recase it elif action[0] == 'delete': #Is it an deletion? if len(words) > i + 1 and words[i + 1].isalpha( ) and words[i + 1][0] == words[i + 1][0].lower( ): #Does the next word start with a capital? if self.debug: self.log(" (Recasing after EOS deletion)") recaseactions[i + 1] = words[ i + 1].lower() #yes, lowercase it for i, recaseaction in enumerate(recaseactions): if recaseaction is not None: actions[i] = ('recase', recaseaction, 1) if self.settings['enforcefinalperiod']: #enforce final period if words[-1] not in self.EOSMARKERS and actions[-1] is None: if self.debug: self.log(" (Enforcing final period)") actions[-1] = ('insert', '.', 1) # action, punc return [(word_id, (action[0], action[1])) for word_id, action in zip(word_ids, actions) if action is not None]
def train(self, sourcefile, modelfile, **parameters): if self.hapaxer: self.log("Training hapaxer...") self.hapaxer.train() if modelfile.endswith('.ibase'): l = self.settings['leftcontext'] r = self.settings['rightcontext'] n = l + 1 + r self.log("Generating training instances...") fileprefix = modelfile.replace(".ibase","") #has been verified earlier classifier = TimblClassifier(fileprefix, self.gettimbloptions()) if sourcefile.endswith(".bz2"): iomodule = bz2 elif sourcefile.endswith(".gz"): iomodule = gzip else: iomodule = io with iomodule.open(sourcefile,mode='rt',encoding='utf-8') as f: for i, line in enumerate(f): if i % 100000 == 0: print(datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") + " - " + str(i),file=sys.stderr) for ngram in Windower(line, n): if self.hapaxer: ngram = self.hapaxer(ngram) focus = ngram[l] if self.hapaxer and focus == self.hapaxer.placeholder: continue leftcontext = tuple(ngram[:l]) rightcontext = tuple(ngram[l+1:]) classifier.append( leftcontext + rightcontext , focus ) self.log("Training classifier...") classifier.train() self.log("Saving model " + modelfile) classifier.save() elif modelfile.endswith('.patternmodel'): self.log("Preparing to generate lexicon for Language Model") classfile = stripsourceextensions(sourcefile) + ".cls" corpusfile = stripsourceextensions(sourcefile) + ".dat" if not os.path.exists(classfile): self.log("Building class file") classencoder = colibricore.ClassEncoder() classencoder.build(sourcefile) classencoder.save(classfile) else: classencoder = colibricore.ClassEncoder(classfile) if not os.path.exists(modelfile+'.cls'): #make symlink to class file, using model name instead of source name os.symlink(classfile, modelfile + '.cls') if not os.path.exists(corpusfile): self.log("Encoding corpus") classencoder.encodefile( sourcefile, corpusfile) if not os.path.exists(modelfile+'.cls'): #make symlink to class file, using model name instead of source name os.symlink(classfile, modelfile + '.cls') self.log("Generating pattern model") options = colibricore.PatternModelOptions(mintokens=self.settings['freqthreshold'],minlength=1,maxlength=1) model = colibricore.UnindexedPatternModel() model.train(corpusfile, options) self.log("Saving model " + modelfile) model.write(modelfile)
def main(): dopretests = True try: tests = sys.argv[1] if tests[0] == 'x': dopretests = False tests = tests[1:] if '-' in tests: begintest = int(tests.split('-')[0]) endtest = int(tests.split('-')[1]) else: begintest = endtest = int(tests) except: print( "Specify a text file (plain text, UTF-8, one sentence per line, preferably tokenised) to use as a basis", file=sys.stderr) sys.exit(2) try: textfile = sys.argv[2] except: print( "Specify a text file (plain text, UTF-8, one sentence per line, preferably tokenised) to use as a basis", file=sys.stderr) sys.exit(2) try: tmpdir = sys.argv[3] except: tmpdir = "/tmp/" classfile = tmpdir + "/" + os.path.basename(textfile) + '.colibri.cls' datafile = tmpdir + "/" + os.path.basename(textfile) + '.colibri.dat' modelfile = tmpdir + "/" + os.path.basename( textfile) + '.colibri.patternmodel' if not os.path.exists(textfile): print("File does not exist", file=sys.stderr) sys.exit(2) if dopretests: linecount = 0 print("PRETEST #1 - Reading text file (Python)") b = begin() with open(textfile, 'r', encoding='utf-8') as f: for line in f: linecount += 1 end(b) print("\t(Read " + str(linecount) + " lines)") print("PRETEST #2 - Building class encoder") encoder = colibricore.ClassEncoder() b = begin() encoder.build(textfile) end(b) print("PRETEST #3 - Saving class encoder") b = begin() encoder.save(classfile) end(b) print("PRETEST #4 - Class encoding corpus") b = begin() encoder.encodefile(textfile, datafile) end(b) print("PRETEST #5 - Unloading encoder") b = begin() del encoder gc.collect() end(b) if begintest < endtest: print("Running tests ", begintest, " to ", endtest) for testnum in range(begintest, min(endtest + 1, 10)): os.system("python3 " + sys.argv[0] + " x" + str(testnum) + " " + textfile + " " + tmpdir) else: testnum = begintest print("-------------------- " + colorf('bold', 'TEST') + " #" + str(testnum) + " ----------------------") if testnum == 1: linecount = 0 print( "Extracting and counting n-grams (up to 8-grams,threshold=1) naively (Python defaultdict + Pynlpl MultiWindower)" ) ngrams = defaultdict(int) b = begin() with open(textfile, 'r', encoding='utf-8') as f: for line in f: for ngram in MultiWindower(line, 1, 8): ngrams[ngram] += 1 end(b) print("\t(Found " + str(len(ngrams)) + " ngrams)") elif testnum == 2: print( "Extracting and counting n-grams (up to 8-grams,threshold=1) naively with NLTK (nltk.FreqDist + nltk.util.ngrams)" ) from nltk.probability import FreqDist from nltk.util import ngrams fd = FreqDist() b = begin() with open(textfile, 'r', encoding='utf-8') as f: for line in f: tokens = line.split(' ') for n in range(1, 9): for ngram in ngrams(tokens, n): fd[ngram] += 1 end(b) print("\t(Done)") elif testnum == 3: print( "Extracting and counting ALL n-grams (up to 8-grams, threshold=1) with UnindexedPatternModel" ) model = colibricore.UnindexedPatternModel() options = colibricore.PatternModelOptions(mintokens=1, maxlength=8, doreverseindex=False) b = begin() model.train(datafile, options) end(b) savemodel(model, modelfile) del model if testnum == 4: linecount = 0 print( "Extracting and counting n-grams (up to 8-grams, threshold=2, with look-back) (Python defaultdict + Pynlpl Windower)" ) ngrams = defaultdict(int) b = begin() for n in range(1, 9): with open(textfile, 'r', encoding='utf-8') as f: for line in f: for ngram in Windower(line, n): docount = True if n > 1: for subngram in Windower(ngram, n - 1): if not subngram in ngrams: docount = False break if docount: ngrams[ngram] += 1 end(b) print("\t(Found " + str(len(ngrams)) + " ngrams)") if testnum == 5: linecount = 0 print( "Extracting and counting n-grams (up to 8-grams, threshold=2, without look-back) (Python defaultdict + Pynlpl Windower)" ) ngrams = defaultdict(int) b = begin() with open(textfile, 'r', encoding='utf-8') as f: for line in f: for ngram in MultiWindower(line, 1, 8): ngrams[ngram] += 1 for ngram in list(ngrams.keys()): if ngrams[ngram] < 2: del ngrams[ngram] gc.collect() end(b) print("\t(Found " + str(len(ngrams)) + " ngrams)") elif testnum == 6: print( "Extracting and counting ALL n-grams (up to 8-grams, threshold=2) with UnindexedPatternModel" ) model = colibricore.UnindexedPatternModel() options = colibricore.PatternModelOptions(mintokens=2, maxlength=8) b = begin() model.train(datafile, options) end(b) savemodel(model, modelfile) elif testnum == 7: print( "Extracting and counting ALL n-grams (up to 8-grams,threshold=1) with UnindexedPatternModel (with preloaded corpus)" ) corpus = colibricore.IndexedCorpus(datafile) model = colibricore.UnindexedPatternModel(reverseindex=corpus) options = colibricore.PatternModelOptions(mintokens=1, maxlength=8) b = begin() model.train(datafile, options) end(b) savemodel(model, modelfile) elif testnum == 8: print( "Extracting and counting ALL n-grams (up to 8-grams,threshold=1) with IndexedPatternModel (with preloaded corpus)" ) corpus = colibricore.IndexedCorpus(datafile) model = colibricore.IndexedPatternModel(reverseindex=corpus) options = colibricore.PatternModelOptions(mintokens=1, maxlength=8) b = begin() model.train(datafile, options) end(b) savemodel(model, modelfile) del model elif testnum == 9: print( "Extracting and counting n-grams with treshold 2 (up to 8-grams) with IndexedPatternModel (with preloaded corpus)" ) corpus = colibricore.IndexedCorpus(datafile) model = colibricore.IndexedPatternModel(reverseindex=corpus) options = colibricore.PatternModelOptions(mintokens=2, maxlength=8) b = begin() model.train(datafile, options) end(b) savemodel(model, modelfile) elif testnum == 10: print( "Extracting and counting n-grams and skipgrams with treshold 2 (up to 8-grams) with IndexedPatternModel (with preloaded corpus)" ) corpus = colibricore.IndexedCorpus(datafile) model = colibricore.IndexedPatternModel(reverseindex=corpus) options = colibricore.PatternModelOptions(mintokens=2, maxlength=8, doskipgrams=True) b = begin() model.train(datafile, options) end(b) savemodel(model, modelfile) elif testnum == 11: print( "Extracting and counting ALL n-grams (up to 8-grams, threshold=1) with OrderedUnindexedPatternModel" ) model = colibricore.OrderedUnindexedPatternModel() options = colibricore.PatternModelOptions(mintokens=1, maxlength=8, doreverseindex=False) b = begin() model.train(datafile, options) end(b) savemodel(model, modelfile) del model else: print("No such test", file=sys.stderr) print()
def scoresentence(self, sentence): return product([ self[x] for x in Windower(sentence, self.n, self.beginmarker, self.endmarker) ])
elif o == "-e": encoding = a else: print("ERROR: Unknown option:", o, file=sys.stderr) sys.exit(1) if not files: print >> sys.stderr, "No files specified" sys.exit(1) freqlist = FrequencyList(None, casesensitive) for filename in files: f = codecs.open(filename, 'r', encoding) for line in f: if n > 1: freqlist.append(Windower(crude_tokenizer(line), n)) else: freqlist.append(crude_tokenizer(line)) f.close() dist = Distribution(freqlist) for type, count in freqlist: if isinstance(type, tuple) or isinstance(type, list): type = " ".join(type) s = type + "\t" + str(count) + "\t" + str(dist[type]) + "\t" + str( dist.information(type)) print(s) print("Tokens: ", freqlist.tokens(), file=sys.stderr) print("Types: ", len(freqlist), file=sys.stderr)
#Presuming that each token will be on one line, make a mapping from lines to IDs idmap = [w.id for w in doc.words()] ########## Extract data for modules ############## if not standalone: clam.common.status.write(statusfile, "Extracting data for modules", 3) f = open(outputdir + 'input.tok.txt', 'w') for currentword in doc.words(): f.write(str(currentword).replace('’', '\'') + ' ') f.close() f = open(outputdir + 'agreement_checker.test.inst', 'w') for prevword3, prevword2, prevword, currentword, nextword, nextword2, nextword3 in Windower( doc.words(), 7): f.write( str(prevword3) + ' ' + str(prevword2) + ' ' + str(prevword) + ' ' + str(currentword) + ' ' + str(nextword) + ' ' + str(nextword2) + ' ' + str(nextword3) + ' ' + str(currentword) + '\n') f.close() ###### BEGIN CALL MODULES (USING PARALLEL POOL) ###### # (nothing to edit here) errout("Calling modules") if not standalone: clam.common.status.write(statusfile, "Calling Modules", 4) def processor():