def __init__(self, n, alpha): self.n = int(n) if self.n < 1: print "Error in ngram constructor: n must be an integer greater than 0" quit(1) self.models = {"lr": ngram(n, alpha), "rl": ngram(n, alpha, True)} self.filler = ["<s>" for i in range(self.n-1)] self.lambda1 = .5 self.lambda2 = .5
def getNgramDistance(f1, f2, n): mnemonics1, mnemonics2 = f1['mnemonics'], f2['mnemonics'] length = min(len(mnemonics1), len(mnemonics2)) mnemonics1, mnemonics2 = f1['mnemonics'][:length], f2['mnemonics'][:length] if length < n: n = length ngram1 = ngram(mnemonics1, n) ngram2 = ngram(mnemonics2, n) ngram_distance = ngramset_edit_distance(ngram1.ngramSet, ngram2.ngramSet) return ngram_distance
def getNgramDistance(f1, f2, n): start = time.time() mnemonics1, mnemonics2 = f1['mnemonics'], f2['mnemonics'] length = min(len(mnemonics1), len(mnemonics2)) # if length > 300: length = 300 mnemonics1, mnemonics2 = f1['mnemonics'][:length], f2['mnemonics'][:length] if length < n: n = length ngram1 = ngram(mnemonics1, n) ngram2 = ngram(mnemonics2, n) ngram_distance = ngramset_edit_distance(ngram1.ngramSet, ngram2.ngramSet) return ngram_distance, time.time() - start
def getNgramDistance(f1, f2, n): start = time.clock() mnemonics1, mnemonics2 = f1['mnemonics'], f2['mnemonics'] length = min(len(mnemonics1), len(mnemonics2)) #if length > 150: length = 150 mnemonics1, mnemonics2 = f1['mnemonics'][:length], f2['mnemonics'][:length] if length < n: n = length ngram1 = ngram(mnemonics1, n) ngram2 = ngram(mnemonics2, n) ngram_distance, ngram_var, sim2, sim3, indexes = ngramset_edit_distance( ngram1.ngramSet, ngram2.ngramSet) return ngram_distance, ngram_var, sim2, sim3, indexes, time.clock() - start
def nbest_sblm_main(lm='/home/nlg-02/pust/v8.1zh/pysblm.sblm/sblm.pcfg.5gram.lwlm', #lm='nbest.pcfg.srilm', nbest='nbest.txt', strip=True, flatten=True, num2at=True, sblm_terminals=0, sblm_pword=1, output_nbest='', maxwords=999999, logp_unk=0.0, closed=True, greedy=True, usage_=usage # rest_=None ): lm=None if lm=='' else ngram(lm=lm,closed=closed) lm.set_logp_unk(logp_unk) output_nbest=None if output_nbest=='' else open(output_nbest,'w') n=0 ng=0 for l in open(nbest): if l.startswith("NBEST sent="): n+=1 if check_nbest(l,lm,sblm_terminals,sblm_pword,strip,flatten,num2at,output_nbest,maxwords,n,greedy): ng+=1 info_summary() log("%s good out of %s NBEST lines"%(ng,n))
def produceExample(exampleStr): tg = ngram([exampleStr]) (grams, list) = tg.ngramify([exampleStr]) printList = [] for item in list: stri = "\ngr{" + item + "}" stri.replace(" ","\_") stri.replace("'","") printList.append(str) print printList freqDict = {} for item in list: if freqDict.has_key(item): freqDict[item] = freqDict[item] + 1 else: freqDict[item] = 1 freqfreqDict = {} for key in freqDict.keys(): value = freqDict[key] if freqfreqDict.has_key(value): freqfreqDict[value] += 1 else: freqfreqDict[value] = 1 print "\\begin{tabular}{|cc|}" print "\\hline" print "Frequency & Frequency of frequency \\\\" print "\\hline" print "r & N_r \\\\" for key in freqfreqDict.keys(): print str(key) + " & " + str(freqfreqDict[key]) + "\\\\" print "\\hline" print "\\end{tabular}"
def __init__(self, order=2, parent=False, digit2at=False, parent_alpha=0.99, cond_parent=False, parent_start=False, skip_bar=True, unsplit=True, logp_unk=0.0, witten_bo=0.1): """ parent: use parent_alpha*p(children|parent)+(1-parent_alpha)*p(children) parent_start: make the <s> symbol <s:NP> - not needed if you use cond_parent or parent cond_parent: ngram c[0]...c[i-1] PARENT c[i] for each i you score. backs off to c[i]|PARENT and c[i]. overrides parent. (this is fine; they aim toward the same purpose) """ self.parent_start = parent_start self.cond_parent = cond_parent self.parent = parent #distinct ngrams for each parent; backoff to indistinct self.digit2at = digit2at self.order = order self.ng = ngram(order, digit2at=False, logp_unk=logp_unk) self.png = dict() self.terminals = tag_word_unigram( bo=witten_bo, digit2at=digit2at, logp_unk=logp_unk ) #simplify: use an ngram for terminals. tag_word_unigram is functionally equiv to bigram lm anyway self.unsplit = unsplit self.skip_bar = skip_bar self.unsplit_map = strip_subcat if unsplit else identity self.skip_map = no_bar if skip_bar else identity self.label_map = lambda l: self.skip_map(self.unsplit_map(l)) self.set_parent_alpha(parent_alpha)
def nbest_sblm_main( lm='/home/nlg-02/pust/v8.1zh/pysblm.sblm/sblm.pcfg.5gram.lwlm', #lm='nbest.pcfg.srilm', nbest='nbest.txt', strip=True, flatten=True, num2at=True, sblm_terminals=0, sblm_pword=1, output_nbest='', maxwords=999999, logp_unk=0.0, closed=True, greedy=True, usage_=usage # rest_=None ): lm = None if lm == '' else ngram(lm=lm, closed=closed) lm.set_logp_unk(logp_unk) output_nbest = None if output_nbest == '' else open(output_nbest, 'w') n = 0 ng = 0 for l in open(nbest): if l.startswith("NBEST sent="): n += 1 if check_nbest(l, lm, sblm_terminals, sblm_pword, strip, flatten, num2at, output_nbest, maxwords, n, greedy): ng += 1 info_summary() log("%s good out of %s NBEST lines" % (ng, n))
def read_radu(self, infile): if isinstance(infile, str): infile = open(infile) n = 0 for line in infile: t = self.tree_from_line(line) if t is not None: n += t.size() for e in gen_pcfg_events_radu(t, terminals=False, digit2at=self.digit2at): if isinstance(e, tuple): e = tuple(e[0]) # warn("sblm_ngram train terminal",e,max=10) self.terminals.count_tw(e) else: if len(e) == 0: continue p = e[0] sent = self.sent_for_event(p, e[1:]) if self.cond_parent: i = len(sent) sent.append(None) while i > 1: sent[i] = sent[i - 1] sent[i - 1] = p # warn("read_radu cond_parent","%s => %s i=%s"%(p,sent[:i+1],i)) self.ng.count_word(sent, i) i -= 1 else: self.ng.count_text(sent, i=1) if self.parent: pngs = self.png if p not in pngs: pn = ngram(self.order, digit2at=False) pngs[p] = pn else: pn = pngs[p] pn.count_text(sent, i=1) return n
def __init__(self,order=2,parent=False,digit2at=False,parent_alpha=0.99,cond_parent=False,parent_start=False,skip_bar=True,unsplit=True,logp_unk=0.0,witten_bo=0.1): """ parent: use parent_alpha*p(children|parent)+(1-parent_alpha)*p(children) parent_start: make the <s> symbol <s:NP> - not needed if you use cond_parent or parent cond_parent: ngram c[0]...c[i-1] PARENT c[i] for each i you score. backs off to c[i]|PARENT and c[i]. overrides parent. (this is fine; they aim toward the same purpose) """ self.parent_start=parent_start self.cond_parent=cond_parent self.parent=parent #distinct ngrams for each parent; backoff to indistinct self.digit2at=digit2at self.order=order self.ng=ngram(order,digit2at=False,logp_unk=logp_unk) self.png=dict() self.terminals=tag_word_unigram(bo=witten_bo,digit2at=digit2at,logp_unk=logp_unk) #simplify: use an ngram for terminals. tag_word_unigram is functionally equiv to bigram lm anyway self.unsplit=unsplit self.skip_bar=skip_bar self.unsplit_map=strip_subcat if unsplit else identity self.skip_map=no_bar if skip_bar else identity self.label_map=lambda l: self.skip_map(self.unsplit_map(l)) self.set_parent_alpha(parent_alpha)
def read_radu(self,infile): if isinstance(infile, str): infile=open(infile) n=0 for line in infile: t=self.tree_from_line(line) if t is not None: n+=t.size() for e in gen_pcfg_events_radu(t,terminals=False,digit2at=self.digit2at): if isinstance(e, tuple): e=tuple(e[0]) # warn("sblm_ngram train terminal",e,max=10) self.terminals.count_tw(e) else: if len(e)==0: continue p=e[0] sent=self.sent_for_event(p,e[1:]) if self.cond_parent: i=len(sent) sent.append(None) while i>1: sent[i]=sent[i-1] sent[i-1]=p # warn("read_radu cond_parent","%s => %s i=%s"%(p,sent[:i+1],i)) self.ng.count_word(sent,i) i-=1 else: self.ng.count_text(sent,i=1) if self.parent: pngs=self.png if p not in pngs: pn=ngram(self.order,digit2at=False) pngs[p]=pn else: pn=pngs[p] pn.count_text(sent,i=1) return n
#Date: 5/13/12 import sys from ngram import * from ngram_util import * #-----------------------------------------------------------------------START - Load the training set sys.stdout.write("Loading training set...") trainingSentences = getWSJDirectories(14, "c:/wsj/") print "complete!" #-----------------------------------------------------------------------END - Load the training set #-----------------------------------------------------------------------START - Load the testing set sys.stdout.write("Loading testing set...") testingSentences = getWSJDirectories(15, "c:/wsj/", 1) print "complete!" #-----------------------------------------------------------------------END - Load the testing set #-----------------------------------------------------------------------START - Evaluate the 1-4-gram models print "\n------------------------N-grams:\n" for n in [1, 2, 3, 4]: model = ngram(n, 0.0001) sys.stdout.write("Training "+`n`+"-gram language model...") model.train(trainingSentences) print("complete!") sys.stdout.write("Evaluating model on training set...") print "Perplexity: "+`model.evaluate(trainingSentences)` sys.stdout.write("Evaluating model on testing set...") print "Perplexity: "+`model.evaluate(testingSentences)` print #-----------------------------------------------------------------------END - Evaluate the 1-4-gram models