Esempi in Python per ngram

Esempio n. 1

0

Mostra file

File: bidirectional_ngram.py Progetto: EdwardBetts/Natural-Language-Processing

 def __init__(self, n, alpha):
   self.n = int(n)
   if self.n < 1:
     print "Error in ngram constructor: n must be an integer greater than 0"
     quit(1)
   self.models = {"lr": ngram(n, alpha), "rl": ngram(n, alpha, True)}
   self.filler = ["<s>" for i in range(self.n-1)]
   self.lambda1 = .5
   self.lambda2 = .5

Esempio n. 2

0

Mostra file

File: similarityAnalyzer_backup.py Progetto: 0cherry/SimilarityAnalyzer

def getNgramDistance(f1, f2, n):
    mnemonics1, mnemonics2 = f1['mnemonics'], f2['mnemonics']
    length = min(len(mnemonics1), len(mnemonics2))
    mnemonics1, mnemonics2 = f1['mnemonics'][:length], f2['mnemonics'][:length]
    if length < n:
        n = length
    ngram1 = ngram(mnemonics1, n)
    ngram2 = ngram(mnemonics2, n)
    ngram_distance = ngramset_edit_distance(ngram1.ngramSet, ngram2.ngramSet)
    return ngram_distance

Esempio n. 3

0

Mostra file

def getNgramDistance(f1, f2, n):
    start = time.time()
    mnemonics1, mnemonics2 = f1['mnemonics'], f2['mnemonics']
    length = min(len(mnemonics1), len(mnemonics2))
    # if length > 300: length = 300
    mnemonics1, mnemonics2 = f1['mnemonics'][:length], f2['mnemonics'][:length]
    if length < n:
        n = length
    ngram1 = ngram(mnemonics1, n)
    ngram2 = ngram(mnemonics2, n)
    ngram_distance = ngramset_edit_distance(ngram1.ngramSet, ngram2.ngramSet)
    return ngram_distance, time.time() - start

Esempio n. 4

0

Mostra file

File: similarityAnalyzer.py Progetto: 0cherry/SimilarityAnalyzer

def getNgramDistance(f1, f2, n):
    start = time.clock()
    mnemonics1, mnemonics2 = f1['mnemonics'], f2['mnemonics']
    length = min(len(mnemonics1), len(mnemonics2))
    #if length > 150: length = 150
    mnemonics1, mnemonics2 = f1['mnemonics'][:length], f2['mnemonics'][:length]
    if length < n:
        n = length
    ngram1 = ngram(mnemonics1, n)
    ngram2 = ngram(mnemonics2, n)
    ngram_distance, ngram_var, sim2, sim3, indexes = ngramset_edit_distance(
        ngram1.ngramSet, ngram2.ngramSet)
    return ngram_distance, ngram_var, sim2, sim3, indexes, time.clock() - start

Esempio n. 5

0

Mostra file

File: nbest-sblm.py Progetto: SlyryD/carmel

def nbest_sblm_main(lm='/home/nlg-02/pust/v8.1zh/pysblm.sblm/sblm.pcfg.5gram.lwlm',
                    #lm='nbest.pcfg.srilm',
                    nbest='nbest.txt',
                    strip=True,
                    flatten=True,
                    num2at=True,
                    sblm_terminals=0,
                    sblm_pword=1,
                    output_nbest='',
                    maxwords=999999,
                    logp_unk=0.0,
                    closed=True,
                    greedy=True,
                    usage_=usage
#                    rest_=None
                    ):
    lm=None if lm=='' else ngram(lm=lm,closed=closed)
    lm.set_logp_unk(logp_unk)
    output_nbest=None if output_nbest=='' else open(output_nbest,'w')
    n=0
    ng=0
    for l in open(nbest):
        if l.startswith("NBEST sent="):
            n+=1
            if check_nbest(l,lm,sblm_terminals,sblm_pword,strip,flatten,num2at,output_nbest,maxwords,n,greedy):
                ng+=1
    info_summary()
    log("%s good out of %s NBEST lines"%(ng,n))

Esempio n. 6

0

Mostra file

File: reportData2.py Progetto: kjmikkel/AuthorAttribution

def produceExample(exampleStr):
    tg = ngram([exampleStr])
    (grams, list) = tg.ngramify([exampleStr])
    
    printList = []
    for item in list:
        stri = "\ngr{" + item + "}"
        stri.replace(" ","\_")
        stri.replace("'","")
        printList.append(str)
    print printList
        
    freqDict = {}
    for item in list:
        if freqDict.has_key(item):
            freqDict[item] = freqDict[item] + 1
        else: 
            freqDict[item] = 1
    
    freqfreqDict = {}
    for key in freqDict.keys():
        value = freqDict[key]
        if freqfreqDict.has_key(value):
            freqfreqDict[value] += 1
        else: 
            freqfreqDict[value] = 1
    print "\\begin{tabular}{|cc|}"
    print "\\hline"
    print "Frequency & Frequency of frequency \\\\"
    print "\\hline"
    print "r & N_r \\\\"
    for key in freqfreqDict.keys():
        print str(key) + " & " + str(freqfreqDict[key]) + "\\\\"
    print "\\hline"
    print "\\end{tabular}"

Esempio n. 7

0

Mostra file

    def __init__(self,
                 order=2,
                 parent=False,
                 digit2at=False,
                 parent_alpha=0.99,
                 cond_parent=False,
                 parent_start=False,
                 skip_bar=True,
                 unsplit=True,
                 logp_unk=0.0,
                 witten_bo=0.1):
        """
        parent: use parent_alpha*p(children|parent)+(1-parent_alpha)*p(children)

        parent_start: make the <s> symbol <s:NP> - not needed if you use cond_parent or parent

        cond_parent: ngram c[0]...c[i-1] PARENT c[i] for each i you score. backs off to c[i]|PARENT and c[i]. overrides parent. (this is fine; they aim toward the same purpose)

        """
        self.parent_start = parent_start
        self.cond_parent = cond_parent
        self.parent = parent  #distinct ngrams for each parent; backoff to indistinct
        self.digit2at = digit2at
        self.order = order
        self.ng = ngram(order, digit2at=False, logp_unk=logp_unk)
        self.png = dict()
        self.terminals = tag_word_unigram(
            bo=witten_bo, digit2at=digit2at, logp_unk=logp_unk
        )  #simplify: use an ngram for terminals. tag_word_unigram is functionally equiv to bigram lm anyway
        self.unsplit = unsplit
        self.skip_bar = skip_bar
        self.unsplit_map = strip_subcat if unsplit else identity
        self.skip_map = no_bar if skip_bar else identity
        self.label_map = lambda l: self.skip_map(self.unsplit_map(l))
        self.set_parent_alpha(parent_alpha)

Esempio n. 8

0

Mostra file

def nbest_sblm_main(
        lm='/home/nlg-02/pust/v8.1zh/pysblm.sblm/sblm.pcfg.5gram.lwlm',
        #lm='nbest.pcfg.srilm',
        nbest='nbest.txt',
        strip=True,
        flatten=True,
        num2at=True,
        sblm_terminals=0,
        sblm_pword=1,
        output_nbest='',
        maxwords=999999,
        logp_unk=0.0,
        closed=True,
        greedy=True,
        usage_=usage
    #                    rest_=None
):
    lm = None if lm == '' else ngram(lm=lm, closed=closed)
    lm.set_logp_unk(logp_unk)
    output_nbest = None if output_nbest == '' else open(output_nbest, 'w')
    n = 0
    ng = 0
    for l in open(nbest):
        if l.startswith("NBEST sent="):
            n += 1
            if check_nbest(l, lm, sblm_terminals, sblm_pword, strip, flatten,
                           num2at, output_nbest, maxwords, n, greedy):
                ng += 1
    info_summary()
    log("%s good out of %s NBEST lines" % (ng, n))

Esempio n. 9

0

Mostra file

 def read_radu(self, infile):
     if isinstance(infile, str): infile = open(infile)
     n = 0
     for line in infile:
         t = self.tree_from_line(line)
         if t is not None:
             n += t.size()
         for e in gen_pcfg_events_radu(t,
                                       terminals=False,
                                       digit2at=self.digit2at):
             if isinstance(e, tuple):
                 e = tuple(e[0])
                 #                    warn("sblm_ngram train terminal",e,max=10)
                 self.terminals.count_tw(e)
             else:
                 if len(e) == 0: continue
                 p = e[0]
                 sent = self.sent_for_event(p, e[1:])
                 if self.cond_parent:
                     i = len(sent)
                     sent.append(None)
                     while i > 1:
                         sent[i] = sent[i - 1]
                         sent[i - 1] = p
                         # warn("read_radu cond_parent","%s => %s i=%s"%(p,sent[:i+1],i))
                         self.ng.count_word(sent, i)
                         i -= 1
                 else:
                     self.ng.count_text(sent, i=1)
                     if self.parent:
                         pngs = self.png
                         if p not in pngs:
                             pn = ngram(self.order, digit2at=False)
                             pngs[p] = pn
                         else:
                             pn = pngs[p]
                             pn.count_text(sent, i=1)
     return n

Esempio n. 10

0

Mostra file

File: pcfg.py Progetto: SlyryD/carmel

    def __init__(self,order=2,parent=False,digit2at=False,parent_alpha=0.99,cond_parent=False,parent_start=False,skip_bar=True,unsplit=True,logp_unk=0.0,witten_bo=0.1):
        """
        parent: use parent_alpha*p(children|parent)+(1-parent_alpha)*p(children)

        parent_start: make the <s> symbol <s:NP> - not needed if you use cond_parent or parent

        cond_parent: ngram c[0]...c[i-1] PARENT c[i] for each i you score. backs off to c[i]|PARENT and c[i]. overrides parent. (this is fine; they aim toward the same purpose)

        """
        self.parent_start=parent_start
        self.cond_parent=cond_parent
        self.parent=parent #distinct ngrams for each parent; backoff to indistinct
        self.digit2at=digit2at
        self.order=order
        self.ng=ngram(order,digit2at=False,logp_unk=logp_unk)
        self.png=dict()
        self.terminals=tag_word_unigram(bo=witten_bo,digit2at=digit2at,logp_unk=logp_unk) #simplify: use an ngram for terminals. tag_word_unigram is functionally equiv to bigram lm anyway
        self.unsplit=unsplit
        self.skip_bar=skip_bar
        self.unsplit_map=strip_subcat if unsplit else identity
        self.skip_map=no_bar if skip_bar else identity
        self.label_map=lambda l: self.skip_map(self.unsplit_map(l))
        self.set_parent_alpha(parent_alpha)

Esempio n. 11

0

Mostra file

File: pcfg.py Progetto: SlyryD/carmel

    def read_radu(self,infile):
        if isinstance(infile, str): infile=open(infile)
        n=0
        for line in infile:
            t=self.tree_from_line(line)
            if t is not None:
                n+=t.size()
            for e in gen_pcfg_events_radu(t,terminals=False,digit2at=self.digit2at):
                if isinstance(e, tuple):
                    e=tuple(e[0])
#                    warn("sblm_ngram train terminal",e,max=10)
                    self.terminals.count_tw(e)
                else:
                    if len(e)==0: continue
                    p=e[0]
                    sent=self.sent_for_event(p,e[1:])
                    if self.cond_parent:
                        i=len(sent)
                        sent.append(None)
                        while i>1:
                            sent[i]=sent[i-1]
                            sent[i-1]=p
                            # warn("read_radu cond_parent","%s => %s i=%s"%(p,sent[:i+1],i))
                            self.ng.count_word(sent,i)
                            i-=1
                    else:
                        self.ng.count_text(sent,i=1)
                        if self.parent:
                            pngs=self.png
                            if p not in pngs:
                                pn=ngram(self.order,digit2at=False)
                                pngs[p]=pn
                            else:
                                pn=pngs[p]
                                pn.count_text(sent,i=1)
        return n

Esempio n. 12

0

Mostra file

File: main.py Progetto: EdwardBetts/Natural-Language-Processing

#Date: 5/13/12

import sys
from ngram import *
from ngram_util import * 

#-----------------------------------------------------------------------START - Load the training set
sys.stdout.write("Loading training set...")
trainingSentences = getWSJDirectories(14, "c:/wsj/")
print "complete!"
#-----------------------------------------------------------------------END - Load the training set

#-----------------------------------------------------------------------START - Load the testing set
sys.stdout.write("Loading testing set...")
testingSentences = getWSJDirectories(15, "c:/wsj/", 1)
print "complete!"
#-----------------------------------------------------------------------END - Load the testing set

#-----------------------------------------------------------------------START - Evaluate the 1-4-gram models
print "\n------------------------N-grams:\n"
for n in [1, 2, 3, 4]:
  model = ngram(n, 0.0001)
  sys.stdout.write("Training "+`n`+"-gram language model...")
  model.train(trainingSentences)
  print("complete!")
  sys.stdout.write("Evaluating model on training set...")
  print "Perplexity: "+`model.evaluate(trainingSentences)`
  sys.stdout.write("Evaluating model on testing set...")
  print "Perplexity: "+`model.evaluate(testingSentences)`
  print 
#-----------------------------------------------------------------------END - Evaluate the 1-4-gram models