def process_train_data(v: int, n: int, delta: float, vocab_size: int,
                       train_file: str) -> Ngram:
    """
    Wrapper function for the training data processing.
    Either fetch or generate necessary Ngrams based on the training information.
    :param v: Vocabulary choice
    :param n: ngram choice
    :param delta: Smoothing choice
    :param vocab_size: The size of the vocabulary
    :param train_file: Path to training data
    :return: Ngram
    """
    ngrams = Ngram(n)
    if ds.data_ser_exists(v, n, delta):
        print("Model with parameters already stored. Retrieving")
        ngrams = ds.data_ser_load(v, n, delta)
    else:
        print(
            "Model with parameters not stored. Generating model from provided training data"
        )
        train_data = pd.read_csv(train_file,
                                 delimiter='\t',
                                 names=[
                                     DF_COLUMN_ID, DF_COLUMN_NAME,
                                     DF_COLUMN_LANG, DF_COLUMN_TWEET
                                 ])
        transform_to_vocab(train_data, v)
        print("Shape of Training Data (Rows, Columns) => {}".format(
            train_data.shape))
        ngrams.generate(train_data, delta, vocab_size)
        ds.data_ser_save(ngrams, v, n, delta)
    return ngrams
Beispiel #2
0
    def getNgramsByWord(self, word, ngramSize):
        if not ngramSize:
            return []

        term = Ngram(word, ngramSize)
        if term.deriveNgrams():
            return term.getNgrams()
        else:
            return []
Beispiel #3
0
 def __init__(self, maxNgramcount, content):
     self.maxNgramcount = maxNgramcount
     self.corpus = content
     print("Hece gramları oluşturuluyor")
     self.ngrams_hece = [(Ngram(i + 1, self.corpus, "hece", self))
                         for i in range(self.maxNgramcount)]
     print("---------------------------")
     print("Harf gramları oluşturuluyor")
     self.ngrams_harf = [(Ngram(i + 1, self.corpus, "harf", self))
                         for i in range(self.maxNgramcount)]
Beispiel #4
0
    def exec_second(self, parole):

        e = EditDistance()
        a = Ngram()

        tempi = []
        n_vicine_trovate = []

        for parola in parole:
            with open('60000_parole_italiane.txt', 'r') as f:
                # print 'parola --> ', parola

                # edit distance
                # print '----- EDIT DISTANCE'
                e_results = []
                start = timer()
                for line in f:
                    p = line.rstrip()
                    _, op = e.edit_distance(parola, p)
                    costo = e.op_sequence(op, len(parola) - 1, len(p) - 1, [])
                    if costo < self.sogliaCosto:
                        e_results.append((p, costo))
                end = timer()

                time_edit = end - start
                n_edit = len(e_results)

                # print 'risultati (%s)' % n_edit,  '-->', sorted(e_results, key=get(1))
                # print 'tempo -->', time_edit

                # ngrams
                # print '----- NGRAMS'
                g_results = []
                b = a.ngram(parola, self.numberOfGrams)
                with open("%s_grams.txt" % self.numberOfGrams, 'r') as r:
                    start = timer()
                    for line in r:
                        s = line.split(' -> ')
                        p, g = s[0], s[1]
                        f = a.jaccard(b, g)
                        if f > self.sogliaJaccard:
                            g_results.append((p, f))
                    end = timer()

                time_gram = end - start
                n_gram = len(g_results)

                # print 'risultati (%s)' % n_gram, '-->', sorted(g_results, key=get(1), reverse=True)
                # print 'tempo -->', time_gram
                # print '\n'

                tempi.append([time_edit, time_gram])
                n_vicine_trovate.append([n_edit, n_gram])

        return [tempi, n_vicine_trovate]
def readFile():
    uni_tag = Ngram()
    bi_tag = Ngram()
    tri_tag = Ngram()
    for s in stdin:
        tags = []
        words = []
        if s != '' and s != '\n':
            s = '<s>/BOS <s>/BOS '+s.rstrip()+' </s>/EOS'
            lst = s.split()
            for wt in lst:  #parse words and tags
                t = wt.split('/')[-1]
                tags.append(t)
                idx = wt.rfind(t)
                w = wt[:idx-1]
                words.append(w)
            for i in range(len(tags)):#go through the training data,add counts of tag ngrams and tag-word co-occurences
                t = tags[i]
                w = words[i]
                uni_tag.addEntry(t,w)
                if i <= len(tags)-2:
                    seq = ' '.join(tags[i:i+2])
                    bi_tag.addCount(seq)
                    if i <= len(tags)-3:
                        seq2 = ' '.join(tags[i:i+3])
                        tri_tag.addCount(seq2)
    return uni_tag,bi_tag,tri_tag
Beispiel #6
0
    def setUp(self):
        ngram_after_1 = self.mock_ngram('bar', 1, 2, 3)
        ngram_after_2 = self.mock_ngram('baz', 7, 8, 9)

        self.ngram = Ngram('foo', 1, 1)

        self.ngram.after[0][ngram_after_1.string] = ngram_after_1
        self.ngram.after[0][ngram_after_2.string] = ngram_after_2

        self.ngram.before[0][ngram_after_1.string] = ngram_after_1
        self.ngram.before[0][ngram_after_2.string] = ngram_after_2
Beispiel #7
0
 def parse(self, text):
     tokens = re.split('\s+', text)
     for wnum in xrange(0, len(tokens)):
         for ng_ord in xrange(1, self.max_order + 1):
             if wnum + ng_ord < len(tokens):
                 words_tuple = tuple(tokens[wnum:wnum + ng_ord])
                 ngram = self.storage_.get_n_gram(words_tuple)
                 if ngram == None:
                     ngram = Ngram(1)
                 else:
                     ngram.count = ngram.count + 1
                 self.storage_.set_n_gram(words_tuple, ngram)
Beispiel #8
0
 def parse(self, text):
     tokens = re.split('\s+', text)
     for wnum in xrange(0, len(tokens)):
         for ng_ord in xrange(1, self.max_order + 1):
             if wnum + ng_ord < len(tokens):
                 words_tuple = tuple(tokens[wnum : wnum + ng_ord])
                 ngram = self.storage_.get_n_gram(words_tuple)
                 if  ngram == None:
                     ngram = Ngram(1)
                 else:
                     ngram.count = ngram.count + 1
                 self.storage_.set_n_gram(words_tuple, ngram)
Beispiel #9
0
def test_hash_fn():
    ngram1 = Ngram('a-rose-is')
    ngram2 = Ngram('rose-is-a')
    assert ngram1.__hash__() != ngram2.__hash__(), 'the two hashes should not be the same'
    print 'Ngrams with different string values give different hashes... ok'

    ngram2.value = 'a-rose-is'
    assert ngram1.__hash__() == ngram2.__hash__(), 'the two hashes should not be the same'
    print 'Ngrams with the same string values give the same hash... ok'
def data_ser_load(v: int, n: int, delta: float):
    """
    loads the Ngram object, initializing DataFrames for each languages from proper files.
    :param v: Vocabulary for the model
    :param n: Ngram size for the model
    :param delta: Delta value for the model
    :return ngrams: Ngram object.
    """
    ngrams = Ngram(n)
    for lang in LANGUAGES:
        ngrams.ngrams[lang] = pd.read_pickle(
            TRAINING_FILE_TEMPLATE.format(lang, v, n, delta))
    return ngrams
Beispiel #11
0
    def exec_fifth(self):

        e = EditDistance()
        a = Ngram()

        originale = raw_input("**** Inserisci parola --> ")
        parola = self.storpia(originale)
        print '**** Parola storpiata -->', parola

        # edit distance
        print '----- EDIT DISTANCE'
        # costi: 1, 2, 3, 4, 5
        for c in range(1, 6):
            with open('60000_parole_italiane.txt', 'r') as f:
                e_results = []
                for line in f:
                    p = line.rstrip()
                    _, op = e.edit_distance(parola, p)
                    costo = e.op_sequence(op, len(parola) - 1, len(p) - 1, [])
                    if costo < c:
                        e_results.append((p, costo))
                if any(originale in a for a in e_results):
                    w = 'parola originale trovata!'
                else:
                    w = 'parola originale non trovata!'
                print w, '(soglia costo %s, %s risultati)' % (
                    c, len(e_results)), '-->', sorted(e_results, key=get(1))

        # ngram
        print '----- NGRAM'
        b = a.ngram(parola, self.numberOfGrams)
        # coefficienti: 0.5, 0.6, 0.7, 0.8, 0.9
        for j in np.arange(0.5, 1.0, 0.1):
            with open("%s_grams.txt" % self.numberOfGrams, 'r') as f:
                g_results = []
                for line in f:
                    s = line.split(' -> ')
                    p, g = s[0], s[1]
                    f = a.jaccard(b, g)
                    if f > j:
                        g_results.append((p, f))
                if any(originale in a for a in g_results):
                    w = 'parola originale trovata!'
                else:
                    w = 'parola originale non trovata!'
                print w, '(jaccard %s, %s risultati)' % (
                    j, len(g_results)), '-->', sorted(g_results,
                                                      key=get(1),
                                                      reverse=True)
Beispiel #12
0
    def getNgramsByLine(self, ngramSize):
        if not ngramSize:
            return []

        occurency = []

        # split the given text into single lines
        lines = self.splitParagraph()
        for line in lines:
            term = Ngram(line, ngramSize)
            if term.deriveNgrams():
                occurency.append(term.getNgrams())
            else:
                occurency.append([])
        return occurency
Beispiel #13
0
	def ngramStemmer (self, wordList, size, equality):
		"reduces wordList according to the n-gram stemming method"
		
		# use return_list and stop_list for the terms to be removed, later
		returnList = []
		stopList = []
		ngramAdvas = Advas("","")

		# calculate length and range
		listLength = len(wordList)
		outerListRange = range(0, listLength)
		
		for i in outerListRange:
			term1 = wordList[i]
			innerListRange = range (0, i)
			
			# define basic n-gram object
			term1Ngram = Ngram(term1, 2)
			term1Ngram.deriveNgrams()
			term1NgramList = term1Ngram.getNgrams()

			for j in innerListRange:
				term2 = wordList[j]
				term2Ngram = Ngram(term2, 2)
				term2Ngram.deriveNgrams()
				term2NgramList = term2Ngram.getNgrams()
				
				# calculate n-gram value
				ngramSimilarity = ngramAdvas.compareNgramLists (term1NgramList, term2NgramList)
	
				# compare
				degree = ngramSimilarity - equality
				if (degree>0):
					# ... these terms are so similar that they can be conflated
					# remove the longer term, keep the shorter one
					if (len(term2)>len(term1)):
						stopList.append(term2)
					else:
						stopList.append(term1)
					# end if
				# end if
			# end for
		# end for

		# conflate the matrix
		# remove all the items which appear in stopList
		return list(set(wordList) - set(stopList))
		 
Beispiel #14
0
def do(n):
    with open('60000_parole_italiane.txt', 'r') as f:
        with open("%s_grams.txt" % n, 'w+') as r:
            for line in f:
                p = line.rstrip()
                g = Ngram().ngram(p, n)
                r.write("%s -> %s\n" % (p, g))
Beispiel #15
0
def main():
  from ngram import Ngram
  from model import Model
  from forest import Forest
  
  flags.DEFINE_integer("beam", 100, "beam size", short_name="b")
  flags.DEFINE_integer("debuglevel", 0, "debug level")
  flags.DEFINE_boolean("mert", True, "output mert-friendly info (<hyp><cost>)")
  flags.DEFINE_boolean("cube", True, "using cube pruning to speedup")
  flags.DEFINE_integer("kbest", 1, "kbest output", short_name="k")
  flags.DEFINE_integer("ratio", 3, "the maximum items (pop from PQ): ratio*b", short_name="r")
  

  argv = FLAGS(sys.argv)

  weights = Model.cmdline_model()
  lm = Ngram.cmdline_ngram()
  
  false_decoder = CYKDecoder(weights, lm)
  
  def non_local_scorer(cedge, ders):
    (lmsc, alltrans, sig) = false_decoder.deltLMScore(cedge.lhsstr, ders)
    fv = Vector()
    fv["lm"] = lmsc
    return ((weights.dot(fv), fv), alltrans, sig)
  cube_prune = CubePruning(FeatureScorer(weights), non_local_scorer, FLAGS.k, FLAGS.ratio)

  for i, forest in enumerate(Forest.load("-", is_tforest=True, lm=lm), 1):
    a = false_decoder.beam_search(forest, b = FLAGS.beam)
    b = cube_prune.run(forest.root)

    assert a[0], b[0].score[0]
    assert a[1], b[0].score[1]
    print a
    print b[0]
Beispiel #16
0
    def add_ngram(self, ngram, tree=None):
        tree = self.tree if tree is None else tree

        if ngram in tree:
            tree[ngram].count += 1
        else:
            tree[ngram] = Ngram(ngram, self.hindsight, self.foresight)
Beispiel #17
0
    def eat_token_string(self, s, max_reach=2, max_ngram_size=2):
        for ngram_size in range(1, max_ngram_size + 1):
            for i in range(len(s)):
                start = i
                end = i + ngram_size
                if start >= 0 and end < len(s) + 1:
                    before, current, after = s[:start], s[start:end], s[end:]

                    if len(current) == 1:
                        self.wordcount += 1

                    ngram = " ".join(current)

                    if ngram in self.tree:
                        self.tree[ngram].count += 1
                    else:
                        self.tree[ngram] = Ngram(ngram, 1, max_reach)

                    for reach in range(1, max_reach + 1):

                        # update dictionary to reflect all words occurring after this ngram
                        try:
                            word = after[reach - 1]
                            #print 'after "%s" is "%s" with reach %s' % (ngram, word, reach)
                            self.tree[ngram].add_after(word, reach, 1)
                        except IndexError:
                            pass
                        """
Beispiel #18
0
    def add_ngram(self, ngram, tree=None):
        """Adds an ngram to a given tree"""
        tree = self.tree if tree is None else tree

        if ngram in tree:
            tree[ngram].count += 1
        else:
            tree[ngram] = Ngram(ngram, self.hindsight, self.foresight)
Beispiel #19
0
    def exec_third(self):
        e = EditDistance()
        a = Ngram()

        costi = []
        coefficienti = []
        risultati_edit = []
        risultati_gram = []

        parola = raw_input("**** Inserisci parola --> ")

        # edit distance
        # print '----- EDIT DISTANCE'
        # costi: 1, 2, 3, 4, 5
        for c in range(1, 6):
            costi.append(c)
            with open('60000_parole_italiane.txt', 'r') as f:
                e_results = []
                for line in f:
                    p = line.rstrip()
                    _, op = e.edit_distance(parola, p)
                    costo = e.op_sequence(op, len(parola) - 1, len(p) - 1, [])
                    if costo < c:
                        e_results.append((p, costo))
                risultati_edit.append(len(e_results))
                # print 'ho trovato %s risultati per soglia costo %s' % (len(e_results), c), '-->', sorted(e_results, key=get(1))

        # ngram
        # print '----- NGRAM'
        b = a.ngram(parola, self.numberOfGrams)
        # coefficienti: 0.5, 0.6, 0.7, 0.8, 0.9
        for j in np.arange(0.5, 1.0, 0.1):
            coefficienti.append(j)
            with open("%s_grams.txt" % self.numberOfGrams, 'r') as f:
                g_results = []
                for line in f:
                    s = line.split(' -> ')
                    p, g = s[0], s[1]
                    f = a.jaccard(b, g)
                    if f > j:
                        g_results.append((p, f))
                risultati_gram.append(len(g_results))
                # print 'ho trovato %s risultati per jaccard maggiore di %s' % (len(g_results), j), '-->', sorted(g_results, key=get(1), reverse=True)

        return [costi, coefficienti, risultati_edit, risultati_gram]
Beispiel #20
0
 def calculate_ngrams(self, doc, length=3):
     num_terms = len(doc)
     ngrams = []
     for t in xrange(num_terms):
         if num_terms <= t + length - 1:
             break  # n-2 ngrams!
         ngram_tokens = doc[t:t + length]
         ngram_value = "-".join(ngram_tokens)
         ngram = Ngram(ngram_value)
         ngrams.append(ngram)
     return ngrams
Beispiel #21
0
    def test_simple(self):
        #        tc = unittest.TestCase()
        words1 = ("hello", "<punc>", "world")
        words2 = ("hello", "<punc>", "underworld")
        words3 = ("hello", "<punc>", "John")
        words4 = ("goodbye", "<punc>", "John")
        ng = NgramStorage(3)
        ng.set_n_gram(words1[0], Ngram(12, 0.1))
        ng.set_n_gram(words1[0:1], Ngram(10, 0.08))
        ng.set_n_gram(words1, Ngram(4, 0.02))
        ng.set_n_gram(words2, Ngram(4, 0.02))
        ng.set_n_gram(words3, Ngram(2, 0.01))
        ng.set_n_gram(words4, Ngram(8, 0.05))

        epsilon = 0.00001

        self.assertEqual(8, ng.get_n_gram(words4).count)
        self.assertLess(abs(0.05 - ng.get_n_gram(words4).prob), epsilon)
        self.assertEqual(4, ng.get_n_gram(words1).count)
        self.assertLess(abs(0.02 - ng.get_n_gram(words1).prob), epsilon)
        self.assertIsNone(ng.get_n_gram(words4[0]))
        self.assertIsNone(ng.get_n_gram(words4[0:1]))

        self.assertEqual(4, len(ng.get_n_grams(3)))

        self.assertEqual(3, ng.max_order())
Beispiel #22
0
    def exec_first(self):
        with open('60000_parole_italiane.txt', 'r') as f:

            e = EditDistance()
            a = Ngram()

            lines = f.readlines()
            rand = random.randint(0, len(lines))
            word = lines[rand].rstrip()
            print 'random word -->', word

            # test edit distance
            start = timer()
            for line in lines:
                p = line.rstrip()
                if p == word:
                    break
                _, op = e.edit_distance(word, p)
                _ = e.op_sequence(op, len(word) - 1, len(p) - 1, [])
            end = timer()
            time_edit = end - start
            # print 'tempo trascorso edit distance -->', time_edit

            # test ngrams
            b = a.ngram(word, self.numberOfGrams)
            with open("%s_grams.txt" % self.numberOfGrams, 'r') as r:
                start = timer()
                for line in r:
                    s = line.split(' -> ')
                    p, g = s[0], s[1]
                    if p == word:
                        break
                    _ = a.jaccard(b, g)
                end = timer()
            time_ngram = end - start
            # print 'tempo trascorso ngrams -->', time_ngram

            return [word, time_edit, time_ngram]
Beispiel #23
0
    def calcSuccVariety(self):

        # derive two-letter combinations
        ngramObject = Ngram(self.term, 2)
        ngramObject.deriveNgrams()
        ngramSet = set(ngramObject.getNgrams())

        # count appearances of the second letter
        varietyList = {}
        for entry in ngramSet:
            letter1 = entry[0]
            letter2 = entry[1]
            if varietyList.has_key(letter1):
                items = varietyList[letter1]
                if not letter2 in items:
                    # extend the existing one
                    items.append(letter2)
                    varietyList[letter1] = items
            else:
                # create a new one
                varietyList[letter1] = [letter2]

        return varietyList
Beispiel #24
0
def main(args):
    print(f'Loading corpus from `{args.data}`...')
    corpus = Corpus(args.data,
                    order=args.order,
                    lower=args.lower,
                    max_lines=args.max_lines)
    model = Ngram(order=args.order)
    name = f'{args.name}.{args.order}gram'

    print('Example data:')
    print('Train:', corpus.train[:20])
    print('Valid:', corpus.valid[:20])

    print('Training model...')
    model.train(corpus.train,
                add_k=args.add_k,
                interpolate=args.interpolate,
                backoff=args.backoff)
    print(f'Vocab size: {len(model.vocab):,}')

    if args.save_arpa:
        print(f'Saving model to `{name}`...')
        model.save_arpa(name)

    assert model.sum_to_one(n=10)

    print('Generating text...')
    text = model.generate(100)
    text = ' '.join(text)
    path = os.path.join(args.out, f'generated.{name}.txt')
    print(text)
    with open(path, 'w') as f:
        print(text, file=f)

    if model.is_smoothed:
        print('\nPredicting test set NLL...')
        logprob = model(corpus.test)
        nll = -logprob / len(corpus.test)
        print(f'Test NLL: {nll:.2f} | Perplexity {exp(nll):.2f}')
        path = os.path.join(args.out, f'result.{name}.txt')
        with open(path, 'w') as f:
            print(f'Test NLL: {nll:.2f} | Perplexity {exp(nll):.2f}', file=f)
    else:
        exit(
            'No evaluation with unsmoothed model: probability is probably 0 anyways.'
        )
Beispiel #25
0
    def exec_fourth(self):
        a = Ngram()

        risultati = []

        # parola = raw_input("**** Inserisci parola --> ")

        parole = []
        with open('60000_parole_italiane.txt', 'r') as f:
            lines = f.readlines()
            for i in range(3):
                rand = random.randint(0, len(lines))
                word = lines[rand].rstrip()
                while len(word) < 5:
                    rand = random.randint(0, len(lines))
                    word = lines[rand].rstrip()
                parole.append(word)

        # grammi: 2, 3, 4
        for parola in parole:
            subarray = []
            for n in range(2, 5):
                with open("%s_grams.txt" % n, 'r') as f:
                    b = a.ngram(parola, n)
                    g_results = []
                    for line in f:
                        s = line.split(' -> ')
                        p, g = s[0], s[1]
                        f = a.jaccard(b, g)
                        if f > self.sogliaJaccard:
                            g_results.append((p, f))
                    # print 'ho trovato %s risultati per %s grammi' % (len(g_results), n), '-->', sorted(g_results, key=get(1), reverse=True)
                    subarray.append(len(g_results))
            risultati.append(subarray)

        return [parole, risultati]
Beispiel #26
0
class NgramTests(unittest.TestCase):

    def mock_ngram(self, string, count, frequency, sig_score):
        ngram = Ngram(string)
        ngram.count = count
        ngram.frequency = frequency
        ngram.sig_score = sig_score
        return ngram

    def setUp(self):
        ngram_after_1 = self.mock_ngram('bar', 1, 2, 3)
        ngram_after_2 = self.mock_ngram('baz', 7, 8, 9)

        self.ngram = Ngram('foo', 1, 1)

        self.ngram.after[0][ngram_after_1.string] = ngram_after_1
        self.ngram.after[0][ngram_after_2.string] = ngram_after_2

        self.ngram.before[0][ngram_after_1.string] = ngram_after_1
        self.ngram.before[0][ngram_after_2.string] = ngram_after_2

    def test_get_after__sort_attribute_count(self):
        self.assertEqual(
            self.ngram.get_after(sort_attribute="count"),
            [('baz', 7), ('bar', 1)]
        )

    def test_get_after__sort_attribute_frequency(self):
        self.assertEqual(
            self.ngram.get_after(sort_attribute="frequency"),
            [('baz', 8), ('bar', 2)]
        )

    def test_get_after__sort_attribute_sig_score(self):
        self.assertEqual(
            self.ngram.get_after(sort_attribute="sig_score"),
            [('baz', 9), ('bar', 3)]
        )

    def test_get_before__sort_attribute_count(self):
        self.assertEqual(
            self.ngram.get_before(sort_attribute="count"),
            [('baz', 7), ('bar', 1)]
        )

    def test_get_before__sort_attribute_frequency(self):
        self.assertEqual(
            self.ngram.get_before(sort_attribute="frequency"),
            [('baz', 8), ('bar', 2)]
        )

    def test_get_before__sort_attribute_sig_score(self):
        self.assertEqual(
            self.ngram.get_before(sort_attribute="sig_score"),
            [('baz', 9), ('bar', 3)]
        )
Beispiel #27
0
def test():
    ng = Ngram()

    # Your n-gram model is trained with a text file
    # ng.train('data/wiki-ja-train.word')
    ng.train(args.train_file)

    # You can save your trained model as text. Currently, we do not support loading trained model.
    # ng.dump('trained/wiki_ja_train_trained_model', n=1)
    ng.dump('{}-{}gram'.format(args.dump_file, args.N), n=args.N)
Beispiel #28
0
def make_ngram(ngrams, splited, n, n_doc):
    tmp = []
    for i in range(0, len(splited)):
        for j in range(0, n):
            if i + j < len(splited):
                tmp.append(splited[i + j])
        if len(tmp) == n:
            key = ' '.join(tmp)
            if key in ngrams:
                ngrams[key].occu_tot += 1
                ngrams[key].docs.add(n_doc)
            else:
                ngram = Ngram(tmp, n_doc)
                ngrams[key] = ngram
        tmp = []
    return ngrams
Beispiel #29
0
    def enter_sequence(self, ngram, count, tree):
        components = ngram.split(' ')
        head = " ".join(components[:-1])
        tail = components[-1]

        if head in tree:
            tree[head].count += count
        else:
            tree[head] = Ngram(ngram, count, 1, 0)

        self.wordcount += count * len(components)

        branch = tree[head].after[0]
        if tail in branch:
            branch[tail] += count
        else:
            branch[tail] = count
Beispiel #30
0
def main():
  from ngram import Ngram
  from model import Model
  from forest import Forest
  
  flags.DEFINE_integer("beam", 100, "beam size", short_name="b")
  flags.DEFINE_integer("debuglevel", 0, "debug level")
  flags.DEFINE_boolean("mert", True, "output mert-friendly info (<hyp><cost>)")
  flags.DEFINE_boolean("cube", True, "using cube pruning to speedup")
  flags.DEFINE_integer("kbest", 1, "kbest output", short_name="k")
  flags.DEFINE_integer("ratio", 3, "the maximum items (pop from PQ): ratio*b", short_name="r")
  

  argv = FLAGS(sys.argv)
  [outfile] = argv[1:]
  weights = Model.cmdline_model()
  lm = Ngram.cmdline_ngram()
  

  false_decoder = CYKDecoder(weights, lm)
  out = utility.getfile(outfile, 1)
  old_bleu = Bleu()
  new_bleu = Bleu()
  
  for i, forest in enumerate(Forest.load("-", is_tforest=True, lm=lm), 1):
    
    oracle_forest, oracle_item = oracle_extracter(forest, weights, false_decoder, 100, 2, extract=100)
    print >>sys.stderr, "processed sent %s " % i
    oracle_forest.dump(out)
    bleu, hyp, fv, edgelist = forest.compute_oracle(weights, 0.0, 1)

    forest.bleu.rescore(hyp)
    old_bleu += forest.bleu
    forest.bleu.rescore(oracle_item[0].full_derivation)
    new_bleu += forest.bleu

    bad_bleu, _, _, _ = oracle_forest.compute_oracle(weights, 0.0, -1)
    #for i in range(min(len(oracle_item), 5)):
     # print >>sys.stderr, "Oracle Trans: %s %s %s" %(oracle_item[i].full_derivation, oracle_item[i].score, str(oracle_item[i].score[2]))
     # print >>sys.stderr, "Oracle BLEU Score: %s"% (forest.bleu.rescore(oracle_item[i].full_derivation))
    print >>sys.stderr, "Oracle BLEU Score: %s"% (forest.bleu.rescore(oracle_item[0].full_derivation))
    print >>sys.stderr, "Worst new Oracle BLEU Score: %s"% (bad_bleu)
    print >>sys.stderr, "Old Oracle BLEU Score: %s"% (bleu)
    
    print >>sys.stderr, "Running Oracle BLEU Score: %s"% (new_bleu.compute_score())
    print >>sys.stderr, "Running Old Oracle BLEU Score: %s"% (old_bleu.compute_score())
Beispiel #31
0
    def __init__(
        self,
        nnet_fname,
        scaler_fname,
        labels_fname,
        ngram_fname,
        logbase=1,
        loglevel=logging.INFO,
    ):
        self.nnet_fname = nnet_fname
        self.scaler_fname = scaler_fname
        self.labels_fname = labels_fname
        self.ngram_fname = ngram_fname
        self.logbase = logbase
        self.loglevel = loglevel
        self.loglevelname = logging._levelToName[loglevel].lower()

        Bantry.scaler = ScalerFactory(scaler_fname)
        Bantry.classifier = Classifier(nnet_fname,
                                       labels_fname,
                                       logbase=logbase)
        self.ng = Ngram(ngram_fname)
        Bantry.ngram = self.ng
        GramGraph.set_ngram(self.ng)
Beispiel #32
0
#coding:utf-8
from Dictionary import Dictionary
from ngram import Ngram

    

dict1 = Dictionary("dict.txt")
while(True):
    ngram1 =Ngram(dict1)
    sentence = raw_input("please input a Chinese Sentence:").decode("cp936");

    segmap=ngram1.getSeg(sentence)
    
    for sg in segmap:
        print(sg)
    
    

    #for eachkey in segmap:
               
     #   if(isinstance(segmap[eachkey],tuple)):
      #      print (eachkey+":"+segmap[eachkey][0]+','+segmap[eachkey][1])
       # else:
        #    print (eachkey+":"+segmap[eachkey])
    #printSeg(segmap,sentence)
    
    #print segmap
Beispiel #33
0
 def mock_ngram(self, string, count, frequency, sig_score):
     ngram = Ngram(string)
     ngram.count = count
     ngram.frequency = frequency
     ngram.sig_score = sig_score
     return ngram
Beispiel #34
0
def main():
    gc.set_threshold(100000, 10, 10)
    flags.DEFINE_integer("beam", 100, "beam size", short_name="b")
    flags.DEFINE_integer("debuglevel", 0, "debug level")
    flags.DEFINE_boolean("mert", True, "output mert-friendly info (<hyp><cost>)")
    flags.DEFINE_boolean("cube", True, "using cube pruning to speedup")
    flags.DEFINE_integer("kbest", 1, "kbest output", short_name="k")
    flags.DEFINE_integer("ratio", 3, "the maximum items (pop from PQ): ratio*b", short_name="r")
    flags.DEFINE_boolean("dist", False, "ditributed (hadoop) training)")
    flags.DEFINE_string("prefix", "", "prefix for distributed training")
    flags.DEFINE_string("hadoop_weights", "", "hadoop weights (formatted specially)")
    flags.DEFINE_boolean("add_features", False, "add features to training data")
    flags.DEFINE_boolean("prune_train", False, "prune before decoding")
    flags.DEFINE_boolean("no_lm", False, "don't use the unigram language model")
    flags.DEFINE_boolean("pickleinput", False, "assumed input is pickled")
    flags.DEFINE_string("oracle_forests", None, "oracle forests", short_name="o")
    flags.DEFINE_string("feature_map_file", None, "file with the integer to feature mapping (for lbfgs)")
    flags.DEFINE_boolean("cache_input", False, "cache input sentences (only works for pruned input)")
    flags.DEFINE_string("rm_features", None, "list of features to remove")
    flags.DEFINE_boolean("just_basic", False, "remove all features but basic")

    argv = FLAGS(sys.argv)

    if FLAGS.weights:
        weights = Model.cmdline_model()
    else:
        vector = Vector()
        assert glob.glob(FLAGS.hadoop_weights)
        for file in glob.glob(FLAGS.hadoop_weights):
            for l in open(file):
                if not l.strip():
                    continue
                f, v = l.strip().split()
                vector[f] = float(v)
        weights = Model(vector)

    rm_features = set()
    if FLAGS.rm_features:
        for l in open(FLAGS.rm_features):
            rm_features.add(l.strip())

    lm = Ngram.cmdline_ngram()
    if FLAGS.no_lm:
        lm = None

    if argv[1] == "train":
        local_decode = ChiangPerceptronDecoder(weights, lm)
    elif argv[1] == "sgd" or argv[1] == "crf":
        local_decode = MarginalDecoder(weights, lm)
    else:
        local_decode = MarginalDecoder(weights, lm)

    if FLAGS.add_features:
        tdm = local_features.TargetDataManager()
        local_decode.feature_adder = FeatureAdder(tdm)
    local_decode.prune_train = FLAGS.prune_train
    local_decode.use_pickle = FLAGS.pickleinput
    local_decode.cache_input = FLAGS.cache_input
    print >> logs, "Cache input is %s" % FLAGS.cache_input
    if FLAGS.debuglevel > 0:
        print >> logs, "beam size = %d" % FLAGS.beam

    if argv[1] == "train":
        if not FLAGS.dist:
            perc = trainer.Perceptron.cmdline_perc(local_decode)
        else:
            train_files = [FLAGS.prefix + file.strip() for file in sys.stdin]
            perc = distributed_trainer.DistributedPerceptron.cmdline_perc(local_decode)
            perc.set_training(train_files)
        perc.train()
    elif argv[1] == "sgd":
        crf = sgd.BaseCRF.cmdline_crf(local_decode)
        crf.set_oracle_files([FLAGS.oracle_forests])
        crf.train()

    elif argv[1] == "crf":
        if not FLAGS.dist:
            crf = CRF.LBFGSCRF.cmdline_crf(local_decode)
            crf.set_oracle_files([FLAGS.oracle_forests])
            crf.set_feature_mappers(add_features.read_features(FLAGS.feature_map_file))
            crf.rm_features(rm_features)
            if FLAGS.just_basic:
                print "Enforcing Basic"
                crf.enforce_just_basic()
            crf.train()
        else:
            # train_files = [FLAGS.prefix+file.strip() for file in sys.stdin]
            # oracle_files = [file+".oracle" for file in train_files]
            print >> sys.stderr, "DistributedCRF"
            crf = distCRF.DistributedCRF.cmdline_distibuted_crf(local_decode)
            # os.system("~/.python/bin/dumbo rm train_input -hadoop /home/nlg-03/mt-apps/hadoop/0.20.1+169.89/")
            # os.system("~/.python/bin/dumbo put "+crf.trainfiles[0]+" train_input -hadoop /home/nlg-03/mt-apps/hadoop/0.20.1+169.89/")
            crf.set_feature_mappers(add_features.read_features(FLAGS.feature_map_file))
            crf.rm_features(rm_features)
            if FLAGS.just_basic:
                print "Enforcing Basic"
                crf.enforce_just_basic()

            # crf.set_oracle_files(oracle_files)
            crf.train()

    else:
        if not FLAGS.dist:
            print "Evaluating"
            eval = Evaluator(local_decode, [FLAGS.dev])
            eval.tune()
        else:
            dev_files = [FLAGS.prefix + file.strip() for file in sys.stdin]
            eval = Evaluator(local_decode, dev_files)
        print eval.eval(verbose=True).compute_score()
Beispiel #35
0
    flags.DEFINE_string("rulefilter", None, "filter ruleset")
    flags.DEFINE_integer("max_height", 3, "maximum height of lhs for pattern-matching")


    flags.DEFINE_integer("example_limit", 1e10, "number of examples to use")
    
    flags.DEFINE_float("hope", 0, "hope weight")
    flags.DEFINE_boolean("mert", True, "output mert-friendly info (<hyp><cost)")

    from ngram import Ngram # defines --lm and --order    

    argv = FLAGS(sys.argv)

    weights = Model.cmdline_model()
    lm = Ngram.cmdline_ngram() # if FLAGS.lm is None then returns None
    if lm:
        weights["lm1"] = weights["lm"] * FLAGS.lmratio

    reffiles = [open(f) for f in argv[1:]]

    convert_forest = ((FLAGS.ruleset is not None) or (FLAGS.rulefilter is not None) )
  
    if FLAGS.ruleset is not None:
        ruleset = RuleSet(FLAGS.ruleset)
        
        if FLAGS.phrase is not None:
            ruleset.add_bp(FLAGS.phrase)

        Forest.globalruleid = ruleset.rule_num()
    
Beispiel #36
0
    from ngram import Ngram
    from model import Model
    from forest import Forest

    flags.DEFINE_integer("beam", 100, "beam size", short_name="b")
    flags.DEFINE_integer("debuglevel", 0, "debug level")
    flags.DEFINE_boolean("mert", True, "output mert-friendly info (<hyp><cost>)")
    flags.DEFINE_boolean("cube", True, "using cube pruning to speedup")
    flags.DEFINE_integer("kbest", 1, "kbest output", short_name="k")
    flags.DEFINE_integer("ratio", 3, "the maximum items (pop from PQ): ratio*b", short_name="r")
  

    argv = FLAGS(sys.argv)

    weights = Model.cmdline_model()
    lm = Ngram.cmdline_ngram()

    decoder = CYKDecoder(weights, lm)

    tot_bleu = Bleu()
    tot_score = 0.
    tot_time = 0.
    tot_len = tot_fnodes = tot_fedges = 0

    tot_lmedges = 0
    tot_lmnodes = 0
    if FLAGS.debuglevel > 0:
        print >>logs, "beam size = %d" % FLAGS.beam

    for i, forest in enumerate(Forest.load("-", is_tforest=True, lm=lm), 1):
Beispiel #37
0
        
        v=self.mDict.getPvalue(sentence)
        if((v)>maxPvalue and self.mDict.isAWord(sentence)):
            self.valueMap[sentence]=v
            self.segMap[sentence]=sentence
            return v
        else:
            self.valueMap[sentence]=maxPvalue
            self.segMap[sentence]=wordPair
            return maxPvalue
    def getSeg(self):
        return self.segMap

        
if(__name__ =="__main__"):
    ngram1 = Ngram("dict1")
    print ngram1.splitsentence("ABC")


from Dictionary import Dictionary
from ngram import Ngram

def printSeg(segMap,sentence):
    if(segMap.has_key(sentence)):
        pair = segMap[sentence]
        if(isinstance(pair,tuple)):
            printSeg(segMap,pair[0])
            printSeg(segMap,pair[1])
        else:
            if(sentence==pair):
                print sentence
Beispiel #38
0
	def increment_tree(self, ngram, count, tree, max_ngram_size):
		if ngram in tree:
			tree[ngram].count += count
		else:
			tree[ngram] = Ngram(ngram, count, max_ngram_size)
Beispiel #39
0
#!/usr/bin/env python3
import math
from util import tokenize_data
from ngram import Ngram
import csv
import os.path

if __name__ == '__main__':
    train_filename = '../data/AllCommitAddLines.txt'
    train_data = tokenize_data(train_filename)
    print(train_data)
    ngram = Ngram(3)
    print("TRAINING STARTED...")
    list_of_bigrams, unigram_counts, bigram_counts, list_of_trigrams, trigram_counts = ngram.train(
        train_data)

    one_gram_prob = ngram.calculate_onegram_prob(unigram_counts)
    bigram_prob = ngram.calculate_bigram_prob(list_of_bigrams, unigram_counts,
                                              bigram_counts)
    trigram_prob = ngram.calculate_trigram_prob(list_of_trigrams,
                                                bigram_counts, trigram_counts)

    with open("input_csv_file_path.csv", 'r') as csvinput:
        with open("output_csv_file_path.csv", 'w') as csvoutput:
            writer = csv.writer(csvoutput, lineterminator='\n')
            reader = csv.reader(csvinput)
            all = []
            row = next(reader)
            # print(row[0])
            row.append('NGLP')
            all.append(row)