def __init__(self): self.unigram_model = Unigram() self.bigram_model = Bigram() self.trigram_model = Trigram() self.unigram_lambda = .25 self.bigram_lambda = .25 self.trigram_lambda = .5
def guess_language(entry): text = u''; for child in entry.xml_xpath(u'atom:title|atom:summary|atom:content'): text = text + u' '+ child.__unicode__() t = Trigram() t.parseString(text) if tri('fr') - t > tri('en') - t: lang=u'en' else: lang=u'fr' entry.xml_set_attribute((u'xml:lang', XML_NS), lang)
def best_match(query, items, min_score=0.5): match = None max_score = 0 if query: t = Trigram(query) for i, item in enumerate(items): score = t.score(item) if item else 0 if score == 1.0: return (i, 1.0,) if score > max_score and score > min_score: match = i max_score = score return (match, max_score,)
class Interpolation(LanguageModel): def __init__(self): self.unigram_model = Unigram() self.bigram_model = Bigram() self.trigram_model = Trigram() self.unigram_lambda = .25 self.bigram_lambda = .25 self.trigram_lambda = .5 def train(self, trainingSentences): self.unigram_model.train(trainingSentences) self.bigram_model.train(trainingSentences) self.trigram_model.train(trainingSentences) #Arbitrary lambdas. def getWordProbability(self, sentence, index): return (self.trigram_lambda * self.trigram_model.getWordProbability(sentence, index)) \ + (self.bigram_lambda * self.bigram_model.getWordProbability(sentence, index)) \ + (self.unigram_lambda * self.unigram_model.getWordProbability(sentence, index)) #Doesn't matter which model we use here- vocabulary is the same def getVocabulary(self, context): return self.trigram_model.getVocabulary(context) #What does generating a sentence in an interpolation model look like? #I don't know, so what I've done is generate a word using trigram, bigram, and #unigram model some of the time, using the same values in getWordProbability def generateSentence(self): sentence = [] prev_previous = LanguageModel.START previous = random.choice(list(self.trigram_model.word_count.keys())) for i in range(20): model_choice = random.random() if model_choice <= self.trigram_lambda: word = self.trigram_model.generateWord(prev_previous, previous) elif model_choice > self.trigram_lambda and model_choice <= self.trigram_lambda + self.bigram_lambda: word = self.bigram_model.generate_word(previous) else: word = self.unigram_model.generateWord() sentence.append(word) prev_previous = previous previous = word if word == LanguageModel.STOP: break return sentence
class TestTrigram(TestCase): def setUp(self): input = "I wish I may I wish I might" self.trigram = Trigram(input) def test_find_trigrams(self): expected_trigram_collection = { "I wish": ["I", "I"], "wish I": ["may", "might"], "may I": ["wish"], "I may": ["I"] } actual_trigram_collection = self.trigram.findTrigrams() self.assertEqual(expected_trigram_collection, actual_trigram_collection)
def on_success(self, data): if 'text' in data: print "analyzing '" + data['text'] + "'" # find words ws = filter(lambda match: match != '', re.findall(r"$| ([a-zA-Z-']+)", data['text'].encode('utf-8'))) # turn into lower case words ws = map(lambda word: word.lower(), ws) # trigrams please trigrams = reduce( lambda tris, w: tris + [(tris[-1][1], tris[-1][2], w)], ws, [('','','')])[3:] print "found trigrams: " + str(trigrams) for tg in trigrams: try: t_rec = Trigram.get(tg[0]+','+tg[1], tg[2]) except DynamoDBKeyNotFoundError: t_rec = Trigram() t_rec.w12 = tg[0]+','+tg[1] t_rec.w3 = tg[2] t_rec.count += 1 t_rec.save()
from unigram import Unigram from bigram import Bigram from trigram import Trigram inputs = read('input.txt')[0].strip().split(" ") V, N, S_FACTOR, TRAINING_FILE, TEST_FILE = (int(inputs[0]), int(inputs[1]), float(inputs[2]), inputs[3], inputs[4]) OUTPUT_FILE_NAME = f"./results/trace_{V}_{N}_{S_FACTOR}.txt" t1 = time() if V == 3: print(f"BYOM: V = {V} n = 3 d = {S_FACTOR}") BYOM = BYOM(V, S_FACTOR, TRAINING_FILE, TEST_FILE, OUTPUT_FILE_NAME) BYOM.execute() elif N == 1: print(f"unigram: V = {V} d = {S_FACTOR}") UNIGRAM = Unigram(V, S_FACTOR, TRAINING_FILE, TEST_FILE, OUTPUT_FILE_NAME) UNIGRAM.execute() elif N == 2: print(f"bigram: V = {V} d = {S_FACTOR}") BIGRAM = Bigram(V, S_FACTOR, TRAINING_FILE, TEST_FILE, OUTPUT_FILE_NAME) BIGRAM.execute() elif N == 3: print(f"trigram: V = {V} d = {S_FACTOR}") TRIGRAM = Trigram(V, S_FACTOR, TRAINING_FILE, TEST_FILE, OUTPUT_FILE_NAME) TRIGRAM.execute() t2 = time() print(f"execution time: {t2 - t1}s")
cb = ConnectionBorg() cb.set_region('eu-west-1') cb.set_credentials(settings.aws_access_key_id, settings.aws_secret_access_key) tw = Twython(settings.app_key, settings.app_secret, settings.oauth_token, settings.oauth_token_secret) while True: try: tweet = '' last_word = '' # get all digrams in db ts = Trigram.scan() # count total number of digrams on the db total_count = 0 for t in ts: total_count += t.count # pick a random digram r = randint(1, total_count) print 'r = '+str(r) ts = Trigram.scan() csum = 0 for t in ts: csum += t.count print 'csum = '+str(csum) if r <= csum: tweet += ' '.join(t.w12.split(',')) last_pair = (t.w12.split(',')[1], t.w3)
def setUp(self): input = "I wish I may I wish I might" self.trigram = Trigram(input)
def main(): tri = Trigram(argv[1]) out = open(argv[2], 'w') dump(tri, out) out.close()
from __future__ import print_function from bs4 import BeautifulSoup from nltk.tokenize import word_tokenize from trigram import Trigram import random replace_rate = 0.2 if __name__ == '__main__': reviews = BeautifulSoup(open('temp/positive.review').read(), 'lxml').findAll('review_text') documents = [review.text for review in reviews] print("Data Loaded") model = Trigram() model.fit(documents) while True: review = random.choice(documents) if len(review.split()) < 30: break string = review.lower().strip() print(string, end='\n\n') tokens = word_tokenize(string) for i in range(len(tokens) - 2): if random.random() < replace_rate: key = (tokens[i], tokens[i + 2]) if key in model.trigram2proba: next_word = model.predict(key) tokens[i + 1] = next_word
from twitter import Twitter from trigram import Trigram from corpus import create_corpus, reconstruct_corpus if __name__ == "__main__": """Main program""" tw = Twitter() corpus = tw.fecth_tweets(5000, wait=True) create_corpus(corpus, 'tweets') corpus = reconstruct_corpus('tweets.pickle') trigram = Trigram(corpus=corpus) print('以下の単語群から文章を開始することができます') for usage in sorted(list(trigram.usage_)): print(usage, end=' / ') try: while True: word = input('\n開始単語を1語入力してください:') sentence = trigram.generate(word, 0.08) print(sentence) print('============================') tweet = input('ツイートしますか? (Y/n)') if tweet == 'Y': tw.update_tweet(sentence) input('プログラム終了次は`Ctrl+C`を押してください\n\ 続ける場合はそのほかのキーを押してください')
import time from trigram import Trigram app = Flask(__name__, static_url_path='/static') # read dictionary file into a trie word_list = [] with open('word_search.tsv', 'r') as tsv: AoA = [line.strip().split('\t')[0] for line in tsv] # print('Creating a tree') # trie = TrieNode() # for word in AoA: # trie.insert(word) trig = Trigram() for word in AoA: trig.add(word) @app.route('/') def hello_world(): return render_template('index.html') @app.route('/search') def search1(): query = request.args.get('query') print("getting search results") start = time.time() result = trig.search(query)