def generateSentencesAndCheckErrors(infile, K): model, tweets = getModelAndTweetsFromFile(infile, K) res = [] for i in range(100): random_tweet = np.random.choice(tweets) tweet_start = random_tweet.split(" ")[:K-1] num_words_wanted = len(random_tweet.split(" ")) - (K-1) generated_sentence = generateSentenceFromStartingKmer(tweet_start, K, num_words_wanted, model) updated_tweet, updated_generated_sentence = match_sentence_lengths(random_tweet, generated_sentence) result = compareSentences(updated_tweet, updated_generated_sentence) if result == None: continue res.append(result) sys.stdout.flush() print res print "Average:", sum(res)/float(len(res)) return res
def generateTweetsFromFile(infile, K): #initialize array of tweets model, tweets = getModelAndTweetsFromFile(infile, K) starting_kmers = getStartingKMers(K-1, tweets) # make original tweet i = 0 tries = 0 while i < 100 and tries < 1000: tries += 1 #randomly pick a starting KMer sentence = starting_kmers[np.random.choice(range(len(starting_kmers)))] generated_tweet = generateSentenceFromStartingKmer(sentence,K, 20, model) # If the tweet was one of the originals, get another one. if checkIfSentenceInDatabase(generated_tweet, tweets): continue print generated_tweet i += 1 return