def gen_mnemonic(corpus_path, input_string): first_letters = gen_input(input_string) corpus = open(corpus_path, encoding='utf8').read() corpus = corpus.lower() tokens = nltk.WhitespaceTokenizer().tokenize(corpus) for i in tokens: i = i.lower() tagged = nltk.pos_tag([i for i in tokens if i], tagset='universal') markov_dict, tag_dict = gen_dicts(tagged) # initialize sequence init_wordpool = [] for pair in markov_dict: if pair[0][0] == first_letters[0]: init_wordpool.append(pair) first_word = random.choice(init_wordpool) mnemonic = [first_word] # check if next beginning letter in sequence is part of markov sequence for # previous word for i in range(1, len(first_letters)): choices = [] markov_chain = markov_dict.get(mnemonic[i - 1]) # sort markov_chain markov_chain = sorted(markov_chain, key=markov_chain.get) tag_chain = tag_dict.get(mnemonic[i - 1][1]) tag_chain = sorted(tag_chain, key=tag_chain.get) for pair in markov_chain: if pair[0][0] == first_letters[i]: choices.append(pair) for pair in markov_dict: # go through everything in tag_dict for k in range(0, MARKOV_ORDER - 1): if pair[0][0] == first_letters[i] and pair[1] == tag_chain[k]: choices.append(pair) random_flag = 0 if not choices: random_flag += 1 for pair in markov_dict: if pair[0][0] == first_letters[i]: choices.append(pair) if random_flag == 0: mnemonic.append(choices[0]) else: mnemonic.append(random.choice(choices)) out = [i[0] for i in mnemonic] return " ".join(out)
def snowBallStemmer(text): tokens = nltk.WhitespaceTokenizer().tokenize(text) stems = [] stemmer = SnowballStemmer("english") for token in tokens: token = stemmer.stem(token) if token != "": stems.append(token) return stems
def evaulate(taggedfile, reffile): tagged_content = open(taggedfile) reffile_content = open(reffile) totaltokens = 0 totalKnowns = 0 totalUnknowns = 0 unknownCorrect = 0 knownCorrect = 0 delimiter = [">>"] for taggedline, refline in izip(tagged_content, reffile_content): taggedtoken = nltk.WhitespaceTokenizer().tokenize(taggedline) reftoken = nltk.WhitespaceTokenizer().tokenize(refline) totaltokens += len(taggedtoken) # get the total number of tokens for index, token in enumerate(taggedtoken): taggedtag = token.split("/")[1] # if unknown tag if ">>" in [ delimit for delimit in delimiter if delimit in taggedtag ]: taggedtag = taggedtag.rstrip(">>") totalUnknowns += 1 if taggedtag == reftoken[index].split("/")[1]: unknownCorrect += 1 else: totalKnowns += 1 if taggedtag == reftoken[index].split("/")[1]: knownCorrect += 1 print "\n----------Results----------" print "Overall Accuracy: " + str( (knownCorrect + unknownCorrect) / float(totaltokens)) print "Known Accuracy: " + str(knownCorrect / float(totalKnowns)) print "Unknown Accuracy: " + str(unknownCorrect / float(totalUnknowns)) print "\n"
def tokenization(fpath): pos = {} no_of_tags = 0 word_tag = {} transition = {} starttags = ["<s>/<s>"] # Dummy start symbol endtags = ["<e>/<e>"] # Dummy end symbol file_content = open(fpath) for line in file_content.readlines(): tokens = starttags + nltk.WhitespaceTokenizer().tokenize( line) + endtags for index, token in enumerate(tokens): # Create the dictionary # Increment the No_of_tags by 1 no_of_tags += 1 # Add the <word tag: count> to dictionary word = token.split("/")[0] tag = token.split("/")[1] if word + " " + tag in word_tag: word_tag[word + " " + tag] += 1 else: word_tag[word + " " + tag] = 1 # Add the pos occurrence to dictionary if tag in pos: pos[tag] += 1 else: pos[tag] = 1 # Get the transition tags if index < len(tokens) - 1: tag1 = tokens[index].split("/")[1] tag2 = tokens[index + 1].split("/")[1] if (tag1 + " " + tag2) in transition: transition[tag1 + " " + tag2] += 1 else: transition[tag1 + " " + tag2] = 1 # tags dictionary, transition dictionary, word_tag dictionary, no of tags in the file token_results = [pos, transition, word_tag, no_of_tags] return token_results
def tokenize(text): """ Tokenizes sequences of text and stems the tokens. :param text: String to tokenize :return: List with stemmed tokens """ tokens = nl.WhitespaceTokenizer().tokenize(text) tokens = list(set(re.sub("[^a-zA-Z\']", "", token) for token in tokens)) tokens = [word for word in tokens if word not in stopwords.words('english')] tokens = list(set(re.sub("[^a-zA-Z]", "", token) for token in tokens)) stems = [] stemmer = SnowballStemmer("english") for token in tokens: token = stemmer.stem(token) if token != "": stems.append(token) return stems
def pre_process(comment_text): comment_text = re.sub(" n't", "n't", comment_text) comment_text = re.sub(" 's", "", comment_text) comment_text = re.sub(" 'd", "'d", comment_text) comment_text = re.sub(" 're", "'re", comment_text) comment_text = re.sub("-LRB-", "(", comment_text) comment_text = re.sub("-RRB-", ")", comment_text) comment_text = re.sub('\W', ' ', comment_text) comment_text = re.sub('\s+', ' ', comment_text) comment_text = re.sub(r'[0-9]+', '', comment_text) tokenizer = nltk.WhitespaceTokenizer() tokens = tokenizer.tokenize(comment_text) tokens = [lemmatizer.lemmatize(token) for token in tokens] stemmed = nltk.tag.pos_tag(tokens) no_stopwords = [] for token, tag in stemmed: token = token.lower() if tag != 'NNP' and tag != 'NNPS' and token not in stop_words and len(token) > 2: no_stopwords.append(token) processed = ' '.join(no_stopwords) processed = processed.strip(' ') return processed