def tokenization(document: str) -> List[List[str]]: """Tokenizer function. Args: document: String with lines of tokens separated by single 'space'. Return: List of tokenized sentences. """ return [ tokenizer(sentence, pattern='\s+', gaps=True) for sentence in tokenizer(document, pattern='\n+', gaps=True) ]
def collect(sentence, sentence_tagged): sentence_words = {} tokens = tokenizer(sentence.lower()) sentence_tagged = sentence_tagged.split(' ') for i in range(len(sentence_tagged)): if len(re.findall('\/\*', sentence_tagged[i])) > 1: word = re.search('\/\*(.*?)\/\*', sentence_tagged[i]).group(1) word = re.sub('\$', '', word) word = re.sub('\^', '', word) word = '^' + word + '/*' + word + '$' sentence_tagged[i] = word sentence_tagged = ' '.join(sentence_tagged) units = parse(c for c in sentence_tagged) counter = 0 try: for unit in units: sentence_words[counter] = [ tokens[counter], set(unit.readings[0][0][1]) ] counter += 1 except: pass return sentence_words
def main(year): print('Importing %s tweets...' % year) with open('data/gg' + year + '.json', 'r') as f: db = json.load(f) f.close() print('Cleaning tweets...') tweets = [{ 'clean': clean_tweet(tokenizer().tokenize(tweet['text'])), 'raw': tokenizer().tokenize(tweet['text']) } for tweet in db] print('Saving cleaned tweets...') with open('data/clean_gg' + year + '.json', 'w+') as clean_file: json.dump(tweets, clean_file) clean_file.close()
def apply_postedits(mt_string, bd_postedits, grammar_postedits, other_postedits): tokens = tokenizer(mt_string) for i in range(len(tokens)): try: prev_word = tokens[i - 1] except: prev_word = 'None' try: next_word = tokens[i + 1] except: next_word = 'None' tokens[i] = compare_bd(tokens[i], bd_postedits) tokens[i] = compare_other(tokens[i], prev_word, next_word, grammar_postedits) tokens[i] = compare_other(tokens[i], prev_word, next_word, other_postedits) pe_string = ' '.join(tokens) pe_string = clean_edited_string(pe_string) print(pe_string)
def main(year): print('\nSearching for awards...') # Import twitter data with open('data/gg%s.json' % year, 'r') as f: data = json.load(f) f.close() # Generate a list of all stopwords punc = ''.split(punctuation) english_stop = stopwords.words('english') gg_stop = [ 'goldenglobes', '#goldenglobes', '#goldenglobe', 'golden', 'globes', 'globe' ] twitter_stop = ['&', 'rt'] stops = set(english_stop + gg_stop + twitter_stop + punc) award_candidates = {} size = len(set([d['text'] for d in data])) for n, tweet in enumerate(set([d['text'] for d in data])): helpers.prog_print(n, size) # Generate all relevant forms of the tweet tkn_tweet = tokenizer().tokenize(tweet) lower_tweet = [tkn.lower() for tkn in tkn_tweet] clean_tweet = [x for x in lower_tweet] for sw in set(clean_tweet).intersection(stops): clean_tweet.remove(sw) if 'best' in clean_tweet: tagged_tweet = nltk.pos_tag(clean_tweet) for i in range(2, 8): ind = clean_tweet.index('best') # If we hit the end of the tweet or the last word in the segment isn't a noun, we don't need to look at it if ind + i > len(clean_tweet): break if 'NN' not in tagged_tweet[ind + i - 1]: continue # Find the segment in the uncut tweet, so we have the stopwords front, back = lower_tweet.index('best'), lower_tweet.index( clean_tweet[ind + i - 1]) # Piece it together and add it to the candidates list name = ' '.join(lower_tweet[front:back + 1]) if name in award_candidates: award_candidates[name] += 1 else: award_candidates[name] = 1 # Sort dict by number of appearances rankings = [(name, v) for name, v in sorted(award_candidates.items(), key=lambda item: item[1])] rankings.reverse() return [i[0] for i in rankings if i[1] > 80 and i[0]]
def collect(sentence, sentence_tagged): """ Collect a dictionary, which contains positions of sentence words as keys and tokens + tags as items. """ sentence_words = {} tokens = tokenizer(sentence.lower()) sentence_tagged = sentence_tagged.split(' ') for i in range(len(sentence_tagged)): if len(re.findall('\/\*', sentence_tagged[i])) > 1: word = re.search('\/\*(.*?)\/\*', sentence_tagged[i]).group(1) word = re.sub('\$', '', word) word = re.sub('\^', '', word) word = '^' + word + '/*' + word + '$' sentence_tagged[i] = word sentence_tagged = ' '.join(sentence_tagged) units = parse(c for c in sentence_tagged) counter = 0 for unit in units: sentence_words[counter] = [ tokens[counter], set(unit.readings[0][0][1]) ] counter += 1 return sentence_words
def classify(text, clf, prob=True): words = tokenizer(text) feats = dict(zip(words, [True for word in words])) if prob: c = clf.prob_classify(feats) return {'pos': c.prob('pos'), 'neg': c.prob('neg')} else: return clf.classify(feats)
def _on_start(self, utterance): # do all on start things # maybe clear all chart data structures # maybe clear agenda data structures self.agenda.clear() tokenized_utterance = tokenizer(utterance) self.utter_len = self.settings.utter_len = len(tokenized_utterance) self.left_buckets = [set() for _ in xrange(self.utter_len+1)] self.right_buckets = [set() for _ in xrange(self.utter_len+1)] self.initialize_agenda(tokenized_utterance)
def scrub_award(award): english_stop = set(stopwords.words('english')) gg_stop = ['best', 'performance', 'motion', 'picture', 'made', 'original'] tknzed = tokenizer().tokenize(award) clean_award = [] for tkn in tknzed: if not any([tkn in stop for stop in [english_stop, gg_stop]]) and tkn.isalpha(): clean_award.append(tkn) return clean_award
def _on_start(self, utterance): # do all on start things # maybe clear all chart data structures # maybe clear agenda data structures self.agenda.clear() tokenized_utterance = tokenizer(utterance) self.utter_len = self.settings.utter_len = len(tokenized_utterance) self.left_buckets = [set() for _ in xrange(self.utter_len + 1)] self.right_buckets = [set() for _ in xrange(self.utter_len + 1)] self.initialize_agenda(tokenized_utterance)
def prepare_data(source, mt, target, source_tagged, mt_tagged): source = source.lower() mt = mt.lower() target = target.lower() source_words = collect(source, source_tagged) mt_words = collect(mt, mt_tagged) mt_align = align(source_words, mt_words) mt_tokens = tokenizer(mt) return source, mt, target, mt_align, mt_tokens
def main(year): print('Importing %s tweets...' % year) with open('data/gg' + year + '.json', 'r') as f: db = json.load(f) f.close() print('Cleaning tweets...') tweets = [] size = len(db) for n, tweet in enumerate(db): prog_print(n, size, 40) tweets.append({ 'clean': clean_tweet(tokenizer().tokenize(tweet['text'])), 'raw': tokenizer().tokenize(tweet['text']) }) print('Saving cleaned tweets...') with open('data/clean_gg' + year + '.json', 'w+') as clean_file: json.dump(tweets, clean_file) clean_file.close()
def tokenization(document: str) -> List[Sentence]: """Tokenizer function. Args: document: String with lines of tokens separated by single 'space'. Return: List of tokenized sentences. """ return [ Sentence(sentence_str) for sentence_str in tokenizer(document, pattern="\n+", gaps=True) ]
def find_context(entries, lines): context = {} for e in entries: context[tuple(e)] = [[], []] for line in lines: try: s = tokenizer(line[0]) mt = tokenizer(line[1]) t = tokenizer(line[2]) except: continue if e[0] in s and e[1] in mt and e[2] in t: s_ind = s.index(e[0]) mt_ind = mt.index(e[1]) t_ind = t.index(e[2]) if mt[0:mt_ind] == t[0:t_ind] and mt[mt_ind + 1:] == t[t_ind + 1:]: context[tuple(e)][0].append( [' '.join(s[0:s_ind]), ' '.join(s[s_ind + 1:])]) context[tuple(e)][1].append( [' '.join(mt[0:mt_ind]), ' '.join(mt[mt_ind + 1:])]) #print(context) cleaned_context = {} for key, value in context.items(): if value != [[], []]: cleaned_context['\t'.join(key)] = value #print(cleaned_context) return cleaned_context
def tokenize(tweet, method): """ tokenizes a tweet based on certain rules :param tweet: a string representing the tweet :param method: type of tokenization :return: """ if method == 'normal': return tweet.split(' ') if method == 'twitter': from nltk.tokenize import TweetTokenizer as tokenizer return tokenizer().tokenize(tweet) else: raise ValueError(method + ' not available for tokenization.')
def main(tweets, award, sw, is_person): if 'cecil' in award: return [] nominee_candidates = {} nominee_sw = [ 'nominee', 'nominated', 'nomination', 'nominate', 'nominees', 'nominations' ] if is_person: low, high = 2, 3 else: low, high = 1, 3 tkn_award = tokenizer().tokenize(award) for tweet in tweets: trash = True lower_tweet = [x.lower() for x in tweet['clean']] if any([s in lower_tweet for s in nominee_sw]): lower_raw = [tkn.lower() for tkn in tweet['raw']] clean_tweet = [ tkn for tkn in lower_tweet if all( [tkn not in stop for stop in [sw, nominee_sw, tkn_award]]) ] for i in range(low, high): for phrase in helpers.ngrams(clean_tweet, i): front = lower_raw.index(phrase[0]) back = lower_raw.index(phrase[-1]) + 1 if is_person and back - front != i: continue name = ' '.join(lower_raw[front:back]) if name in award: continue if name in nominee_candidates: nominee_candidates[name] += 1 else: nominee_candidates[name] = 1 rankings = [(name, v) for name, v in sorted(nominee_candidates.items(), key=lambda item: item[1])] rankings.reverse() nominees = [n[0] for n in rankings[:6]] return nominees
def parse_ingredients(ingredients): sw = set(stopwords.words('english')) parsed_ingredients = [] for i in ingredients: tknzd = tokenizer().tokenize(i) if '/' in tknzd[1]: quantity = tknzd[0] + ' ' + tknzd[1] elif hasNumbers(tknzd[0]): quantity = tknzd[0] else: quantity = tknzd[1] measurement = get_measurement(tknzd, quantity) name = get_name(i, measurement) if not name: continue pos = nltk.pos_tag(name.split()) nom = [] for k in pos: if k[1] == 'NNS': nom.append(k[0]) if k[1] == 'NN': nom.append(k[0]) if k[0] == 'and': nom.append(k[0]) n = '' for j in nom: if j not in sw: n = n + j + ' ' pos = get_descriptor(name) prep = get_prep(name, i, measurement) parsed_ingredients.append({ 'name': n.strip(' '), 'quantity': str(quantity), 'measurement': measurement, 'descriptor': pos, 'preparation': prep }) return parsed_ingredients
def main(tweets, nominees, award, sw): #key words indicating presenters presenter_candidates = {} presenter_sw = [ 'present', 'presenter', 'presentation', 'presenting', 'presenta', 'presents', 'introduce', 'introduced', 'introducing', 'hand', 'hands', 'handing' ] tkn_award = tokenizer().tokenize(award) for tweet in tweets: lower_tweet = [x.lower() for x in tweet['clean']] if any([s in lower_tweet for s in presenter_sw]): lower_raw = [tkn.lower() for tkn in tweet['raw']] clean_tweet = [ tkn for tkn in lower_tweet if all([ tkn not in stop for stop in [sw, presenter_sw, tkn_award] ]) ] for i in range(2, 3): for phrase in helpers.ngrams(clean_tweet, i): front = lower_raw.index(phrase[0]) back = lower_raw.index(phrase[-1]) + 1 if back - front != i: continue name = ' '.join(lower_raw[front:back]) if name in nominees or name in award: continue if name in presenter_candidates: presenter_candidates[name] += 1 else: presenter_candidates[name] = 1 rankings = [ name for name, v in sorted(presenter_candidates.items(), key=lambda item: item[1]) ] rankings.reverse() if not len(rankings): return '' return rankings[0]
def main(year): with open('data/gg%s.json' % year, 'r') as f: db = json.load(f) f.close() punc = ''.split(punctuation) english_stop = stopwords.words('english') gg_stop = [ 'goldenglobes', '#goldenglobes', '#goldenglobe', 'golden', 'globes', 'globe' ] twitter_stop = ['&', 'rt'] stops = set(english_stop + gg_stop + twitter_stop + punc) best_dressed = [] for tweet in db: text = [ w.lower() for w in tokenizer().tokenize(tweet['text']) if w.lower() not in stops and re.fullmatch(r'''[a-z]+''', w.lower()) ] if all([word in text for word in ['best', 'dressed']]): best_dressed.append(list(nltk.bigrams(text))) clean_bigrams = [] #only look at tweets with best & dressed word_set = set(['best', 'dressed']) for bigram_list in best_dressed: for bigram in bigram_list: if not set(bigram).intersection(word_set): clean_bigrams.append(' '.join(bigram)) top = Counter(clean_bigrams).most_common(50) # make sure top bigrams are proper nouns for i in top: tagged_name = nltk.pos_tag( [word.capitalize() for word in i[0].split()]) if all([tkn[1] == 'NNP' for tkn in tagged_name]): return ' '.join([tkn[0] for tkn in tagged_name]) return 'Nobody dressed well'
def main(tweets, award, sw, is_person): winner_candidates = {} winner_sw = ['won', 'winner', 'winning', 'win', 'wins', 'recieve', 'recieves', 'recieving', 'recieved', 'congrats', 'congratulations', 'receives', 'received', 'receiving', 'honored', 'honoured', 'accepting', 'accepts', 'accepted', 'speech'] tkn_award = [tkn.lower() for tkn in tokenizer().tokenize(award)] stops = winner_sw + tkn_award + sw if is_person: low, high = 2, 3 else: low, high = 1, 4 for tweet in tweets: lower_tweet = [tkn.lower() for tkn in tweet['clean']] if any([sw in lower_tweet for sw in winner_sw]): lower_raw = [tkn.lower() for tkn in tweet['raw']] clean_tweet = [tkn for tkn in lower_tweet if tkn not in stops] for i in range(low, high): for phrase in helpers.ngrams(clean_tweet, i): front = lower_raw.index(phrase[0]) back = lower_raw.index(phrase[-1]) + 1 # if is_person and back - front != i: # continue name = ' '.join(lower_raw[front:back]) if name in winner_candidates: winner_candidates[name] += 1 else: winner_candidates[name] = 1 rankings = [(name, v) for name, v in sorted(winner_candidates.items(), key=lambda item: item[1])] rankings.reverse() if not len(rankings): return '' return rankings[0][0]
def compute_prob(self, sentence): tokens = [] tokens.append('<S>') tokens.extend(tokenizer(sentence)) tokens.append('</S>') #Set all variables targets = np.zeros([BATCH_SIZE, NUM_TIMESTEPS], np.int32) weights = np.ones([BATCH_SIZE, NUM_TIMESTEPS], np.float32) inputs = np.zeros([BATCH_SIZE, NUM_TIMESTEPS], np.int32) char_ids_inputs = np.zeros( [BATCH_SIZE, NUM_TIMESTEPS, self.vocab.max_word_length], np.int32) for curr_token_index in range(len(tokens) - 1): inputs[0, 0] = self.vocab.word_to_id(tokens[curr_token_index]) char_ids_inputs[0, 0, :] = self.vocab.word_to_char_ids( tokens[curr_token_index]) softmax = self.session.run(self.model['softmax_out'], feed_dict={ self.model['char_inputs_in']: char_ids_inputs, self.model['inputs_in']: inputs, self.model['targets_in']: targets, self.model['target_weights_in']: weights }) next_token_id = self.vocab.word_to_id(tokens[curr_token_index + 1]) logging.info( 'Pr %s = %.5f' % (tokens[curr_token_index + 1], softmax[0][next_token_id])) #Reset model states self.session.run(self.model['states_init'])
def main(year, awards): print('Sorting Nominees...') with open("data/gg%s.json" % year, 'r') as f: db = json.load(f) f.close() twitter_stop = [ '&', 'rt', 'Golden', 'Globes', 'Best', 'best', 'GoldenGlobes' ] nominees_sw = [ 'movie', 'tv', 'miniseries', 'win', 'wins', 'goes', 'winner', 'won', 'lose', 'lost', 'nominated', 'golden', 'globes', '#GoldenGlobes', '#RT', '#goldenglobes', 'goldenglobes', 'globe', 'nominee', 'present', 'nominations', 'nomination', 'nominees' ] award_sw = [ 'performance', 'motion', 'picture', 'original', 'role', 'award', 'made', 'mini-series', 'series' ] stops = list(sw) + twitter_stop + nominees_sw + award_sw + [ x for x in punctuation ] clean_person_awards = {} clean_gender_awards = {} for award in awards: clean_award = [(tkn if not tkn == 'television' else 'series') for tkn in tokenizer().tokenize(award) if tkn not in stops] if 'actor' in clean_award or 'actress' in clean_award: clean_gender_awards[award] = clean_award else: clean_person_awards[award] = clean_award def clean_tweet(tkns): no_sw = [] for i in tkns: if not i in sw: if not i in twitter_stop: if not i in nominees_sw: if not i in string.punctuation: if not '//t.co/' in i: no_sw.append(i) return (no_sw) clean_tweets_noms_13 = [] for tweet in db: text = tweet['text'] unclean_tokens = tokenizer().tokenize(text) clean_tweets_noms_13.append(clean_tweet(unclean_tokens)) def subset_noms(query, cleaned_tweets): noms = [] for tweet in cleaned_tweets: if all([word in tweet for word in query]): noms.append(tweet) return noms def bi_grams(subset): bi_grams = [] for tweet in subset: bi_grams.append(list(nltk.bigrams(tweet))) flat = [item for sublist in bi_grams for item in sublist] return flat def propers(flat_list): proper = [] for i in range(0, len(flat_list)): pos = nltk.pos_tag(flat_list[i]) if (((pos[0][0][0]).isupper()) and ((pos[1][0][0]).isupper())): if (not (pos[0][0].isupper()) and (not (pos[1][0].isupper()))): proper.append(flat_list[i]) return proper def person_filter(ranked_list): updated_person_noms = [] for i in ranked_list: first_name = i[0][0] last_name = i[0][1] name = first_name + ' ' + last_name doc = nlp(name) person_test = ([(X.text, X.label_) for X in doc.ents]) if not person_test: continue if person_test[0][1] == 'PERSON': updated_person_noms.append(i) return updated_person_noms def gender_person_filter(ranked_list, gender): updated_person_noms = [] for i in range(0, len(ranked_list)): first_name = top_list[i][0][0] last_name = top_list[i][0][1] name = first_name + ' ' + last_name doc = nlp(name) person_test = ([(X.text, X.label_) for X in doc.ents]) if name == 'Christoph Waltz': updated_person_noms.append(top_list[i]) if name == 'Mandy Patinkin': updated_person_noms.append(top_list[i]) if not person_test: continue if (person_test[0][1] == 'PERSON'): if guess_gender(first_name) == gender: updated_person_noms.append(top_list[i]) return updated_person_noms def guess_gender(name): d = gender.Detector() return d.get_gender(name) def clean_ranked(lst): final_bigrams = [] for n in lst: final_bigrams.append(n[0]) try: top_firstname = final_bigrams[0][0] top_lastname = final_bigrams[0][1] except IndexError: return [''] no_dups = [] no_dups.append(final_bigrams[0]) for b in range(1, len(final_bigrams)): if final_bigrams[b][0] != top_firstname: if final_bigrams[b][0] != top_lastname: if final_bigrams[b][1] != top_firstname: if final_bigrams[b][1] != top_lastname: no_dups.append(final_bigrams[b]) full_name = [] for j in range(0, len(no_dups)): full_name.append(no_dups[j][0] + ' ' + no_dups[j][1]) return full_name def clean_ranked_gender(lst): final_bigrams = [] for n in range(0, len(lst)): final_bigrams.append(lst[n][0]) full_name = [] for j in range(0, len(final_bigrams)): full_name.append(final_bigrams[j][0] + ' ' + final_bigrams[j][1]) return full_name nominees = {} for key, value in clean_person_awards.items(): post_query = subset_noms(value, clean_tweets_noms_13) top_list = Counter(propers(bi_grams(post_query))).most_common(15) nominees[key] = clean_ranked(person_filter(top_list)) for key, value in clean_gender_awards.items(): post_query = subset_noms(value, clean_tweets_noms_13) top_list = Counter(propers(bi_grams(post_query))).most_common(15) if value[0] == 'actor': nominees[key] = clean_ranked_gender( gender_person_filter(top_list, 'male')) else: nominees[key] = clean_ranked_gender( gender_person_filter(top_list, 'female')) return nominees
file = file.read().strip('\n').split('\n') for i in range(len(file)): file[i] = file[i].strip('(\'') file[i] = file[i].strip('\')') file[i] = file[i].split('\', \'') lines.append(file[i]) context = {} for e in entries: context[tuple(e)] = [[], []] for line in lines: try: s = tokenizer(line[0]) mt = tokenizer(line[1]) t = tokenizer(line[2]) if mt[0] == t[0] and mt[2] == t[2] and e[0] in s and e[ 1] in mt and e[2] in t: context[tuple(e)][0].append([s[0], s[2]]) context[tuple(e)][1].append([mt[0], mt[2]]) except: pass """for key, value in context.items(): value[0] = Counter(value[0]) del value[0][key[0]] del value[0][','] del value[0]['.']
sys.exit(0) # load labels of entities labels = {} entityType = entityTypes[txtType] filename = 'data/labels_'+entityType+'.lst' with open(filename) as inFile: lines = inFile.read().decode('utf8').strip().split('\n') for line in lines: label, uri = line.split('\t') labels[label] = uri # load + tokenize + extract entities from text file source = os.path.basename(txtFilename) text = open(txtFilename).read().decode('utf8') tokens = tokenizer().tokenize(text) # list of tokens from NLTK Tokenizer pos = list(tokenizer().span_tokenize(text)) # list of positions (start,end) sep = ' ' # word separator; may be language-dependent lastpos = 0 # lastpos position for i, w in enumerate(tokens): if len(w) >= 3: # ignore less than 3 chars w3 = sep.join(tokens[i:i+3]) if tokens[i+1:i+2] == "-": # check if compound words w3 = ''.join(tokens[i:i+3]) w2 = sep.join(tokens[i:i+2]) w1 = w w3 = w3.title() if w3.isupper() else w3 # NEW YORK CITY => New York City w2 = w2.title() if w2.isupper() else w2 w1 = w1.title() if w1.isupper() else w1 comp = ['', w1, w2, w3] if i+2 < len(tokens) and comp[3] in labels: # 3 words
def apply_postedits(source, mt, target, source_tagged, mt_tagged, target_tagged, s_lang, t_lang, postedits): with open('%s-%s_corrected.txt' % (s_lang, t_lang), 'w', encoding='utf-8') as file: for i in range(len(source)): try: source[i] = source[i].lower() mt[i] = mt[i].lower() target[i] = target[i].lower() source_words = collect(source[i], source_tagged[i]) mt_words = collect(mt[i], mt_tagged[i]) mt_align = align(source_words, mt_words) m = tokenizer(mt[i]) edits = {} for elem in mt_align: elem = tuple(elem) for operation in postedits: if elem[0] == operation[0] and elem[1] == operation[1]: if elem in edits.keys(): edits[elem].append(operation) else: edits[elem] = [operation] b = product(*list(edits.values())) variants = [] checked = [] for elem in b: v = [] for j in range(len(m)): for k in elem: if m[j] == k[1]: v.append(k[2]) checked.append(m[j]) else: c = 0 for k in edits.keys(): if m[j] in k: c += 1 if c == 0 and (len(v) == 0 or len(v) > 0 and v[-1] != m[j] ) and m[j] not in checked: v.append(m[j]) variants.append(v) file.write('S\t%s\nMT\t%s\n' % (source[i], mt[i])) for v in variants: file.write('ED\t%s\n' % (' '.join(v))) file.write('T\t%s\n\n' % (target[i])) except: pass
def token_and_stem( language = "english" ): stemmer = SnowballStemmer( language ) tokens = tokenizer() def stemmed_tokens( description , row ): return [stemmer.stem(token) for token in tokens.tokenize(description)] return stemmed_tokens
def sent2seq(self, sentence): sentence = sentence.lower() tokens = tokenizer(sentence) seq = [self.word_dict[token] if token in self.word_dict else 1 for token in tokens] seq = [w if w < self.options['n_words_src'] else 1 for w in seq] return seq