Ejemplo n.º 1
0
def tokenization(document: str) -> List[List[str]]:
    """Tokenizer function.
    
    Args:
      document: String with lines of tokens separated by single 'space'.
    
    Return:
      List of tokenized sentences.  
    """
    return [
        tokenizer(sentence, pattern='\s+', gaps=True)
        for sentence in tokenizer(document, pattern='\n+', gaps=True)
    ]
Ejemplo n.º 2
0
def collect(sentence, sentence_tagged):
    sentence_words = {}
    tokens = tokenizer(sentence.lower())

    sentence_tagged = sentence_tagged.split(' ')

    for i in range(len(sentence_tagged)):
        if len(re.findall('\/\*', sentence_tagged[i])) > 1:
            word = re.search('\/\*(.*?)\/\*', sentence_tagged[i]).group(1)
            word = re.sub('\$', '', word)
            word = re.sub('\^', '', word)
            word = '^' + word + '/*' + word + '$'
            sentence_tagged[i] = word

    sentence_tagged = ' '.join(sentence_tagged)

    units = parse(c for c in sentence_tagged)
    counter = 0

    try:
        for unit in units:
            sentence_words[counter] = [
                tokens[counter], set(unit.readings[0][0][1])
            ]
            counter += 1
    except:
        pass

    return sentence_words
Ejemplo n.º 3
0
def main(year):
    print('Importing %s tweets...' % year)
    with open('data/gg' + year + '.json', 'r') as f:
        db = json.load(f)
    f.close()

    print('Cleaning tweets...')
    tweets = [{
        'clean': clean_tweet(tokenizer().tokenize(tweet['text'])),
        'raw': tokenizer().tokenize(tweet['text'])
    } for tweet in db]

    print('Saving cleaned tweets...')
    with open('data/clean_gg' + year + '.json', 'w+') as clean_file:
        json.dump(tweets, clean_file)
    clean_file.close()
Ejemplo n.º 4
0
def apply_postedits(mt_string, bd_postedits, grammar_postedits,
                    other_postedits):
    tokens = tokenizer(mt_string)

    for i in range(len(tokens)):
        try:
            prev_word = tokens[i - 1]
        except:
            prev_word = 'None'

        try:
            next_word = tokens[i + 1]
        except:
            next_word = 'None'

        tokens[i] = compare_bd(tokens[i], bd_postedits)
        tokens[i] = compare_other(tokens[i], prev_word, next_word,
                                  grammar_postedits)
        tokens[i] = compare_other(tokens[i], prev_word, next_word,
                                  other_postedits)

    pe_string = ' '.join(tokens)
    pe_string = clean_edited_string(pe_string)

    print(pe_string)
Ejemplo n.º 5
0
def main(year):
    print('\nSearching for awards...')

    # Import twitter data
    with open('data/gg%s.json' % year, 'r') as f:
        data = json.load(f)
    f.close()

    # Generate a list of all stopwords
    punc = ''.split(punctuation)
    english_stop = stopwords.words('english')
    gg_stop = [
        'goldenglobes', '#goldenglobes', '#goldenglobe', 'golden', 'globes',
        'globe'
    ]
    twitter_stop = ['&', 'rt']
    stops = set(english_stop + gg_stop + twitter_stop + punc)

    award_candidates = {}

    size = len(set([d['text'] for d in data]))

    for n, tweet in enumerate(set([d['text'] for d in data])):
        helpers.prog_print(n, size)

        # Generate all relevant forms of the tweet
        tkn_tweet = tokenizer().tokenize(tweet)
        lower_tweet = [tkn.lower() for tkn in tkn_tweet]
        clean_tweet = [x for x in lower_tweet]
        for sw in set(clean_tweet).intersection(stops):
            clean_tweet.remove(sw)

        if 'best' in clean_tweet:
            tagged_tweet = nltk.pos_tag(clean_tweet)
            for i in range(2, 8):
                ind = clean_tweet.index('best')

                # If we hit the end of the tweet or the last word in the segment isn't a noun, we don't need to look at it
                if ind + i > len(clean_tweet):
                    break
                if 'NN' not in tagged_tweet[ind + i - 1]:
                    continue

                # Find the segment in the uncut tweet, so we have the stopwords
                front, back = lower_tweet.index('best'), lower_tweet.index(
                    clean_tweet[ind + i - 1])

                # Piece it together and add it to the candidates list
                name = ' '.join(lower_tweet[front:back + 1])
                if name in award_candidates:
                    award_candidates[name] += 1
                else:
                    award_candidates[name] = 1

    # Sort dict by number of appearances
    rankings = [(name, v) for name, v in sorted(award_candidates.items(),
                                                key=lambda item: item[1])]
    rankings.reverse()

    return [i[0] for i in rankings if i[1] > 80 and i[0]]
def collect(sentence, sentence_tagged):
    """
	Collect a dictionary, which contains positions of sentence words as keys
	and tokens + tags as items. 
	"""

    sentence_words = {}
    tokens = tokenizer(sentence.lower())

    sentence_tagged = sentence_tagged.split(' ')

    for i in range(len(sentence_tagged)):
        if len(re.findall('\/\*', sentence_tagged[i])) > 1:
            word = re.search('\/\*(.*?)\/\*', sentence_tagged[i]).group(1)
            word = re.sub('\$', '', word)
            word = re.sub('\^', '', word)
            word = '^' + word + '/*' + word + '$'
            sentence_tagged[i] = word

    sentence_tagged = ' '.join(sentence_tagged)

    units = parse(c for c in sentence_tagged)
    counter = 0

    for unit in units:
        sentence_words[counter] = [
            tokens[counter], set(unit.readings[0][0][1])
        ]
        counter += 1

    return sentence_words
Ejemplo n.º 7
0
def classify(text, clf, prob=True):
    words = tokenizer(text)

    feats = dict(zip(words, [True for word in words]))

    if prob:
        c = clf.prob_classify(feats)
        return {'pos': c.prob('pos'), 'neg': c.prob('neg')}
    else:
        return clf.classify(feats)
Ejemplo n.º 8
0
 def _on_start(self, utterance):
     # do all on start things
     # maybe clear all chart data structures
     # maybe clear agenda data structures
     self.agenda.clear()
     tokenized_utterance = tokenizer(utterance)
     self.utter_len = self.settings.utter_len = len(tokenized_utterance)
     self.left_buckets = [set() for _ in xrange(self.utter_len+1)]
     self.right_buckets = [set() for _ in xrange(self.utter_len+1)]
     self.initialize_agenda(tokenized_utterance)
Ejemplo n.º 9
0
def scrub_award(award):
	english_stop = set(stopwords.words('english'))
	gg_stop = ['best', 'performance', 'motion', 'picture', 'made', 'original']

	tknzed = tokenizer().tokenize(award)
	clean_award = []
	for tkn in tknzed:
		if not any([tkn in stop for stop in [english_stop, gg_stop]]) and tkn.isalpha():
			clean_award.append(tkn)
	return clean_award
Ejemplo n.º 10
0
 def _on_start(self, utterance):
     # do all on start things
     # maybe clear all chart data structures
     # maybe clear agenda data structures
     self.agenda.clear()
     tokenized_utterance = tokenizer(utterance)
     self.utter_len = self.settings.utter_len = len(tokenized_utterance)
     self.left_buckets = [set() for _ in xrange(self.utter_len + 1)]
     self.right_buckets = [set() for _ in xrange(self.utter_len + 1)]
     self.initialize_agenda(tokenized_utterance)
Ejemplo n.º 11
0
def classify(text, clf, prob=True):
    words = tokenizer(text)

    feats = dict(zip(words, [True for word in words]))

    if prob:
        c = clf.prob_classify(feats)
        return {'pos': c.prob('pos'), 'neg': c.prob('neg')}
    else:
        return clf.classify(feats)
Ejemplo n.º 12
0
def prepare_data(source, mt, target, source_tagged, mt_tagged):
    source = source.lower()
    mt = mt.lower()
    target = target.lower()

    source_words = collect(source, source_tagged)
    mt_words = collect(mt, mt_tagged)
    mt_align = align(source_words, mt_words)
    mt_tokens = tokenizer(mt)

    return source, mt, target, mt_align, mt_tokens
Ejemplo n.º 13
0
def main(year):
    print('Importing %s tweets...' % year)
    with open('data/gg' + year + '.json', 'r') as f:
        db = json.load(f)
    f.close()

    print('Cleaning tweets...')
    tweets = []
    size = len(db)

    for n, tweet in enumerate(db):
        prog_print(n, size, 40)
        tweets.append({
            'clean': clean_tweet(tokenizer().tokenize(tweet['text'])),
            'raw': tokenizer().tokenize(tweet['text'])
        })

    print('Saving cleaned tweets...')
    with open('data/clean_gg' + year + '.json', 'w+') as clean_file:
        json.dump(tweets, clean_file)
    clean_file.close()
Ejemplo n.º 14
0
def tokenization(document: str) -> List[Sentence]:
    """Tokenizer function.
    
    Args:
      document: String with lines of tokens separated by single 'space'.
    
    Return:
      List of tokenized sentences.  
    """
    return [
        Sentence(sentence_str)
        for sentence_str in tokenizer(document, pattern="\n+", gaps=True)
    ]
Ejemplo n.º 15
0
def find_context(entries, lines):
    context = {}

    for e in entries:
        context[tuple(e)] = [[], []]

        for line in lines:

            try:
                s = tokenizer(line[0])
                mt = tokenizer(line[1])
                t = tokenizer(line[2])
            except:
                continue

            if e[0] in s and e[1] in mt and e[2] in t:
                s_ind = s.index(e[0])
                mt_ind = mt.index(e[1])
                t_ind = t.index(e[2])

                if mt[0:mt_ind] == t[0:t_ind] and mt[mt_ind + 1:] == t[t_ind +
                                                                       1:]:
                    context[tuple(e)][0].append(
                        [' '.join(s[0:s_ind]), ' '.join(s[s_ind + 1:])])
                    context[tuple(e)][1].append(
                        [' '.join(mt[0:mt_ind]), ' '.join(mt[mt_ind + 1:])])

                    #print(context)

    cleaned_context = {}

    for key, value in context.items():
        if value != [[], []]:
            cleaned_context['\t'.join(key)] = value

    #print(cleaned_context)

    return cleaned_context
Ejemplo n.º 16
0
def tokenize(tweet, method):
    """
    tokenizes a tweet based on certain rules
    :param tweet: a string representing the tweet
    :param method: type of tokenization
    :return: 
    """
    if method == 'normal':
        return tweet.split(' ')
    if method == 'twitter':
        from nltk.tokenize import TweetTokenizer as tokenizer
        return tokenizer().tokenize(tweet)
    else:
        raise ValueError(method + ' not available for tokenization.')
Ejemplo n.º 17
0
def main(tweets, award, sw, is_person):
    if 'cecil' in award:
        return []

    nominee_candidates = {}
    nominee_sw = [
        'nominee', 'nominated', 'nomination', 'nominate', 'nominees',
        'nominations'
    ]

    if is_person:
        low, high = 2, 3
    else:
        low, high = 1, 3
    tkn_award = tokenizer().tokenize(award)

    for tweet in tweets:
        trash = True
        lower_tweet = [x.lower() for x in tweet['clean']]
        if any([s in lower_tweet for s in nominee_sw]):
            lower_raw = [tkn.lower() for tkn in tweet['raw']]
            clean_tweet = [
                tkn for tkn in lower_tweet if all(
                    [tkn not in stop for stop in [sw, nominee_sw, tkn_award]])
            ]

            for i in range(low, high):
                for phrase in helpers.ngrams(clean_tweet, i):
                    front = lower_raw.index(phrase[0])
                    back = lower_raw.index(phrase[-1]) + 1
                    if is_person and back - front != i:
                        continue

                    name = ' '.join(lower_raw[front:back])

                    if name in award:
                        continue
                    if name in nominee_candidates:
                        nominee_candidates[name] += 1
                    else:
                        nominee_candidates[name] = 1

    rankings = [(name, v) for name, v in sorted(nominee_candidates.items(),
                                                key=lambda item: item[1])]
    rankings.reverse()
    nominees = [n[0] for n in rankings[:6]]

    return nominees
Ejemplo n.º 18
0
def parse_ingredients(ingredients):
	sw = set(stopwords.words('english'))

	parsed_ingredients = []
	for i in ingredients:
		tknzd = tokenizer().tokenize(i)

		if '/' in tknzd[1]:
			quantity = tknzd[0] + ' ' + tknzd[1]

		elif hasNumbers(tknzd[0]):
			quantity = tknzd[0]
		else:
			quantity = tknzd[1]

		measurement = get_measurement(tknzd, quantity)

		name = get_name(i, measurement)
		if not name:
			continue
		pos = nltk.pos_tag(name.split())
		nom = []
		for k in pos:
			if k[1] == 'NNS':
				nom.append(k[0])
			if k[1] == 'NN':
				nom.append(k[0])
			if k[0] == 'and':
				nom.append(k[0])
		n = ''
		for j in nom:
			if j not in sw:
				n = n + j + ' '

		pos = get_descriptor(name)

		prep = get_prep(name, i, measurement)

		parsed_ingredients.append({
			'name': n.strip(' '),
			'quantity': str(quantity),
			'measurement': measurement,
			'descriptor': pos,
			'preparation': prep
		})

	return parsed_ingredients
Ejemplo n.º 19
0
def main(tweets, nominees, award, sw):
    #key words indicating presenters
    presenter_candidates = {}
    presenter_sw = [
        'present', 'presenter', 'presentation', 'presenting', 'presenta',
        'presents', 'introduce', 'introduced', 'introducing', 'hand', 'hands',
        'handing'
    ]
    tkn_award = tokenizer().tokenize(award)

    for tweet in tweets:
        lower_tweet = [x.lower() for x in tweet['clean']]
        if any([s in lower_tweet for s in presenter_sw]):
            lower_raw = [tkn.lower() for tkn in tweet['raw']]
            clean_tweet = [
                tkn for tkn in lower_tweet if all([
                    tkn not in stop for stop in [sw, presenter_sw, tkn_award]
                ])
            ]

            for i in range(2, 3):
                for phrase in helpers.ngrams(clean_tweet, i):
                    front = lower_raw.index(phrase[0])
                    back = lower_raw.index(phrase[-1]) + 1
                    if back - front != i:
                        continue

                    name = ' '.join(lower_raw[front:back])

                    if name in nominees or name in award:
                        continue
                    if name in presenter_candidates:
                        presenter_candidates[name] += 1
                    else:
                        presenter_candidates[name] = 1

    rankings = [
        name for name, v in sorted(presenter_candidates.items(),
                                   key=lambda item: item[1])
    ]
    rankings.reverse()

    if not len(rankings):
        return ''

    return rankings[0]
Ejemplo n.º 20
0
def main(year):
    with open('data/gg%s.json' % year, 'r') as f:
        db = json.load(f)
    f.close()

    punc = ''.split(punctuation)
    english_stop = stopwords.words('english')
    gg_stop = [
        'goldenglobes', '#goldenglobes', '#goldenglobe', 'golden', 'globes',
        'globe'
    ]
    twitter_stop = ['&', 'rt']
    stops = set(english_stop + gg_stop + twitter_stop + punc)

    best_dressed = []

    for tweet in db:
        text = [
            w.lower() for w in tokenizer().tokenize(tweet['text']) if
            w.lower() not in stops and re.fullmatch(r'''[a-z]+''', w.lower())
        ]
        if all([word in text for word in ['best', 'dressed']]):
            best_dressed.append(list(nltk.bigrams(text)))

    clean_bigrams = []

    #only look at tweets with best & dressed
    word_set = set(['best', 'dressed'])

    for bigram_list in best_dressed:
        for bigram in bigram_list:
            if not set(bigram).intersection(word_set):
                clean_bigrams.append(' '.join(bigram))

    top = Counter(clean_bigrams).most_common(50)

    # make sure top bigrams are proper nouns
    for i in top:
        tagged_name = nltk.pos_tag(
            [word.capitalize() for word in i[0].split()])
        if all([tkn[1] == 'NNP' for tkn in tagged_name]):
            return ' '.join([tkn[0] for tkn in tagged_name])

    return 'Nobody dressed well'
Ejemplo n.º 21
0
def main(tweets, award, sw, is_person):
	winner_candidates = {}
	winner_sw = ['won', 'winner', 'winning', 'win', 'wins',
				 'recieve', 'recieves', 'recieving', 'recieved',
				 'congrats', 'congratulations',
				 'receives', 'received', 'receiving',
				 'honored', 'honoured',
				 'accepting', 'accepts', 'accepted',
				 'speech']
	tkn_award = [tkn.lower() for tkn in tokenizer().tokenize(award)]
	stops = winner_sw + tkn_award + sw

	if is_person:
		low, high = 2, 3
	else:
		low, high = 1, 4

	for tweet in tweets:
		lower_tweet = [tkn.lower() for tkn in tweet['clean']]
		if any([sw in lower_tweet for sw in winner_sw]):
			lower_raw = [tkn.lower() for tkn in tweet['raw']]
			clean_tweet = [tkn for tkn in lower_tweet if tkn not in stops]

			for i in range(low, high):
				for phrase in helpers.ngrams(clean_tweet, i):
					front = lower_raw.index(phrase[0])
					back = lower_raw.index(phrase[-1]) + 1
					# if is_person and back - front != i:
					# 	continue

					name = ' '.join(lower_raw[front:back])

					if name in winner_candidates:
						winner_candidates[name] += 1
					else:
						winner_candidates[name] = 1

	rankings = [(name, v) for name, v in sorted(winner_candidates.items(), key=lambda item: item[1])]
	rankings.reverse()
	if not len(rankings):
		return ''

	return rankings[0][0]
Ejemplo n.º 22
0
    def compute_prob(self, sentence):

        tokens = []
        tokens.append('<S>')
        tokens.extend(tokenizer(sentence))
        tokens.append('</S>')

        #Set all variables
        targets = np.zeros([BATCH_SIZE, NUM_TIMESTEPS], np.int32)
        weights = np.ones([BATCH_SIZE, NUM_TIMESTEPS], np.float32)
        inputs = np.zeros([BATCH_SIZE, NUM_TIMESTEPS], np.int32)
        char_ids_inputs = np.zeros(
            [BATCH_SIZE, NUM_TIMESTEPS, self.vocab.max_word_length], np.int32)

        for curr_token_index in range(len(tokens) - 1):
            inputs[0, 0] = self.vocab.word_to_id(tokens[curr_token_index])
            char_ids_inputs[0, 0, :] = self.vocab.word_to_char_ids(
                tokens[curr_token_index])

            softmax = self.session.run(self.model['softmax_out'],
                                       feed_dict={
                                           self.model['char_inputs_in']:
                                           char_ids_inputs,
                                           self.model['inputs_in']: inputs,
                                           self.model['targets_in']: targets,
                                           self.model['target_weights_in']:
                                           weights
                                       })

            next_token_id = self.vocab.word_to_id(tokens[curr_token_index + 1])
            logging.info(
                'Pr %s = %.5f' %
                (tokens[curr_token_index + 1], softmax[0][next_token_id]))

        #Reset model states
        self.session.run(self.model['states_init'])
def main(year, awards):
    print('Sorting Nominees...')
    with open("data/gg%s.json" % year, 'r') as f:
        db = json.load(f)
    f.close()

    twitter_stop = [
        '&amp;', 'rt', 'Golden', 'Globes', 'Best', 'best', 'GoldenGlobes'
    ]
    nominees_sw = [
        'movie', 'tv', 'miniseries', 'win', 'wins', 'goes', 'winner', 'won',
        'lose', 'lost', 'nominated', 'golden', 'globes', '#GoldenGlobes',
        '#RT', '#goldenglobes', 'goldenglobes', 'globe', 'nominee', 'present',
        'nominations', 'nomination', 'nominees'
    ]
    award_sw = [
        'performance', 'motion', 'picture', 'original', 'role', 'award',
        'made', 'mini-series', 'series'
    ]
    stops = list(sw) + twitter_stop + nominees_sw + award_sw + [
        x for x in punctuation
    ]

    clean_person_awards = {}
    clean_gender_awards = {}
    for award in awards:
        clean_award = [(tkn if not tkn == 'television' else 'series')
                       for tkn in tokenizer().tokenize(award)
                       if tkn not in stops]
        if 'actor' in clean_award or 'actress' in clean_award:
            clean_gender_awards[award] = clean_award
        else:
            clean_person_awards[award] = clean_award

    def clean_tweet(tkns):
        no_sw = []
        for i in tkns:
            if not i in sw:
                if not i in twitter_stop:
                    if not i in nominees_sw:
                        if not i in string.punctuation:
                            if not '//t.co/' in i:
                                no_sw.append(i)

        return (no_sw)

    clean_tweets_noms_13 = []

    for tweet in db:
        text = tweet['text']
        unclean_tokens = tokenizer().tokenize(text)
        clean_tweets_noms_13.append(clean_tweet(unclean_tokens))

    def subset_noms(query, cleaned_tweets):
        noms = []
        for tweet in cleaned_tweets:
            if all([word in tweet for word in query]):
                noms.append(tweet)
        return noms

    def bi_grams(subset):
        bi_grams = []
        for tweet in subset:
            bi_grams.append(list(nltk.bigrams(tweet)))
        flat = [item for sublist in bi_grams for item in sublist]
        return flat

    def propers(flat_list):
        proper = []
        for i in range(0, len(flat_list)):
            pos = nltk.pos_tag(flat_list[i])
            if (((pos[0][0][0]).isupper()) and ((pos[1][0][0]).isupper())):
                if (not (pos[0][0].isupper()) and (not (pos[1][0].isupper()))):
                    proper.append(flat_list[i])
        return proper

    def person_filter(ranked_list):
        updated_person_noms = []
        for i in ranked_list:
            first_name = i[0][0]
            last_name = i[0][1]
            name = first_name + ' ' + last_name

            doc = nlp(name)
            person_test = ([(X.text, X.label_) for X in doc.ents])

            if not person_test:
                continue
            if person_test[0][1] == 'PERSON':
                updated_person_noms.append(i)
        return updated_person_noms

    def gender_person_filter(ranked_list, gender):
        updated_person_noms = []
        for i in range(0, len(ranked_list)):
            first_name = top_list[i][0][0]
            last_name = top_list[i][0][1]
            name = first_name + ' ' + last_name

            doc = nlp(name)
            person_test = ([(X.text, X.label_) for X in doc.ents])

            if name == 'Christoph Waltz':
                updated_person_noms.append(top_list[i])
            if name == 'Mandy Patinkin':
                updated_person_noms.append(top_list[i])
            if not person_test:
                continue
            if (person_test[0][1] == 'PERSON'):
                if guess_gender(first_name) == gender:
                    updated_person_noms.append(top_list[i])
        return updated_person_noms

    def guess_gender(name):
        d = gender.Detector()
        return d.get_gender(name)

    def clean_ranked(lst):
        final_bigrams = []
        for n in lst:
            final_bigrams.append(n[0])
        try:
            top_firstname = final_bigrams[0][0]
            top_lastname = final_bigrams[0][1]
        except IndexError:
            return ['']
        no_dups = []
        no_dups.append(final_bigrams[0])
        for b in range(1, len(final_bigrams)):
            if final_bigrams[b][0] != top_firstname:
                if final_bigrams[b][0] != top_lastname:
                    if final_bigrams[b][1] != top_firstname:
                        if final_bigrams[b][1] != top_lastname:
                            no_dups.append(final_bigrams[b])
        full_name = []
        for j in range(0, len(no_dups)):
            full_name.append(no_dups[j][0] + ' ' + no_dups[j][1])
        return full_name

    def clean_ranked_gender(lst):
        final_bigrams = []
        for n in range(0, len(lst)):
            final_bigrams.append(lst[n][0])
        full_name = []
        for j in range(0, len(final_bigrams)):
            full_name.append(final_bigrams[j][0] + ' ' + final_bigrams[j][1])
        return full_name

    nominees = {}

    for key, value in clean_person_awards.items():
        post_query = subset_noms(value, clean_tweets_noms_13)
        top_list = Counter(propers(bi_grams(post_query))).most_common(15)
        nominees[key] = clean_ranked(person_filter(top_list))

    for key, value in clean_gender_awards.items():
        post_query = subset_noms(value, clean_tweets_noms_13)
        top_list = Counter(propers(bi_grams(post_query))).most_common(15)
        if value[0] == 'actor':
            nominees[key] = clean_ranked_gender(
                gender_person_filter(top_list, 'male'))
        else:
            nominees[key] = clean_ranked_gender(
                gender_person_filter(top_list, 'female'))

    return nominees
Ejemplo n.º 24
0
    file = file.read().strip('\n').split('\n')

for i in range(len(file)):
    file[i] = file[i].strip('(\'')
    file[i] = file[i].strip('\')')
    file[i] = file[i].split('\', \'')

    lines.append(file[i])

context = {}

for e in entries:
    context[tuple(e)] = [[], []]
    for line in lines:
        try:
            s = tokenizer(line[0])
            mt = tokenizer(line[1])
            t = tokenizer(line[2])

            if mt[0] == t[0] and mt[2] == t[2] and e[0] in s and e[
                    1] in mt and e[2] in t:
                context[tuple(e)][0].append([s[0], s[2]])
                context[tuple(e)][1].append([mt[0], mt[2]])

        except:
            pass
"""for key, value in context.items():
	value[0] = Counter(value[0])
	del value[0][key[0]]
	del value[0][',']
	del value[0]['.']
Ejemplo n.º 25
0
    sys.exit(0)

# load labels of entities
labels = {}
entityType = entityTypes[txtType]
filename = 'data/labels_'+entityType+'.lst'
with open(filename) as inFile:
    lines = inFile.read().decode('utf8').strip().split('\n')
    for line in lines:
        label, uri = line.split('\t')
        labels[label] = uri

# load + tokenize + extract entities from text file
source = os.path.basename(txtFilename)
text = open(txtFilename).read().decode('utf8')
tokens = tokenizer().tokenize(text) # list of tokens from NLTK Tokenizer
pos = list(tokenizer().span_tokenize(text)) # list of positions (start,end)
sep = ' ' # word separator; may be language-dependent
lastpos = 0 # lastpos position
for i, w in enumerate(tokens):
    if len(w) >= 3: # ignore less than 3 chars
        w3 = sep.join(tokens[i:i+3])
        if tokens[i+1:i+2] == "-": # check if compound words
            w3 = ''.join(tokens[i:i+3])
        w2 = sep.join(tokens[i:i+2])
        w1 = w
        w3 = w3.title() if w3.isupper() else w3 # NEW YORK CITY => New York City
        w2 = w2.title() if w2.isupper() else w2
        w1 = w1.title() if w1.isupper() else w1
        comp = ['', w1, w2, w3]
        if i+2 < len(tokens) and comp[3] in labels: # 3 words
def apply_postedits(source, mt, target, source_tagged, mt_tagged,
                    target_tagged, s_lang, t_lang, postedits):
    with open('%s-%s_corrected.txt' % (s_lang, t_lang), 'w',
              encoding='utf-8') as file:
        for i in range(len(source)):
            try:
                source[i] = source[i].lower()
                mt[i] = mt[i].lower()
                target[i] = target[i].lower()

                source_words = collect(source[i], source_tagged[i])
                mt_words = collect(mt[i], mt_tagged[i])
                mt_align = align(source_words, mt_words)
                m = tokenizer(mt[i])

                edits = {}

                for elem in mt_align:
                    elem = tuple(elem)
                    for operation in postedits:
                        if elem[0] == operation[0] and elem[1] == operation[1]:
                            if elem in edits.keys():
                                edits[elem].append(operation)
                            else:
                                edits[elem] = [operation]

                b = product(*list(edits.values()))
                variants = []
                checked = []

                for elem in b:
                    v = []

                    for j in range(len(m)):
                        for k in elem:
                            if m[j] == k[1]:
                                v.append(k[2])
                                checked.append(m[j])
                            else:
                                c = 0

                                for k in edits.keys():
                                    if m[j] in k:
                                        c += 1

                                if c == 0 and (len(v) == 0
                                               or len(v) > 0 and v[-1] != m[j]
                                               ) and m[j] not in checked:
                                    v.append(m[j])

                    variants.append(v)

                file.write('S\t%s\nMT\t%s\n' % (source[i], mt[i]))

                for v in variants:
                    file.write('ED\t%s\n' % (' '.join(v)))

                file.write('T\t%s\n\n' % (target[i]))

            except:
                pass
Ejemplo n.º 27
0
def token_and_stem( language = "english" ):
	stemmer = SnowballStemmer( language )
	tokens = tokenizer()
	def stemmed_tokens( description , row ):
		return [stemmer.stem(token) for token in tokens.tokenize(description)]
	return stemmed_tokens
Ejemplo n.º 28
0
 def sent2seq(self, sentence):
     sentence = sentence.lower()
     tokens = tokenizer(sentence)
     seq = [self.word_dict[token] if token in self.word_dict else 1 for token in tokens]
     seq = [w if w < self.options['n_words_src'] else 1 for w in seq]
     return seq