def get_smooth_path(summ_sent, article_sent): summ_sent = ['<s>'] + summ_sent + ['</s>'] article_sent = ['<s>'] + article_sent + ['</s>'] matches = [] article_indices = [] summ_token_to_indices = util.create_token_to_indices(summ_sent) article_token_to_indices = util.create_token_to_indices(article_sent) for key in list(article_token_to_indices.keys()): if (util.is_punctuation(key) and not util.is_quotation_mark(key)): del article_token_to_indices[key] for token in list(summ_token_to_indices.keys()): if token in article_token_to_indices: article_indices.extend(article_token_to_indices[token]) matches.extend([token] * len(summ_token_to_indices[token])) article_indices = sorted(article_indices) # Add a single word or a pair of words if they are in between two hightlighted content words new_article_indices = [] new_article_indices.append(0) for article_idx in article_indices[1:]: word = article_sent[article_idx] prev_highlighted_word = article_sent[new_article_indices[-1]] if article_idx - new_article_indices[-1] <= 3 \ and ((util.is_content_word(word) and util.is_content_word(prev_highlighted_word)) \ or (len(new_article_indices) >= 2 and util.is_content_word(word) and util.is_content_word(article_sent[new_article_indices[-2]]))): in_between_indices = list( range(new_article_indices[-1] + 1, article_idx)) are_not_punctuation = [ not util.is_punctuation(article_sent[in_between_idx]) for in_between_idx in in_between_indices ] if all(are_not_punctuation): new_article_indices.extend(in_between_indices) new_article_indices.append(article_idx) new_article_indices = new_article_indices[ 1:-1] # remove <s> and </s> from list # Remove isolated stopwords new_new_article_indices = [] for idx, article_idx in enumerate(new_article_indices): if (not util.is_stopword_punctuation(article_sent[article_idx])) or ( idx > 0 and new_article_indices[idx - 1] == article_idx - 1) or (idx < len(new_article_indices) - 1 and new_article_indices[idx + 1] == article_idx + 1): new_new_article_indices.append(article_idx) new_new_article_indices = [ idx - 1 for idx in new_new_article_indices ] # fix indexing since we don't count <s> and </s> return new_new_article_indices
def get_first_nonpronoun_mention_less_than_5_words(mentions): for m in mentions: tokens = m['text'].split(' ') contains_punctuation = any( [util.is_punctuation(token) for token in tokens]) if m['type'] != 'PRONOMINAL' and (int(m['endIndex']) - int( m['startIndex']) <= 5) and not contains_punctuation: return m return None
def removable_words(words): return [ i for i, w in enumerate(words) if len(w) > 0 and not (is_punctuation(w) or is_number(w)) ]
# distance of top N words from top location # features 50-53 row += [abs(x-p.locations[0]) for x in p.locations[1:]] # fraction of top N words predicted to be at this location try: f = sum(loc==p.locations[0] for loc in p.locations) / float(len(p.locations)) except ZeroDivisionError: f = 0 try: f2 = float(p.location)/nwords_in_sentence except ZeroDivisionError: f2 = 0 # features 54-58 row += [f, nwords_in_sentence, nwords_in_sentence-p.location, p.location, f2, (10.**p.p_anywhere[0])/nwords_in_sentence] # predicted word is unknown / POS tag # feature 59 row.append(p.word in unk) # amount of punctuation in the sentence npunct = sum(is_punctuation(w) for i, w in enumerate(g) if i != gi) # amount of punctuation around the predicted word npunct_around_word = sum(is_punctuation(w) for w in g[max(0,i-5):i+5]) ncommas = sum(w==',' for w in g) # feature 60-61 row += [npunct, npunct_around_word, ncommas, ncommas % 2] # features from parse trees t = Tree.fromstring(parse) rt = Tree.fromstring(rparse) row += [t.height(), rt.height(), t.height()-rt.height()] # probability of sentence based on parse Tree productions tprod = t.productions() tp = sentence_prob(tprod, sngp) rtprod = rt.productions() rtp = sentence_prob(rtprod, sngp) rtprod = set(map(str, rtprod))
def is_valid(token: Token) -> bool: return not is_stopword(token) and not is_punctuation(token)
except ZeroDivisionError: f = 0 try: f2 = float(p.location) / nwords_in_sentence except ZeroDivisionError: f2 = 0 # features 54-58 row += [ f, nwords_in_sentence, nwords_in_sentence - p.location, p.location, f2, (10.**p.p_anywhere[0]) / nwords_in_sentence ] # predicted word is unknown / POS tag # feature 59 row.append(p.word in unk) # amount of punctuation in the sentence npunct = sum(is_punctuation(w) for i, w in enumerate(g) if i != gi) # amount of punctuation around the predicted word npunct_around_word = sum( is_punctuation(w) for w in g[max(0, i - 5):i + 5]) ncommas = sum(w == ',' for w in g) # feature 60-61 row += [npunct, npunct_around_word, ncommas, ncommas % 2] # features from parse trees t = Tree.fromstring(parse) rt = Tree.fromstring(rparse) row += [t.height(), rt.height(), t.height() - rt.height()] # probability of sentence based on parse Tree productions tprod = t.productions() tp = sentence_prob(tprod, sngp) rtprod = rt.productions() rtp = sentence_prob(rtprod, sngp)