Exemple #1
0
def get_smooth_path(summ_sent, article_sent):
    summ_sent = ['<s>'] + summ_sent + ['</s>']
    article_sent = ['<s>'] + article_sent + ['</s>']

    matches = []
    article_indices = []
    summ_token_to_indices = util.create_token_to_indices(summ_sent)
    article_token_to_indices = util.create_token_to_indices(article_sent)
    for key in list(article_token_to_indices.keys()):
        if (util.is_punctuation(key) and not util.is_quotation_mark(key)):
            del article_token_to_indices[key]
    for token in list(summ_token_to_indices.keys()):
        if token in article_token_to_indices:
            article_indices.extend(article_token_to_indices[token])
            matches.extend([token] * len(summ_token_to_indices[token]))
    article_indices = sorted(article_indices)

    # Add a single word or a pair of words if they are in between two hightlighted content words
    new_article_indices = []
    new_article_indices.append(0)
    for article_idx in article_indices[1:]:
        word = article_sent[article_idx]
        prev_highlighted_word = article_sent[new_article_indices[-1]]
        if article_idx - new_article_indices[-1] <= 3 \
                and ((util.is_content_word(word) and util.is_content_word(prev_highlighted_word)) \
                or (len(new_article_indices) >= 2 and util.is_content_word(word) and util.is_content_word(article_sent[new_article_indices[-2]]))):
            in_between_indices = list(
                range(new_article_indices[-1] + 1, article_idx))
            are_not_punctuation = [
                not util.is_punctuation(article_sent[in_between_idx])
                for in_between_idx in in_between_indices
            ]
            if all(are_not_punctuation):
                new_article_indices.extend(in_between_indices)
        new_article_indices.append(article_idx)
    new_article_indices = new_article_indices[
        1:-1]  # remove <s> and </s> from list

    # Remove isolated stopwords
    new_new_article_indices = []
    for idx, article_idx in enumerate(new_article_indices):
        if (not util.is_stopword_punctuation(article_sent[article_idx])) or (
                idx > 0 and new_article_indices[idx - 1] == article_idx -
                1) or (idx < len(new_article_indices) - 1
                       and new_article_indices[idx + 1] == article_idx + 1):
            new_new_article_indices.append(article_idx)
    new_new_article_indices = [
        idx - 1 for idx in new_new_article_indices
    ]  # fix indexing since we don't count <s> and </s>
    return new_new_article_indices
def get_first_nonpronoun_mention_less_than_5_words(mentions):
    for m in mentions:
        tokens = m['text'].split(' ')
        contains_punctuation = any(
            [util.is_punctuation(token) for token in tokens])
        if m['type'] != 'PRONOMINAL' and (int(m['endIndex']) - int(
                m['startIndex']) <= 5) and not contains_punctuation:
            return m
    return None
def removable_words(words):
    return [
        i for i, w in enumerate(words)
        if len(w) > 0 and not (is_punctuation(w) or is_number(w))
    ]
 # distance of top N words from top location
 # features 50-53
 row += [abs(x-p.locations[0]) for x in p.locations[1:]]
 # fraction of top N words predicted to be at this location
 try: f = sum(loc==p.locations[0] for loc in p.locations) / float(len(p.locations))
 except ZeroDivisionError: f = 0
 try: f2 = float(p.location)/nwords_in_sentence
 except ZeroDivisionError: f2 = 0
 # features 54-58
 row += [f, nwords_in_sentence, nwords_in_sentence-p.location, 
         p.location, f2, (10.**p.p_anywhere[0])/nwords_in_sentence]
 # predicted word is unknown / POS tag
 # feature 59
 row.append(p.word in unk)
 # amount of punctuation in the sentence
 npunct = sum(is_punctuation(w) for i, w in enumerate(g) if i != gi)
 # amount of punctuation around the predicted word
 npunct_around_word = sum(is_punctuation(w) for w in g[max(0,i-5):i+5])
 ncommas = sum(w==',' for w in g)
 # feature 60-61
 row += [npunct, npunct_around_word, ncommas, ncommas % 2]
 # features from parse trees
 t = Tree.fromstring(parse)
 rt = Tree.fromstring(rparse)
 row += [t.height(), rt.height(), t.height()-rt.height()]
 # probability of sentence based on parse Tree productions
 tprod = t.productions()
 tp = sentence_prob(tprod, sngp)
 rtprod = rt.productions()
 rtp = sentence_prob(rtprod, sngp)
 rtprod = set(map(str, rtprod))
Exemple #5
0
 # distance of top N words from top location
 # features 50-53
 row += [abs(x-p.locations[0]) for x in p.locations[1:]]
 # fraction of top N words predicted to be at this location
 try: f = sum(loc==p.locations[0] for loc in p.locations) / float(len(p.locations))
 except ZeroDivisionError: f = 0
 try: f2 = float(p.location)/nwords_in_sentence
 except ZeroDivisionError: f2 = 0
 # features 54-58
 row += [f, nwords_in_sentence, nwords_in_sentence-p.location, 
         p.location, f2, (10.**p.p_anywhere[0])/nwords_in_sentence]
 # predicted word is unknown / POS tag
 # feature 59
 row.append(p.word in unk)
 # amount of punctuation in the sentence
 npunct = sum(is_punctuation(w) for i, w in enumerate(g) if i != gi)
 # amount of punctuation around the predicted word
 npunct_around_word = sum(is_punctuation(w) for w in g[max(0,i-5):i+5])
 ncommas = sum(w==',' for w in g)
 # feature 60-61
 row += [npunct, npunct_around_word, ncommas, ncommas % 2]
 # features from parse trees
 t = Tree.fromstring(parse)
 rt = Tree.fromstring(rparse)
 row += [t.height(), rt.height(), t.height()-rt.height()]
 # probability of sentence based on parse Tree productions
 tprod = t.productions()
 tp = sentence_prob(tprod, sngp)
 rtprod = rt.productions()
 rtp = sentence_prob(rtprod, sngp)
 rtprod = set(map(str, rtprod))
Exemple #6
0
 def is_valid(token: Token) -> bool:
     return not is_stopword(token) and not is_punctuation(token)
Exemple #7
0
 except ZeroDivisionError:
     f = 0
 try:
     f2 = float(p.location) / nwords_in_sentence
 except ZeroDivisionError:
     f2 = 0
 # features 54-58
 row += [
     f, nwords_in_sentence, nwords_in_sentence - p.location, p.location,
     f2, (10.**p.p_anywhere[0]) / nwords_in_sentence
 ]
 # predicted word is unknown / POS tag
 # feature 59
 row.append(p.word in unk)
 # amount of punctuation in the sentence
 npunct = sum(is_punctuation(w) for i, w in enumerate(g) if i != gi)
 # amount of punctuation around the predicted word
 npunct_around_word = sum(
     is_punctuation(w) for w in g[max(0, i - 5):i + 5])
 ncommas = sum(w == ',' for w in g)
 # feature 60-61
 row += [npunct, npunct_around_word, ncommas, ncommas % 2]
 # features from parse trees
 t = Tree.fromstring(parse)
 rt = Tree.fromstring(rparse)
 row += [t.height(), rt.height(), t.height() - rt.height()]
 # probability of sentence based on parse Tree productions
 tprod = t.productions()
 tp = sentence_prob(tprod, sngp)
 rtprod = rt.productions()
 rtp = sentence_prob(rtprod, sngp)