Beispiel #1
0
    def prepare_result(cls, paper):
        """Prepare the raw text result for featurization

        Arguments:
            paper {dict} -- A dictionary that has the required paper fields:
                            'title', 'abstract', 'authors', 'venues', 'year',
                            'n_citations', 'n_key_citations'
        Returns:
            out {dict} -- A dictionary where the paper fields have been pre-processed.
        """
        out = {'paper_year': paper.get('year', np.nan)}
        out['n_citations'] = paper.get('n_citations', 0)
        # if n_key_citations aren't available, we can get a quick estimate of what they are from the n_citations
        out['n_key_citations'] = paper.get(
            'n_key_citations', int(-1.4 + np.log1p(out['n_citations'])))
        if out['n_key_citations'] < 0:
            out['n_key_citations'] = 0
        out['paper_title_cleaned'] = fix_text(paper.get('title', ''))
        out['paper_abstract_cleaned'] = fix_text(paper.get('abstract', ''))
        out['paper_venue_cleaned'] = fix_text(paper.get('venue', ''))
        out['author_name'] = [
            fix_author_text(i) for i in paper.get('authors', [])
        ]
        return out
Beispiel #2
0
def make_features(query, result_paper, max_q_len=128, max_field_len=1024):
    # originally accepts lms as third arg
    # the language model should have the beginning and end of sentences turned off
    
    # originally lms
    lm_tiab, lm_auth, lm_venu = glms
    lm_dict = {
        'title_abstract': lambda s: lm_tiab.score(s, eos=False, bos=False),
    'author': lambda s: lm_auth.score(s, eos=False, bos=False),
    'venue': lambda s: lm_venu.score(s, eos=False, bos=False)
    }
    # apply the language model in the field as necessary
    def lm_score(s, which_lm='title'):
       if 'title' in which_lm or 'abstract' in which_lm: 
           return lm_dict['title_abstract'](s)
       elif 'venue' in which_lm: 
           return lm_dict['venue'](s)
       elif 'author' in which_lm: 
           return lm_dict['author'](s)
       elif 'max' in which_lm:
           return np.max([lm_dict['title_abstract'](s), lm_dict['venue'](s), lm_dict['author'](s)])

    try:
       year = int(result_paper['paper_year'])
       year = np.minimum(now.year, year) # papers can't be from the future.
    except:
       year = np.nan

    if result_paper['author_name'] is None:
       authors = []
    else:
       authors = result_paper['author_name']

    # fix the text and separate out quoted and unquoted
    query = str(query)
    q = fix_text(query)[:max_q_len]
    q_quoted = [i for i in extract_from_between_quotations(q) if len(i) > 0]
    q_split_on_quotes = [i.strip() for i in q.split('"') if len(i.strip()) > 0]
    q_unquoted = [i.strip() for i in q_split_on_quotes if i not in q_quoted and len(i.strip()) > 0] 

    q_unquoted_split_set = set(' '.join(q_unquoted).split())
    q_quoted_split_set = set(' '.join(q_quoted).split())
    q_split_set = q_unquoted_split_set | q_quoted_split_set
    q_split_set -= STOPWORDS

    # we will find out how much of a match we have *across* fields
    unquoted_matched_across_fields = []
    quoted_matched_across_fields = []

    # overall features for the paper and query
    q_quoted_len = np.sum([len(i) for i in q_quoted])  # total length of quoted snippets
    q_unquoted_len = np.sum([len(i) for i in q_unquoted])   # total length of non-quoted snippets
    q_len = q_unquoted_len + q_quoted_len  

    # if there's no query left at this point, we return NaNs
    # which the model natively supports
    if q_len == 0:
       return [np.nan] * len(FEATURE_NAMES)

    # testing whether a year is somewhere in the query and making year-based features
    if re.search('\d{4}', q):  # if year is in query, the feature is whether the paper year appears in the query
       year_feat = (str(year) in q_split_set)
    else:  # if year isn't in the query, we don't care about matching
       year_feat = np.nan

    feats = [
       result_paper['paper_abstract_cleaned'] is not None and len(result_paper['paper_abstract_cleaned']) > 1,
       year_feat,  # whether the year appears anywhere in the (split) query
    ]

    # if year is matched, add it to the matched_across_all_fields but remove from query
    # so it doesn't get matched in author/title/venue/abstract later
    if np.any([str(year) in i for i in q_quoted]):
       quoted_matched_across_fields.append(str(year))
    if np.any([str(year) in i for i in q_unquoted]):
       unquoted_matched_across_fields.append(str(year))

    # if year is matched, we don't need to match it again, so removing
    if year_feat is True and len(q_split_set) > 1: 
       q_split_set.remove(str(year))

    # later we will filter some features based on nonsensical unigrams in the query
    # this is the log probability lower-bound for sensible unigrams
    log_prob_nonsense = lm_score('qwertyuiop', 'max')

    # features title, abstract, venue
    title_and_venue_matches = set()
    title_and_abstract_matches = set()
    for field in ['paper_title_cleaned', 'paper_abstract_cleaned', 'paper_venue_cleaned']:
       if result_paper[field] is not None:
           text = result_paper[field][:max_field_len]
       else:
           text = ''
       text_len = len(text)

       # unquoted matches
       unquoted_match_spans, unquoted_match_text, unquoted_longest_starting_ngram = find_query_ngrams_in_text(q_unquoted, text, quotes=False)
       unquoted_matched_across_fields.extend(unquoted_match_text)
       unquoted_match_len = len(unquoted_match_spans)

       # quoted matches
       quoted_match_spans, quoted_match_text, quoted_longest_starting_ngram = find_query_ngrams_in_text(q_quoted, text, quotes=True)
       quoted_matched_across_fields.extend(quoted_match_text)
       quoted_match_len = len(quoted_match_text)

       # now we (a) combine the quoted and unquoted results
       match_spans = unquoted_match_spans + quoted_match_spans
       match_text = unquoted_match_text + quoted_match_text

       # and (b) take the set of the results
       # while excluding sub-ngrams if longer ngrams are found
       # e.g. if we already have 'sentiment analysis', then 'sentiment' is excluded
       match_spans_set = []
       match_text_set = []
       for t, s in sorted(zip(match_text, match_spans), key=lambda s: len(s[0]))[::-1]:
           if t not in match_text_set and ~np.any([t in i for i in match_text_set]):
               match_spans_set.append(s)
               match_text_set.append(t)

       # remove venue results if they already entirely appeared
       if 'venue' in field:
           text_unigram_len = len(text.split(' '))
           match_spans_set_filtered = []
           match_text_set_filtered = []
           for sp, tx in zip(match_spans_set, match_text_set):
               tx_unigrams = set(tx.split(' '))
               # already matched all of these unigrams in title or abstract
               condition_1 = (tx_unigrams.intersection(title_and_abstract_matches) == tx_unigrams)
               # and matched too little of the venue text
               condition_2 = len(tx_unigrams) / text_unigram_len <= 2/3
               if not (condition_1 and condition_2):
                   match_spans_set_filtered.append(sp)
                   match_text_set_filtered.append(tx)

           match_spans_set = match_spans_set_filtered
           match_text_set = match_text_set_filtered

       # match_text_set but unigrams
       matched_text_unigrams = set()
       for i in match_text_set:
           i_split = i.split()
           matched_text_unigrams.update(i_split)
           if 'title' in field or 'venue' in field:
               title_and_venue_matches.update(i_split)
           if 'title' in field or 'abstract' in field:
               title_and_abstract_matches.update(i_split)

       if len(match_text_set) > 0 and text_len > 0:  # if any matches and the text has any length
           # log probabilities of the scores
           if 'venue' in field:
               lm_probs = [lm_score(match, 'venue') for match in match_text_set]
           else:
               lm_probs = [lm_score(match, 'max') for match in match_text_set]

           # match character lengths
           match_lens = [len(i) for i in match_text_set]

           # match word lens
           match_word_lens = [len(i.split()) for i in match_text_set]

           # we have one feature that takes into account repetition of matches
           match_text_counter = Counter(match_text)
           match_spans_len_normed = np.log1p(list(match_text_counter.values())).sum()

           # remove stopwords from unigrams
           matched_text_unigrams -= STOPWORDS

           feats.extend([
               len(q_split_set.intersection(matched_text_unigrams)) / np.maximum(len(q_split_set), 1),  # total fraction of the query that was matched in text
               np.nanmean(lm_probs),  # average log-prob of the matches
               np.nansum(np.array(lm_probs) * np.array(match_word_lens)),  # sum of log-prob of matches times word-lengths
           ])
       else:
           # if we have no matches, then the features are deterministically 0
           feats.extend([0, 0, 0])


    # features for author field only
    # note: we aren't using citation info
    # because we don't know which author we are matching
    # in the case of multiple authors with the same name
    q_auth = fix_author_text(query)[:max_q_len]
    q_quoted_auth = extract_from_between_quotations(q_auth)
    q_split_on_quotes = [i.strip() for i in q_auth.split('"') if len(i.strip()) > 0]
    q_unquoted_auth = [i for i in q_split_on_quotes if i not in q_quoted_auth]
    # remove any unigrams that we already matched in title or venue
    # but not abstract since citations are included there
    # note: not sure if this make sense for quotes, but keeping it for those now
    q_quoted_auth = [remove_unigrams(i, title_and_venue_matches) for i in q_quoted_auth]
    q_unquoted_auth = [remove_unigrams(i, title_and_venue_matches) for i in q_unquoted_auth]

    unquoted_match_lens = []  # normalized author matches
    quoted_match_lens = []  # quoted author matches
    match_fracs = []
    for paper_author in authors:
       len_author = len(paper_author)
       if len_author > 0:
            # higher weight for the last name
           paper_author_weights = np.ones(len_author)
           len_last_name = len(paper_author.split(' ')[-1])
           paper_author_weights[-len_last_name:] *= 10  # last name is ten times more important to match
           paper_author_weights /= paper_author_weights.sum()

           # 
           for quotes_flag, q_loop in zip([False, True], [q_unquoted_auth, q_quoted_auth]):
               matched_spans, match_text, _ = find_query_ngrams_in_text(
                   q_loop, 
                   paper_author, 
                   quotes=quotes_flag, 
                   len_filter=0,
                   remove_stopwords=True,  # only removes entire matches that are stopwords. too bad for people named 'the' or 'less'
                   use_word_boundaries=False  
               )
               if len(matched_spans) > 0:
                   matched_text_joined = ' '.join(match_text)
                   # edge case: single character matches are not good
                   if len(matched_text_joined) == 1:
                       matched_text_joined = ''
                   weight = np.sum([paper_author_weights[i:j].sum() for i, j in matched_spans]) 
                   match_frac = np.minimum((len(matched_text_joined) / q_len), 1)
                   match_fracs.append(match_frac)
                   if quotes_flag:
                       quoted_match_lens.append(match_frac * weight)
                       quoted_matched_across_fields.append(matched_text_joined)
                   else:
                       unquoted_match_lens.append(match_frac * weight)
                       unquoted_matched_across_fields.append(matched_text_joined)
               else:
                   if quotes_flag:
                       quoted_match_lens.append(0)
                   else:
                       unquoted_match_lens.append(0)

    # since we ran this separately (per author) for quoted and uquoted, we want to avoid potential double counting
    match_lens_max = np.maximum(unquoted_match_lens, quoted_match_lens)
    nonzero_inds = np.flatnonzero(match_lens_max)
    # the closest index to the ends of author lists
    if len(nonzero_inds) == 0:
       author_ind_feature = np.nan
    else:
       author_ind_feature = np.minimum(nonzero_inds[0], len(authors) - 1 - nonzero_inds[-1]) 
    feats.extend([
       np.nansum(match_lens_max),  # total amount of (weighted) matched authors 
       nanwrapper(np.nanmax, match_lens_max),  # largest (weighted) author match
       author_ind_feature,  # penalizing matches that are far away from ends of author list
    ])

    # oldness and citations 
    feats.extend([
       now.year - year,  # oldness (could be nan if year is missing)
       result_paper['n_citations'],  # no need for log due to decision trees
       result_paper['n_key_citations'],
       np.nan if np.isnan(year) else result_paper['n_citations'] / (now.year - year + 1)
    ])

    # special features for how much of the unquoted query was matched/unmatched across all fields
    q_unquoted_split_set -= STOPWORDS
    if len(q_unquoted_split_set) > 0:
       matched_split_set = set()
       for i in unquoted_matched_across_fields:
           matched_split_set.update(i.split())
       # making sure stopwords aren't an issue
       matched_split_set -= STOPWORDS
       # fraction of the unquery matched
       numerator = len(q_unquoted_split_set.intersection(matched_split_set))
       feats.append(numerator / np.maximum(len(q_unquoted_split_set), 1))
       # the log-prob of the unmatched unquotes
       unmatched_unquoted = q_unquoted_split_set - matched_split_set
       log_probs_unmatched_unquoted = [lm_score(i, 'max') for i in unmatched_unquoted]
       feats.append(np.nansum([i for i in log_probs_unmatched_unquoted if i > log_prob_nonsense]))
    else:
       feats.extend([np.nan, np.nan])

    # special features for how much of the quoted query was matched/unmatched across all fields
    if len(q_quoted) > 0:
       numerator = len(set(' '.join(quoted_matched_across_fields).split()))
       feats.append(numerator / len(q_quoted_split_set))
       # the log-prob of the unmatched quotes
       unmatched_quoted = set(q_quoted) - set(quoted_matched_across_fields)
       feats.append(np.nansum([lm_score(i, 'max') for i in unmatched_quoted]))
    else:
       feats.extend([np.nan, np.nan])

    return feats