def colorize_alterations(tokens):
  out_toks = []
  for t in tokens:
    if 'altered' in t:
      new_tok = {'originalText': colored(t['originalText'], 'cyan'),
                 'before': t['before']}
      out_toks.append(new_tok)
    else:
      out_toks.append(t)
  return corenlp.rejoin(out_toks)
Ejemplo n.º 2
0
 def _get_tokens_for_answers(self, answer_objs, corenlp_obj):
   """Get CoreNLP tokens corresponding to a SQuAD answer object."""
   first_a_toks = None
   for i, a_obj in enumerate(answer_objs):
     a_toks = []
     answer_start = a_obj['answer_start']
     answer_end = answer_start + len(a_obj['text'])
     for s in corenlp_obj['sentences']:
       for t in s['tokens']:
         if t['characterOffsetBegin']  >= answer_end: continue
         if t['characterOffsetEnd'] <= answer_start: continue
         a_toks.append(t)
     if corenlp.rejoin(a_toks).strip() == a_obj['text']:
       # Make sure that the tokens reconstruct the answer
       return i, a_toks
     if i == 0: first_a_toks = a_toks
   # None of the extracted token lists reconstruct the answer
   # Default to the first
   return 0, first_a_toks
Ejemplo n.º 3
0
def ans_date(a, tokens, q, **kwargs):
  out_toks = []
  if not all(t['ner'] == 'DATE' for t in tokens): return None
  for t in tokens:
    if t['pos'] == 'CD' or t['word'].isdigit():
      try:
        value = int(t['word'])
      except:
        value = 10  # fallback
      if value > 50:  new_val = str(value - 25)  # Year
      else:  # Day of month
        if value > 15: new_val = str(value - 11)
        else: new_val = str(value + 11)
    else:
      if t['word'].lower() in MONTHS:
        m_ind = MONTHS.index(t['word'].lower())
        new_val = MONTHS[(m_ind + 6) % 12].title()
      else:
        # Give up
        new_val = t['originalText']
    out_toks.append({'before': t['before'], 'originalText': new_val})
  new_ans = corenlp.rejoin(out_toks).strip()
  if new_ans == a['text']: return None
  return new_ans
Ejemplo n.º 4
0
  def alter_question(self, q, tokens, const_parse,
                    strategy='separate'):
    """Alter the question to make it ask something else.

    Possible strategies:
      - separate: Do best alteration for each word separately.
      - best: Generate exactly one best alteration (may over-alter).
      - high-conf: Do all possible high-confidence alterations
      - high-conf-separate: Do best high-confidence alteration for each word separately.
      - all: Do all possible alterations (very conservative)
    """
    used_words = [t['word'].lower() for t in tokens]
    new_qs = []
    toks_all = []
    if strategy.startswith('high-conf'): 
      rules = HIGH_CONF_ALTER_RULES
    else:
      rules = ALL_ALTER_RULES
    for i, t in enumerate(tokens):
      if t['word'].lower() in DO_NOT_ALTER: 
        if strategy in ('high-conf', 'all'): toks_all.append(t)
        continue
      begin = tokens[:i]
      end = tokens[i+1:]
      found = False
      for rule_name in rules:
        rule = rules[rule_name]
        new_words = rule(t, nearby_word_dict=self.nearby_word_dict,
                        postag_dict=self.postag_dict)
        if new_words:
          for nw in new_words:
            if nw.lower() in used_words: continue
            if nw.lower() in BAD_ALTERATIONS: continue
            # Match capitzliation
            if t['word'] == t['word'].upper():
              nw = nw.upper()
            elif t['word'] == t['word'].title():
              nw = nw.title()
            new_tok = dict(t)
            new_tok['word'] = new_tok['lemma'] = new_tok['originalText'] = nw
            new_tok['altered'] = True
            # NOTE: obviously this is approximate
            if strategy.endswith('separate'):
              new_tokens = begin + [new_tok] + end
              new_q = corenlp.rejoin(new_tokens)
              tag = '%s-%d-%s' % (rule_name, i, nw)
              new_const_parse = corenlp.ConstituencyParse.replace_words(
                  const_parse, [t['word'] for t in new_tokens])
              new_qs.append((new_q, new_tokens, new_const_parse, tag))
              break
            elif strategy in ('high-conf', 'all'):
              toks_all.append(new_tok)
              found = True
              break
        if strategy in ('high-conf', 'all') and found: break
      if strategy in ('high-conf', 'all') and not found:
        toks_all.append(t)
    if strategy in ('high-conf', 'all'):
      new_q = corenlp.rejoin(toks_all)
      new_const_parse = corenlp.ConstituencyParse.replace_words(
          const_parse, [t['word'] for t in toks_all])
      if new_q != q:
        new_qs.append((corenlp.rejoin(toks_all), toks_all, new_const_parse, strategy))
    return new_qs
Ejemplo n.º 5
0
def ans_number(a, tokens, q, **kwargs):
  out_toks = []
  seen_num = False
  for t in tokens:
    ner = t['ner']
    pos = t['pos']
    w = t['word']
    out_tok = {'before': t['before']}

    # Split on dashes
    leftover = ''
    dash_toks = w.split('-')
    if len(dash_toks) > 1:
      w = dash_toks[0]
      leftover = '-'.join(dash_toks[1:])

    # Try to get a number out
    value = None
    if w != '%': 
      # Percent sign should just pass through
      try:
        value = float(w.replace(',', ''))
      except:
        try:
          norm_ner = t['normalizedNER']
          if norm_ner[0] in ('%', '>', '<'):
            norm_ner = norm_ner[1:]
          value = float(norm_ner)
        except:
          pass
    if not value and (
        ner == 'NUMBER' or 
        (ner == 'PERCENT' and pos == 'CD')):
      # Force this to be a number anyways
      value = 10
    if value:
      if math.isinf(value) or math.isnan(value): value = 9001
      seen_num = True
      if w in ('thousand', 'million', 'billion', 'trillion'):
        if w == 'thousand':
          new_val = 'million'
        else:
          new_val = 'thousand'
      else:
        if value < 2500 and value > 1000:
          new_val = str(value - 75)
        else:
          # Change leading digit
          if value == int(value):
            val_chars = list('%d' % value)
          else:
            val_chars = list('%g' % value)
          c = val_chars[0]
          for i in range(len(val_chars)):
            c = val_chars[i]
            if c >= '0' and c <= '9':
              val_chars[i] = str(max((int(c) + 5) % 10, 1))
              break
          new_val = ''.join(val_chars)
      if leftover:
        new_val = '%s-%s' % (new_val, leftover)
      out_tok['originalText'] = new_val
    else:
      out_tok['originalText'] = t['originalText']
    out_toks.append(out_tok)
  if seen_num:
    return corenlp.rejoin(out_toks).strip()
  else:
    return None