def answer_all(answerer, use_chunk=False):
  with open('data/naive_out.txt', 'w') as fout:
    questions = Loader.questions()
    for qno, question in questions.iteritems():
      docs = CoreNLPLoader(qno)  
      print qno
      answer = answerer(question, docs, use_chunk)
      print answer
      if answer == None:
        fout.write("%d top_docs.%d nil\n" % (qno, qno))
      else:
        for ans in answer[:5]:
          fout.write("%d top_docs.%d %s\n" % (qno, qno, ans))
Esempio n. 2
0
class DocFeatures:
  def __init__(self, qno):
    self.docs = CoreNLPLoader(qno)

  # Return a set of indices of candidate sentences based on question features
  # Limit search to the top doc_limit docs
  # Return a list of (tokenized sentence)
  def filter_sentences(self, question_features, doc_limit=20):
    words = []
    
    # Get sentence indices, filtering by both keywords and NEs + corefs
    indices1 = self.filter_by_keyword_count(question_features, doc_limit)
    if PIPE_DEBUG: print "Indices from Keyword Count\n\t", indices1
    indices2 = self.filter_by_ne_corefs(question_features, doc_limit)
    if PIPE_DEBUG: print "Indices from NE Corefs\n\t", indices2
    indices3 = self.filter_by_exact_np_matches(question_features, doc_limit)
    if PIPE_DEBUG: print "Indices from Exact NP Matches\n\t", indices3
    indices = DocFeatures.union_sort(indices1, indices2)
    indices = DocFeatures.union_sort(indices3, indices)
    indices = [ (x,y,z) for w,x,y,z in indices ]
    if PIPE_DEBUG: print "Indices Combined\n\t", indices
    if PIPE_DEBUG:
      print "Index/Sentence Map"
      for index in indices:
        doc_idx,paragraph_idx,sent_idx = index
        paragraphs = self.docs.load_paras(doc_idx)
        paragraph = paragraphs[paragraph_idx]
        print index, "=>", paragraph.sentences()[sent_idx]
    
    # Attempt to find answer types using NEs and WordNet
    # Order NE answer types before WordNet results
    # But if this is definitely a description question, this is not going to help, so ignore
    if not self.is_description(question_features):
      if PIPE_DEBUG: print "Not a naive description question"
      words = self.filter_by_answer_type(question_features, indices)
      if PIPE_DEBUG: print "Words by answer type\n\t", words
      # words2 = self.filter_by_wordnet(question_features, indices) # Doesn't seem to work well :(
      # if PIPE_DEBUG: print "Words by wordnet type\n\t", words2
      # words = DocFeatures.match_prioritize(words, words2)
      # words = DocFeatures.union_order(words, words2)
      # pprint(words)
    
    # Pad results with NPs from sentences
    # words.extend(self.filter_by_nps(question_features, indices)) # Just extract NPs
    words.extend(self.filter_by_nps_nearby(question_features, indices)) # Extract in order of NPs near NEs
    if PIPE_DEBUG: print "Words with nearby NPs\n\t", words

    return words
  
  # Union the lists i1 and i2, sorting by the first index of each list element, descending
  @staticmethod
  def union_sort(i1, i2):
    i = list(i1)
    i1hash = dict( [ ((x,y,z),True) for c,x,y,z in i1 ] )
    for c,x,y,z in i2: # Add stuff from i2 if it doesn't appear in i1
      if (x,y,z) not in i1hash:
        i1hash[(x,y,z)] = True
        i.append((c,x,y,z))
    i = sorted(i, key = lambda x: -x[0]) # sort by count descending
    # i = [ (x,y,z) for w,x,y,z in i ] # get rid of counts
    return i
  
  # Union the lists i1 and i2, ensuring that all elements in i1 come before those in i2
  @staticmethod
  def union_order(i1, i2):
    i = list(i1)
    ihash = dict([(x,True) for x in i1])
    for y in i2:
      if y not in ihash:
        ihash[y] = True
        i.append(y)
    return i
  
  # Return a reordered list of i1, where we prioritize elements of i1 that also appear in i2
  @staticmethod
  def match_prioritize(i1, i2):
    highs, lows = [], []
    ihash = dict([(x,True) for x in i2])
    for x in i1:
      if x in ihash:
        highs.append(x)
      else:
        lows.append(x)
    return highs + lows
      
  # Tries to identify some very specific description questions
  def is_description(self, question_features):
    pos = question_features['pos']
    # WP is/was NN/P ? (ex. Who was Quetzacoatl?)
    if len(pos) == 4:
      if pos[0][1] == "WP" and (pos[1][0] == "is" or pos[1][0] == "was") and "NN" in pos[2][1]:
        return True
    return False
  
  # Filters by exact NP matches of NPs in the question to NPs in a sentence. Weighs these results more heavily,
  # since these are essentially phrase matches rather than individual word matches
  def filter_by_exact_np_matches(self, question_features, doc_limit=20):
    global_matches = []
    parse_tree = question_features['parse_tree']
    nps = extract_nps_without_determiners(parse_tree)
    phrases = [[w[0] for w in np] for np in nps]
    phrase_regexes = []
    for phrase in phrases:
      # Generate Regex for this phrase
      pre = re.compile("".join([w+"[\s]+" for w in phrase])[:-5])
      phrase_regexes.append((pre, 2**(2*(len(phrase)-1))))
    for doc_idx in range(0, min(doc_limit,len(self.docs.docs))):
      paragraphs = self.docs.load_paras(doc_idx)
      for para_idx, paragraph in enumerate(paragraphs):
        sentences = paragraph.sentences()
        tokenized_sentences = paragraph.tokenized()
        matches = naive_filter_sentences_phrases(phrase_regexes, sentences, tokenized_sentences)
        matches = [ (count,doc_idx,para_idx,sent_idx) for sent_idx,count in matches ]
        global_matches.extend(matches)        
    return global_matches
  
  # TODO can match more exactly (ex. match only "The Golden Gate Bridge" vs "Directors of the Golden Gate Bridge District")
  # Filters by matching NEs in question to words in coreference clusters in paragraphs,
  # returning all sentences belonging to each matched cluster
  # also returns keyword counts for each sentence 
  def filter_by_ne_corefs(self, question_features, doc_limit=20):
    nes = question_features['nes']
    keywords = question_features["keywords"]
    global_matches = []
    # Loop through each document
    for doc_idx in range(0, min(doc_limit,len(self.docs.docs))):
      paragraphs = self.docs.load_paras(doc_idx)
      for para_idx, paragraph in enumerate(paragraphs):
        # Match clusters in this Paragraph
        clus_matches, sentence_indices = [], []
        clusters = paragraph.coreferences()
        sentences = paragraph.tokenized()
        if clusters is not None:
          for clus_idx, cluster in enumerate(clusters):
            for cluster_pair in cluster:
              for stringy, sentence_index, x, y, z in cluster_pair:
                # Match this cluster if for any string in this cluster,
                # that all words in any NE are present
                ne_match = True
                for ne_words, _ in nes:
                  for ne_word in ne_words:
                    if ne_word not in stringy:
                      ne_match = False
                if ne_match is True:
                  clus_matches.append(clus_idx)
          clus_matches = set(clus_matches)
          # Add sentence indices for each matched cluster
          for clus_idx in clus_matches:
            cluster = clusters[clus_idx]
            for cluster_pair in cluster:
              for _, sentence_index, x, y, z in cluster_pair:
                sentence_indices.append(sentence_index)
          sentence_indices = set(sentence_indices)
          # Sanity check since CoreNLP might mess up coref sentence indexing
          if len(sentence_indices) > 0 and max(sentence_indices) < len(sentences):
            for sentence_index in sentence_indices:
              # Get keyword count, add 1 to bias slightly
              try:
                count = naive_filter_sentences(keywords, [sentences[sentence_index]], filter_zero=False)[0][1] + 1
                global_matches.append((count, doc_idx, para_idx, sentence_index))
              except:
                print "DOCIDX"
                print doc_idx
                print "PARAINDEX"
                print para_idx
                print "PARA SENTENCES"
                print paragraph.sentences()
                print "KEYWORDS"
                print keywords
                print sentence_index
                print len(sentences)
                print "SENTENCES"
                print sentences
                print "CLUSTERS"
                print clusters
                print sentences[sentence_index]
                print naive_filter_sentences(keywords, [sentences[sentence_index]], filter_zero=False)
                raise Exception()
    return global_matches
  
  # Returns sentences that contain keywords from the question, ordered by
  # the number of times keywords appear in a question
  def filter_by_keyword_count(self, question_features, doc_limit=20):
    keywords = question_features["keywords"]
    global_matches = []
    
    for doc_idx in range(0, min(doc_limit,len(self.docs.docs)) ):
      # Loop through each document
      paragraphs = self.docs.load_paras(doc_idx)
      # paragraphs = list of CoreNLPFeatures
      for paragraph_idx,paragraph in enumerate(paragraphs):
        # sentences = paragraph.tokenized()
        sentences = paragraph.lemmas()
        # Loop through paragraphs
        matches = naive_filter_sentences(keywords, sentences)
        matches = [ (count,doc_idx,paragraph_idx,sent_idx) for sent_idx,count in matches ]
        global_matches.extend(matches)
    
    # sort the matches by counts
    global_matches = sorted( global_matches, key=lambda x: -x[0] )
    return global_matches
  
  def filter_by_answer_type(self, question_features, indices):
    question_classification = question_features['classification']
    answer_type = liroth_to_corenlp(question_classification)
    global_matches = []
    
    for doc_idx,paragraph_idx,sent_idx in indices:
      paragraphs = self.docs.load_paras(doc_idx)
      paragraph = paragraphs[paragraph_idx]
      named_entities = paragraph.named_entities()
      nes_in_sentence = named_entities[sent_idx]
      for words,ne_type in nes_in_sentence:
        if ne_type == answer_type: # or answer_type == None:
          global_matches.append( words )
    global_matches = [tuple(x) for x in global_matches]
    set_matches = set()
    filtered_matches = []
    for w in global_matches:
      if w not in set_matches:
        set_matches.add(w)
        filtered_matches.append(w)
    return filtered_matches
  
  # Return NPs of sentences, given sentence indices, filtering out NEs that appear
  # in the question itself
  def filter_by_nps(self, question_features, indices):
    word_filter = list(itertools.chain.from_iterable([w for w,t in question_features['nes']]))
    global_matches = []
    for doc_idx,paragraph_idx,sent_idx in indices:
      paragraphs = self.docs.load_paras(doc_idx)
      paragraph = paragraphs[paragraph_idx]
      sentence_parse_tree = paragraph.parse_trees(flatten=True)[sent_idx]
      nps_in_sentence = naive_extract_nps(sentence_parse_tree, word_filter)
      nps_in_sentence = [ [w for w,p in np] for np in nps_in_sentence]
      global_matches.extend(nps_in_sentence)
    return global_matches
  
  # Return NPs of sentences, given sentence indices, filtering out NEs that appear
  # in the question itself, and prioritze NPs that are near NPs containing the question's NEs
  # (+/-1 NP away)
  def filter_by_nps_nearby(self, question_features, indices):
    word_filter = list(itertools.chain.from_iterable([w for w,t in question_features['nes']]))
    high_matches, low_matches = [], []
    for doc_idx,paragraph_idx,sent_idx in indices:
      paragraphs = self.docs.load_paras(doc_idx)
      paragraph = paragraphs[paragraph_idx]
      sentence_parse_tree = paragraph.parse_trees(flatten=True)[sent_idx]
      nps_in_sentence = naive_extract_nps(sentence_parse_tree)
      nps_in_sentence = [ tuple([w for w,p in np]) for np in nps_in_sentence]
      # Pick NPs near NEs and prioritize them
      high, low = [], []
      for i, np in enumerate(nps_in_sentence):
        for words, word_type in question_features['nes']:
          matched = True
          for word in words:
            if word not in np: matched = False
          if matched is True:
            if i+1 < len(nps_in_sentence) and nps_in_sentence[i+1] not in high:
              high.append(nps_in_sentence[i+1])
            if nps_in_sentence[i] not in high:
              high.append(nps_in_sentence[i])
            if i-1 >= 0 and nps_in_sentence[i-1] not in high:
              high.append(nps_in_sentence[i-1])
      for np in nps_in_sentence:
        if np not in high: high.append(np)
      high_matches.extend(high)
      low_matches.extend(low)
      high_matches = [[w for w in np if w not in word_filter] for np in high_matches]
      low_matches = [[w for w in np if w not in word_filter] for np in low_matches]
    return high_matches + low_matches
    
  # goes up the hypernym relation until either an element in answer_types is
  # a hypernym of the sense and returns true, or there are no more hypernyms
  # in which case it returns false
  def wordnet_hypernym_recursion(self, sense, answer_types):
    if sense in answer_types:
      return True
    else:
      hypernyms = sense.hypernyms()
      for hypernym in hypernyms:
        found = self.wordnet_hypernym_recursion(hypernym, answer_types)
        if found:
          return True
      return False

  def filter_by_wordnet(self, question_features, indices):
    question_classification = question_features['classification']
    coarse_question_class = question_classification.split(':')[0]
    answer_types = liroth_to_wordnet(question_classification)
    global_matches = []
    for doc_idx,paragraph_idx,sent_idx in indices:
      paragraphs = self.docs.load_paras(doc_idx)
      paragraph = paragraphs[paragraph_idx]
      tokenized_sentences = paragraph.tokenized()
      tokens = tokenized_sentences[sent_idx]
      for token in tokens:
        if answer_types == None:
          global_matches.append([token])
        else:
          token_synsets = wn.synsets(token)
          found = False
          for sense in token_synsets:
            if coarse_question_class in ['ENTY','NUM']:
              # this part is for words that are not names, like 'craters', 'dinosaurs'
              # might be applicable for definition and entity type questions
              found = self.wordnet_hypernym_recursion(sense, answer_types)
              if found:
                break
            else:
              # this part is for words that are names, like 'Australia'
              # names require the use of instance_hypernym() to get what they are
              instance_synsets = sense.instance_hypernyms()
              for instance_sense in instance_synsets:
                found = self.wordnet_hypernym_recursion(instance_sense, answer_types)
                if found:
                  break
          if found:
            global_matches.append([token]) 
    global_matches = [tuple(x) for x in global_matches]
    set_matches = set(global_matches)
    filtered_matches = []
    for w in global_matches:
      if w in set_matches:
        set_matches.remove(w)
        filtered_matches.append(w)
    return filtered_matches
Esempio n. 3
0
 def __init__(self, qno):
   self.docs = CoreNLPLoader(qno)