Exemple #1
0
    def align_single_concept(self,sent,tokens,cur_var,amr,alignment,unmatched_vars,triples,NEXT=False):
        '''align single concept'''
        
        if cur_var in amr.node_to_concepts:
            cur_concept = amr.node_to_concepts[cur_var]
        else:
            cur_concept = cur_var

        if cur_var in alignment and not NEXT and not isinstance(cur_var,(StrLiteral,Quantity,Polarity)) : # already aligned
            return True, sent,tokens

        match = self.concept_patterns.match(cur_concept)
        if match:
            rule_type = match.lastgroup
            span = None
            update = True
            if rule_type == "NameEntity":
                NE_items = [v[0] for k,v in amr[cur_var].items() if isinstance(v[0],StrLiteral)]
                nep = r'%s|%s'%(r'\s'.join(NE_items),r'\s'.join(n[:4] if len(n) > 3 else n for n in NE_items))
                NE_pattern = re.compile(nep,re.IGNORECASE)
                
                start,end = self._search_sent(NE_pattern,sent,tokens)
                assert end-start == len(NE_items)
                span = Span(start,end,Aligner.ENTITY_TAG_TABLE[rule_type],NE_items)
                alignment[cur_var].append(span)
                for k,v in amr[cur_var].items():
                    if isinstance(v[0],StrLiteral):
                        self.remove_aligned_concepts(cur_var,k,v[0],unmatched_vars,triples)

            elif rule_type in ["DateEntity", "haveOrgRole91","RateEntity"]:
                EN_items = []
                EN_spans = []
                for k,v in amr[cur_var].items():                    
                    vconcept = amr.node_to_concepts[v[0]] if v[0] in amr.node_to_concepts else v[0]
                    EN_items.append(vconcept)
                    success, sent, tokens = self.align_single_concept(sent,tokens,v[0],amr,alignment,unmatched_vars,triples)

                    sp = alignment[v[0]][-1]
                    sp.set_entity_tag(Aligner.ENTITY_TAG_TABLE[rule_type])
                    EN_spans.append(sp)
                    self.remove_aligned_concepts(cur_var,k,v[0],unmatched_vars,triples)
                #print NE_spans,alignment
                start = EN_spans[0].start
                end = EN_spans[-1].end
                span = Span(start,end,Aligner.ENTITY_TAG_TABLE[rule_type],EN_items)
                span.set_entity_tag(Aligner.ENTITY_TAG_TABLE[rule_type])
                alignment[cur_var].append(span)

            elif rule_type == "QuantityEntity":
                quantity = ''
                unit = ''
                unit_var = None
                q_success = False
                u_success = False
                                
                for k,v in amr[cur_var].items():
                    if k == 'quant':
                        quantity = v[0]
                        q_success, sent, tokens = self.align_single_concept(sent,tokens,quantity,amr,alignment,unmatched_vars,triples)
                    elif k == 'unit':
                        unit_var = v[0]
                        unit = amr.node_to_concepts[v[0]]
                        u_success, sent, tokens = self.align_single_concept(sent,tokens,unit_var,amr,alignment,unmatched_vars,triples)
                    else:
                        pass
                        
                if q_success and u_success:
                    #QTY_pattern = r'(%s|%s)\s+(%s)s?' % (quantity,english_number(int(quantity)),unit)
                    #QTY_items = [quantity,unit]
                    #start,end = self._search_sent(QTY_pattern,QTY_items,sent,tokens)
                    #assert end - start == len(QTY_items)
                    quantity_span = alignment[quantity][-1]
                    unit_span = alignment[unit_var][0]
                    start = quantity_span.start if quantity_span.start < unit_span.end else unit_span.start
                    end = unit_span.end if quantity_span.start < unit_span.end else quantity_span.end
                    while not (end - len(quantity_span.words)-len(unit_span.words) - start < 2): # wrong match more than one quantity to map in sentence
                        alignment[quantity].pop()
                        q_success, sent, tokens = self.align_single_concept(sent,tokens,quantity,amr,alignment,unmatched_vars,triples,NEXT=True) # redo it on updated sentence
                        quantity_span = alignment[quantity][-1]
                        start = quantity_span.start
                    #assert start == end - 2
                    span = Span(start,end,Aligner.ENTITY_TAG_TABLE[rule_type],[quantity,unit])
                    self.remove_aligned_concepts(cur_var,'quant',quantity,unmatched_vars,triples)
                    alignment[cur_var].append(span)
                elif q_success and not u_success: # does not have unit or unit cannot be aligned
                    quantity_span =  alignment[quantity][0]
                    start = quantity_span.start
                    end = quantity_span.end
                    span = Span(start,end,Aligner.ENTITY_TAG_TABLE[rule_type],[quantity])
                    self.remove_aligned_concepts(cur_var,'quant',quantity,unmatched_vars,triples)
                    alignment[cur_var].append(span)
                    #self.remove_aligned_concepts(unmatched_vars,amr[cur_var].items())
                elif not q_success and u_success:
                    unit_span = alignment[unit_var][0]
                    span = Span(unit_span.start,unit_span.end,Aligner.ENTITY_TAG_TABLE[rule_type],[unit])
                    self.remove_aligned_concepts(cur_var,'unit',unit_var,unmatched_vars,triples)
                    alignment[cur_var].append(span)
                else:
                    rule_type = 'SingleConcept'
            elif rule_type == "Number":
                '''
                aligned = False
                num = [cur_var]
                num.extend(english_number(int(cur_var)).split('|'))
                for i,token in tokens:
                    if token.lower() in num:
                        aligned = True
                        break
                if aligned:
                    span = Span(i,i+1,Aligner.ENTITY_TAG_TABLE[rule_type],[token])
                    alignment[cur_var].append(span)
                else:
                    print >> sys.stderr, 'Variable/Concept %s/%s cannot be aligned'%(cur_var,cur_concept)
                    update = False
                '''
                if re.match('[0-9]+:[0-9]+',cur_concept):
                    num = [('time','(\\s|^)('+cur_concept+')(\\s|&)'),
                           ('english','(\\s|^)('+to_time(cur_concept)+')(\\s|&)')]
                else:
                    num = [('digit','(\\s|^)('+cur_concept+'|'+format_num(cur_concept)+')(\\s|&)'),
                           ('string','(\\s|^)('+english_number(int(cur_concept))+')(\\s|&)'),
                           ('order','(\\s|^)('+to_order(cur_concept)+')(\\s|&)'),
                           ('round','(\\s|^)('+to_round(int(cur_concept))+')(\\s|&)') 
                       ]
                NUM_pattern = self._compile_regex_rule(num)
                #print NUM_pattern.pattern
                try:
                    start,end = self._search_sent(NUM_pattern,sent,tokens)
                    span = Span(start,end,Aligner.ENTITY_TAG_TABLE[rule_type],[w for i,w in tokens if i in range(start,end)])
                    alignment[cur_var].append(span)                
                except Exception as e:
                    update = False
                    print >> sys.stderr,e
                    #raw_input('CONTINUE')
            
            elif rule_type == 'multiple':
                op1 = amr[cur_var]['op1'][0]

                success, sent, tokens = self.align_single_concept(sent,tokens,op1,amr,alignment,unmatched_vars,triples)
                if success:
                    span = alignment[op1][0]
                    alignment[cur_var].append(span)                                    
                    self.remove_aligned_concepts(cur_var,'op1',op1,unmatched_vars,triples)  
                else:
                    update = False
                
            elif rule_type in ["person","picture","country","state","city","desert","organization"]:
                if 'name' in amr[cur_var]:
                    k_var = amr[cur_var]['name'][0]
                    success, sent, tokens = self.align_single_concept(sent,tokens,k_var,amr,alignment,unmatched_vars,triples)
                    span = alignment[k_var][0]
                    span.set_entity_tag(Aligner.ENTITY_TAG_TABLE[rule_type+'-name'])
                    alignment[cur_var].append(span)
                else:
                    ind,span = self.try_align_as_single_concept(cur_var,cur_concept,amr,alignment,tokens,unmatched_vars,triples)
                    if ind:
                        pass
                    elif 'ARG0-of' in amr[cur_var]:
                        k_var = amr[cur_var]['ARG0-of'][0]
                        success, sent, tokens = self.align_single_concept(sent,tokens,k_var,amr,alignment,unmatched_vars,triples)
                        if success:
                            span = alignment[k_var][0]
                            span.set_entity_tag(Aligner.ENTITY_TAG_TABLE[rule_type])
                            alignment[cur_var].append(span)
                        else:
                            update = False

                    else:
                        update = False
               


            elif rule_type == "NegPolarity":
                aligned = False
                for i,token in tokens:
                    if token.lower() in Aligner.neg_polarity:
                        aligned = True
                        break
                if aligned:
                    span = Span(i,i+1,Aligner.ENTITY_TAG_TABLE[rule_type],[token])
                    alignment[cur_var].append(span)
                else:
                    print >> sys.stderr, 'Variable/Concept %s/%s cannot be aligned'%(cur_var,cur_concept)
                    update = False

            elif rule_type == "thing":
                if 'ARG1-of' in amr[cur_var]:
                    k_var = amr[cur_var]['ARG1-of'][0]
                    success, sent, tokens = self.align_single_concept(sent,tokens,k_var,amr,alignment,unmatched_vars,triples)
                    if success:
                        span = alignment[k_var][0]
                        span.set_entity_tag(Aligner.ENTITY_TAG_TABLE[rule_type])
                        alignment[cur_var].append(span)
                    else:
                        update = False
                else:
                    rule_type = 'SingleConcept'

            elif rule_type == 'OrdinalEntity':
                val = amr[cur_var]['value'][0]
                success, sent, tokens = self.align_single_concept(sent,tokens,val,amr,alignment,unmatched_vars,triples)
                self.remove_aligned_concepts(cur_var,'value',val,unmatched_vars,triples)
                span = alignment[val][0]
                span.set_entity_tag(Aligner.ENTITY_TAG_TABLE[rule_type])
                alignment[cur_var].append(span)

            elif rule_type == 'RelativePosition':
                if 'direction' in amr[cur_var]:
                    dir_var = amr[cur_var]['direction'][0]
                    if amr.node_to_concepts[dir_var] == 'away':
                        aligned = False
                        for i,tok in tokens:
                            if tok.lower() == 'from':
                                aligned = True
                                break
                        if aligned:
                            span = Span(i,i+1,Aligner.ENTITY_TAG_TABLE[rule_type],[tok])
                            alignment[cur_var].append(span)
                            alignment[dir_var].append(span)
                        else:
                            print >> sys.stderr, 'Variable/Concept %s/%s cannot be aligned'%(cur_var,cur_concept)
                            update = False
                    else:
                        rule_type = 'SingleConcept'
                else:
                    rule_type = 'SingleConcept'
                
            elif self.is_ago(cur_var,cur_concept,amr):
                k_var = amr[cur_var]['op1'][0]
                aligned = False
                for i,tok in tokens:
                    if tok.lower() == 'ago':
                        aligned = True
                        break
                if aligned:
                    span = Span(i,i+1,Aligner.ENTITY_TAG_TABLE['ago'],[tok])
                    alignment[cur_var].append(span)
                    alignment[k_var].append(span)
                else:
                    print >> sys.stderr, '(%s/%s) :op1 (%s/%s) cannot be aligned'%(cur_var,cur_concept,k_var,amr.node_to_concepts[k_var])
                    update = False

            elif self.is_why_question(cur_var,amr):
                arg0_var = amr[cur_var]['ARG0'][0]
                aligned = False
                for i,tok in tokens:
                    if tok.lower() == 'why':
                        aligned = True
                        break
                if aligned:
                    span = Span(i,i+1,Aligner.ENTITY_TAG_TABLE['cause'],[tok])
                    alignment[cur_var].append(span)
                    alignment[arg0_var].append(span)
                else:
                    print >> sys.stderr, '(%s/%s) :op1 (%s/%s) cannot be aligned'%(cur_var,cur_concept,arg0_var,amr.node_to_concepts[arg0_var])
                    update = False
            else:
                pass

            if rule_type == "SingleConcept":
                update,span = self.try_align_as_single_concept(cur_var,cur_concept,amr,alignment,tokens,unmatched_vars,triples)
            elif cur_var in alignment:
                pass
            else:
                print >> sys.stderr, 'Can not find type of concept %s / %s'%(cur_var,cur_concept)

            # update
            #print cur_concept,rule_type
            if update:
                tokens = [(i,tk) for i,tk in tokens if i not in range(span.start,span.end)]
                sent = ' '.join(x for i,x in tokens)
                if self.verbose > 2:
                    print >> sys.stderr, "Concept '%s' Matched to span '%s' "%(cur_concept,' '.join(w for i,w in enumerate(sentence.split()) if i+1 in range(span[0],span[1])))
                    print sent
                    print alignment
                    
                    #raw_input('ENTER to continue')
            return update, sent, tokens
Exemple #2
0
    def span_align(self,sentence,amr):
        '''
        use rules to align amr concepts to sentence spans 
        '''
        sent = sentence[:]
        alignment = defaultdict(list)
        alignment['root'] = 0
        tokens = [(i+1,x) for i,x in enumerate(sent.split())]
        
        unmatched_vars = list(set([var for var in amr.bfs()[0] if not isinstance(var,StrLiteral)]))
        
        while unmatched_vars:
            cur = unmatched_vars.pop(0)
            if cur in amr.node_to_concepts:
                cur_concept = amr.node_to_concepts[cur]
            else:
                cur_concept = cur
            match = self.concept_patterns.match(cur_concept)
            if match:
                rule_type = match.lastgroup
                span = None
                update = True
                if rule_type == "NameEntity":
                    NE_items = [v[0] for k,v in amr[cur].items()]
                    NE_pattern = re.compile(r"\s".join(NE_items),re.IGNORECASE)
                    
                    start,end = self._search_sent(NE_pattern,sent,tokens)
                    assert end-start == len(NE_items)
                    span = Span(start,end,Aligner.ENTITY_TAG_TABLE[rule_type],NE_items)
                    alignment[cur].append(span)
                
                elif rule_type == "QuantityEntity":
                    quantity = ''
                    unit = ''
                    unit_var = None
                    for k,v in amr[cur].items():
                        if k == 'quant':
                            quantity = v[0]
                        elif k == 'unit':
                            unit_var = v[0]
                            unit = amr.node_to_concepts[v[0]]
                        else:
                            pass
                    if quantity and unit:
                        QTY_pattern = re.compile('(%s|%s)\s+(%s)s?' % (quantity,english_number(int(quantity)),unit),re.IGNORECASE)
                        QTY_items = [quantity,unit]
                        start,end = self._search_sent(QTY_pattern,sent,tokens)
                        assert end - start == len(QTY_items)
                        span = Span(start,end,Aligner.ENTITY_TAG_TABLE[rule_type],QTY_items)
                        alignment[cur].append(span)
                        
                        self.remove_aligned_concepts(unmatched_vars,amr[cur].items())
                elif rule_type == "NegPolarity":
                    aligned = False
                    for i,token in tokens:
                        if token.lower() in Aligner.neg_polarity:
                            aligned = True
                            break
                    if aligned:
                        span = Span(i,i+1,Aligner.ENTITY_TAG_TABLE[rule_type],[token])
                        alignment[cur].append(span)
                    else:
                        update = False
                        
                elif rule_type == "SingleConcept":
                    tmp = cur_concept.rsplit('-',1)
                    sense = None 
                    if len(tmp) == 2:
                        sense = tmp[1]
                    cur_concept = tmp[0].lower()
                    for idx,token in tokens:
                        t = token.lower()
                        if t == cur_concept:  # exact match
                            span = Span(idx,idx+1,Aligner.ENTITY_TAG_TABLE[rule_type],[t])
                            break
                        elif self.fuzzy_match(t,cur_concept,Aligner.fuzzy_max_len):
                            span = Span(idx,idx+1,Aligner.ENTITY_TAG_TABLE[rule_type],[t])
                            break
                        elif self.is_neg_form(t,cur_concept):
                            span = Span(idx,idx+1,Aligner.ENTITY_TAG_TABLE[rule_type],[t])
                            break
                        elif self.WN_lemma_match(t,cur_concept,sense):
                            span = Span(idx,idx+1,Aligner.ENTITY_TAG_TABLE[rule_type],[t])
                            break
                        elif self.is_spec_form(t,cur_concept):
                            span = Span(idx,idx+1,Aligner.ENTITY_TAG_TABLE[rule_type],[t])
                            break
                        else:
                            pass

                    if span:
                        alignment[cur].append(span)
                    else:
                        print >> sys.stderr, 'Variable/Concept %s/%s cannot be aligned'%(cur,cur_concept)
                        #alignment[matched_variable].append(matched_variable)
                        update = False                    
            else:
                raise Exception('Can not find type of concept %s / %s'%(cur,cur_concept))

            # update
            if update:
                tokens = [(i,tk) for i,tk in tokens if i not in range(span.start,span.end)]
                sent = ' '.join(x for i,x in tokens)
                if self.verbose > 2:
                    print >> sys.stderr, "Concept '%s' Matched to span '%s' "%(cur_concept,' '.join(w for i,w in enumerate(sentence.split()) if i+1 in range(span[0],span[1])))
                    print sent
                    print alignment
                    
                    #raw_input('ENTER to continue')

        return alignment
Exemple #3
0
    def word_align(self,sentence,amr):
        """
           use set of rules greedily align concepts to words, for special concepts like name,date-entity,etc., they 
           stay unaligned
           details: Flanigan,2014 ACL
        """
        
        sent = sentence[:] # copy the sentence 
        alignment = defaultdict(list)
        alignment['root'] = 0
        
        tokens = [(i+1,x) for i,x in enumerate(sent.split())]
        #tagged_tokens = nltk.pos_tag(tokens)

        # single root graph
        unmatched_variables = list(set([var for var in amr.bfs()[0] if not isinstance(var,StrLiteral)]))

        while unmatched_variables:
            cur = unmatched_variables.pop(0)
            if cur in amr.node_to_concepts:                 
                cur_concept = amr.node_to_concepts[cur]
            else: #not have concepts
                cur_concept = cur
            match = self.concept_patterns.match(cur_concept)
            #import pdb
            #pdb.set_trace()
            if match:
                rule_type = match.lastgroup
                matched_variable = cur
                span = None
                update = True
                #matched_variable_pos = int(match.group(0).split(':')[0])
                #matched_variable = unmatched_variables[matched_variable_pos]

                if rule_type == 'NameEntity':
                    NE_items = [v[0] for k,v in amr[matched_variable].items()]
                    #spans = [(j,len(NEStr)) for j in range(len(sent_list)) if sent_list[j:j+len(NEStr)] == NEStr]
                    NE_pattern = re.compile("\s".join(NE_items),re.IGNORECASE)

                    '''
                    m = name_re.match(sent)
                    span = [(i,len(NEStr)) for i in range(len(sent_list)) if (sum(map(lambda x:len(x)+1,sent_list[:i])),sum(map(lambda x:len(x)+1,sent_list[:i]))+len(m.group())) == m.span()]
                    alignment[matched_variable].append(span)

                    span = [range(i,i+len(NEStr)) for i in range(len(sent_list)) if sent_list[i:i+len(NEStr)] == NEStr]
                    '''
                    span = self._search_sent(NE_pattern,sent,tokens)
                    for sid, n in zip(range(span[0],span[1]), NE_items):
                        alignment[n].append(sid)
                    alignment[matched_variable].append(matched_variable)

                elif rule_type == 'QuantityEntity':
                    quantity = ''
                    unit = ''
                    unit_node = None
                    for k,v in amr[matched_variable].items():
                        if k == 'quant':
                            quantity = v[0]
                        elif k == 'unit':
                            unit = amr.node_to_concepts[v[0]]
                            unit_node = v[0]
                        else:
                            # other modifier
                            pass
                    if quantity and unit:
                        TEMP_pattern = re.compile('(%s|%s)\s+(%s)s?' % (quantity,english_number(int(quantity)),unit),re.IGNORECASE)
                        TEMP_items = [quantity,unit_node]
                    else:
                        missing = ''
                        if quantity == '':
                            missing += ' quantity'
                        if unit == '':
                            missing += ' unit'
                        
                        raise Exception('Quantity Entity %s does not contain %s'%(cur_concept,missing))
                    '''
                    temp_re = re.compile(r'(%s|%s)\s+(%s)s?' % (quantity,english_number(int(quantity)),unit))
                    #temp_re = re.compile(regex_pattern.TempQuantity,re.IGNORECASE)
                    m = temp_re.match(sent)
                    span = [range(i,i+2) for i in range(len(sent_list)) if (sum(map(lambda x:len(x)+1,sent_list[:i])),sum(map(lambda x:len(x)+1,sent_list[:i]))+len(m.group())) == m.span()]
                    #alignment[matched_variable].append(span)
                    '''
                    span = self._search_sent(TEMP_pattern,sent,tokens)
                    for sid, n in zip(range(span[0],span[1]), TEMP_items):
                        alignment[n].append(sid)
                    alignment[matched_variable].append(matched_variable)
    
                    self.remove_aligned_concepts(unmatched_variables,amr[matched_variable].items())

                elif rule_type == 'NegPolarity':
                    aligned = False
                    for i,token in tokens:
                        if token.lower() in Aligner.neg_polarity:
                            aligned = True
                            break
                    if aligned:
                        span = (i,i+1)
                        alignment[matched_variable].append(i)
                    else:
                        update = False
                    
                elif rule_type == 'SingleConcept':
                    tmp = cur_concept.rsplit('-',1)
                    sense = None 
                    if len(tmp) == 2:
                        sense = tmp[1]
                    cur_concept = tmp[0].lower()
                    for idx,token in tokens:
                        t = token.lower()
                        if t == cur_concept:  # exact match
                            span = (idx,idx+1)
                            break
                        elif self.fuzzy_match(t,cur_concept,Aligner.fuzzy_max_len):
                            span = (idx,idx+1)
                            break
                        elif self.WN_lemma_match(t,cur_concept,sense):
                            span = (idx,idx+1)
                            break
                        elif self.is_spec_form(t,cur_concept):
                            span = (idx,idx+1)
                            break
                        else:
                            pass

                    if span:
                        alignment[matched_variable].append(idx)
                    else:
                        print >> sys.stderr, 'WARNING: Variable %s/%s cannot be aligned'%(matched_variable,cur_concept)
                        alignment[matched_variable].append(matched_variable)
                        update = False
                else:
                    pass
                
                # update
                if update:
                    tokens = [(i,tk) for i,tk in tokens if i not in range(span[0],span[1])]
                    sent = ' '.join(x for i,x in tokens)
                    if self.verbose > 2:
                        print >> sys.stderr, "Concept '%s' Matched to span '%s' "%(cur_concept,' '.join(w for i,w in enumerate(sentence.split()) if i+1 in range(span[0],span[1])))
                        print sent
                        print alignment
                
                        #raw_input('ENTER to continue')

        return alignment
Exemple #4
0
    def align_single_concept(self,
                             sent,
                             tokens,
                             cur_var,
                             amr,
                             alignment,
                             unmatched_vars,
                             triples,
                             NEXT=False):
        '''align single concept'''

        if cur_var in amr.node_to_concepts:
            cur_concept = amr.node_to_concepts[cur_var]
        else:
            cur_concept = cur_var

        if cur_var in alignment and not NEXT and not isinstance(
                cur_var, (StrLiteral, Quantity, Polarity)):  # already aligned
            return True, sent, tokens

        match = self.concept_patterns.match(cur_concept)
        if match:
            rule_type = match.lastgroup
            span = None
            update = True
            if rule_type == "NameEntity":
                NE_items = [
                    v[0] for k, v in amr[cur_var].items()
                    if isinstance(v[0], StrLiteral)
                ]
                nep = r'%s|%s' % (r'\s'.join(NE_items), r'\s'.join(
                    n[:4] if len(n) > 3 else n for n in NE_items))
                NE_pattern = re.compile(nep, re.IGNORECASE)

                start, end = self._search_sent(NE_pattern, sent, tokens)
                assert end - start == len(NE_items)
                span = Span(start, end, Aligner.ENTITY_TAG_TABLE[rule_type],
                            NE_items)
                alignment[cur_var].append(span)
                for k, v in amr[cur_var].items():
                    if isinstance(v[0], StrLiteral):
                        self.remove_aligned_concepts(cur_var, k, v[0],
                                                     unmatched_vars, triples)

            elif rule_type in ["DateEntity", "haveOrgRole91", "RateEntity"]:
                EN_items = []
                EN_spans = []
                for k, v in amr[cur_var].items():
                    vconcept = amr.node_to_concepts[
                        v[0]] if v[0] in amr.node_to_concepts else v[0]
                    EN_items.append(vconcept)
                    success, sent, tokens = self.align_single_concept(
                        sent, tokens, v[0], amr, alignment, unmatched_vars,
                        triples)

                    sp = alignment[v[0]][-1]
                    sp.set_entity_tag(Aligner.ENTITY_TAG_TABLE[rule_type])
                    EN_spans.append(sp)
                    self.remove_aligned_concepts(cur_var, k, v[0],
                                                 unmatched_vars, triples)
                #print NE_spans,alignment
                start = EN_spans[0].start
                end = EN_spans[-1].end
                span = Span(start, end, Aligner.ENTITY_TAG_TABLE[rule_type],
                            EN_items)
                span.set_entity_tag(Aligner.ENTITY_TAG_TABLE[rule_type])
                alignment[cur_var].append(span)

            elif rule_type == "QuantityEntity":
                quantity = ''
                unit = ''
                unit_var = None
                q_success = False
                u_success = False

                for k, v in amr[cur_var].items():
                    if k == 'quant':
                        quantity = v[0]
                        q_success, sent, tokens = self.align_single_concept(
                            sent, tokens, quantity, amr, alignment,
                            unmatched_vars, triples)
                    elif k == 'unit':
                        unit_var = v[0]
                        unit = amr.node_to_concepts[v[0]]
                        u_success, sent, tokens = self.align_single_concept(
                            sent, tokens, unit_var, amr, alignment,
                            unmatched_vars, triples)
                    else:
                        pass

                if q_success and u_success:
                    #QTY_pattern = r'(%s|%s)\s+(%s)s?' % (quantity,english_number(int(quantity)),unit)
                    #QTY_items = [quantity,unit]
                    #start,end = self._search_sent(QTY_pattern,QTY_items,sent,tokens)
                    #assert end - start == len(QTY_items)
                    quantity_span = alignment[quantity][-1]
                    unit_span = alignment[unit_var][0]
                    start = quantity_span.start if quantity_span.start < unit_span.end else unit_span.start
                    end = unit_span.end if quantity_span.start < unit_span.end else quantity_span.end
                    while not (
                            end - len(quantity_span.words) -
                            len(unit_span.words) - start < 2
                    ):  # wrong match more than one quantity to map in sentence
                        alignment[quantity].pop()
                        q_success, sent, tokens = self.align_single_concept(
                            sent,
                            tokens,
                            quantity,
                            amr,
                            alignment,
                            unmatched_vars,
                            triples,
                            NEXT=True)  # redo it on updated sentence
                        quantity_span = alignment[quantity][-1]
                        start = quantity_span.start
                    #assert start == end - 2
                    span = Span(start, end,
                                Aligner.ENTITY_TAG_TABLE[rule_type],
                                [quantity, unit])
                    self.remove_aligned_concepts(cur_var, 'quant', quantity,
                                                 unmatched_vars, triples)
                    alignment[cur_var].append(span)
                elif q_success and not u_success:  # does not have unit or unit cannot be aligned
                    quantity_span = alignment[quantity][0]
                    start = quantity_span.start
                    end = quantity_span.end
                    span = Span(start, end,
                                Aligner.ENTITY_TAG_TABLE[rule_type],
                                [quantity])
                    self.remove_aligned_concepts(cur_var, 'quant', quantity,
                                                 unmatched_vars, triples)
                    alignment[cur_var].append(span)
                    #self.remove_aligned_concepts(unmatched_vars,amr[cur_var].items())
                elif not q_success and u_success:
                    unit_span = alignment[unit_var][0]
                    span = Span(unit_span.start, unit_span.end,
                                Aligner.ENTITY_TAG_TABLE[rule_type], [unit])
                    self.remove_aligned_concepts(cur_var, 'unit', unit_var,
                                                 unmatched_vars, triples)
                    alignment[cur_var].append(span)
                else:
                    rule_type = 'SingleConcept'
            elif rule_type == "Number":
                '''
                aligned = False
                num = [cur_var]
                num.extend(english_number(int(cur_var)).split('|'))
                for i,token in tokens:
                    if token.lower() in num:
                        aligned = True
                        break
                if aligned:
                    span = Span(i,i+1,Aligner.ENTITY_TAG_TABLE[rule_type],[token])
                    alignment[cur_var].append(span)
                else:
                    print >> sys.stderr, 'Variable/Concept %s/%s cannot be aligned'%(cur_var,cur_concept)
                    update = False
                '''
                if re.match('[0-9]+:[0-9]+', cur_concept):
                    num = [('time', '(\\s|^)(' + cur_concept + ')(\\s|&)'),
                           ('english',
                            '(\\s|^)(' + to_time(cur_concept) + ')(\\s|&)')]
                else:
                    num = [
                        ('digit', '(\\s|^)(' + cur_concept + '|' +
                         format_num(cur_concept) + ')(\\s|&)'),
                        ('string', '(\\s|^)(' +
                         english_number(int(cur_concept)) + ')(\\s|&)'),
                        ('order',
                         '(\\s|^)(' + to_order(cur_concept) + ')(\\s|&)'),
                        ('round',
                         '(\\s|^)(' + to_round(int(cur_concept)) + ')(\\s|&)')
                    ]
                NUM_pattern = self._compile_regex_rule(num)
                #print NUM_pattern.pattern
                try:
                    start, end = self._search_sent(NUM_pattern, sent, tokens)
                    span = Span(
                        start, end, Aligner.ENTITY_TAG_TABLE[rule_type],
                        [w for i, w in tokens if i in range(start, end)])
                    alignment[cur_var].append(span)
                except Exception as e:
                    update = False
                    print >> sys.stderr, e
                    #raw_input('CONTINUE')

            elif rule_type == 'multiple':
                op1 = amr[cur_var]['op1'][0]

                success, sent, tokens = self.align_single_concept(
                    sent, tokens, op1, amr, alignment, unmatched_vars, triples)
                if success:
                    span = alignment[op1][0]
                    alignment[cur_var].append(span)
                    self.remove_aligned_concepts(cur_var, 'op1', op1,
                                                 unmatched_vars, triples)
                else:
                    update = False

            elif rule_type in [
                    "person", "picture", "country", "state", "city", "desert",
                    "organization"
            ]:
                if 'name' in amr[cur_var]:
                    k_var = amr[cur_var]['name'][0]
                    success, sent, tokens = self.align_single_concept(
                        sent, tokens, k_var, amr, alignment, unmatched_vars,
                        triples)
                    span = alignment[k_var][0]
                    span.set_entity_tag(Aligner.ENTITY_TAG_TABLE[rule_type +
                                                                 '-name'])
                    alignment[cur_var].append(span)
                else:
                    ind, span = self.try_align_as_single_concept(
                        cur_var, cur_concept, amr, alignment, tokens,
                        unmatched_vars, triples)
                    if ind:
                        pass
                    elif 'ARG0-of' in amr[cur_var]:
                        k_var = amr[cur_var]['ARG0-of'][0]
                        success, sent, tokens = self.align_single_concept(
                            sent, tokens, k_var, amr, alignment,
                            unmatched_vars, triples)
                        if success:
                            span = alignment[k_var][0]
                            span.set_entity_tag(
                                Aligner.ENTITY_TAG_TABLE[rule_type])
                            alignment[cur_var].append(span)
                        else:
                            update = False

                    else:
                        update = False

            elif rule_type == "NegPolarity":
                aligned = False
                for i, token in tokens:
                    if token.lower() in Aligner.neg_polarity:
                        aligned = True
                        break
                if aligned:
                    span = Span(i, i + 1, Aligner.ENTITY_TAG_TABLE[rule_type],
                                [token])
                    alignment[cur_var].append(span)
                else:
                    print >> sys.stderr, 'Variable/Concept %s/%s cannot be aligned' % (
                        cur_var, cur_concept)
                    update = False

            elif rule_type == "thing":
                if 'ARG1-of' in amr[cur_var]:
                    k_var = amr[cur_var]['ARG1-of'][0]
                    success, sent, tokens = self.align_single_concept(
                        sent, tokens, k_var, amr, alignment, unmatched_vars,
                        triples)
                    if success:
                        span = alignment[k_var][0]
                        span.set_entity_tag(
                            Aligner.ENTITY_TAG_TABLE[rule_type])
                        alignment[cur_var].append(span)
                    else:
                        update = False
                else:
                    rule_type = 'SingleConcept'

            elif rule_type == 'OrdinalEntity':
                val = amr[cur_var]['value'][0]
                success, sent, tokens = self.align_single_concept(
                    sent, tokens, val, amr, alignment, unmatched_vars, triples)
                self.remove_aligned_concepts(cur_var, 'value', val,
                                             unmatched_vars, triples)
                span = alignment[val][0]
                span.set_entity_tag(Aligner.ENTITY_TAG_TABLE[rule_type])
                alignment[cur_var].append(span)

            elif rule_type == 'RelativePosition':
                if 'direction' in amr[cur_var]:
                    dir_var = amr[cur_var]['direction'][0]
                    if amr.node_to_concepts[dir_var] == 'away':
                        aligned = False
                        for i, tok in tokens:
                            if tok.lower() == 'from':
                                aligned = True
                                break
                        if aligned:
                            span = Span(i, i + 1,
                                        Aligner.ENTITY_TAG_TABLE[rule_type],
                                        [tok])
                            alignment[cur_var].append(span)
                            alignment[dir_var].append(span)
                        else:
                            print >> sys.stderr, 'Variable/Concept %s/%s cannot be aligned' % (
                                cur_var, cur_concept)
                            update = False
                    else:
                        rule_type = 'SingleConcept'
                else:
                    rule_type = 'SingleConcept'

            elif self.is_ago(cur_var, cur_concept, amr):
                k_var = amr[cur_var]['op1'][0]
                aligned = False
                for i, tok in tokens:
                    if tok.lower() == 'ago':
                        aligned = True
                        break
                if aligned:
                    span = Span(i, i + 1, Aligner.ENTITY_TAG_TABLE['ago'],
                                [tok])
                    alignment[cur_var].append(span)
                    alignment[k_var].append(span)
                else:
                    print >> sys.stderr, '(%s/%s) :op1 (%s/%s) cannot be aligned' % (
                        cur_var, cur_concept, k_var,
                        amr.node_to_concepts[k_var])
                    update = False

            elif self.is_why_question(cur_var, amr):
                arg0_var = amr[cur_var]['ARG0'][0]
                aligned = False
                for i, tok in tokens:
                    if tok.lower() == 'why':
                        aligned = True
                        break
                if aligned:
                    span = Span(i, i + 1, Aligner.ENTITY_TAG_TABLE['cause'],
                                [tok])
                    alignment[cur_var].append(span)
                    alignment[arg0_var].append(span)
                else:
                    print >> sys.stderr, '(%s/%s) :op1 (%s/%s) cannot be aligned' % (
                        cur_var, cur_concept, arg0_var,
                        amr.node_to_concepts[arg0_var])
                    update = False
            else:
                pass

            if rule_type == "SingleConcept":
                update, span = self.try_align_as_single_concept(
                    cur_var, cur_concept, amr, alignment, tokens,
                    unmatched_vars, triples)
            elif cur_var in alignment:
                pass
            else:
                print >> sys.stderr, 'Can not find type of concept %s / %s' % (
                    cur_var, cur_concept)

            # update
            #print cur_concept,rule_type
            if update:
                tokens = [(i, tk) for i, tk in tokens
                          if i not in range(span.start, span.end)]
                sent = ' '.join(x for i, x in tokens)
                if self.verbose > 2:
                    print >> sys.stderr, "Concept '%s' Matched to span '%s' " % (
                        cur_concept, ' '.join(
                            w for i, w in enumerate(sentence.split())
                            if i + 1 in range(span[0], span[1])))
                    print(sent)
                    print(alignment)

                    #raw_input('ENTER to continue')
            return update, sent, tokens
Exemple #5
0
    def span_align(self, sentence, amr):
        '''
        use rules to align amr concepts to sentence spans
        '''
        sent = sentence[:]
        alignment = defaultdict(list)
        alignment['root'] = 0
        tokens = [(i + 1, x) for i, x in enumerate(sent.split())]

        unmatched_vars = list(
            set([
                var for var in amr.bfs()[0] if not isinstance(var, StrLiteral)
            ]))

        while unmatched_vars:
            cur = unmatched_vars.pop(0)
            if cur in amr.node_to_concepts:
                cur_concept = amr.node_to_concepts[cur]
            else:
                cur_concept = cur
            match = self.concept_patterns.match(cur_concept)
            if match:
                rule_type = match.lastgroup
                span = None
                update = True
                if rule_type == "NameEntity":
                    NE_items = [v[0] for k, v in amr[cur].items()]
                    NE_pattern = re.compile(r"\s".join(NE_items),
                                            re.IGNORECASE)

                    start, end = self._search_sent(NE_pattern, sent, tokens)
                    assert end - start == len(NE_items)
                    span = Span(start, end,
                                Aligner.ENTITY_TAG_TABLE[rule_type], NE_items)
                    alignment[cur].append(span)

                elif rule_type == "QuantityEntity":
                    quantity = ''
                    unit = ''
                    unit_var = None
                    for k, v in amr[cur].items():
                        if k == 'quant':
                            quantity = v[0]
                        elif k == 'unit':
                            unit_var = v[0]
                            unit = amr.node_to_concepts[v[0]]
                        else:
                            pass
                    if quantity and unit:
                        QTY_pattern = re.compile(
                            '(%s|%s)\s+(%s)s?' %
                            (quantity, english_number(int(quantity)), unit),
                            re.IGNORECASE)
                        QTY_items = [quantity, unit]
                        start, end = self._search_sent(QTY_pattern, sent,
                                                       tokens)
                        assert end - start == len(QTY_items)
                        span = Span(start, end,
                                    Aligner.ENTITY_TAG_TABLE[rule_type],
                                    QTY_items)
                        alignment[cur].append(span)

                        self.remove_aligned_concepts(unmatched_vars,
                                                     amr[cur].items())
                elif rule_type == "NegPolarity":
                    aligned = False
                    for i, token in tokens:
                        if token.lower() in Aligner.neg_polarity:
                            aligned = True
                            break
                    if aligned:
                        span = Span(i, i + 1,
                                    Aligner.ENTITY_TAG_TABLE[rule_type],
                                    [token])
                        alignment[cur].append(span)
                    else:
                        update = False

                elif rule_type == "SingleConcept":
                    tmp = cur_concept.rsplit('-', 1)
                    sense = None
                    if len(tmp) == 2:
                        sense = tmp[1]
                    cur_concept = tmp[0].lower()
                    for idx, token in tokens:
                        t = token.lower()
                        if t == cur_concept:  # exact match
                            span = Span(idx, idx + 1,
                                        Aligner.ENTITY_TAG_TABLE[rule_type],
                                        [t])
                            break
                        elif self.fuzzy_match(t, cur_concept,
                                              Aligner.fuzzy_max_len):
                            span = Span(idx, idx + 1,
                                        Aligner.ENTITY_TAG_TABLE[rule_type],
                                        [t])
                            break
                        elif self.is_neg_form(t, cur_concept):
                            span = Span(idx, idx + 1,
                                        Aligner.ENTITY_TAG_TABLE[rule_type],
                                        [t])
                            break
                        elif self.WN_lemma_match(t, cur_concept, sense):
                            span = Span(idx, idx + 1,
                                        Aligner.ENTITY_TAG_TABLE[rule_type],
                                        [t])
                            break
                        elif self.is_spec_form(t, cur_concept):
                            span = Span(idx, idx + 1,
                                        Aligner.ENTITY_TAG_TABLE[rule_type],
                                        [t])
                            break
                        else:
                            pass

                    if span:
                        alignment[cur].append(span)
                    else:
                        print >> sys.stderr, 'Variable/Concept %s/%s cannot be aligned' % (
                            cur, cur_concept)
                        #alignment[matched_variable].append(matched_variable)
                        update = False
            else:
                raise Exception('Can not find type of concept %s / %s' %
                                (cur, cur_concept))

            # update
            if update:
                tokens = [(i, tk) for i, tk in tokens
                          if i not in range(span.start, span.end)]
                sent = ' '.join(x for i, x in tokens)
                if self.verbose > 2:
                    print >> sys.stderr, "Concept '%s' Matched to span '%s' " % (
                        cur_concept, ' '.join(
                            w for i, w in enumerate(sentence.split())
                            if i + 1 in range(span[0], span[1])))
                    print(sent)
                    print(alignment)

                    #raw_input('ENTER to continue')

        return alignment
Exemple #6
0
    def word_align(self, sentence, amr):
        """
           use set of rules greedily align concepts to words, for special concepts like name,date-entity,etc., they
           stay unaligned
           details: Flanigan,2014 ACL
        """

        sent = sentence[:]  # copy the sentence
        alignment = defaultdict(list)
        alignment['root'] = 0

        tokens = [(i + 1, x) for i, x in enumerate(sent.split())]
        #tagged_tokens = nltk.pos_tag(tokens)

        # single root graph
        unmatched_variables = list(
            set([
                var for var in amr.bfs()[0] if not isinstance(var, StrLiteral)
            ]))

        while unmatched_variables:
            cur = unmatched_variables.pop(0)
            if cur in amr.node_to_concepts:
                cur_concept = amr.node_to_concepts[cur]
            else:  #not have concepts
                cur_concept = cur
            match = self.concept_patterns.match(cur_concept)
            #import pdb
            #pdb.set_trace()
            if match:
                rule_type = match.lastgroup
                matched_variable = cur
                span = None
                update = True
                #matched_variable_pos = int(match.group(0).split(':')[0])
                #matched_variable = unmatched_variables[matched_variable_pos]

                if rule_type == 'NameEntity':
                    NE_items = [v[0] for k, v in amr[matched_variable].items()]
                    #spans = [(j,len(NEStr)) for j in range(len(sent_list)) if sent_list[j:j+len(NEStr)] == NEStr]
                    NE_pattern = re.compile("\s".join(NE_items), re.IGNORECASE)
                    '''
                    m = name_re.match(sent)
                    span = [(i,len(NEStr)) for i in range(len(sent_list)) if (sum(map(lambda x:len(x)+1,sent_list[:i])),sum(map(lambda x:len(x)+1,sent_list[:i]))+len(m.group())) == m.span()]
                    alignment[matched_variable].append(span)

                    span = [range(i,i+len(NEStr)) for i in range(len(sent_list)) if sent_list[i:i+len(NEStr)] == NEStr]
                    '''
                    span = self._search_sent(NE_pattern, sent, tokens)
                    for sid, n in zip(range(span[0], span[1]), NE_items):
                        alignment[n].append(sid)
                    alignment[matched_variable].append(matched_variable)

                elif rule_type == 'QuantityEntity':
                    quantity = ''
                    unit = ''
                    unit_node = None
                    for k, v in amr[matched_variable].items():
                        if k == 'quant':
                            quantity = v[0]
                        elif k == 'unit':
                            unit = amr.node_to_concepts[v[0]]
                            unit_node = v[0]
                        else:
                            # other modifier
                            pass
                    if quantity and unit:
                        TEMP_pattern = re.compile(
                            '(%s|%s)\s+(%s)s?' %
                            (quantity, english_number(int(quantity)), unit),
                            re.IGNORECASE)
                        TEMP_items = [quantity, unit_node]
                    else:
                        missing = ''
                        if quantity == '':
                            missing += ' quantity'
                        if unit == '':
                            missing += ' unit'

                        raise Exception(
                            'Quantity Entity %s does not contain %s' %
                            (cur_concept, missing))
                    '''
                    temp_re = re.compile(r'(%s|%s)\s+(%s)s?' % (quantity,english_number(int(quantity)),unit))
                    #temp_re = re.compile(regex_pattern.TempQuantity,re.IGNORECASE)
                    m = temp_re.match(sent)
                    span = [range(i,i+2) for i in range(len(sent_list)) if (sum(map(lambda x:len(x)+1,sent_list[:i])),sum(map(lambda x:len(x)+1,sent_list[:i]))+len(m.group())) == m.span()]
                    #alignment[matched_variable].append(span)
                    '''
                    span = self._search_sent(TEMP_pattern, sent, tokens)
                    for sid, n in zip(range(span[0], span[1]), TEMP_items):
                        alignment[n].append(sid)
                    alignment[matched_variable].append(matched_variable)

                    self.remove_aligned_concepts(unmatched_variables,
                                                 amr[matched_variable].items())

                elif rule_type == 'NegPolarity':
                    aligned = False
                    for i, token in tokens:
                        if token.lower() in Aligner.neg_polarity:
                            aligned = True
                            break
                    if aligned:
                        span = (i, i + 1)
                        alignment[matched_variable].append(i)
                    else:
                        update = False

                elif rule_type == 'SingleConcept':
                    tmp = cur_concept.rsplit('-', 1)
                    sense = None
                    if len(tmp) == 2:
                        sense = tmp[1]
                    cur_concept = tmp[0].lower()
                    for idx, token in tokens:
                        t = token.lower()
                        if t == cur_concept:  # exact match
                            span = (idx, idx + 1)
                            break
                        elif self.fuzzy_match(t, cur_concept,
                                              Aligner.fuzzy_max_len):
                            span = (idx, idx + 1)
                            break
                        elif self.WN_lemma_match(t, cur_concept, sense):
                            span = (idx, idx + 1)
                            break
                        elif self.is_spec_form(t, cur_concept):
                            span = (idx, idx + 1)
                            break
                        else:
                            pass

                    if span:
                        alignment[matched_variable].append(idx)
                    else:
                        print >> sys.stderr, 'WARNING: Variable %s/%s cannot be aligned' % (
                            matched_variable, cur_concept)
                        alignment[matched_variable].append(matched_variable)
                        update = False
                else:
                    pass

                # update
                if update:
                    tokens = [(i, tk) for i, tk in tokens
                              if i not in range(span[0], span[1])]
                    sent = ' '.join(x for i, x in tokens)
                    if self.verbose > 2:
                        print >> sys.stderr, "Concept '%s' Matched to span '%s' " % (
                            cur_concept, ' '.join(
                                w for i, w in enumerate(sentence.split())
                                if i + 1 in range(span[0], span[1])))
                        print(sent)
                        print(alignment)

                        #raw_input('ENTER to continue')

        return alignment