Beispiel #1
0
def read_training_data(file_name):
    """
    read kaf/naf and matches the aspects with the words
    """
    parser = KafNafParser(PATH_ANNOTATED_DATA + file_name)
    terms = list(parser.get_terms())
    #    create token dictionairy containing naf info
    tokens_container = dict()
    for token_el in parser.get_tokens():
        token_node = token_el.node
        token_id = token_node.get('wid').replace('w', 't')
        token_info = token_node.attrib
        tokens_container[token_id] = token_info
    properties = list(parser.get_properties())
    handled_properties, term_dict = handle_properties(properties, terms,
                                                      tokens_container)
    return terms, properties, handled_properties, term_dict, tokens_container
def process_file(this_file,token_freq):
    xml_obj = KafNafParser(this_file)
    print>>sys.stderr,'Processing file',this_file
    token_for_wid = {}
    order_for_wid = {}
    opinion_expressions = []
    opinion_targets = []
    whole_text = ' '
    for n, token in enumerate(xml_obj.get_tokens()):
        text = token.get_text().lower()
        token_freq[text] += 1
        token_for_wid[token.get_id()] = text
        order_for_wid[token.get_id()] = n
        whole_text += text + ' '
    wids_for_tid = {}
    lemma_for_wid = {}
    pos_for_wid = {}
    for term in xml_obj.get_terms():
        tid = term.get_id()
        wids = term.get_span().get_span_ids()
        wids_for_tid[tid] = wids
        for wid in wids:
            lemma_for_wid[wid] = term.get_lemma()
            pos_for_wid[wid] = term.get_pos()
        
    
    ##Properties!
    aspects = [] ## [(label,term_span)...]
    
    for property in xml_obj.get_properties():
        for refs in property.get_references():
            for span in refs:
                aspects.append((property.get_type(),span.get_span_ids()))
       
    
    
    already_counted = {EXP:set(), TAR:set()}
    
    for opinion in xml_obj.get_opinions():   
        for this_type, opinion_obj in [(EXP,opinion.get_expression()),(TAR,opinion.get_target())]:
            if this_type is EXP and opinion_obj.get_polarity()=='NON-OPINIONATED':
                continue
            if opinion_obj is not None:
                span = opinion_obj.get_span()
                if span is not None:
                    list_wids = []
                    for tid in span.get_span_ids():
                        list_wids.extend(wids_for_tid.get(tid,[]))
                    list_wids.sort(key=lambda wid: order_for_wid[wid])  ##Sorted according the the order of the tokens
                    
                    string_wids = '#'.join(list_wids)
                    opinion_tokens = ' '.join( token_for_wid[wid] for wid in list_wids)
                    opinion_lemmas = ' '.join( lemma_for_wid[wid] for wid in list_wids)
                    opinion_pos    = ' '.join( pos_for_wid[wid]   for wid in list_wids)
                    
                   
                    if string_wids not in already_counted[this_type]:
                        if this_type == EXP:
                            polarity = (opinion_obj.get_polarity()).lower()
                            opinion_expressions.append((opinion_tokens,polarity,opinion_lemmas,opinion_pos))
                        else:
                            ##Calculate the aspect type
                            possible_aspects = []
                            target_ids = span.get_span_ids()
                            for aspect_label, aspect_span in aspects:
                                num_in_common = len(set(target_ids) & set(aspect_span))
                                if num_in_common != 0:
                                    possible_aspects.append((aspect_label,num_in_common,len(aspect_span)))
                            aspect_for_target = 'unknown'

                            if len(possible_aspects) != 0:
                                ##Sorting by the number in common first, and by the lengtgh of the aspect secondly
                                aspect_for_target = sorted(possible_aspects,key=lambda t: (t[1],t[2]), reverse=True)[0][0]
                            opinion_targets.append((opinion_tokens,aspect_for_target, opinion_lemmas,opinion_pos))
                        already_counted[this_type].add(string_wids)    
      
    del xml_obj
    print>>sys.stderr,'\tNumber of opinion expressions:',len(opinion_expressions)
    print>>sys.stderr,'\tNumber of opinion targets:',len(opinion_targets)
    print>>sys.stderr,'\tNumber of characters of the text:',len(whole_text)
    return opinion_expressions, opinion_targets, whole_text
def process_file(this_file, token_freq):
    xml_obj = KafNafParser(this_file)
    print >> sys.stderr, 'Processing file', this_file
    token_for_wid = {}
    order_for_wid = {}
    opinion_expressions = []
    opinion_targets = []
    whole_text = ' '
    for n, token in enumerate(xml_obj.get_tokens()):
        text = token.get_text().lower()
        token_freq[text] += 1
        token_for_wid[token.get_id()] = text
        order_for_wid[token.get_id()] = n
        whole_text += text + ' '
    wids_for_tid = {}
    lemma_for_wid = {}
    pos_for_wid = {}
    for term in xml_obj.get_terms():
        tid = term.get_id()
        wids = term.get_span().get_span_ids()
        wids_for_tid[tid] = wids
        for wid in wids:
            lemma_for_wid[wid] = term.get_lemma()
            pos_for_wid[wid] = term.get_pos()

    ##Properties!
    aspects = []  ## [(label,term_span)...]

    for property in xml_obj.get_properties():
        for refs in property.get_references():
            for span in refs:
                aspects.append((property.get_type(), span.get_span_ids()))

    already_counted = {EXP: set(), TAR: set()}

    for opinion in xml_obj.get_opinions():
        for this_type, opinion_obj in [(EXP, opinion.get_expression()),
                                       (TAR, opinion.get_target())]:
            if this_type is EXP and opinion_obj.get_polarity(
            ) == 'NON-OPINIONATED':
                continue
            if opinion_obj is not None:
                span = opinion_obj.get_span()
                if span is not None:
                    list_wids = []
                    for tid in span.get_span_ids():
                        list_wids.extend(wids_for_tid.get(tid, []))
                    list_wids.sort(key=lambda wid: order_for_wid[
                        wid])  ##Sorted according the the order of the tokens

                    string_wids = '#'.join(list_wids)
                    opinion_tokens = ' '.join(token_for_wid[wid]
                                              for wid in list_wids)
                    opinion_lemmas = ' '.join(lemma_for_wid[wid]
                                              for wid in list_wids)
                    opinion_pos = ' '.join(pos_for_wid[wid]
                                           for wid in list_wids)

                    if string_wids not in already_counted[this_type]:
                        if this_type == EXP:
                            polarity = (opinion_obj.get_polarity()).lower()
                            opinion_expressions.append(
                                (opinion_tokens, polarity, opinion_lemmas,
                                 opinion_pos))
                        else:
                            ##Calculate the aspect type
                            possible_aspects = []
                            target_ids = span.get_span_ids()
                            for aspect_label, aspect_span in aspects:
                                num_in_common = len(
                                    set(target_ids) & set(aspect_span))
                                if num_in_common != 0:
                                    possible_aspects.append(
                                        (aspect_label, num_in_common,
                                         len(aspect_span)))
                            aspect_for_target = 'unknown'

                            if len(possible_aspects) != 0:
                                ##Sorting by the number in common first, and by the lengtgh of the aspect secondly
                                aspect_for_target = sorted(possible_aspects,
                                                           key=lambda t:
                                                           (t[1], t[2]),
                                                           reverse=True)[0][0]
                            opinion_targets.append(
                                (opinion_tokens, aspect_for_target,
                                 opinion_lemmas, opinion_pos))
                        already_counted[this_type].add(string_wids)

    del xml_obj
    print >> sys.stderr, '\tNumber of opinion expressions:', len(
        opinion_expressions)
    print >> sys.stderr, '\tNumber of opinion targets:', len(opinion_targets)
    print >> sys.stderr, '\tNumber of characters of the text:', len(whole_text)
    return opinion_expressions, opinion_targets, whole_text