def map_opinion_labels(input_file, output_file, config_file):
    # Load the mapping from the config_file
    mapping = {}
    parser = ConfigParser.ConfigParser()
    parser.read(config_file)
    for mapped_opinion, values_in_corpus in parser.items('valid_opinions'):
        values = [v for v in values_in_corpus.split(';') if v != '']
        for v in values:
            mapping[v] = mapped_opinion
    del parser
    ##################

    input_kaf = KafNafParser(input_file)
    remove_these = []
    for opinion in input_kaf.get_opinions():
        exp = opinion.get_expression()
        polarity = exp.get_polarity()
        if polarity in mapping:
            mapped_polarity = mapping[polarity]
        else:
            opi_id = opinion.get_id()
            remove_these.append(opi_id)
            mapped_polarity = polarity

        exp.set_polarity(mapped_polarity)

    for opi_id in remove_these:
        input_kaf.remove_this_opinion(opi_id)
    input_kaf.dump(output_file)
def map_opinion_labels(input_file,output_file,config_file):
    # Load the mapping from the config_file
    mapping = {}
    parser = ConfigParser.ConfigParser()
    parser.read(config_file)
    for mapped_opinion, values_in_corpus in parser.items('valid_opinions'):
        values = [ v for v in values_in_corpus.split(';') if v != '']
        for v in values:
            mapping[v] = mapped_opinion
    del parser
    ##################        
    
    input_kaf = KafNafParser(input_file)
    remove_these = []
    for opinion in input_kaf.get_opinions():
        exp = opinion.get_expression()
        polarity = exp.get_polarity()
        if polarity in mapping:
            mapped_polarity = mapping[polarity]
        else:
            opi_id = opinion.get_id()
            remove_these.append(opi_id)
            mapped_polarity = polarity
            
        exp.set_polarity(mapped_polarity)
        
    for opi_id in remove_these:
        input_kaf.remove_this_opinion(opi_id)
    input_kaf.dump(output_file)
    def train(self,list_training_files, out_folder):
        self.folder= out_folder
        os.mkdir(self.folder)
        print('Creating output folder %s' % self.folder)
        
        training_fd = open(os.path.join(self.folder,TRAIN_FILE),'w')
        
        
        for this_file in list_training_files:
            print('\tEncoding training file %s' % this_file)
            
            this_obj = KafNafParser(this_file)
            num_pos = num_neg = 0
            for opinion in this_obj.get_opinions():
                opinion_expression = opinion.get_expression()
                polarity = opinion_expression.get_polarity()
                
                span_obj = opinion_expression.get_span()
                if span_obj is None:
                    continue
                
                list_term_ids = span_obj.get_span_ids()
                features = self.extract_features(this_obj, list_term_ids)
                
            
                int_features = self.encode_string_features(features, update_index=True) #Map feat index --> frequency
                
                if len(int_features) != 0:                
                    this_class = None
                    if self.is_positive(polarity):
                        this_class = '+1'
                        num_pos += 1
                    elif self.is_negative(polarity):
                        this_class = '-1'
                        num_neg += 1
                    
                    if this_class is not None:
                        self.write_example_to_file(training_fd, this_class, int_features)

            #END FOR
            print('\t\tNum positive examples: %d' % num_pos)
            print('\t\tNum negative examples: %d' % num_neg)
        training_fd.close()
        print('Training file at %s' % training_fd.name)
        
        ##RUN THE TRAINING
        training_cmd = [SVM_LEARN]
        
        training_cmd.append(training_fd.name)
        
        whole_model_file = os.path.join(self.folder, MODEL_FILE)
        training_cmd.append(whole_model_file)
        ret_code = check_call(training_cmd)
        print('Training done on %s with code %d' % (whole_model_file,ret_code))
        
        #Save also the index
        whole_index_file = os.path.join(self.folder,INDEX_FILE)
        index_fd = open(whole_index_file,'wb')
        pickle.dump(self.index_features, index_fd, -1)
        index_fd.close()
        print('Feature index saved to %s with %d features' % (whole_index_file,len(self.index_features)))
Esempio n. 4
0
def main(inputfile,
         this_type,
         folder,
         overall_parameters={},
         detected_dse={},
         log=False):
    files = []
    output_fd = None
    if this_type == 'train':
        output_fd = open(folder + '/' + TRAINING_FILENAME, 'w')

        ##Save the parametes
        parameter_filename = os.path.join(folder, PARAMETERS_FILENAME)
        fd_parameter = open(parameter_filename, 'w')
        pickler.dump(overall_parameters, fd_parameter, protocol=0)
        print >> sys.stderr, 'Parameters saved to file %s' % parameter_filename
        fd_parameter.close()

        #Input is a files with a list of files
        fin = open(inputfile, 'r')
        for line in fin:
            files.append(line.strip())
        fin.close()

    elif this_type == 'tag':
        parameter_filename = os.path.join(folder, PARAMETERS_FILENAME)
        fd_param = open(parameter_filename, 'r')
        overall_parameters = pickler.load(fd_param)
        fd_param.close()

        #Input is a isngle file
        files.append(inputfile)

        #Output FD will be a temporary file
        output_fd = tempfile.NamedTemporaryFile('w', delete=False)
    elif this_type == 'test':
        parameter_filename = os.path.join(folder, PARAMETERS_FILENAME)
        fd_param = open(parameter_filename, 'r')
        these_overall_parameters = pickler.load(fd_param)
        fd_param.close()
        for opt, val in these_overall_parameters.items():
            overall_parameters[opt] = val

        #Input is a files with a list of files
        fin = open(inputfile, 'r')
        for line in fin:
            files.append(line.strip())
        fin.close()
        output_fd = open(folder + '/' + TESTING_FILENAME, 'w')

    gold_fd = None
    gold_filename = overall_parameters.get('gold_standard')
    if gold_filename is not None:
        gold_fd = open(gold_filename, 'w')

    for filename in files:
        if log:
            print >> sys.stderr, 'HOLDER: processing file', filename

        if isinstance(filename, KafNafParser):
            naf_obj = filename
        else:
            naf_obj = KafNafParser(filename)

        create_structures(naf_obj, filename)

        #Extract all the opinions
        opinions_per_sentence = defaultdict(list)

        num_opinions = 0

        for opinion in naf_obj.get_opinions():
            exp = opinion.get_expression()
            if exp is not None:
                p = exp.get_polarity()
                if p != 'NON-OPINIONATED':
                    #if p.startswith('D-'):
                    holder = opinion.get_holder()
                    if holder is not None:
                        span = holder.get_span()
                        if span is not None:
                            span_ids = span.get_span_ids()
                            if len(span_ids) != 0:
                                sentence_id = get_sentence_id_for_opinion(
                                    naf_obj, opinion)
                                if sentence_id is not None:
                                    opinions_per_sentence[sentence_id].append(
                                        opinion)
                                    num_opinions += 1

        if log:
            print >> sys.stderr, '\tNum of opinions:', num_opinions

        if this_type == 'train':
            # For the train a sequence is created for every opinion
            #One sequence is created for every DSE (possible to have repeated sentences)
            sentences_with_opinions = set()
            for this_sentence, these_opinions in opinions_per_sentence.items():
                for opinion in these_opinions:
                    sentences_with_opinions.add(this_sentence)
                    create_sequence(naf_obj,
                                    this_type,
                                    this_sentence,
                                    overall_parameters,
                                    opinion,
                                    output=output_fd)

            #Include the rest of sentence without opinions
            '''
            for sentence_id in naf_obj.list_sentence_ids:
                if sentence_id not in sentences_with_opinions:
                    create_sequence(naf_obj, sentence_id, overall_parameters, list_opinions=[])
            '''

        elif this_type == 'tag':
            # Obtain the opinions per sentence per
            opinions_per_sentence = defaultdict(list)
            for list_name_ids, list_words in detected_dse:
                list_ids = [v[v.rfind('#') + 1:] for v in list_name_ids]
                first_token = naf_obj.get_token(list_ids[0])
                sentence_for_opinion = first_token.get_sent()
                opinions_per_sentence[sentence_for_opinion].append(list_ids)

            for this_sentence, these_opinions in opinions_per_sentence.items():
                for list_dse_token_ids in these_opinions:
                    create_sequence(naf_obj,
                                    this_type,
                                    this_sentence,
                                    overall_parameters,
                                    opinion=list_dse_token_ids,
                                    output=output_fd,
                                    log=log)

        elif this_type == 'test':
            opinion_list = []
            '''
            for sentence_id in naf_obj.list_sentence_ids:
                if sentence_id in opinions_per_sentence:
                    for this_sentence, these_opinions in opinions_per_sentence.items():
                        for opinion in these_opinions:
                            create_sequence(naf_obj, this_type, this_sentence, overall_parameters,opinion, output = output_fd)
                            opinion_list.append(opinion)
                else:
                    create_sequence(naf_obj, this_type, sentence_id, overall_parameters,opinion=None, output = output_fd)
               
            '''
            #For the testing, one sequence is created for every sentence, with no opinion included
            opinion_list = []
            #WE include only the the sentences where there are opinions
            for this_sentence, these_opinions in opinions_per_sentence.items():
                for opinion in these_opinions:
                    create_sequence(naf_obj,
                                    this_type,
                                    this_sentence,
                                    overall_parameters,
                                    opinion,
                                    output=output_fd)
                    opinion_list.append(opinion)

            ## Create the gold standard data also
            if gold_fd is not None:
                create_gold_standard_holder(naf_obj, opinion_list, gold_fd)

    if gold_fd is not None:
        gold_fd.close()
        print >> sys.stderr, 'Gold standard in the file %s' % gold_fd.name

    return output_fd.name
Esempio n. 5
0
def main(inputfile, type, folder, overall_parameters={},log=False):
    files = []
    output_fd = None
    if type == 'train':
        if not os.path.isdir(folder):
            os.mkdir(folder)
        res_fol = os.path.join(folder,RESOURCES_FOLDER)
        if not os.path.isdir(res_fol):
            os.mkdir(res_fol)
        output_fd = open(folder+'/'+TRAINING_FILENAME,'w')
            
        ##Save the parametes
        parameter_filename = os.path.join(folder,PARAMETERS_FILENAME)
        fd_parameter = open(parameter_filename,'w')
        pickler.dump(overall_parameters,fd_parameter,protocol=0)
        print>>sys.stderr,'Parameters saved to file %s' % parameter_filename
        fd_parameter.close()
        
        #Input is a files with a list of files
        fin = open(inputfile,'r')
        for line in fin:
            files.append(line.strip())
        fin.close()
        
    elif type == 'tag':
        parameter_filename = os.path.join(folder,PARAMETERS_FILENAME)
        fd_param = open(parameter_filename,'r')
        overall_parameters = pickler.load(fd_param)
        fd_param.close()

        #Input is a isngle file
        files.append(inputfile)
        
        #Output FD will be a temporary file
        output_fd = tempfile.NamedTemporaryFile('w', delete=False)
    elif type == 'test':
        parameter_filename = os.path.join(folder,PARAMETERS_FILENAME)
        fd_param = open(parameter_filename,'r')
        these_overall_parameters = pickler.load(fd_param)
        fd_param.close()
        for opt, val in these_overall_parameters.items():
            overall_parameters[opt] = val
        
        #Input is a files with a list of files
        fin = open(inputfile,'r')
        for line in fin:
            files.append(line.strip())
        fin.close()
        output_fd = open(folder+'/'+TESTING_FILENAME,'w')
        
          
    ##Load the sentiment-nva-gi42.txt
    ##overall_parameters['sentiment-nva-gi42'] = load_sentiment_nva_gi42()  
    
    
    ##overall_parameters['lexOut_90000_monovalue'] = load_lexOut_90000()

    ###if overall_parameters['use_mpqa_lexicon']:
    from mpqa_lexicon import MPQA_subjectivity_lexicon
    overall_parameters['mpqa_lexicon'] = MPQA_subjectivity_lexicon()
    
    
    if overall_parameters.get('use_wordnet_lexicon', False):
        from wordnet_lexicon import WordnetLexicon
        wordnet_lexicon_expression = WordnetLexicon()
        complete_wn_filename = os.path.join(folder, RESOURCES_FOLDER, WORDNET_LEXICON_FILENAME) 

        if type == 'train':
            #We create it from the training files
            print>>sys.stderr,'Creating WORDNET LEXICON FILE from %d files and storing it on %s' % (len(files), complete_wn_filename)
            wordnet_lexicon_expression.create_from_files(files,'expression')
            wordnet_lexicon_expression.save_to_file(complete_wn_filename)
        else:
            #READ IT
            wordnet_lexicon_expression.load_from_file(complete_wn_filename)
        overall_parameters['wordnet_lexicon'] = wordnet_lexicon_expression
        
    gold_fd = None    
    gold_filename = overall_parameters.get('gold_standard')
    if gold_filename is not None:
        gold_fd = open(gold_filename ,'w')
          
    #Processing every file
    
    #### FOR THE CUSTOM LEXICON
    #from customized_lexicon import CustomizedLexicon
    #overall_parameters['custom_lexicon'] = CustomizedLexicon()
    #overall_parameters['custom_lexicon'].load_from_filename('EXP.nl')
    ###########################

    #from customized_lexicon import CustomizedLexicon
    #overall_parameters['custom_lexicon'] = CustomizedLexicon()
    #overall_parameters['custom_lexicon'].load_for_language('it')
    
    for filename in files:
        if log:
            print>>sys.stderr,'EXPRESSION: processing file', filename
        
        if isinstance(filename,KafNafParser):
            naf_obj = filename
        else:
            naf_obj = KafNafParser(filename)
        create_structures(naf_obj, filename)
        
        #Extract all the opinions
        opinions_per_sentence = defaultdict(list)
        num_opinions = 0
        for opinion in naf_obj.get_opinions():
            exp = opinion.get_expression()
            if exp is not None:
                p = exp.get_polarity()
                
                if p != 'NON-OPINIONATED':
                    #if p.startswith('D-'):           
                    sentence_id = get_sentence_id_for_opinion(naf_obj,opinion)
                    if sentence_id is not None:
                        opinions_per_sentence[sentence_id].append(opinion)
                        num_opinions += 1
        if log:
            print>>sys.stderr,'\tNum of opinions:', num_opinions
        
        
        if type == 'train':
            ############################
            # One sequence per sentence
            ############################
            for sentence_id in naf_obj.list_sentence_ids:
                opinions_in_sent = opinions_per_sentence.get(sentence_id,[])
                if len(opinions_in_sent) != 0:
                    ##Only sentences with opinions
                    create_sequence(naf_obj, sentence_id, overall_parameters, opinions_in_sent, output = output_fd)
        elif type == 'test':
            #TESTING CASE
            #For the testing, one sequence is created for every sentence
            for sentence_id in naf_obj.list_sentence_ids:
                opinions_in_sent = opinions_per_sentence.get(sentence_id,[])
                if len(opinions_in_sent) != 0:
                    #Only tested on sentences with opinions
                    create_sequence(naf_obj, sentence_id, overall_parameters, opinions_in_sent,output = output_fd)
                    
            ## Create the gold standard data also
            opinion_list = []
            for this_sentence, these_opinions in opinions_per_sentence.items():
                opinion_list.extend(these_opinions)
            if gold_fd is not None:
                create_gold_standard(naf_obj,opinion_list,gold_fd)
        elif type == 'tag':
            #TAGGING CASE
            # All the sentences are considered
            for sentence_id in naf_obj.list_sentence_ids:
                create_sequence(naf_obj, sentence_id, overall_parameters, list_opinions = [],output = output_fd, log=log)
            
            
    if gold_fd is not None:
        gold_fd.close() 
        print>>sys.stderr,'Gold standard in the file %s' % gold_fd.name
        
    output_fd.close()
    return output_fd.name
def main(inputfile, type, folder, overall_parameters={},log=False):
    files = []
    output_fd = None
    if type == 'train':
        if not os.path.isdir(folder):
            os.mkdir(folder)
        res_fol = os.path.join(folder,RESOURCES_FOLDER)
        if not os.path.isdir(res_fol):
            os.mkdir(res_fol)
        output_fd = open(folder+'/'+TRAINING_FILENAME,'w')
            
        ##Save the parametes
        parameter_filename = os.path.join(folder,PARAMETERS_FILENAME)
        fd_parameter = open(parameter_filename,'w')
        pickler.dump(overall_parameters,fd_parameter,protocol=0)
        print('Parameters saved to file %s' % parameter_filename, file=sys.stderr)
        fd_parameter.close()
        
        #Input is a files with a list of files
        fin = open(inputfile,'r')
        for line in fin:
            files.append(line.strip())
        fin.close()
        
    elif type == 'tag':
        parameter_filename = os.path.join(folder,PARAMETERS_FILENAME)
        fd_param = open(parameter_filename,'rb')
        try:
            overall_parameters = pickler.load(fd_param,encoding='bytes')
        except TypeError:
            overall_parameters = pickler.load(fd_param)
        fd_param.close()

        #Input is a isngle file
        files.append(inputfile)
        
        #Output FD will be a temporary file
        output_fd = tempfile.NamedTemporaryFile('w', delete=False)
    elif type == 'test':
        parameter_filename = os.path.join(folder,PARAMETERS_FILENAME)
        fd_param = open(parameter_filename,'r')
        these_overall_parameters = pickler.load(fd_param)
        fd_param.close()
        for opt, val in list(these_overall_parameters.items()):
            overall_parameters[opt] = val
        
        #Input is a files with a list of files
        fin = open(inputfile,'r')
        for line in fin:
            files.append(line.strip())
        fin.close()
        output_fd = open(folder+'/'+TESTING_FILENAME,'w')
        
          
    ##Load the sentiment-nva-gi42.txt
    ##overall_parameters['sentiment-nva-gi42'] = load_sentiment_nva_gi42()  
    
    
    ##overall_parameters['lexOut_90000_monovalue'] = load_lexOut_90000()

    ###if overall_parameters['use_mpqa_lexicon']:
    from mpqa_lexicon import MPQA_subjectivity_lexicon
    overall_parameters['mpqa_lexicon'] = MPQA_subjectivity_lexicon()
    
    
    if overall_parameters.get('use_wordnet_lexicon', False):
        from wordnet_lexicon import WordnetLexicon
        wordnet_lexicon_expression = WordnetLexicon()
        complete_wn_filename = os.path.join(folder, RESOURCES_FOLDER, WORDNET_LEXICON_FILENAME) 

        if type == 'train':
            #We create it from the training files
            print('Creating WORDNET LEXICON FILE from %d files and storing it on %s' % (len(files), complete_wn_filename), file=sys.stderr)
            wordnet_lexicon_expression.create_from_files(files,'expression')
            wordnet_lexicon_expression.save_to_file(complete_wn_filename)
        else:
            #READ IT
            wordnet_lexicon_expression.load_from_file(complete_wn_filename)
        overall_parameters['wordnet_lexicon'] = wordnet_lexicon_expression
        
    gold_fd = None    
    gold_filename = overall_parameters.get('gold_standard')
    if gold_filename is not None:
        gold_fd = open(gold_filename ,'w')
          
    #Processing every file
    
    #### FOR THE CUSTOM LEXICON
    #from customized_lexicon import CustomizedLexicon
    #overall_parameters['custom_lexicon'] = CustomizedLexicon()
    #overall_parameters['custom_lexicon'].load_from_filename('EXP.nl')
    ###########################

    #from customized_lexicon import CustomizedLexicon
    #overall_parameters['custom_lexicon'] = CustomizedLexicon()
    #overall_parameters['custom_lexicon'].load_for_language('it')
    
    for filename in files:
        if log:
            print('EXPRESSION: processing file', filename, file=sys.stderr)
        
        if isinstance(filename,KafNafParser):
            naf_obj = filename
        else:
            naf_obj = KafNafParser(filename)
        create_structures(naf_obj, filename)
        
        #Extract all the opinions
        opinions_per_sentence = defaultdict(list)
        num_opinions = 0
        for opinion in naf_obj.get_opinions():
            exp = opinion.get_expression()
            if exp is not None:
                p = exp.get_polarity()
                
                if p != 'NON-OPINIONATED':
                    #if p.startswith('D-'):           
                    sentence_id = get_sentence_id_for_opinion(naf_obj,opinion)
                    if sentence_id is not None:
                        opinions_per_sentence[sentence_id].append(opinion)
                        num_opinions += 1
        if log:
            print('\tNum of opinions:', num_opinions, file=sys.stderr)
        
        
        if type == 'train':
            ############################
            # One sequence per sentence
            ############################
            for sentence_id in naf_obj.list_sentence_ids:
                opinions_in_sent = opinions_per_sentence.get(sentence_id,[])
                if len(opinions_in_sent) != 0:
                    ##Only sentences with opinions
                    create_sequence(naf_obj, sentence_id, overall_parameters, opinions_in_sent, output = output_fd)
        elif type == 'test':
            #TESTING CASE
            #For the testing, one sequence is created for every sentence
            for sentence_id in naf_obj.list_sentence_ids:
                opinions_in_sent = opinions_per_sentence.get(sentence_id,[])
                if len(opinions_in_sent) != 0:
                    #Only tested on sentences with opinions
                    create_sequence(naf_obj, sentence_id, overall_parameters, opinions_in_sent,output = output_fd)
                    
            ## Create the gold standard data also
            opinion_list = []
            for this_sentence, these_opinions in list(opinions_per_sentence.items()):
                opinion_list.extend(these_opinions)
            if gold_fd is not None:
                create_gold_standard(naf_obj,opinion_list,gold_fd)
        elif type == 'tag':
            #TAGGING CASE
            # All the sentences are considered
            for sentence_id in naf_obj.list_sentence_ids:
                create_sequence(naf_obj, sentence_id, overall_parameters, list_opinions = [],output = output_fd, log=log)
            
            
    if gold_fd is not None:
        gold_fd.close() 
        print('Gold standard in the file %s' % gold_fd.name, file=sys.stderr)
        
    output_fd.close()
    return output_fd.name
def process_file(this_file,token_freq):
    xml_obj = KafNafParser(this_file)
    print>>sys.stderr,'Processing file',this_file
    token_for_wid = {}
    order_for_wid = {}
    opinion_expressions = []
    opinion_targets = []
    whole_text = ' '
    for n, token in enumerate(xml_obj.get_tokens()):
        text = token.get_text().lower()
        token_freq[text] += 1
        token_for_wid[token.get_id()] = text
        order_for_wid[token.get_id()] = n
        whole_text += text + ' '
    wids_for_tid = {}
    lemma_for_wid = {}
    pos_for_wid = {}
    for term in xml_obj.get_terms():
        tid = term.get_id()
        wids = term.get_span().get_span_ids()
        wids_for_tid[tid] = wids
        for wid in wids:
            lemma_for_wid[wid] = term.get_lemma()
            pos_for_wid[wid] = term.get_pos()
        
    
    ##Properties!
    aspects = [] ## [(label,term_span)...]
    
    for property in xml_obj.get_properties():
        for refs in property.get_references():
            for span in refs:
                aspects.append((property.get_type(),span.get_span_ids()))
       
    
    
    already_counted = {EXP:set(), TAR:set()}
    
    for opinion in xml_obj.get_opinions():   
        for this_type, opinion_obj in [(EXP,opinion.get_expression()),(TAR,opinion.get_target())]:
            if this_type is EXP and opinion_obj.get_polarity()=='NON-OPINIONATED':
                continue
            if opinion_obj is not None:
                span = opinion_obj.get_span()
                if span is not None:
                    list_wids = []
                    for tid in span.get_span_ids():
                        list_wids.extend(wids_for_tid.get(tid,[]))
                    list_wids.sort(key=lambda wid: order_for_wid[wid])  ##Sorted according the the order of the tokens
                    
                    string_wids = '#'.join(list_wids)
                    opinion_tokens = ' '.join( token_for_wid[wid] for wid in list_wids)
                    opinion_lemmas = ' '.join( lemma_for_wid[wid] for wid in list_wids)
                    opinion_pos    = ' '.join( pos_for_wid[wid]   for wid in list_wids)
                    
                   
                    if string_wids not in already_counted[this_type]:
                        if this_type == EXP:
                            polarity = (opinion_obj.get_polarity()).lower()
                            opinion_expressions.append((opinion_tokens,polarity,opinion_lemmas,opinion_pos))
                        else:
                            ##Calculate the aspect type
                            possible_aspects = []
                            target_ids = span.get_span_ids()
                            for aspect_label, aspect_span in aspects:
                                num_in_common = len(set(target_ids) & set(aspect_span))
                                if num_in_common != 0:
                                    possible_aspects.append((aspect_label,num_in_common,len(aspect_span)))
                            aspect_for_target = 'unknown'

                            if len(possible_aspects) != 0:
                                ##Sorting by the number in common first, and by the lengtgh of the aspect secondly
                                aspect_for_target = sorted(possible_aspects,key=lambda t: (t[1],t[2]), reverse=True)[0][0]
                            opinion_targets.append((opinion_tokens,aspect_for_target, opinion_lemmas,opinion_pos))
                        already_counted[this_type].add(string_wids)    
      
    del xml_obj
    print>>sys.stderr,'\tNumber of opinion expressions:',len(opinion_expressions)
    print>>sys.stderr,'\tNumber of opinion targets:',len(opinion_targets)
    print>>sys.stderr,'\tNumber of characters of the text:',len(whole_text)
    return opinion_expressions, opinion_targets, whole_text
Esempio n. 8
0
    my_polarity_classifier = PolarityClassifier('nl')
    my_polarity_classifier.load_models(sys.argv[1])

    OK = WR = 1
    for example_file in files:
        this_obj = KafNafParser(example_file)

        my_polarity_classifier.classify_kaf_naf_object(this_obj)
        this_obj.dump()

        break

        GOLD = {}
        list_ids_term_ids = []
        for opinion in this_obj.get_opinions():
            op_exp = opinion.get_expression()
            polarity = op_exp.get_polarity()
            term_ids = op_exp.get_span().get_span_ids()
            list_ids_term_ids.append((opinion.get_id(), term_ids))
            GOLD[opinion.get_id()] = polarity

        class_for_opinion_id, features_for_opinion_id = my_polarity_classifier.classify_list_opinions(
            this_obj, list_ids_term_ids)
        for oid, c in list(class_for_opinion_id.items()):
            #print '%s Gold:%s   System:%s'  % (oid,GOLD[oid],c)
            #print '\tFeatures:', features_for_opinion_id[oid]
            if c.lower() in GOLD[oid].lower():
                OK += 1
            else:
                WR += 1
def process_file(this_file, token_freq):
    xml_obj = KafNafParser(this_file)
    print >> sys.stderr, 'Processing file', this_file
    token_for_wid = {}
    order_for_wid = {}
    opinion_expressions = []
    opinion_targets = []
    whole_text = ' '
    for n, token in enumerate(xml_obj.get_tokens()):
        text = token.get_text().lower()
        token_freq[text] += 1
        token_for_wid[token.get_id()] = text
        order_for_wid[token.get_id()] = n
        whole_text += text + ' '
    wids_for_tid = {}
    lemma_for_wid = {}
    pos_for_wid = {}
    for term in xml_obj.get_terms():
        tid = term.get_id()
        wids = term.get_span().get_span_ids()
        wids_for_tid[tid] = wids
        for wid in wids:
            lemma_for_wid[wid] = term.get_lemma()
            pos_for_wid[wid] = term.get_pos()

    ##Properties!
    aspects = []  ## [(label,term_span)...]

    for property in xml_obj.get_properties():
        for refs in property.get_references():
            for span in refs:
                aspects.append((property.get_type(), span.get_span_ids()))

    already_counted = {EXP: set(), TAR: set()}

    for opinion in xml_obj.get_opinions():
        for this_type, opinion_obj in [(EXP, opinion.get_expression()),
                                       (TAR, opinion.get_target())]:
            if this_type is EXP and opinion_obj.get_polarity(
            ) == 'NON-OPINIONATED':
                continue
            if opinion_obj is not None:
                span = opinion_obj.get_span()
                if span is not None:
                    list_wids = []
                    for tid in span.get_span_ids():
                        list_wids.extend(wids_for_tid.get(tid, []))
                    list_wids.sort(key=lambda wid: order_for_wid[
                        wid])  ##Sorted according the the order of the tokens

                    string_wids = '#'.join(list_wids)
                    opinion_tokens = ' '.join(token_for_wid[wid]
                                              for wid in list_wids)
                    opinion_lemmas = ' '.join(lemma_for_wid[wid]
                                              for wid in list_wids)
                    opinion_pos = ' '.join(pos_for_wid[wid]
                                           for wid in list_wids)

                    if string_wids not in already_counted[this_type]:
                        if this_type == EXP:
                            polarity = (opinion_obj.get_polarity()).lower()
                            opinion_expressions.append(
                                (opinion_tokens, polarity, opinion_lemmas,
                                 opinion_pos))
                        else:
                            ##Calculate the aspect type
                            possible_aspects = []
                            target_ids = span.get_span_ids()
                            for aspect_label, aspect_span in aspects:
                                num_in_common = len(
                                    set(target_ids) & set(aspect_span))
                                if num_in_common != 0:
                                    possible_aspects.append(
                                        (aspect_label, num_in_common,
                                         len(aspect_span)))
                            aspect_for_target = 'unknown'

                            if len(possible_aspects) != 0:
                                ##Sorting by the number in common first, and by the lengtgh of the aspect secondly
                                aspect_for_target = sorted(possible_aspects,
                                                           key=lambda t:
                                                           (t[1], t[2]),
                                                           reverse=True)[0][0]
                            opinion_targets.append(
                                (opinion_tokens, aspect_for_target,
                                 opinion_lemmas, opinion_pos))
                        already_counted[this_type].add(string_wids)

    del xml_obj
    print >> sys.stderr, '\tNumber of opinion expressions:', len(
        opinion_expressions)
    print >> sys.stderr, '\tNumber of opinion targets:', len(opinion_targets)
    print >> sys.stderr, '\tNumber of characters of the text:', len(whole_text)
    return opinion_expressions, opinion_targets, whole_text
        

if __name__ == '__main__':
    import glob
    #feature_file = 'my_feat_file'
    #fd = open(feature_file,'w')
    #for kaf_file in glob.glob('/home/izquierdo/data/opinion_annotations_en/kaf/hotel/*.kaf'):
    #    print kaf_file
    #    knaf_obj = KafNafParser(kaf_file)
    #    extract_features_polarity_classifier_from_kaf(knaf_obj, fd)
    #fd.close()
    #print ' Feature file in ',feature_file
    #train_polarity_classifier(feature_file)
    kaf_obj = KafNafParser('dutch00011_f1b91e00bddbf62fbb35e4755e786406.kaf')
    list_terms = []
    list_ids = []
    for opinion in kaf_obj.get_opinions():
        exp = opinion.get_expression()
        pol = exp.get_polarity()
        if pol in ['Positive','Negative','StrongPositive','StrongNegative']:
            this_id = (opinion.get_id(),pol)
            ids = exp.get_span().get_span_ids()
            list_ids.append(this_id)
            list_terms.append(ids)
    index_filename = '/home/izquierdo/cltl_repos/opinion_miner_deluxe/check_me/polarity_classifier/index.features'
    model_filename = '/home/izquierdo/cltl_repos/opinion_miner_deluxe/check_me/polarity_classifier/model.svm'
    svm_path = '/home/izquierdo/bin/svm_classify'
    results = classify(kaf_obj,list_terms,index_filename,model_filename, svm_path)
    for n in range(len(results)):
        print list_ids[n], results[n]
    def train(self, list_training_files, out_folder):
        self.folder = out_folder
        os.mkdir(self.folder)
        print('Creating output folder %s' % self.folder)

        training_fd = open(os.path.join(self.folder, TRAIN_FILE), 'w')

        for this_file in list_training_files:
            print('\tEncoding training file %s' % this_file)

            this_obj = KafNafParser(this_file)
            num_pos = num_neg = 0
            for opinion in this_obj.get_opinions():
                opinion_expression = opinion.get_expression()
                polarity = opinion_expression.get_polarity()

                span_obj = opinion_expression.get_span()
                if span_obj is None:
                    continue

                list_term_ids = span_obj.get_span_ids()
                features = self.extract_features(this_obj, list_term_ids)

                int_features = self.encode_string_features(
                    features, update_index=True)  #Map feat index --> frequency

                if len(int_features) != 0:
                    this_class = None
                    if self.is_positive(polarity):
                        this_class = '+1'
                        num_pos += 1
                    elif self.is_negative(polarity):
                        this_class = '-1'
                        num_neg += 1

                    if this_class is not None:
                        self.write_example_to_file(training_fd, this_class,
                                                   int_features)

            #END FOR
            print('\t\tNum positive examples: %d' % num_pos)
            print('\t\tNum negative examples: %d' % num_neg)
        training_fd.close()
        print('Training file at %s' % training_fd.name)

        ##RUN THE TRAINING
        training_cmd = [SVM_LEARN]

        training_cmd.append(training_fd.name)

        whole_model_file = os.path.join(self.folder, MODEL_FILE)
        training_cmd.append(whole_model_file)
        ret_code = check_call(training_cmd)
        print('Training done on %s with code %d' %
              (whole_model_file, ret_code))

        #Save also the index
        whole_index_file = os.path.join(self.folder, INDEX_FILE)
        index_fd = open(whole_index_file, 'wb')
        pickle.dump(self.index_features, index_fd, -1)
        index_fd.close()
        print('Feature index saved to %s with %d features' %
              (whole_index_file, len(self.index_features)))
    my_polarity_classifier = PolarityClassifier('nl')
    my_polarity_classifier.load_models(sys.argv[1])

    OK = WR = 1
    for example_file in files:
        this_obj = KafNafParser(example_file)
        
        
        my_polarity_classifier.classify_kaf_naf_object(this_obj)
        this_obj.dump()

        break
    
        GOLD = {}
        list_ids_term_ids = []
        for opinion in this_obj.get_opinions():
            op_exp = opinion.get_expression()
            polarity = op_exp.get_polarity()
            term_ids = op_exp.get_span().get_span_ids()
            list_ids_term_ids.append((opinion.get_id(),term_ids))
            GOLD[opinion.get_id()] = polarity
    
        
                
    
        class_for_opinion_id, features_for_opinion_id = my_polarity_classifier.classify_list_opinions(this_obj, list_ids_term_ids)
        for oid, c in list(class_for_opinion_id.items()):
            #print '%s Gold:%s   System:%s'  % (oid,GOLD[oid],c)
            #print '\tFeatures:', features_for_opinion_id[oid]
            if c.lower() in GOLD[oid].lower():
                OK +=1
def main(inputfile, this_type, folder, overall_parameters = {}, detected_dse = {},log=False):
    files = []
    output_fd = None
    if this_type == 'train':
        output_fd = open(folder+'/'+TRAINING_FILENAME,'w')
            
        ##Save the parametes
        parameter_filename = os.path.join(folder,PARAMETERS_FILENAME)
        fd_parameter = open(parameter_filename,'w')
        pickler.dump(overall_parameters,fd_parameter,protocol=0)
        print>>sys.stderr,'Parameters saved to file %s' % parameter_filename
        fd_parameter.close()
        
        #Input is a files with a list of files
        fin = open(inputfile,'r')
        for line in fin:
            files.append(line.strip())
        fin.close()
        
    elif this_type == 'tag':
        parameter_filename = os.path.join(folder,PARAMETERS_FILENAME)
        fd_param = open(parameter_filename,'r')
        overall_parameters = pickler.load(fd_param)
        fd_param.close()

        #Input is a isngle file
        files.append(inputfile)
        
        #Output FD will be a temporary file
        output_fd = tempfile.NamedTemporaryFile('w', delete=False)
    elif this_type == 'test':
        parameter_filename = os.path.join(folder,PARAMETERS_FILENAME)
        fd_param = open(parameter_filename,'r')
        these_overall_parameters = pickler.load(fd_param)
        fd_param.close()
        for opt, val in these_overall_parameters.items():
            overall_parameters[opt] = val
        
        #Input is a files with a list of files
        fin = open(inputfile,'r')
        for line in fin:
            files.append(line.strip())
        fin.close()
        output_fd = open(folder+'/'+TESTING_FILENAME,'w')
     
      
    gold_fd = None    
    gold_filename = overall_parameters.get('gold_standard')
    if gold_filename is not None:
        gold_fd = open(gold_filename ,'w')
          

    for filename in files:
        if log:
            print>>sys.stderr,'TARGET: processing file', filename
        
        if isinstance(filename,KafNafParser):
            naf_obj = filename
        else:
            naf_obj = KafNafParser(filename)
            
        create_structures(naf_obj, filename)
        
        #Extract all the opinions
        opinions_per_sentence = defaultdict(list)
        num_opinions = 0
       
        
        for opinion in naf_obj.get_opinions():
            exp = opinion.get_expression()
            if exp is not None:
                p = exp.get_polarity()
                if p != 'NON-OPINIONATED':
                    target = opinion.get_target()
                    if target is not None:  
                        span = target.get_span()
                        if span is not None:
                            S = span.get_span_ids()
                            if len(S) != 0:    
                                sentence_id = get_sentence_id_for_opinion(naf_obj,opinion)
                                if sentence_id is not None:
                                    opinions_per_sentence[sentence_id].append(opinion)
                                    num_opinions += 1
                    
        if log:
            print>>sys.stderr,'\tNum of opinions:', num_opinions
        
        if this_type == 'train':
            # For the train a sequence is created for every opinion
            #One sequence is created for every DSE (possible to have repeated sentences)
            sentences_with_opinions = set()
            for this_sentence, these_opinions in opinions_per_sentence.items():
                for opinion in these_opinions:
                    sentences_with_opinions.add(this_sentence)
                    create_sequence(naf_obj, this_type, this_sentence, overall_parameters, opinion, output = output_fd)
            
            #Include the rest of sentence without opinions
            '''
            for sentence_id in naf_obj.list_sentence_ids:
                if sentence_id not in sentences_with_opinions:
                    create_sequence(naf_obj, sentence_id, overall_parameters, list_opinions=[])
            '''
                
        elif this_type=='tag':
            # Obtain the opinions per sentence per
            opinions_per_sentence = defaultdict(list)
            for list_name_ids, list_words in detected_dse:
                list_ids = [v[v.rfind('#')+1:] for v in list_name_ids]
                first_token = naf_obj.get_token(list_ids[0])
                sentence_for_opinion = first_token.get_sent()
                opinions_per_sentence[sentence_for_opinion].append(list_ids)
                
            for this_sentence, these_opinions in opinions_per_sentence.items():
                for list_dse_token_ids in these_opinions:
                    create_sequence(naf_obj, this_type, this_sentence, overall_parameters, opinion = list_dse_token_ids, output = output_fd,log=log)  

        elif this_type=='test':
            #For the testing, one sequence is created for every sentence, with no opinion included
            opinion_list = []
            for this_sentence, these_opinions in opinions_per_sentence.items():
                for opinion in these_opinions:
                    create_sequence(naf_obj, this_type, this_sentence, overall_parameters,opinion, output = output_fd)
                    opinion_list.append(opinion)
   
            if gold_fd is not None:
                create_gold_standard_target(naf_obj,opinion_list,gold_fd)
            
            
    if gold_fd is not None:
        gold_fd.close() 
        print>>sys.stderr,'Gold standard in the file %s' % gold_fd.name
        
    return output_fd.name