def map_opinion_labels(input_file, output_file, config_file): # Load the mapping from the config_file mapping = {} parser = ConfigParser.ConfigParser() parser.read(config_file) for mapped_opinion, values_in_corpus in parser.items('valid_opinions'): values = [v for v in values_in_corpus.split(';') if v != ''] for v in values: mapping[v] = mapped_opinion del parser ################## input_kaf = KafNafParser(input_file) remove_these = [] for opinion in input_kaf.get_opinions(): exp = opinion.get_expression() polarity = exp.get_polarity() if polarity in mapping: mapped_polarity = mapping[polarity] else: opi_id = opinion.get_id() remove_these.append(opi_id) mapped_polarity = polarity exp.set_polarity(mapped_polarity) for opi_id in remove_these: input_kaf.remove_this_opinion(opi_id) input_kaf.dump(output_file)
def map_opinion_labels(input_file,output_file,config_file): # Load the mapping from the config_file mapping = {} parser = ConfigParser.ConfigParser() parser.read(config_file) for mapped_opinion, values_in_corpus in parser.items('valid_opinions'): values = [ v for v in values_in_corpus.split(';') if v != ''] for v in values: mapping[v] = mapped_opinion del parser ################## input_kaf = KafNafParser(input_file) remove_these = [] for opinion in input_kaf.get_opinions(): exp = opinion.get_expression() polarity = exp.get_polarity() if polarity in mapping: mapped_polarity = mapping[polarity] else: opi_id = opinion.get_id() remove_these.append(opi_id) mapped_polarity = polarity exp.set_polarity(mapped_polarity) for opi_id in remove_these: input_kaf.remove_this_opinion(opi_id) input_kaf.dump(output_file)
def train(self,list_training_files, out_folder): self.folder= out_folder os.mkdir(self.folder) print('Creating output folder %s' % self.folder) training_fd = open(os.path.join(self.folder,TRAIN_FILE),'w') for this_file in list_training_files: print('\tEncoding training file %s' % this_file) this_obj = KafNafParser(this_file) num_pos = num_neg = 0 for opinion in this_obj.get_opinions(): opinion_expression = opinion.get_expression() polarity = opinion_expression.get_polarity() span_obj = opinion_expression.get_span() if span_obj is None: continue list_term_ids = span_obj.get_span_ids() features = self.extract_features(this_obj, list_term_ids) int_features = self.encode_string_features(features, update_index=True) #Map feat index --> frequency if len(int_features) != 0: this_class = None if self.is_positive(polarity): this_class = '+1' num_pos += 1 elif self.is_negative(polarity): this_class = '-1' num_neg += 1 if this_class is not None: self.write_example_to_file(training_fd, this_class, int_features) #END FOR print('\t\tNum positive examples: %d' % num_pos) print('\t\tNum negative examples: %d' % num_neg) training_fd.close() print('Training file at %s' % training_fd.name) ##RUN THE TRAINING training_cmd = [SVM_LEARN] training_cmd.append(training_fd.name) whole_model_file = os.path.join(self.folder, MODEL_FILE) training_cmd.append(whole_model_file) ret_code = check_call(training_cmd) print('Training done on %s with code %d' % (whole_model_file,ret_code)) #Save also the index whole_index_file = os.path.join(self.folder,INDEX_FILE) index_fd = open(whole_index_file,'wb') pickle.dump(self.index_features, index_fd, -1) index_fd.close() print('Feature index saved to %s with %d features' % (whole_index_file,len(self.index_features)))
def main(inputfile, this_type, folder, overall_parameters={}, detected_dse={}, log=False): files = [] output_fd = None if this_type == 'train': output_fd = open(folder + '/' + TRAINING_FILENAME, 'w') ##Save the parametes parameter_filename = os.path.join(folder, PARAMETERS_FILENAME) fd_parameter = open(parameter_filename, 'w') pickler.dump(overall_parameters, fd_parameter, protocol=0) print >> sys.stderr, 'Parameters saved to file %s' % parameter_filename fd_parameter.close() #Input is a files with a list of files fin = open(inputfile, 'r') for line in fin: files.append(line.strip()) fin.close() elif this_type == 'tag': parameter_filename = os.path.join(folder, PARAMETERS_FILENAME) fd_param = open(parameter_filename, 'r') overall_parameters = pickler.load(fd_param) fd_param.close() #Input is a isngle file files.append(inputfile) #Output FD will be a temporary file output_fd = tempfile.NamedTemporaryFile('w', delete=False) elif this_type == 'test': parameter_filename = os.path.join(folder, PARAMETERS_FILENAME) fd_param = open(parameter_filename, 'r') these_overall_parameters = pickler.load(fd_param) fd_param.close() for opt, val in these_overall_parameters.items(): overall_parameters[opt] = val #Input is a files with a list of files fin = open(inputfile, 'r') for line in fin: files.append(line.strip()) fin.close() output_fd = open(folder + '/' + TESTING_FILENAME, 'w') gold_fd = None gold_filename = overall_parameters.get('gold_standard') if gold_filename is not None: gold_fd = open(gold_filename, 'w') for filename in files: if log: print >> sys.stderr, 'HOLDER: processing file', filename if isinstance(filename, KafNafParser): naf_obj = filename else: naf_obj = KafNafParser(filename) create_structures(naf_obj, filename) #Extract all the opinions opinions_per_sentence = defaultdict(list) num_opinions = 0 for opinion in naf_obj.get_opinions(): exp = opinion.get_expression() if exp is not None: p = exp.get_polarity() if p != 'NON-OPINIONATED': #if p.startswith('D-'): holder = opinion.get_holder() if holder is not None: span = holder.get_span() if span is not None: span_ids = span.get_span_ids() if len(span_ids) != 0: sentence_id = get_sentence_id_for_opinion( naf_obj, opinion) if sentence_id is not None: opinions_per_sentence[sentence_id].append( opinion) num_opinions += 1 if log: print >> sys.stderr, '\tNum of opinions:', num_opinions if this_type == 'train': # For the train a sequence is created for every opinion #One sequence is created for every DSE (possible to have repeated sentences) sentences_with_opinions = set() for this_sentence, these_opinions in opinions_per_sentence.items(): for opinion in these_opinions: sentences_with_opinions.add(this_sentence) create_sequence(naf_obj, this_type, this_sentence, overall_parameters, opinion, output=output_fd) #Include the rest of sentence without opinions ''' for sentence_id in naf_obj.list_sentence_ids: if sentence_id not in sentences_with_opinions: create_sequence(naf_obj, sentence_id, overall_parameters, list_opinions=[]) ''' elif this_type == 'tag': # Obtain the opinions per sentence per opinions_per_sentence = defaultdict(list) for list_name_ids, list_words in detected_dse: list_ids = [v[v.rfind('#') + 1:] for v in list_name_ids] first_token = naf_obj.get_token(list_ids[0]) sentence_for_opinion = first_token.get_sent() opinions_per_sentence[sentence_for_opinion].append(list_ids) for this_sentence, these_opinions in opinions_per_sentence.items(): for list_dse_token_ids in these_opinions: create_sequence(naf_obj, this_type, this_sentence, overall_parameters, opinion=list_dse_token_ids, output=output_fd, log=log) elif this_type == 'test': opinion_list = [] ''' for sentence_id in naf_obj.list_sentence_ids: if sentence_id in opinions_per_sentence: for this_sentence, these_opinions in opinions_per_sentence.items(): for opinion in these_opinions: create_sequence(naf_obj, this_type, this_sentence, overall_parameters,opinion, output = output_fd) opinion_list.append(opinion) else: create_sequence(naf_obj, this_type, sentence_id, overall_parameters,opinion=None, output = output_fd) ''' #For the testing, one sequence is created for every sentence, with no opinion included opinion_list = [] #WE include only the the sentences where there are opinions for this_sentence, these_opinions in opinions_per_sentence.items(): for opinion in these_opinions: create_sequence(naf_obj, this_type, this_sentence, overall_parameters, opinion, output=output_fd) opinion_list.append(opinion) ## Create the gold standard data also if gold_fd is not None: create_gold_standard_holder(naf_obj, opinion_list, gold_fd) if gold_fd is not None: gold_fd.close() print >> sys.stderr, 'Gold standard in the file %s' % gold_fd.name return output_fd.name
def main(inputfile, type, folder, overall_parameters={},log=False): files = [] output_fd = None if type == 'train': if not os.path.isdir(folder): os.mkdir(folder) res_fol = os.path.join(folder,RESOURCES_FOLDER) if not os.path.isdir(res_fol): os.mkdir(res_fol) output_fd = open(folder+'/'+TRAINING_FILENAME,'w') ##Save the parametes parameter_filename = os.path.join(folder,PARAMETERS_FILENAME) fd_parameter = open(parameter_filename,'w') pickler.dump(overall_parameters,fd_parameter,protocol=0) print>>sys.stderr,'Parameters saved to file %s' % parameter_filename fd_parameter.close() #Input is a files with a list of files fin = open(inputfile,'r') for line in fin: files.append(line.strip()) fin.close() elif type == 'tag': parameter_filename = os.path.join(folder,PARAMETERS_FILENAME) fd_param = open(parameter_filename,'r') overall_parameters = pickler.load(fd_param) fd_param.close() #Input is a isngle file files.append(inputfile) #Output FD will be a temporary file output_fd = tempfile.NamedTemporaryFile('w', delete=False) elif type == 'test': parameter_filename = os.path.join(folder,PARAMETERS_FILENAME) fd_param = open(parameter_filename,'r') these_overall_parameters = pickler.load(fd_param) fd_param.close() for opt, val in these_overall_parameters.items(): overall_parameters[opt] = val #Input is a files with a list of files fin = open(inputfile,'r') for line in fin: files.append(line.strip()) fin.close() output_fd = open(folder+'/'+TESTING_FILENAME,'w') ##Load the sentiment-nva-gi42.txt ##overall_parameters['sentiment-nva-gi42'] = load_sentiment_nva_gi42() ##overall_parameters['lexOut_90000_monovalue'] = load_lexOut_90000() ###if overall_parameters['use_mpqa_lexicon']: from mpqa_lexicon import MPQA_subjectivity_lexicon overall_parameters['mpqa_lexicon'] = MPQA_subjectivity_lexicon() if overall_parameters.get('use_wordnet_lexicon', False): from wordnet_lexicon import WordnetLexicon wordnet_lexicon_expression = WordnetLexicon() complete_wn_filename = os.path.join(folder, RESOURCES_FOLDER, WORDNET_LEXICON_FILENAME) if type == 'train': #We create it from the training files print>>sys.stderr,'Creating WORDNET LEXICON FILE from %d files and storing it on %s' % (len(files), complete_wn_filename) wordnet_lexicon_expression.create_from_files(files,'expression') wordnet_lexicon_expression.save_to_file(complete_wn_filename) else: #READ IT wordnet_lexicon_expression.load_from_file(complete_wn_filename) overall_parameters['wordnet_lexicon'] = wordnet_lexicon_expression gold_fd = None gold_filename = overall_parameters.get('gold_standard') if gold_filename is not None: gold_fd = open(gold_filename ,'w') #Processing every file #### FOR THE CUSTOM LEXICON #from customized_lexicon import CustomizedLexicon #overall_parameters['custom_lexicon'] = CustomizedLexicon() #overall_parameters['custom_lexicon'].load_from_filename('EXP.nl') ########################### #from customized_lexicon import CustomizedLexicon #overall_parameters['custom_lexicon'] = CustomizedLexicon() #overall_parameters['custom_lexicon'].load_for_language('it') for filename in files: if log: print>>sys.stderr,'EXPRESSION: processing file', filename if isinstance(filename,KafNafParser): naf_obj = filename else: naf_obj = KafNafParser(filename) create_structures(naf_obj, filename) #Extract all the opinions opinions_per_sentence = defaultdict(list) num_opinions = 0 for opinion in naf_obj.get_opinions(): exp = opinion.get_expression() if exp is not None: p = exp.get_polarity() if p != 'NON-OPINIONATED': #if p.startswith('D-'): sentence_id = get_sentence_id_for_opinion(naf_obj,opinion) if sentence_id is not None: opinions_per_sentence[sentence_id].append(opinion) num_opinions += 1 if log: print>>sys.stderr,'\tNum of opinions:', num_opinions if type == 'train': ############################ # One sequence per sentence ############################ for sentence_id in naf_obj.list_sentence_ids: opinions_in_sent = opinions_per_sentence.get(sentence_id,[]) if len(opinions_in_sent) != 0: ##Only sentences with opinions create_sequence(naf_obj, sentence_id, overall_parameters, opinions_in_sent, output = output_fd) elif type == 'test': #TESTING CASE #For the testing, one sequence is created for every sentence for sentence_id in naf_obj.list_sentence_ids: opinions_in_sent = opinions_per_sentence.get(sentence_id,[]) if len(opinions_in_sent) != 0: #Only tested on sentences with opinions create_sequence(naf_obj, sentence_id, overall_parameters, opinions_in_sent,output = output_fd) ## Create the gold standard data also opinion_list = [] for this_sentence, these_opinions in opinions_per_sentence.items(): opinion_list.extend(these_opinions) if gold_fd is not None: create_gold_standard(naf_obj,opinion_list,gold_fd) elif type == 'tag': #TAGGING CASE # All the sentences are considered for sentence_id in naf_obj.list_sentence_ids: create_sequence(naf_obj, sentence_id, overall_parameters, list_opinions = [],output = output_fd, log=log) if gold_fd is not None: gold_fd.close() print>>sys.stderr,'Gold standard in the file %s' % gold_fd.name output_fd.close() return output_fd.name
def main(inputfile, type, folder, overall_parameters={},log=False): files = [] output_fd = None if type == 'train': if not os.path.isdir(folder): os.mkdir(folder) res_fol = os.path.join(folder,RESOURCES_FOLDER) if not os.path.isdir(res_fol): os.mkdir(res_fol) output_fd = open(folder+'/'+TRAINING_FILENAME,'w') ##Save the parametes parameter_filename = os.path.join(folder,PARAMETERS_FILENAME) fd_parameter = open(parameter_filename,'w') pickler.dump(overall_parameters,fd_parameter,protocol=0) print('Parameters saved to file %s' % parameter_filename, file=sys.stderr) fd_parameter.close() #Input is a files with a list of files fin = open(inputfile,'r') for line in fin: files.append(line.strip()) fin.close() elif type == 'tag': parameter_filename = os.path.join(folder,PARAMETERS_FILENAME) fd_param = open(parameter_filename,'rb') try: overall_parameters = pickler.load(fd_param,encoding='bytes') except TypeError: overall_parameters = pickler.load(fd_param) fd_param.close() #Input is a isngle file files.append(inputfile) #Output FD will be a temporary file output_fd = tempfile.NamedTemporaryFile('w', delete=False) elif type == 'test': parameter_filename = os.path.join(folder,PARAMETERS_FILENAME) fd_param = open(parameter_filename,'r') these_overall_parameters = pickler.load(fd_param) fd_param.close() for opt, val in list(these_overall_parameters.items()): overall_parameters[opt] = val #Input is a files with a list of files fin = open(inputfile,'r') for line in fin: files.append(line.strip()) fin.close() output_fd = open(folder+'/'+TESTING_FILENAME,'w') ##Load the sentiment-nva-gi42.txt ##overall_parameters['sentiment-nva-gi42'] = load_sentiment_nva_gi42() ##overall_parameters['lexOut_90000_monovalue'] = load_lexOut_90000() ###if overall_parameters['use_mpqa_lexicon']: from mpqa_lexicon import MPQA_subjectivity_lexicon overall_parameters['mpqa_lexicon'] = MPQA_subjectivity_lexicon() if overall_parameters.get('use_wordnet_lexicon', False): from wordnet_lexicon import WordnetLexicon wordnet_lexicon_expression = WordnetLexicon() complete_wn_filename = os.path.join(folder, RESOURCES_FOLDER, WORDNET_LEXICON_FILENAME) if type == 'train': #We create it from the training files print('Creating WORDNET LEXICON FILE from %d files and storing it on %s' % (len(files), complete_wn_filename), file=sys.stderr) wordnet_lexicon_expression.create_from_files(files,'expression') wordnet_lexicon_expression.save_to_file(complete_wn_filename) else: #READ IT wordnet_lexicon_expression.load_from_file(complete_wn_filename) overall_parameters['wordnet_lexicon'] = wordnet_lexicon_expression gold_fd = None gold_filename = overall_parameters.get('gold_standard') if gold_filename is not None: gold_fd = open(gold_filename ,'w') #Processing every file #### FOR THE CUSTOM LEXICON #from customized_lexicon import CustomizedLexicon #overall_parameters['custom_lexicon'] = CustomizedLexicon() #overall_parameters['custom_lexicon'].load_from_filename('EXP.nl') ########################### #from customized_lexicon import CustomizedLexicon #overall_parameters['custom_lexicon'] = CustomizedLexicon() #overall_parameters['custom_lexicon'].load_for_language('it') for filename in files: if log: print('EXPRESSION: processing file', filename, file=sys.stderr) if isinstance(filename,KafNafParser): naf_obj = filename else: naf_obj = KafNafParser(filename) create_structures(naf_obj, filename) #Extract all the opinions opinions_per_sentence = defaultdict(list) num_opinions = 0 for opinion in naf_obj.get_opinions(): exp = opinion.get_expression() if exp is not None: p = exp.get_polarity() if p != 'NON-OPINIONATED': #if p.startswith('D-'): sentence_id = get_sentence_id_for_opinion(naf_obj,opinion) if sentence_id is not None: opinions_per_sentence[sentence_id].append(opinion) num_opinions += 1 if log: print('\tNum of opinions:', num_opinions, file=sys.stderr) if type == 'train': ############################ # One sequence per sentence ############################ for sentence_id in naf_obj.list_sentence_ids: opinions_in_sent = opinions_per_sentence.get(sentence_id,[]) if len(opinions_in_sent) != 0: ##Only sentences with opinions create_sequence(naf_obj, sentence_id, overall_parameters, opinions_in_sent, output = output_fd) elif type == 'test': #TESTING CASE #For the testing, one sequence is created for every sentence for sentence_id in naf_obj.list_sentence_ids: opinions_in_sent = opinions_per_sentence.get(sentence_id,[]) if len(opinions_in_sent) != 0: #Only tested on sentences with opinions create_sequence(naf_obj, sentence_id, overall_parameters, opinions_in_sent,output = output_fd) ## Create the gold standard data also opinion_list = [] for this_sentence, these_opinions in list(opinions_per_sentence.items()): opinion_list.extend(these_opinions) if gold_fd is not None: create_gold_standard(naf_obj,opinion_list,gold_fd) elif type == 'tag': #TAGGING CASE # All the sentences are considered for sentence_id in naf_obj.list_sentence_ids: create_sequence(naf_obj, sentence_id, overall_parameters, list_opinions = [],output = output_fd, log=log) if gold_fd is not None: gold_fd.close() print('Gold standard in the file %s' % gold_fd.name, file=sys.stderr) output_fd.close() return output_fd.name
def process_file(this_file,token_freq): xml_obj = KafNafParser(this_file) print>>sys.stderr,'Processing file',this_file token_for_wid = {} order_for_wid = {} opinion_expressions = [] opinion_targets = [] whole_text = ' ' for n, token in enumerate(xml_obj.get_tokens()): text = token.get_text().lower() token_freq[text] += 1 token_for_wid[token.get_id()] = text order_for_wid[token.get_id()] = n whole_text += text + ' ' wids_for_tid = {} lemma_for_wid = {} pos_for_wid = {} for term in xml_obj.get_terms(): tid = term.get_id() wids = term.get_span().get_span_ids() wids_for_tid[tid] = wids for wid in wids: lemma_for_wid[wid] = term.get_lemma() pos_for_wid[wid] = term.get_pos() ##Properties! aspects = [] ## [(label,term_span)...] for property in xml_obj.get_properties(): for refs in property.get_references(): for span in refs: aspects.append((property.get_type(),span.get_span_ids())) already_counted = {EXP:set(), TAR:set()} for opinion in xml_obj.get_opinions(): for this_type, opinion_obj in [(EXP,opinion.get_expression()),(TAR,opinion.get_target())]: if this_type is EXP and opinion_obj.get_polarity()=='NON-OPINIONATED': continue if opinion_obj is not None: span = opinion_obj.get_span() if span is not None: list_wids = [] for tid in span.get_span_ids(): list_wids.extend(wids_for_tid.get(tid,[])) list_wids.sort(key=lambda wid: order_for_wid[wid]) ##Sorted according the the order of the tokens string_wids = '#'.join(list_wids) opinion_tokens = ' '.join( token_for_wid[wid] for wid in list_wids) opinion_lemmas = ' '.join( lemma_for_wid[wid] for wid in list_wids) opinion_pos = ' '.join( pos_for_wid[wid] for wid in list_wids) if string_wids not in already_counted[this_type]: if this_type == EXP: polarity = (opinion_obj.get_polarity()).lower() opinion_expressions.append((opinion_tokens,polarity,opinion_lemmas,opinion_pos)) else: ##Calculate the aspect type possible_aspects = [] target_ids = span.get_span_ids() for aspect_label, aspect_span in aspects: num_in_common = len(set(target_ids) & set(aspect_span)) if num_in_common != 0: possible_aspects.append((aspect_label,num_in_common,len(aspect_span))) aspect_for_target = 'unknown' if len(possible_aspects) != 0: ##Sorting by the number in common first, and by the lengtgh of the aspect secondly aspect_for_target = sorted(possible_aspects,key=lambda t: (t[1],t[2]), reverse=True)[0][0] opinion_targets.append((opinion_tokens,aspect_for_target, opinion_lemmas,opinion_pos)) already_counted[this_type].add(string_wids) del xml_obj print>>sys.stderr,'\tNumber of opinion expressions:',len(opinion_expressions) print>>sys.stderr,'\tNumber of opinion targets:',len(opinion_targets) print>>sys.stderr,'\tNumber of characters of the text:',len(whole_text) return opinion_expressions, opinion_targets, whole_text
my_polarity_classifier = PolarityClassifier('nl') my_polarity_classifier.load_models(sys.argv[1]) OK = WR = 1 for example_file in files: this_obj = KafNafParser(example_file) my_polarity_classifier.classify_kaf_naf_object(this_obj) this_obj.dump() break GOLD = {} list_ids_term_ids = [] for opinion in this_obj.get_opinions(): op_exp = opinion.get_expression() polarity = op_exp.get_polarity() term_ids = op_exp.get_span().get_span_ids() list_ids_term_ids.append((opinion.get_id(), term_ids)) GOLD[opinion.get_id()] = polarity class_for_opinion_id, features_for_opinion_id = my_polarity_classifier.classify_list_opinions( this_obj, list_ids_term_ids) for oid, c in list(class_for_opinion_id.items()): #print '%s Gold:%s System:%s' % (oid,GOLD[oid],c) #print '\tFeatures:', features_for_opinion_id[oid] if c.lower() in GOLD[oid].lower(): OK += 1 else: WR += 1
def process_file(this_file, token_freq): xml_obj = KafNafParser(this_file) print >> sys.stderr, 'Processing file', this_file token_for_wid = {} order_for_wid = {} opinion_expressions = [] opinion_targets = [] whole_text = ' ' for n, token in enumerate(xml_obj.get_tokens()): text = token.get_text().lower() token_freq[text] += 1 token_for_wid[token.get_id()] = text order_for_wid[token.get_id()] = n whole_text += text + ' ' wids_for_tid = {} lemma_for_wid = {} pos_for_wid = {} for term in xml_obj.get_terms(): tid = term.get_id() wids = term.get_span().get_span_ids() wids_for_tid[tid] = wids for wid in wids: lemma_for_wid[wid] = term.get_lemma() pos_for_wid[wid] = term.get_pos() ##Properties! aspects = [] ## [(label,term_span)...] for property in xml_obj.get_properties(): for refs in property.get_references(): for span in refs: aspects.append((property.get_type(), span.get_span_ids())) already_counted = {EXP: set(), TAR: set()} for opinion in xml_obj.get_opinions(): for this_type, opinion_obj in [(EXP, opinion.get_expression()), (TAR, opinion.get_target())]: if this_type is EXP and opinion_obj.get_polarity( ) == 'NON-OPINIONATED': continue if opinion_obj is not None: span = opinion_obj.get_span() if span is not None: list_wids = [] for tid in span.get_span_ids(): list_wids.extend(wids_for_tid.get(tid, [])) list_wids.sort(key=lambda wid: order_for_wid[ wid]) ##Sorted according the the order of the tokens string_wids = '#'.join(list_wids) opinion_tokens = ' '.join(token_for_wid[wid] for wid in list_wids) opinion_lemmas = ' '.join(lemma_for_wid[wid] for wid in list_wids) opinion_pos = ' '.join(pos_for_wid[wid] for wid in list_wids) if string_wids not in already_counted[this_type]: if this_type == EXP: polarity = (opinion_obj.get_polarity()).lower() opinion_expressions.append( (opinion_tokens, polarity, opinion_lemmas, opinion_pos)) else: ##Calculate the aspect type possible_aspects = [] target_ids = span.get_span_ids() for aspect_label, aspect_span in aspects: num_in_common = len( set(target_ids) & set(aspect_span)) if num_in_common != 0: possible_aspects.append( (aspect_label, num_in_common, len(aspect_span))) aspect_for_target = 'unknown' if len(possible_aspects) != 0: ##Sorting by the number in common first, and by the lengtgh of the aspect secondly aspect_for_target = sorted(possible_aspects, key=lambda t: (t[1], t[2]), reverse=True)[0][0] opinion_targets.append( (opinion_tokens, aspect_for_target, opinion_lemmas, opinion_pos)) already_counted[this_type].add(string_wids) del xml_obj print >> sys.stderr, '\tNumber of opinion expressions:', len( opinion_expressions) print >> sys.stderr, '\tNumber of opinion targets:', len(opinion_targets) print >> sys.stderr, '\tNumber of characters of the text:', len(whole_text) return opinion_expressions, opinion_targets, whole_text
if __name__ == '__main__': import glob #feature_file = 'my_feat_file' #fd = open(feature_file,'w') #for kaf_file in glob.glob('/home/izquierdo/data/opinion_annotations_en/kaf/hotel/*.kaf'): # print kaf_file # knaf_obj = KafNafParser(kaf_file) # extract_features_polarity_classifier_from_kaf(knaf_obj, fd) #fd.close() #print ' Feature file in ',feature_file #train_polarity_classifier(feature_file) kaf_obj = KafNafParser('dutch00011_f1b91e00bddbf62fbb35e4755e786406.kaf') list_terms = [] list_ids = [] for opinion in kaf_obj.get_opinions(): exp = opinion.get_expression() pol = exp.get_polarity() if pol in ['Positive','Negative','StrongPositive','StrongNegative']: this_id = (opinion.get_id(),pol) ids = exp.get_span().get_span_ids() list_ids.append(this_id) list_terms.append(ids) index_filename = '/home/izquierdo/cltl_repos/opinion_miner_deluxe/check_me/polarity_classifier/index.features' model_filename = '/home/izquierdo/cltl_repos/opinion_miner_deluxe/check_me/polarity_classifier/model.svm' svm_path = '/home/izquierdo/bin/svm_classify' results = classify(kaf_obj,list_terms,index_filename,model_filename, svm_path) for n in range(len(results)): print list_ids[n], results[n]
def train(self, list_training_files, out_folder): self.folder = out_folder os.mkdir(self.folder) print('Creating output folder %s' % self.folder) training_fd = open(os.path.join(self.folder, TRAIN_FILE), 'w') for this_file in list_training_files: print('\tEncoding training file %s' % this_file) this_obj = KafNafParser(this_file) num_pos = num_neg = 0 for opinion in this_obj.get_opinions(): opinion_expression = opinion.get_expression() polarity = opinion_expression.get_polarity() span_obj = opinion_expression.get_span() if span_obj is None: continue list_term_ids = span_obj.get_span_ids() features = self.extract_features(this_obj, list_term_ids) int_features = self.encode_string_features( features, update_index=True) #Map feat index --> frequency if len(int_features) != 0: this_class = None if self.is_positive(polarity): this_class = '+1' num_pos += 1 elif self.is_negative(polarity): this_class = '-1' num_neg += 1 if this_class is not None: self.write_example_to_file(training_fd, this_class, int_features) #END FOR print('\t\tNum positive examples: %d' % num_pos) print('\t\tNum negative examples: %d' % num_neg) training_fd.close() print('Training file at %s' % training_fd.name) ##RUN THE TRAINING training_cmd = [SVM_LEARN] training_cmd.append(training_fd.name) whole_model_file = os.path.join(self.folder, MODEL_FILE) training_cmd.append(whole_model_file) ret_code = check_call(training_cmd) print('Training done on %s with code %d' % (whole_model_file, ret_code)) #Save also the index whole_index_file = os.path.join(self.folder, INDEX_FILE) index_fd = open(whole_index_file, 'wb') pickle.dump(self.index_features, index_fd, -1) index_fd.close() print('Feature index saved to %s with %d features' % (whole_index_file, len(self.index_features)))
my_polarity_classifier = PolarityClassifier('nl') my_polarity_classifier.load_models(sys.argv[1]) OK = WR = 1 for example_file in files: this_obj = KafNafParser(example_file) my_polarity_classifier.classify_kaf_naf_object(this_obj) this_obj.dump() break GOLD = {} list_ids_term_ids = [] for opinion in this_obj.get_opinions(): op_exp = opinion.get_expression() polarity = op_exp.get_polarity() term_ids = op_exp.get_span().get_span_ids() list_ids_term_ids.append((opinion.get_id(),term_ids)) GOLD[opinion.get_id()] = polarity class_for_opinion_id, features_for_opinion_id = my_polarity_classifier.classify_list_opinions(this_obj, list_ids_term_ids) for oid, c in list(class_for_opinion_id.items()): #print '%s Gold:%s System:%s' % (oid,GOLD[oid],c) #print '\tFeatures:', features_for_opinion_id[oid] if c.lower() in GOLD[oid].lower(): OK +=1
def main(inputfile, this_type, folder, overall_parameters = {}, detected_dse = {},log=False): files = [] output_fd = None if this_type == 'train': output_fd = open(folder+'/'+TRAINING_FILENAME,'w') ##Save the parametes parameter_filename = os.path.join(folder,PARAMETERS_FILENAME) fd_parameter = open(parameter_filename,'w') pickler.dump(overall_parameters,fd_parameter,protocol=0) print>>sys.stderr,'Parameters saved to file %s' % parameter_filename fd_parameter.close() #Input is a files with a list of files fin = open(inputfile,'r') for line in fin: files.append(line.strip()) fin.close() elif this_type == 'tag': parameter_filename = os.path.join(folder,PARAMETERS_FILENAME) fd_param = open(parameter_filename,'r') overall_parameters = pickler.load(fd_param) fd_param.close() #Input is a isngle file files.append(inputfile) #Output FD will be a temporary file output_fd = tempfile.NamedTemporaryFile('w', delete=False) elif this_type == 'test': parameter_filename = os.path.join(folder,PARAMETERS_FILENAME) fd_param = open(parameter_filename,'r') these_overall_parameters = pickler.load(fd_param) fd_param.close() for opt, val in these_overall_parameters.items(): overall_parameters[opt] = val #Input is a files with a list of files fin = open(inputfile,'r') for line in fin: files.append(line.strip()) fin.close() output_fd = open(folder+'/'+TESTING_FILENAME,'w') gold_fd = None gold_filename = overall_parameters.get('gold_standard') if gold_filename is not None: gold_fd = open(gold_filename ,'w') for filename in files: if log: print>>sys.stderr,'TARGET: processing file', filename if isinstance(filename,KafNafParser): naf_obj = filename else: naf_obj = KafNafParser(filename) create_structures(naf_obj, filename) #Extract all the opinions opinions_per_sentence = defaultdict(list) num_opinions = 0 for opinion in naf_obj.get_opinions(): exp = opinion.get_expression() if exp is not None: p = exp.get_polarity() if p != 'NON-OPINIONATED': target = opinion.get_target() if target is not None: span = target.get_span() if span is not None: S = span.get_span_ids() if len(S) != 0: sentence_id = get_sentence_id_for_opinion(naf_obj,opinion) if sentence_id is not None: opinions_per_sentence[sentence_id].append(opinion) num_opinions += 1 if log: print>>sys.stderr,'\tNum of opinions:', num_opinions if this_type == 'train': # For the train a sequence is created for every opinion #One sequence is created for every DSE (possible to have repeated sentences) sentences_with_opinions = set() for this_sentence, these_opinions in opinions_per_sentence.items(): for opinion in these_opinions: sentences_with_opinions.add(this_sentence) create_sequence(naf_obj, this_type, this_sentence, overall_parameters, opinion, output = output_fd) #Include the rest of sentence without opinions ''' for sentence_id in naf_obj.list_sentence_ids: if sentence_id not in sentences_with_opinions: create_sequence(naf_obj, sentence_id, overall_parameters, list_opinions=[]) ''' elif this_type=='tag': # Obtain the opinions per sentence per opinions_per_sentence = defaultdict(list) for list_name_ids, list_words in detected_dse: list_ids = [v[v.rfind('#')+1:] for v in list_name_ids] first_token = naf_obj.get_token(list_ids[0]) sentence_for_opinion = first_token.get_sent() opinions_per_sentence[sentence_for_opinion].append(list_ids) for this_sentence, these_opinions in opinions_per_sentence.items(): for list_dse_token_ids in these_opinions: create_sequence(naf_obj, this_type, this_sentence, overall_parameters, opinion = list_dse_token_ids, output = output_fd,log=log) elif this_type=='test': #For the testing, one sequence is created for every sentence, with no opinion included opinion_list = [] for this_sentence, these_opinions in opinions_per_sentence.items(): for opinion in these_opinions: create_sequence(naf_obj, this_type, this_sentence, overall_parameters,opinion, output = output_fd) opinion_list.append(opinion) if gold_fd is not None: create_gold_standard_target(naf_obj,opinion_list,gold_fd) if gold_fd is not None: gold_fd.close() print>>sys.stderr,'Gold standard in the file %s' % gold_fd.name return output_fd.name