def process_single_file(self,file):
        try:
            xml_obj = KafNafParser(file)
        except:
            print>>sys.stderr,'Error parsing',file,': skipped'
            return        


        print>>sys.stderr,'Processing file', os.path.basename(file), 'Type:',xml_obj.get_type()
        self.langs[xml_obj.get_language()] += 1
        sentences = []
        current_sent = []
        this_sent = None
                 
                 
               
         
        pos_for_wid = {} ## For each token id (wid) the pos of it
        for term in xml_obj.get_terms():
            w_ids = term.get_span().get_span_ids()
            pos = term.get_pos()
            for wid in term.get_span().get_span_ids():
                pos_for_wid[wid] = pos

            
        for token in xml_obj.get_tokens():
            wid = token.get_id()
            value = token.get_text()
            if self.convert_to_lowercase:
                value = value.lower()
                
            if value in self.punctuation:
                value = 'PUN'
                
            if value == '*':
                value = 'STAR'
            
            sentence = token.get_sent()
            if this_sent is not None and sentence != this_sent:  ## There is a new sent
                sentences.append(current_sent)
                current_sent = []
            current_sent.append((wid,value))
            this_sent = sentence
        ## Add the last sentence as well
        sentences.append(current_sent)
        
        for sentence in sentences:
            if self.include_sentence_delimiters:
                sentence.insert(0,('xxx','<S>'))
                sentence.append(('xxx','</S>'))
        
            for idx in range(0,len(sentence)):
                for ngramlen in range(self.min_ngram_len,self.max_ngram_len+1):
                    file_desc = self.get_file_desc_for_ngram(ngramlen)
                    start = idx
                    end = start + ngramlen
                    if end <= len(sentence):
                        this_ngram = '\t'.join(value for wid, value in sentence[start:end])
                        this_ngram_pos = '\t'.join(pos_for_wid.get(wid,'X') for wid, value in sentence[start:end])
                        file_desc.write(this_ngram.encode('utf-8')+'\t'+DELIMITER+'\t'+this_ngram_pos+'\n')
Esempio n. 2
0
def extract_all_features():
    train_files = load_training_files()
    logging.debug('Loaded '+str(len(train_files))+' files')

    feat_folder = my_config_manager.get_feature_folder_name()
    label_feats = separator = None
    my_stdout, my_stderr = sys.stdout,sys.stderr
    
    rel_exp_tar_filename = my_config_manager.get_relation_exp_tar_training_filename()
    exp_tar_rel_fic = open(rel_exp_tar_filename,'w')
   
    rel_exp_hol_filename = my_config_manager.get_relation_exp_hol_training_filename()
    exp_hol_rel_fic = open(rel_exp_hol_filename,'w') 
    
    filename_features_polarity_classifier = my_config_manager.get_filename_features_polarity_classifier()
    fd_filename_features_polarity_classifier = open(filename_features_polarity_classifier,'w')
    
     

    ## Configuration for the relational alcasifier
    use_these_lexicons = []
    use_deps_now = my_config_manager.get_use_dependencies()
    use_toks_lems_now = my_config_manager.get_use_tokens_lemmas()
      
    #accepted_opinions = my_config_manager.get_mapping_valid_opinions(map_all_to_this=OPINION_EXPRESSION)
    accepted_opinions = my_config_manager.get_mapping_valid_opinions(map_all_to_this=None)
    mapping_positive_negative = my_config_manager.get_mapping_valid_opinions()
    use_dependencies_now = my_config_manager.get_use_dependencies()
    polarities_found_and_skipped = []
    for num_file, train_file in enumerate(train_files):
        logging.debug('Extracting features '+os.path.basename(train_file))
        base_name = os.path.basename(train_file)
        out_file = os.path.join(feat_folder,'file#'+str(num_file)+'#'+base_name+".feat")
        err_file = out_file+'.log'
        

        kaf_naf_obj = KafNafParser(train_file)
        print>>sys.stderr,'Extracting features from',train_file
        
        if num_file == 0: #The first time we load the lexicons
            lang = kaf_naf_obj.get_language()
            use_these_lexicons = load_lexicons(my_config_manager,lang)
            
        label_feats, separator, pols_skipped_this = extract_features_from_kaf_naf_file(kaf_naf_obj,out_file,err_file, 
                                                                                       accepted_opinions=accepted_opinions, 
                                                                                       lexicons = use_these_lexicons)
         
        polarities_found_and_skipped.extend(pols_skipped_this)
        print>>exp_tar_rel_fic,'#'+train_file
        print>>exp_hol_rel_fic,'#'+train_file
            
        # SET valid_opinions to None to use all the possible opinions in the KAF file for extracitng relations 
        # set it valid_opinions = accepted opinions for feiltering
        '''
        create_rel_exp_tar_training(kaf_naf_obj, output=exp_tar_rel_fic, valid_opinions=None,
                                    use_dependencies=use_dependencies_now,use_tokens=use_toks_lems_now,
                                    use_lemmas=use_toks_lems_now,
                                    log=err_file)
        
        create_rel_exp_hol_training(kaf_naf_obj ,output=exp_hol_rel_fic, valid_opinions=None,
                                    use_dependencies=use_dependencies_now,use_tokens=use_toks_lems_now,
                                    use_lemmas=use_toks_lems_now)
            
        '''
        ##Extract features for the polarity classifier
        #for mpqa there will be no polarity classifier
        #extract_features_polarity_classifier_from_kaf(kaf_naf_obj,fd_filename_features_polarity_classifier,mapping_positive_negative)
        
    fd_filename_features_polarity_classifier.close()
    ##Show just for information how many instances have been skipped becase the polarity of opinion expression was not allowed
    count = defaultdict(int)
    for exp_label in polarities_found_and_skipped:
        count[exp_label] += 1
    info = '\nOpinions skipped because the polarity label is not included in the configuration\n'
    info += 'Accepted opinions: '+' '.join(accepted_opinions.keys())+'\n'
    info += 'Number of complete opinions skipped\n'
    for label, c in count.items():
        info+=' '+label+' :'+str(c)+'\n'
    info+='\n'
    logging.debug(info)
    ###################################################
    
    
        
    #Re-set the stdout and stderr
    exp_tar_rel_fic.close()
    exp_hol_rel_fic.close()
    
    sys.stdout,sys.stderr = my_stdout, my_stderr
    #Sabe labelfeats and separator in a file
    filename = my_config_manager.get_feature_desc_filename()
    fic = open(filename,'w')
    fic.write(' '.join(label_feats)+'\n')
    fic.close()
    logging.debug('Description of features --> '+filename)
	fic.close()
	return map



if __name__=='__main__':
	this_folder = os.path.dirname(os.path.realpath(__file__))

	if sys.stdin.isatty():
			print>>sys.stderr,'Input stream required.'
			print>>sys.stderr,'Example usage: cat myUTF8file.kaf |',sys.argv[0]
			sys.exit(-1)


	input_obj = KafNafParser(sys.stdin)
	my_lang = input_obj.get_language()

	complete_path_to_treetagger = find_treetagger()
	if complete_path_to_treetagger is None:
		print>>sys.stderr,'Treetagger could not be found. You need to specify there treetagger is installed in 2 ways:'
		print>>sys.stderr,'\t1)Update the TREE_TAGGER_PATH variable in the file lib/__init__.py'
		print>>sys.stderr,'\t2_Update your TREE_TAGGER_PATH environment variable'
		sys.exit(0)
        
        
    # In the last version of treetagger all the names of commands have been change from X-utf to just X
    # /cmd/tree-tagger-english-utf8 ==> /cmd/tree-tagger-english
    # This could be a problem in case other version of treetagger is being used.
	if my_lang == 'en':
		treetagger_cmd = complete_path_to_treetagger+'/cmd/tree-tagger-english'
		mapping_file = this_folder +'/mappings/english.map.treetagger.kaf.csv'
def extract_all_features():
    train_files = load_training_files()
    logging.debug('Loaded '+str(len(train_files))+' files')

    feat_folder = my_config_manager.get_feature_folder_name()
    label_feats = separator = None
    my_stdout, my_stderr = sys.stdout,sys.stderr
    
    rel_exp_tar_filename = my_config_manager.get_relation_exp_tar_training_filename()
    exp_tar_rel_fic = open(rel_exp_tar_filename,'w')
   
    rel_exp_hol_filename = my_config_manager.get_relation_exp_hol_training_filename()
    exp_hol_rel_fic = open(rel_exp_hol_filename,'w') 
    
    ### LEXICON FROM THE DOMAIN
    expressions_lexicon = None
    targets_lexicon = None
    if my_config_manager.get_use_training_lexicons():
        # Create the lexicons
        
        ##GUESS THE LANG:
        first_train_file = train_files[0]
        obj = KafNafParser(first_train_file)
        lang = obj.get_language()
        
        expression_lexicon_filename = my_config_manager.get_expression_lexicon_filename()
        target_lexicon_filename = my_config_manager.get_target_lexicon_filename()
        
        
        this_exp_lex = my_config_manager.get_use_this_expression_lexicon()            
        this_tar_lex = my_config_manager.get_use_this_target_lexicon()

        
        if this_exp_lex is None or this_tar_lex is None:
            path_to_lex_creator = '/home/izquierdo/opener_repos/opinion-domain-lexicon-acquisition/acquire_from_annotated_data.py'
            training_filename = my_config_manager.get_file_training_list()
            lexicons_manager.create_lexicons(path_to_lex_creator,training_filename,expression_lexicon_filename,target_lexicon_filename)
        
        ##Once created we have to copy the previous one in case:
        if this_exp_lex is not None:
            if "$LANG" in this_exp_lex:
                this_exp_lex = this_exp_lex.replace('$LANG',lang)
            shutil.copy(this_exp_lex, expression_lexicon_filename)
            
        if this_tar_lex is not None:
            if "$LANG" in this_tar_lex:
                this_tar_lex = this_tar_lex.replace('$LANG',lang)
            shutil.copy(this_tar_lex,target_lexicon_filename)
        
        expressions_lexicon = lexicons_manager.load_lexicon(expression_lexicon_filename)
        targets_lexicon =  lexicons_manager.load_lexicon(target_lexicon_filename)
        
        this_propagation_lexicon = my_config_manager.get_propagation_lexicon_name()
        if this_propagation_lexicon is not None:
            if "$LANG" in this_propagation_lexicon:
                this_propagation_lexicon = this_propagation_lexicon.replace('$LANG',lang)
                
        print>>sys.stderr,'Propagated lexicon',this_propagation_lexicon
        
        
        

    ## Configuration for the relational alcasifier
    use_deps_now = my_config_manager.get_use_dependencies()
    use_toks_lems_now = my_config_manager.get_use_tokens_lemmas()
      
    accepted_opinions = my_config_manager.get_mapping_valid_opinions()
    use_dependencies_now = my_config_manager.get_use_dependencies()
    polarities_found_and_skipped = []
    for num_file, train_file in enumerate(train_files):
        logging.debug('Extracting features '+os.path.basename(train_file))
        base_name = os.path.basename(train_file)
        out_file = os.path.join(feat_folder,'file#'+str(num_file)+'#'+base_name+".feat")
        err_file = out_file+'.log'
        
        #Creates the output file
        # Returns the labels for the features and the separator used
        if True:
            kaf_naf_obj = KafNafParser(train_file)
            
            label_feats, separator, pols_skipped_this = extract_features_from_kaf_naf_file(kaf_naf_obj,out_file,err_file, 
                                                                                           accepted_opinions=accepted_opinions, 
                                                                                           exp_lex=expressions_lexicon, 
                                                                                           tar_lex=targets_lexicon,
                                                                                           propagation_lex_filename=this_propagation_lexicon)
            polarities_found_and_skipped.extend(pols_skipped_this)
            print>>exp_tar_rel_fic,'#'+train_file
            print>>exp_hol_rel_fic,'#'+train_file
            # SET valid_opinions to None to use all the possible opinions in the KAF file for extracitng relations 
            create_rel_exp_tar_training(kaf_naf_obj, output=exp_tar_rel_fic, valid_opinions=accepted_opinions,use_dependencies=use_dependencies_now,use_tokens=use_toks_lems_now,use_lemmas=use_toks_lems_now)
            create_rel_exp_hol_training(kaf_naf_obj ,output=exp_hol_rel_fic, valid_opinions=accepted_opinions,use_dependencies=use_dependencies_now,use_tokens=use_toks_lems_now,use_lemmas=use_toks_lems_now)
        if False:
        #except Exception as e:
            sys.stdout, sys.stderr = my_stdout, my_stderr
            print>>sys.stderr,str(e),dir(e)
            pass
        
    ##Show just for information how many instances have been skipped becase the polarity of opinion expression was not allowed
    count = defaultdict(int)
    for exp_label in polarities_found_and_skipped:
        count[exp_label] += 1
    info = '\nOpinions skipped because the polarity label is not included in the configuration\n'
    info += 'Accepted opinions: '+' '.join(accepted_opinions.keys())+'\n'
    info += 'Number of complete opinions skipped\n'
    for label, c in count.items():
        info+=' '+label+' :'+str(c)+'\n'
    info+='\n'
    logging.debug(info)
    ###################################################
    
    
        
    #Re-set the stdout and stderr
    exp_tar_rel_fic.close()
    exp_hol_rel_fic.close()
    
    sys.stdout,sys.stderr = my_stdout, my_stderr
    #Sabe labelfeats and separator in a file
    filename = my_config_manager.get_feature_desc_filename()
    fic = open(filename,'w')
    fic.write(' '.join(label_feats)+'\n')
    fic.close()
    logging.debug('Description of features --> '+filename)
Esempio n. 5
0
def extract_all_features():
    train_files = load_training_files()
    logging.debug('Loaded ' + str(len(train_files)) + ' files')

    feat_folder = my_config_manager.get_feature_folder_name()
    label_feats = separator = None
    my_stdout, my_stderr = sys.stdout, sys.stderr

    rel_exp_tar_filename = my_config_manager.get_relation_exp_tar_training_filename(
    )
    exp_tar_rel_fic = open(rel_exp_tar_filename, 'w')

    rel_exp_hol_filename = my_config_manager.get_relation_exp_hol_training_filename(
    )
    exp_hol_rel_fic = open(rel_exp_hol_filename, 'w')

    ### LEXICON FROM THE DOMAIN
    expressions_lexicon = None
    targets_lexicon = None
    if my_config_manager.get_use_training_lexicons():
        # Create the lexicons

        ##GUESS THE LANG:
        first_train_file = train_files[0]
        obj = KafNafParser(first_train_file)
        lang = obj.get_language()

        expression_lexicon_filename = my_config_manager.get_expression_lexicon_filename(
        )
        target_lexicon_filename = my_config_manager.get_target_lexicon_filename(
        )

        this_exp_lex = my_config_manager.get_use_this_expression_lexicon()
        this_tar_lex = my_config_manager.get_use_this_target_lexicon()

        if this_exp_lex is None or this_tar_lex is None:
            path_to_lex_creator = '/home/izquierdo/opener_repos/opinion-domain-lexicon-acquisition/acquire_from_annotated_data.py'
            training_filename = my_config_manager.get_file_training_list()
            lexicons_manager.create_lexicons(path_to_lex_creator,
                                             training_filename,
                                             expression_lexicon_filename,
                                             target_lexicon_filename)

        ##Once created we have to copy the previous one in case:
        if this_exp_lex is not None:
            if "$LANG" in this_exp_lex:
                this_exp_lex = this_exp_lex.replace('$LANG', lang)
            shutil.copy(this_exp_lex, expression_lexicon_filename)

        if this_tar_lex is not None:
            if "$LANG" in this_tar_lex:
                this_tar_lex = this_tar_lex.replace('$LANG', lang)
            shutil.copy(this_tar_lex, target_lexicon_filename)

        expressions_lexicon = lexicons_manager.load_lexicon(
            expression_lexicon_filename)
        targets_lexicon = lexicons_manager.load_lexicon(
            target_lexicon_filename)

        this_propagation_lexicon = my_config_manager.get_propagation_lexicon_name(
        )
        if this_propagation_lexicon is not None:
            if "$LANG" in this_propagation_lexicon:
                this_propagation_lexicon = this_propagation_lexicon.replace(
                    '$LANG', lang)

        print >> sys.stderr, 'Propagated lexicon', this_propagation_lexicon

    ## Configuration for the relational alcasifier
    use_deps_now = my_config_manager.get_use_dependencies()
    use_toks_lems_now = my_config_manager.get_use_tokens_lemmas()

    accepted_opinions = my_config_manager.get_mapping_valid_opinions()
    use_dependencies_now = my_config_manager.get_use_dependencies()
    polarities_found_and_skipped = []
    for num_file, train_file in enumerate(train_files):
        logging.debug('Extracting features ' + os.path.basename(train_file))
        base_name = os.path.basename(train_file)
        out_file = os.path.join(
            feat_folder, 'file#' + str(num_file) + '#' + base_name + ".feat")
        err_file = out_file + '.log'

        #Creates the output file
        # Returns the labels for the features and the separator used
        if True:
            kaf_naf_obj = KafNafParser(train_file)

            label_feats, separator, pols_skipped_this = extract_features_from_kaf_naf_file(
                kaf_naf_obj,
                out_file,
                err_file,
                accepted_opinions=accepted_opinions,
                exp_lex=expressions_lexicon,
                tar_lex=targets_lexicon,
                propagation_lex_filename=this_propagation_lexicon)
            polarities_found_and_skipped.extend(pols_skipped_this)
            print >> exp_tar_rel_fic, '#' + train_file
            print >> exp_hol_rel_fic, '#' + train_file
            # SET valid_opinions to None to use all the possible opinions in the KAF file for extracitng relations
            create_rel_exp_tar_training(kaf_naf_obj,
                                        output=exp_tar_rel_fic,
                                        valid_opinions=accepted_opinions,
                                        use_dependencies=use_dependencies_now,
                                        use_tokens=use_toks_lems_now,
                                        use_lemmas=use_toks_lems_now)
            create_rel_exp_hol_training(kaf_naf_obj,
                                        output=exp_hol_rel_fic,
                                        valid_opinions=accepted_opinions,
                                        use_dependencies=use_dependencies_now,
                                        use_tokens=use_toks_lems_now,
                                        use_lemmas=use_toks_lems_now)
        if False:
            #except Exception as e:
            sys.stdout, sys.stderr = my_stdout, my_stderr
            print >> sys.stderr, str(e), dir(e)
            pass

    ##Show just for information how many instances have been skipped becase the polarity of opinion expression was not allowed
    count = defaultdict(int)
    for exp_label in polarities_found_and_skipped:
        count[exp_label] += 1
    info = '\nOpinions skipped because the polarity label is not included in the configuration\n'
    info += 'Accepted opinions: ' + ' '.join(accepted_opinions.keys()) + '\n'
    info += 'Number of complete opinions skipped\n'
    for label, c in count.items():
        info += ' ' + label + ' :' + str(c) + '\n'
    info += '\n'
    logging.debug(info)
    ###################################################

    #Re-set the stdout and stderr
    exp_tar_rel_fic.close()
    exp_hol_rel_fic.close()

    sys.stdout, sys.stderr = my_stdout, my_stderr
    #Sabe labelfeats and separator in a file
    filename = my_config_manager.get_feature_desc_filename()
    fic = open(filename, 'w')
    fic.write(' '.join(label_feats) + '\n')
    fic.close()
    logging.debug('Description of features --> ' + filename)
Esempio n. 6
0
    def process_single_file(self, file):
        try:
            xml_obj = KafNafParser(file)
        except:
            print >> sys.stderr, 'Error parsing', file, ': skipped'
            return

        print >> sys.stderr, 'Processing file', os.path.basename(
            file), 'Type:', xml_obj.get_type()
        self.langs[xml_obj.get_language()] += 1
        sentences = []
        current_sent = []
        this_sent = None

        pos_for_wid = {}  ## For each token id (wid) the pos of it
        for term in xml_obj.get_terms():
            w_ids = term.get_span().get_span_ids()
            pos = term.get_pos()
            for wid in term.get_span().get_span_ids():
                pos_for_wid[wid] = pos

        for token in xml_obj.get_tokens():
            wid = token.get_id()
            value = token.get_text()
            if self.convert_to_lowercase:
                value = value.lower()

            if value in self.punctuation:
                value = 'PUN'

            if value == '*':
                value = 'STAR'

            sentence = token.get_sent()
            if this_sent is not None and sentence != this_sent:  ## There is a new sent
                sentences.append(current_sent)
                current_sent = []
            current_sent.append((wid, value))
            this_sent = sentence
        ## Add the last sentence as well
        sentences.append(current_sent)

        for sentence in sentences:
            if self.include_sentence_delimiters:
                sentence.insert(0, ('xxx', '<S>'))
                sentence.append(('xxx', '</S>'))

            for idx in range(0, len(sentence)):
                for ngramlen in range(self.min_ngram_len,
                                      self.max_ngram_len + 1):
                    file_desc = self.get_file_desc_for_ngram(ngramlen)
                    start = idx
                    end = start + ngramlen
                    if end <= len(sentence):
                        this_ngram = '\t'.join(
                            value for wid, value in sentence[start:end])
                        this_ngram_pos = '\t'.join(
                            pos_for_wid.get(wid, 'X')
                            for wid, value in sentence[start:end])
                        file_desc.write(
                            this_ngram.encode('utf-8') + '\t' + DELIMITER +
                            '\t' + this_ngram_pos + '\n')