def get_sentence(naf: KafNafParser, term: Cterm) -> int: tokens = [ naf.get_token(tid) for tid in naf.get_dict_tokens_for_termid(term.get_id()) ] sent = {t.get_sent() for t in tokens} if len(sent) != 1: raise Exception( f"Term {term.get_id}:{term.get_lemma()} did not map to single sentence: {sent}" ) return sent.pop()
def main(inputfile, this_type, folder, overall_parameters={}, detected_dse={}, log=False): files = [] output_fd = None if this_type == 'train': output_fd = open(folder + '/' + TRAINING_FILENAME, 'w') ##Save the parametes parameter_filename = os.path.join(folder, PARAMETERS_FILENAME) fd_parameter = open(parameter_filename, 'w') pickler.dump(overall_parameters, fd_parameter, protocol=0) print >> sys.stderr, 'Parameters saved to file %s' % parameter_filename fd_parameter.close() #Input is a files with a list of files fin = open(inputfile, 'r') for line in fin: files.append(line.strip()) fin.close() elif this_type == 'tag': parameter_filename = os.path.join(folder, PARAMETERS_FILENAME) fd_param = open(parameter_filename, 'r') overall_parameters = pickler.load(fd_param) fd_param.close() #Input is a isngle file files.append(inputfile) #Output FD will be a temporary file output_fd = tempfile.NamedTemporaryFile('w', delete=False) elif this_type == 'test': parameter_filename = os.path.join(folder, PARAMETERS_FILENAME) fd_param = open(parameter_filename, 'r') these_overall_parameters = pickler.load(fd_param) fd_param.close() for opt, val in these_overall_parameters.items(): overall_parameters[opt] = val #Input is a files with a list of files fin = open(inputfile, 'r') for line in fin: files.append(line.strip()) fin.close() output_fd = open(folder + '/' + TESTING_FILENAME, 'w') gold_fd = None gold_filename = overall_parameters.get('gold_standard') if gold_filename is not None: gold_fd = open(gold_filename, 'w') for filename in files: if log: print >> sys.stderr, 'HOLDER: processing file', filename if isinstance(filename, KafNafParser): naf_obj = filename else: naf_obj = KafNafParser(filename) create_structures(naf_obj, filename) #Extract all the opinions opinions_per_sentence = defaultdict(list) num_opinions = 0 for opinion in naf_obj.get_opinions(): exp = opinion.get_expression() if exp is not None: p = exp.get_polarity() if p != 'NON-OPINIONATED': #if p.startswith('D-'): holder = opinion.get_holder() if holder is not None: span = holder.get_span() if span is not None: span_ids = span.get_span_ids() if len(span_ids) != 0: sentence_id = get_sentence_id_for_opinion( naf_obj, opinion) if sentence_id is not None: opinions_per_sentence[sentence_id].append( opinion) num_opinions += 1 if log: print >> sys.stderr, '\tNum of opinions:', num_opinions if this_type == 'train': # For the train a sequence is created for every opinion #One sequence is created for every DSE (possible to have repeated sentences) sentences_with_opinions = set() for this_sentence, these_opinions in opinions_per_sentence.items(): for opinion in these_opinions: sentences_with_opinions.add(this_sentence) create_sequence(naf_obj, this_type, this_sentence, overall_parameters, opinion, output=output_fd) #Include the rest of sentence without opinions ''' for sentence_id in naf_obj.list_sentence_ids: if sentence_id not in sentences_with_opinions: create_sequence(naf_obj, sentence_id, overall_parameters, list_opinions=[]) ''' elif this_type == 'tag': # Obtain the opinions per sentence per opinions_per_sentence = defaultdict(list) for list_name_ids, list_words in detected_dse: list_ids = [v[v.rfind('#') + 1:] for v in list_name_ids] first_token = naf_obj.get_token(list_ids[0]) sentence_for_opinion = first_token.get_sent() opinions_per_sentence[sentence_for_opinion].append(list_ids) for this_sentence, these_opinions in opinions_per_sentence.items(): for list_dse_token_ids in these_opinions: create_sequence(naf_obj, this_type, this_sentence, overall_parameters, opinion=list_dse_token_ids, output=output_fd, log=log) elif this_type == 'test': opinion_list = [] ''' for sentence_id in naf_obj.list_sentence_ids: if sentence_id in opinions_per_sentence: for this_sentence, these_opinions in opinions_per_sentence.items(): for opinion in these_opinions: create_sequence(naf_obj, this_type, this_sentence, overall_parameters,opinion, output = output_fd) opinion_list.append(opinion) else: create_sequence(naf_obj, this_type, sentence_id, overall_parameters,opinion=None, output = output_fd) ''' #For the testing, one sequence is created for every sentence, with no opinion included opinion_list = [] #WE include only the the sentences where there are opinions for this_sentence, these_opinions in opinions_per_sentence.items(): for opinion in these_opinions: create_sequence(naf_obj, this_type, this_sentence, overall_parameters, opinion, output=output_fd) opinion_list.append(opinion) ## Create the gold standard data also if gold_fd is not None: create_gold_standard_holder(naf_obj, opinion_list, gold_fd) if gold_fd is not None: gold_fd.close() print >> sys.stderr, 'Gold standard in the file %s' % gold_fd.name return output_fd.name
def get_word(naf: KafNafParser, term: Cterm) -> str: """Get the word(s) belonging to a term, joining them if there's more than one""" tokenids = naf.get_dict_tokens_for_termid(term.get_id()) tokens = sort_tokens(naf.get_token(tid) for tid in tokenids) return " ".join(t.get_text() for t in tokens)
#!/usr/bin/env python from KafNafParserPy import KafNafParser import sys if __name__ == '__main__': naf_obj = KafNafParser(sys.stdin) candidates = set(['is', 'are', 'were', 'was', "'s"]) for term in naf_obj.get_terms(): if term.get_lemma() == 'i': token_id = term.get_span().get_span_ids()[0] token = naf_obj.get_token(token_id).get_text() if token in candidates: term.set_lemma('be') term.set_pos('VBZ') naf_obj.dump()
): #get the entities identified through NER if entity.get_type( ) == "LOCATION": #if it's a location, then let's get its term references entity_type = entity.get_type() #get the entity type entity_id = entity.get_id() #get the entity id ref = entity.get_node().find( "references" ) #find the references node in the KAF file targets = ref.find("span").findall( "target" ) #get a list of all the targets in the references node term_id = targets[0].attrib[ "id"] #get the first target term id, so we can generate the term text in the next two lines word_id = my_parser.get_term(term_id).get_node().find( "span").find("target").attrib["id"] #get the word id word = my_parser.get_token( word_id).get_text() #get the word text #simpledict[word]=freqdict[word] #generate a simple tally that ignores whether each LOCATION is valid; #commented out since OpenRefine will be used to validate sentence = "" #start a blank KWIC sentence variable for target in targets: #iterate through all targets in the references element for the entity node tid = int( my_parser.get_term( target.attrib["id"]).get_node().find( "span").find("target").attrib["id"] [1:]) #get the token id for the target for x in range( tid - 15, tid + 15 ): #iterate through the tokens 15 before and 15 after the target if my_parser.get_token("w" + str(x)):
def main(inputfile, this_type, folder, overall_parameters = {}, detected_dse = {},log=False): files = [] output_fd = None if this_type == 'train': output_fd = open(folder+'/'+TRAINING_FILENAME,'w') ##Save the parametes parameter_filename = os.path.join(folder,PARAMETERS_FILENAME) fd_parameter = open(parameter_filename,'w') pickler.dump(overall_parameters,fd_parameter,protocol=0) print>>sys.stderr,'Parameters saved to file %s' % parameter_filename fd_parameter.close() #Input is a files with a list of files fin = open(inputfile,'r') for line in fin: files.append(line.strip()) fin.close() elif this_type == 'tag': parameter_filename = os.path.join(folder,PARAMETERS_FILENAME) fd_param = open(parameter_filename,'r') overall_parameters = pickler.load(fd_param) fd_param.close() #Input is a isngle file files.append(inputfile) #Output FD will be a temporary file output_fd = tempfile.NamedTemporaryFile('w', delete=False) elif this_type == 'test': parameter_filename = os.path.join(folder,PARAMETERS_FILENAME) fd_param = open(parameter_filename,'r') these_overall_parameters = pickler.load(fd_param) fd_param.close() for opt, val in these_overall_parameters.items(): overall_parameters[opt] = val #Input is a files with a list of files fin = open(inputfile,'r') for line in fin: files.append(line.strip()) fin.close() output_fd = open(folder+'/'+TESTING_FILENAME,'w') gold_fd = None gold_filename = overall_parameters.get('gold_standard') if gold_filename is not None: gold_fd = open(gold_filename ,'w') for filename in files: if log: print>>sys.stderr,'TARGET: processing file', filename if isinstance(filename,KafNafParser): naf_obj = filename else: naf_obj = KafNafParser(filename) create_structures(naf_obj, filename) #Extract all the opinions opinions_per_sentence = defaultdict(list) num_opinions = 0 for opinion in naf_obj.get_opinions(): exp = opinion.get_expression() if exp is not None: p = exp.get_polarity() if p != 'NON-OPINIONATED': target = opinion.get_target() if target is not None: span = target.get_span() if span is not None: S = span.get_span_ids() if len(S) != 0: sentence_id = get_sentence_id_for_opinion(naf_obj,opinion) if sentence_id is not None: opinions_per_sentence[sentence_id].append(opinion) num_opinions += 1 if log: print>>sys.stderr,'\tNum of opinions:', num_opinions if this_type == 'train': # For the train a sequence is created for every opinion #One sequence is created for every DSE (possible to have repeated sentences) sentences_with_opinions = set() for this_sentence, these_opinions in opinions_per_sentence.items(): for opinion in these_opinions: sentences_with_opinions.add(this_sentence) create_sequence(naf_obj, this_type, this_sentence, overall_parameters, opinion, output = output_fd) #Include the rest of sentence without opinions ''' for sentence_id in naf_obj.list_sentence_ids: if sentence_id not in sentences_with_opinions: create_sequence(naf_obj, sentence_id, overall_parameters, list_opinions=[]) ''' elif this_type=='tag': # Obtain the opinions per sentence per opinions_per_sentence = defaultdict(list) for list_name_ids, list_words in detected_dse: list_ids = [v[v.rfind('#')+1:] for v in list_name_ids] first_token = naf_obj.get_token(list_ids[0]) sentence_for_opinion = first_token.get_sent() opinions_per_sentence[sentence_for_opinion].append(list_ids) for this_sentence, these_opinions in opinions_per_sentence.items(): for list_dse_token_ids in these_opinions: create_sequence(naf_obj, this_type, this_sentence, overall_parameters, opinion = list_dse_token_ids, output = output_fd,log=log) elif this_type=='test': #For the testing, one sequence is created for every sentence, with no opinion included opinion_list = [] for this_sentence, these_opinions in opinions_per_sentence.items(): for opinion in these_opinions: create_sequence(naf_obj, this_type, this_sentence, overall_parameters,opinion, output = output_fd) opinion_list.append(opinion) if gold_fd is not None: create_gold_standard_target(naf_obj,opinion_list,gold_fd) if gold_fd is not None: gold_fd.close() print>>sys.stderr,'Gold standard in the file %s' % gold_fd.name return output_fd.name