def convert(self, id, result, format): assert format == "csv" _int = lambda x: None if x is None else int(x) naf = KafNafParser(BytesIO(result.encode("utf-8"))) deps = {dep.get_to(): (dep.get_function(), dep.get_from()) for dep in naf.get_dependencies()} tokendict = {token.get_id(): token for token in naf.get_tokens()} s = StringIO() w = csv.writer(s) w.writerow(["id", "token_id", "offset", "sentence", "para", "word", "term_id", "lemma", "pos", "pos1", "parent", "relation"]) for term in naf.get_terms(): tokens = [tokendict[id] for id in term.get_span().get_span_ids()] for token in tokens: tid = term.get_id() pos = term.get_pos() pos1 = POSMAP[pos] row = [id, token.get_id(), _int(token.get_offset()), _int(token.get_para()), token.get_text(), tid, term.get_lemma(), pos, pos1] if tid in deps: rel, parent = deps[tid] row += [parent, rel.split("/")[-1]] else: row += [None, None] w.writerow(row) return s.getvalue()
def from_naf(self, article, naf): def _int(x): return None if x is None else int(x) naf = KafNafParser(BytesIO(naf.encode("utf-8"))) deps = {dep.get_to(): (dep.get_function(), dep.get_from()) for dep in naf.get_dependencies()} tokendict = {token.get_id(): token for token in naf.get_tokens()} for term in naf.get_terms(): tokens = [tokendict[id] for id in term.get_span().get_span_ids()] for token in tokens: tid = term.get_id() tok = {"aid": article, "token_id": token.get_id(), "offset": _int(token.get_offset()), "sentence": _int(token.get_sent()), "para": _int(token.get_para()), "word": token.get_text(), "term_id": tid, "lemma": term.get_lemma(), "pos": term.get_pos()} if tid in deps: rel, parent = deps[tid] tok['parent'] = parent tok['relation'] = rel.split("/")[-1] yield tok
def test_create_terms(): """ Can we create_terms via the create_{term,token} functions? """ naf = KafNafParser(type="NAF") sent=1; offset=0 input = [(u'dit', u'dit', u'O', u'VNW'), (u'is', u'zijn', u'V', u'WW'), (u'een', u'een', u'D', u'LID'), (u'test', u'test', u'N', u'N')] offset = 0 for (word, lemma, pos, morph) in input: token = naf.create_wf(word, 1, offset) offset += len(word) term = naf.create_term(lemma, pos, morph, [token]) tokens = {t.get_id(): t for t in naf.get_tokens()} assert_equal(len(tokens), 4) result = {} for term in naf.get_terms(): for token_id in term.get_span().get_span_ids(): token = tokens[token_id] result[term.get_id()] = (token.get_text(), term.get_lemma(), term.get_pos(), term.get_morphofeat()) result = [result[tid] for tid in sorted(result.keys())] assert_equal(input, result)
def process_single_file(self,file): try: xml_obj = KafNafParser(file) except: print>>sys.stderr,'Error parsing',file,': skipped' return print>>sys.stderr,'Processing file', os.path.basename(file), 'Type:',xml_obj.get_type() self.langs[xml_obj.get_language()] += 1 sentences = [] current_sent = [] this_sent = None pos_for_wid = {} ## For each token id (wid) the pos of it for term in xml_obj.get_terms(): w_ids = term.get_span().get_span_ids() pos = term.get_pos() for wid in term.get_span().get_span_ids(): pos_for_wid[wid] = pos for token in xml_obj.get_tokens(): wid = token.get_id() value = token.get_text() if self.convert_to_lowercase: value = value.lower() if value in self.punctuation: value = 'PUN' if value == '*': value = 'STAR' sentence = token.get_sent() if this_sent is not None and sentence != this_sent: ## There is a new sent sentences.append(current_sent) current_sent = [] current_sent.append((wid,value)) this_sent = sentence ## Add the last sentence as well sentences.append(current_sent) for sentence in sentences: if self.include_sentence_delimiters: sentence.insert(0,('xxx','<S>')) sentence.append(('xxx','</S>')) for idx in range(0,len(sentence)): for ngramlen in range(self.min_ngram_len,self.max_ngram_len+1): file_desc = self.get_file_desc_for_ngram(ngramlen) start = idx end = start + ngramlen if end <= len(sentence): this_ngram = '\t'.join(value for wid, value in sentence[start:end]) this_ngram_pos = '\t'.join(pos_for_wid.get(wid,'X') for wid, value in sentence[start:end]) file_desc.write(this_ngram.encode('utf-8')+'\t'+DELIMITER+'\t'+this_ngram_pos+'\n')
def test_corenlp_naf(): _check_corenlp() naf_bytes = corenlp.corenlp_naf("John shoots himself", annotators=corenlp.LEMMATIZER) print naf_bytes naf = KafNafParser(BytesIO(naf_bytes)) terms = {t.get_id(): t.get_lemma() for t in naf.get_terms()} assert_equal(set(terms.values()), {"John", "shoot", "himself"})
def test_frog_saf(): _check_frog() naf_str = frog._process("Mark Rutte werkte gisteren nog bij de Vrije Universiteit in Amsterdam") naf = KafNafParser(BytesIO(naf_str)) lemmata = {t.get_lemma() for t in naf.get_terms()} assert_equal(lemmata, {"Mark_Rutte", "werken", "gisteren", "nog", "bij", "de", "vrij", "universiteit", "in", "Amsterdam"})
def create_naf(text): naf = KafNafParser(type="NAF") naf.set_version("3.0") naf.set_language("nl") naf.lang = "nl" naf.raw = text naf.set_raw(naf.raw) return naf
def test_dump(): """ Can we use naf.dump() to stdout and file? Make sure the run with nosetests -s, otherwise python3 will err """ naf = KafNafParser(type="NAF") token = naf.create_wf("\xd8lleg\xe5rd", 1, 1) expected = '<![CDATA[\xd8lleg\xe5rd]]></wf>' # do we get an error on dumping to stdout without redirect? naf.dump() # Can we dump to stdout? with capture_stdout() as s: naf.dump() output = s.getvalue().decode("utf-8") assert_in(expected, output) # Can we dump to a named file? f = tempfile.NamedTemporaryFile(suffix=".xml", delete=False) try: naf.dump(f.name) f.close() output = open(f.name, mode='rb').read().decode('utf-8') finally: os.remove(f.name) assert_in(expected, output)
def get_sentence(naf: KafNafParser, term: Cterm) -> int: tokens = [ naf.get_token(tid) for tid in naf.get_dict_tokens_for_termid(term.get_id()) ] sent = {t.get_sent() for t in tokens} if len(sent) != 1: raise Exception( f"Term {term.get_id}:{term.get_lemma()} did not map to single sentence: {sent}" ) return sent.pop()
def test_frog_saf(): _check_frog() naf_str = frog._process( "Mark Rutte werkte gisteren nog bij de Vrije Universiteit in Amsterdam" ) naf = KafNafParser(BytesIO(naf_str)) lemmata = {t.get_lemma() for t in naf.get_terms()} assert_equal( lemmata, { "Mark_Rutte", "werken", "gisteren", "nog", "bij", "de", "vrij", "universiteit", "in", "Amsterdam" })
def main(argv): conversion = "" try: opts, args = getopt.getopt(argv,"hkn",["tokaf","tonaf"]) except getopt.GetoptError: print 'could not parse options. Correct usage: \n\n kaf-naf-parser.py --tokaf --tonaf' sys.exit(2) for opt, arg in opts: if opt == '-h': print 'test.py --tokaf --tonaf' sys.exit() elif opt in ("-k", "--tokaf"): conversion = "to-kaf" elif opt in ("-n", "--tonaf"): conversion = "to-naf" if conversion == "": conversion = "kaf-naf" obj = KafNafParser(sys.stdin) if conversion == "to-kaf": obj.to_kaf() if conversion == "to-naf": obj.to_naf() obj.dump()
def run_and_compare(in_filename, out_filename, correct_out_filename, use_subprocess=True, **kwargs): """ Runs the system with `in_filename` as input and `out_filename` as output and then compares the result to `correct_out_filename`. Because some header data changes (as it should), the contents of `correct_out_filename` will be formatted using a call to `str.format` with the following keyword arguments: - version - timestamp - beginTimestamp - endTimestamp - hostname """ with open(in_filename) as fd, open(out_filename, 'wb') as out: if use_subprocess: run_with_subprocess(fd, out, **kwargs) else: run_without_subprocess(fd, out, **kwargs) with open(out_filename) as out, open(correct_out_filename) as correct: # Check something happened and that the result can be parsed outnaf = KafNafParser(out_filename) # Get the header information to be able to compare raw files our_header_layer = list( outnaf.get_linguisticProcessors() )[-1] assert our_header_layer.get_layer() == 'coreferences' processors = list( our_header_layer.get_linguistic_processors() ) assert len(processors) == 1 our_header_data = processors[0] correct = correct.read().format( version=our_header_data.get_version(), timestamp=our_header_data.get_timestamp(), beginTimestamp=our_header_data.get_beginTimestamp(), endTimestamp=our_header_data.get_endTimestamp(), hostname=our_header_data.get_hostname(), ) assert correct == out.read()
def alpino(cls, data: bytes) -> bytes: data = BytesIO(data) try: data = KafNafParser(data) except XMLSyntaxError: pass # alpino can parse raw text return dump_naf(alpinonaf.parse(data))
def from_naf(self, naf): naf = KafNafParser(BytesIO(naf.encode("utf-8"))) tokendict = {token.get_id(): token for token in naf.get_tokens()} for term in naf.get_terms(): tokens = [tokendict[id] for id in term.get_span().get_span_ids()] for token in tokens: yield {"aid": article.pk, "token_id": token.get_id(), "offset": token.get_offset(), "sentence": token.get_sent(), "para": token.get_para(), "word": token.get_text(), "term_id": term.get_id(), "lemma": term.get_lemma(), "pos": term.get_pos()}
def read_training_data(file_name): """ read kaf/naf and matches the aspects with the words """ parser = KafNafParser(PATH_ANNOTATED_DATA + file_name) terms = list(parser.get_terms()) # create token dictionairy containing naf info tokens_container = dict() for token_el in parser.get_tokens(): token_node = token_el.node token_id = token_node.get('wid').replace('w', 't') token_info = token_node.attrib tokens_container[token_id] = token_info properties = list(parser.get_properties()) handled_properties, term_dict = handle_properties(properties, terms, tokens_container) return terms, properties, handled_properties, term_dict, tokens_container
def convert(self, id, result, format): assert format == "csv" naf = KafNafParser(BytesIO(result.encode("utf-8"))) memo = self._csv_memo(naf) tokendict = {token.get_id(): token for token in naf.get_tokens()} s = StringIO() w = csv.writer(s) w.writerow(self._csv_header()) for term in naf.get_terms(): tokens = [tokendict[id] for id in term.get_span().get_span_ids()] for token in tokens: tid = term.get_id() pos = term.get_pos() pos1 = POSMAP[pos] row = [id] + list(self._csv_row(memo, term, token)) w.writerow(row) return s.getvalue()
def map_opinion_labels(input_file, output_file, config_file): # Load the mapping from the config_file mapping = {} parser = ConfigParser.ConfigParser() parser.read(config_file) for mapped_opinion, values_in_corpus in parser.items('valid_opinions'): values = [v for v in values_in_corpus.split(';') if v != ''] for v in values: mapping[v] = mapped_opinion del parser ################## input_kaf = KafNafParser(input_file) remove_these = [] for opinion in input_kaf.get_opinions(): exp = opinion.get_expression() polarity = exp.get_polarity() if polarity in mapping: mapped_polarity = mapping[polarity] else: opi_id = opinion.get_id() remove_these.append(opi_id) mapped_polarity = polarity exp.set_polarity(mapped_polarity) for opi_id in remove_these: input_kaf.remove_this_opinion(opi_id) input_kaf.dump(output_file)
def map_opinion_labels(input_file,output_file,config_file): # Load the mapping from the config_file mapping = {} parser = ConfigParser.ConfigParser() parser.read(config_file) for mapped_opinion, values_in_corpus in parser.items('valid_opinions'): values = [ v for v in values_in_corpus.split(';') if v != ''] for v in values: mapping[v] = mapped_opinion del parser ################## input_kaf = KafNafParser(input_file) remove_these = [] for opinion in input_kaf.get_opinions(): exp = opinion.get_expression() polarity = exp.get_polarity() if polarity in mapping: mapped_polarity = mapping[polarity] else: opi_id = opinion.get_id() remove_these.append(opi_id) mapped_polarity = polarity exp.set_polarity(mapped_polarity) for opi_id in remove_these: input_kaf.remove_this_opinion(opi_id) input_kaf.dump(output_file)
def test_create_terms(): """ Can we create_terms via the create_{term,token} functions? """ naf = KafNafParser(type="NAF") sent = 1 offset = 0 input = [(u'dit', u'dit', u'O', u'VNW'), (u'is', u'zijn', u'V', u'WW'), (u'een', u'een', u'D', u'LID'), (u'test', u'test', u'N', u'N')] offset = 0 for (word, lemma, pos, morph) in input: token = naf.create_wf(word, 1, offset) offset += len(word) term = naf.create_term(lemma, pos, morph, [token]) tokens = {t.get_id(): t for t in naf.get_tokens()} assert_equal(len(tokens), 4) result = {} for term in naf.get_terms(): for token_id in term.get_span().get_span_ids(): token = tokens[token_id] result[term.get_id()] = (token.get_text(), term.get_lemma(), term.get_pos(), term.get_morphofeat()) result = [result[tid] for tid in sorted(result.keys())] assert_equal(input, result)
def extract_data_file(filename, label_gold, label_system, this_temp_folder=None, get_random=False): if this_temp_folder is None: temp_folder = mkdtemp() else: temp_folder = this_temp_folder fd_gold = open(temp_folder+'/'+__gold_filename__,'a') fd_system = open(temp_folder+'/'+__system_filename__, 'a') input_obj = KafNafParser(filename) for term in input_obj.get_terms(): #Get gold term_id = term.get_id() results_gold = [] results_system = [] for ext_ref in term.get_external_references(): resource = ext_ref.get_resource() if resource == label_gold: results_gold.append((ext_ref.get_reference(),ext_ref.get_confidence())) elif resource == label_system: results_system.append((ext_ref.get_reference(),ext_ref.get_confidence())) if len(results_gold) > 0: best_gold_label, best_gold_value = get_max_from_list(results_gold) fd_gold.write(filename+'\t'+term_id+'\t'+best_gold_label+'\n') if get_random: best_system_label, best_system_value = get_random_from_list(results_system) else: best_system_label, best_system_value = get_max_from_list(results_system) if best_system_label is not None: fd_system.write(filename+'\t'+term_id+'\t'+best_system_label+'\n') fd_gold.close() fd_system.close() #Create the "fake" sense.mappings fd_map = open(temp_folder+'/'+__sense_mapping__,'w') fd_map.close() return temp_folder
def test_corenlp2naf(): xml = open(os.path.join(os.path.dirname(__file__), "test_corenlp.xml")).read() naf_bytes = corenlp.corenlp2naf(xml, corenlp.PARSER) naf = KafNafParser(BytesIO(naf_bytes)) terms = {t.get_id(): t.get_lemma() for t in naf.get_terms()} assert_equal(set(terms.values()), {"John", "attack", "I", "in", "London", "hit", "he", "back", "."}) london = [t for t in naf.get_terms() if t.get_lemma() == 'London'][0] assert_equal(london.get_pos(), 'R') assert_equal(london.get_morphofeat(), 'NNP') ents = {} for e in naf.get_entities(): for ref in e.get_references(): for term_id in ref.get_span().get_span_ids(): ents[terms[term_id]] = e.get_type() assert_equal(ents, {"John": "PERSON", "London": "LOCATION"}) deps = {terms[d.get_from()]: (d.get_function(), terms[d.get_to()]) for d in naf.get_dependencies()} expected = {'I': ('nsubj', 'hit'), 'John': ('nsubj', 'attack'), 'London': ('prep_in', 'attack'), 'back': ('advmod', 'hit'), 'he': ('dobj', 'hit')} assert_equal(deps, expected) corefs = [] for coref in naf.get_corefs(): corefs.append(set()) for span in coref.get_spans(): corefs[-1] |= {terms[t] for t in span.get_span_ids()} assert_in({"John", "he"}, corefs)
def test_header(): """ Do the functions to set header attributes work correctly? Make sure the run with nosetests -s, otherwise python3 will err """ naf = KafNafParser(type="NAF") naf.header = CHeader(type=naf.type) naf.root.insert(0, naf.header.get_node()) naf.header.set_uri("http://example.com") assert_equal("http://example.com", naf.header.get_uri()) naf.header.set_publicId("123") assert_equal("123", naf.header.get_publicId()) # test if properties are serialized/deserialized correctly b = BytesIO() naf.dump(b) b.seek(0) naf2 = KafNafParser(b, type="NAF") assert_equal("http://example.com", naf2.header.get_uri()) assert_equal("123", naf2.header.get_publicId())
def single_main( cls, output_file, naf_file, naf_extension=c.NAF_EXTENSION, validate=c.VALIDATE, uniqueyfy=c.UNIQUEYFY, fill_non_consecutive_coref_spans=c.FILL_NON_CONSECUTIVE_COREF_SPANS, sentence_filter=c.SENTENCE_DEFAULT_FILTER, conll_columns=c.CONLL_COLUMNS, conll_defaults=c.CONLL_DEFAULTS, min_column_spacing=c.MIN_COLUMN_SPACING, on_missing=c.CONLL_ON_MISSING, ): # Read document ID document_id = document_ID_from_filename(naf_file, naf_extension) cls.check_document_id(document_id, naf_file, on_missing['document_id']) # Read data reader = NAFReader(validate=validate) nafobj = KafNafParser(naf_file) sentences = reader.extract_sentences(nafobj) coref_sets = reader.extract_coref_sets(nafobj) del reader, nafobj add_word_numbers(sentences) CorefConverter( sentences, uniqueyfy=uniqueyfy, fill_spans=fill_non_consecutive_coref_spans, ).add_data_from_coref_sets(coref_sets) del coref_sets sentences = filter(sentence_filter, sentences) # Save the data to CoNLL cls.write_conll(filename=output_file, writer=CoNLLWriter( defaults=conll_defaults, min_column_spacing=min_column_spacing, on_missing=on_missing, columns=conll_columns), document_id=document_id, sentences=sentences)
def test_corenlp2naf(): xml = open(os.path.join(os.path.dirname(__file__), "test_corenlp.xml")).read() naf_bytes = corenlp.corenlp2naf(xml, corenlp.PARSER) naf = KafNafParser(BytesIO(naf_bytes)) terms = {t.get_id(): t.get_lemma() for t in naf.get_terms()} assert_equal( set(terms.values()), {"John", "attack", "I", "in", "London", "hit", "he", "back", "."}) london = [t for t in naf.get_terms() if t.get_lemma() == 'London'][0] assert_equal(london.get_pos(), 'R') assert_equal(london.get_morphofeat(), 'NNP') ents = {} for e in naf.get_entities(): for ref in e.get_references(): for term_id in ref.get_span().get_span_ids(): ents[terms[term_id]] = e.get_type() assert_equal(ents, {"John": "PERSON", "London": "LOCATION"}) deps = { terms[d.get_from()]: (d.get_function(), terms[d.get_to()]) for d in naf.get_dependencies() } expected = { 'I': ('nsubj', 'hit'), 'John': ('nsubj', 'attack'), 'London': ('prep_in', 'attack'), 'back': ('advmod', 'hit'), 'he': ('dobj', 'hit') } assert_equal(deps, expected) corefs = [] for coref in naf.get_corefs(): corefs.append(set()) for span in coref.get_spans(): corefs[-1] |= {terms[t] for t in span.get_span_ids()} assert_in({"John", "he"}, corefs)
def _test_file(this_file): input_fd = open(this_file) result = subprocess.check_output(os.path.join(__here__,'run_parser.sh'), stdin=input_fd) my_obj = KafNafParser(BytesIO(result)) #Check the terms terms = [term for term in my_obj.get_terms()] assert_equal(len(terms),12) assert_equal(my_obj.get_term('t_4').get_lemma(),'mooi') assert_equal(my_obj.get_term('t_4').get_pos(),'adj') #Check constituents trees = [tree for tree in my_obj.get_trees()] assert_equal(len(trees),2) assert_equal(trees[0].get_terminals_as_list()[1].get_span().get_span_ids(),['t_1']) #Check dependencies dependencies = [dep for dep in my_obj.get_dependencies()] assert_equal(len(dependencies),10) assert_equal(dependencies[5].get_function(),'hd/su')
from KafNafParserPy import KafNafParser if __name__ == '__main__': files = [] fd = open('nl.list.test') for line in fd: files.append(line.strip()) fd.close() my_polarity_classifier = PolarityClassifier('nl') my_polarity_classifier.load_models(sys.argv[1]) OK = WR = 1 for example_file in files: this_obj = KafNafParser(example_file) my_polarity_classifier.classify_kaf_naf_object(this_obj) this_obj.dump() break GOLD = {} list_ids_term_ids = [] for opinion in this_obj.get_opinions(): op_exp = opinion.get_expression() polarity = op_exp.get_polarity() term_ids = op_exp.get_span().get_span_ids() list_ids_term_ids.append((opinion.get_id(), term_ids)) GOLD[opinion.get_id()] = polarity
def get_terms_in_sentence(naf: KafNafParser, sent: int) -> Iterable[Cterm]: tokens = sort_tokens(t for t in naf.get_tokens() if t.get_sent() == sent) tokenids = [t.get_id() for t in tokens] return sort_terms( naf, [naf.get_term(tid) for tid in naf.map_tokens_to_terms(tokenids)])
def find_terms(naf: KafNafParser, words: Sequence[str]) -> Iterable[Cterm]: """Find all terms whose lemma or word form is in the list of words""" for t in naf.get_terms(): if t.get_lemma() in words or get_word(naf, t) in words: yield t
def add_file(filename, data_lexelt, reftype='lexical_key'): obj = KafNafParser(filename) tokens_per_sent = {} sent_for_token = {} sents_in_order = [] for token in obj.get_tokens(): sentid = token.get_sent() if sentid not in sents_in_order: sents_in_order.append(sentid) sent_for_token[token.get_id()] = sentid if sentid not in tokens_per_sent: tokens_per_sent[sentid] = [] tokens_per_sent[sentid].append((token.get_id(), token.get_text())) annotated_lemmas = [] # LIST of (full_id, token ids, lemma,pos,synset) for term in obj.get_terms(): synset_label = None for ext_ref in term.get_external_references(): if ext_ref.get_reftype() == 'lexical_key': synset_label = term.get_lemma() + '%' + ext_ref.get_reference() elif ext_ref.get_reftype() == 'sense' and ext_ref.get_resource( ) == 'WordNet-3.0': synset_label = ext_ref.get_reference() if synset_label is not None: break if synset_label is not None: annotated_lemmas.append( (filename + '#' + term.get_id(), term.get_span().get_span_ids(), term.get_lemma(), term.get_pos(), synset_label)) for full_id, token_ids, lemma, pos, synset_label in annotated_lemmas: #CREATE NEW INSTANCE this_key = lemma + '.' + pos.lower()[0] if this_key not in data_lexelt: data_lexelt[this_key] = Clexelt(this_key, pos) if not data_lexelt[this_key].exists(full_id): #Create the new instance new_instance = Cinstance() new_instance.id = full_id new_instance.docsrc = filename new_instance.key = synset_label tokens = [] target_indexes = [] this_sent = sent_for_token[token_ids[0]] index = sents_in_order.index(this_sent) start_idx = max(index - 2, 0) end_idx = min(index + 2, len(sents_in_order) - 1) selected_sents = sents_in_order[start_idx:end_idx + 1] num_token = 0 for current_sent in selected_sents: for token_id, token_text in tokens_per_sent[str(current_sent)]: tokens.append(token_text) if token_id in token_ids: target_indexes.append(num_token) num_token += 1 new_instance.tokens = tokens[:] new_instance.index_head = target_indexes[:] data_lexelt[this_key].add_instance(new_instance)
def main(inputfile, this_type, folder, overall_parameters={}, detected_dse={}, log=False): files = [] output_fd = None if this_type == 'train': output_fd = open(folder + '/' + TRAINING_FILENAME, 'w') ##Save the parametes parameter_filename = os.path.join(folder, PARAMETERS_FILENAME) fd_parameter = open(parameter_filename, 'w') pickler.dump(overall_parameters, fd_parameter, protocol=0) print >> sys.stderr, 'Parameters saved to file %s' % parameter_filename fd_parameter.close() #Input is a files with a list of files fin = open(inputfile, 'r') for line in fin: files.append(line.strip()) fin.close() elif this_type == 'tag': parameter_filename = os.path.join(folder, PARAMETERS_FILENAME) fd_param = open(parameter_filename, 'r') overall_parameters = pickler.load(fd_param) fd_param.close() #Input is a isngle file files.append(inputfile) #Output FD will be a temporary file output_fd = tempfile.NamedTemporaryFile('w', delete=False) elif this_type == 'test': parameter_filename = os.path.join(folder, PARAMETERS_FILENAME) fd_param = open(parameter_filename, 'r') these_overall_parameters = pickler.load(fd_param) fd_param.close() for opt, val in these_overall_parameters.items(): overall_parameters[opt] = val #Input is a files with a list of files fin = open(inputfile, 'r') for line in fin: files.append(line.strip()) fin.close() output_fd = open(folder + '/' + TESTING_FILENAME, 'w') gold_fd = None gold_filename = overall_parameters.get('gold_standard') if gold_filename is not None: gold_fd = open(gold_filename, 'w') for filename in files: if log: print >> sys.stderr, 'HOLDER: processing file', filename if isinstance(filename, KafNafParser): naf_obj = filename else: naf_obj = KafNafParser(filename) create_structures(naf_obj, filename) #Extract all the opinions opinions_per_sentence = defaultdict(list) num_opinions = 0 for opinion in naf_obj.get_opinions(): exp = opinion.get_expression() if exp is not None: p = exp.get_polarity() if p != 'NON-OPINIONATED': #if p.startswith('D-'): holder = opinion.get_holder() if holder is not None: span = holder.get_span() if span is not None: span_ids = span.get_span_ids() if len(span_ids) != 0: sentence_id = get_sentence_id_for_opinion( naf_obj, opinion) if sentence_id is not None: opinions_per_sentence[sentence_id].append( opinion) num_opinions += 1 if log: print >> sys.stderr, '\tNum of opinions:', num_opinions if this_type == 'train': # For the train a sequence is created for every opinion #One sequence is created for every DSE (possible to have repeated sentences) sentences_with_opinions = set() for this_sentence, these_opinions in opinions_per_sentence.items(): for opinion in these_opinions: sentences_with_opinions.add(this_sentence) create_sequence(naf_obj, this_type, this_sentence, overall_parameters, opinion, output=output_fd) #Include the rest of sentence without opinions ''' for sentence_id in naf_obj.list_sentence_ids: if sentence_id not in sentences_with_opinions: create_sequence(naf_obj, sentence_id, overall_parameters, list_opinions=[]) ''' elif this_type == 'tag': # Obtain the opinions per sentence per opinions_per_sentence = defaultdict(list) for list_name_ids, list_words in detected_dse: list_ids = [v[v.rfind('#') + 1:] for v in list_name_ids] first_token = naf_obj.get_token(list_ids[0]) sentence_for_opinion = first_token.get_sent() opinions_per_sentence[sentence_for_opinion].append(list_ids) for this_sentence, these_opinions in opinions_per_sentence.items(): for list_dse_token_ids in these_opinions: create_sequence(naf_obj, this_type, this_sentence, overall_parameters, opinion=list_dse_token_ids, output=output_fd, log=log) elif this_type == 'test': opinion_list = [] ''' for sentence_id in naf_obj.list_sentence_ids: if sentence_id in opinions_per_sentence: for this_sentence, these_opinions in opinions_per_sentence.items(): for opinion in these_opinions: create_sequence(naf_obj, this_type, this_sentence, overall_parameters,opinion, output = output_fd) opinion_list.append(opinion) else: create_sequence(naf_obj, this_type, sentence_id, overall_parameters,opinion=None, output = output_fd) ''' #For the testing, one sequence is created for every sentence, with no opinion included opinion_list = [] #WE include only the the sentences where there are opinions for this_sentence, these_opinions in opinions_per_sentence.items(): for opinion in these_opinions: create_sequence(naf_obj, this_type, this_sentence, overall_parameters, opinion, output=output_fd) opinion_list.append(opinion) ## Create the gold standard data also if gold_fd is not None: create_gold_standard_holder(naf_obj, opinion_list, gold_fd) if gold_fd is not None: gold_fd.close() print >> sys.stderr, 'Gold standard in the file %s' % gold_fd.name return output_fd.name
def main(inputfile, this_type, folder, overall_parameters = {}, detected_dse = {},log=False): files = [] output_fd = None if this_type == 'train': output_fd = open(folder+'/'+TRAINING_FILENAME,'w') ##Save the parametes parameter_filename = os.path.join(folder,PARAMETERS_FILENAME) fd_parameter = open(parameter_filename,'w') pickler.dump(overall_parameters,fd_parameter,protocol=0) print>>sys.stderr,'Parameters saved to file %s' % parameter_filename fd_parameter.close() #Input is a files with a list of files fin = open(inputfile,'r') for line in fin: files.append(line.strip()) fin.close() elif this_type == 'tag': parameter_filename = os.path.join(folder,PARAMETERS_FILENAME) fd_param = open(parameter_filename,'r') overall_parameters = pickler.load(fd_param) fd_param.close() #Input is a isngle file files.append(inputfile) #Output FD will be a temporary file output_fd = tempfile.NamedTemporaryFile('w', delete=False) elif this_type == 'test': parameter_filename = os.path.join(folder,PARAMETERS_FILENAME) fd_param = open(parameter_filename,'r') these_overall_parameters = pickler.load(fd_param) fd_param.close() for opt, val in these_overall_parameters.items(): overall_parameters[opt] = val #Input is a files with a list of files fin = open(inputfile,'r') for line in fin: files.append(line.strip()) fin.close() output_fd = open(folder+'/'+TESTING_FILENAME,'w') gold_fd = None gold_filename = overall_parameters.get('gold_standard') if gold_filename is not None: gold_fd = open(gold_filename ,'w') for filename in files: if log: print>>sys.stderr,'TARGET: processing file', filename if isinstance(filename,KafNafParser): naf_obj = filename else: naf_obj = KafNafParser(filename) create_structures(naf_obj, filename) #Extract all the opinions opinions_per_sentence = defaultdict(list) num_opinions = 0 for opinion in naf_obj.get_opinions(): exp = opinion.get_expression() if exp is not None: p = exp.get_polarity() if p != 'NON-OPINIONATED': target = opinion.get_target() if target is not None: span = target.get_span() if span is not None: S = span.get_span_ids() if len(S) != 0: sentence_id = get_sentence_id_for_opinion(naf_obj,opinion) if sentence_id is not None: opinions_per_sentence[sentence_id].append(opinion) num_opinions += 1 if log: print>>sys.stderr,'\tNum of opinions:', num_opinions if this_type == 'train': # For the train a sequence is created for every opinion #One sequence is created for every DSE (possible to have repeated sentences) sentences_with_opinions = set() for this_sentence, these_opinions in opinions_per_sentence.items(): for opinion in these_opinions: sentences_with_opinions.add(this_sentence) create_sequence(naf_obj, this_type, this_sentence, overall_parameters, opinion, output = output_fd) #Include the rest of sentence without opinions ''' for sentence_id in naf_obj.list_sentence_ids: if sentence_id not in sentences_with_opinions: create_sequence(naf_obj, sentence_id, overall_parameters, list_opinions=[]) ''' elif this_type=='tag': # Obtain the opinions per sentence per opinions_per_sentence = defaultdict(list) for list_name_ids, list_words in detected_dse: list_ids = [v[v.rfind('#')+1:] for v in list_name_ids] first_token = naf_obj.get_token(list_ids[0]) sentence_for_opinion = first_token.get_sent() opinions_per_sentence[sentence_for_opinion].append(list_ids) for this_sentence, these_opinions in opinions_per_sentence.items(): for list_dse_token_ids in these_opinions: create_sequence(naf_obj, this_type, this_sentence, overall_parameters, opinion = list_dse_token_ids, output = output_fd,log=log) elif this_type=='test': #For the testing, one sequence is created for every sentence, with no opinion included opinion_list = [] for this_sentence, these_opinions in opinions_per_sentence.items(): for opinion in these_opinions: create_sequence(naf_obj, this_type, this_sentence, overall_parameters,opinion, output = output_fd) opinion_list.append(opinion) if gold_fd is not None: create_gold_standard_target(naf_obj,opinion_list,gold_fd) if gold_fd is not None: gold_fd.close() print>>sys.stderr,'Gold standard in the file %s' % gold_fd.name return output_fd.name
def nafobj(naffile_coref): from KafNafParserPy import KafNafParser return KafNafParser(naffile_coref)
def get_naf(input_filename): try: naf = KafNafParser(input_filename) except XMLSyntaxError: with open(input_filename) as input_file: input = input_file.read() if "<NAF" in input and "</NAF>" in input: # I'm guessing this should be a NAF file but something is wrong logger.exception("Error parsing NAF file") raise naf = KafNafParser(type="NAF") naf.set_version("3.0") naf.set_language("nl") naf.lang = "nl" naf.raw = input naf.set_raw(naf.raw) return naf
def create_training_sentences(folder_tag_in,folder_kaf_in, opinion_layers,non_opinion,folder_out): #Remove the outputfolder if exists and create it again if os.path.exists(folder_out): shutil.rmtree(folder_out) os.mkdir(folder_out) total_sents_opi = total_sents_no_opi = 0 for tag_file in glob.glob(os.path.join(folder_tag_in,'*.tag')): basename = os.path.basename(tag_file).replace('.tag','') kaf_file = os.path.join(folder_kaf_in,basename+'.kaf') if os.path.exists(kaf_file): ##From the tag file we extract the token ids for opinions and for non opinionated opinion_wids = set() #token ids annotated as opinions no_opinion_wids = set() #token ids annotated as no opinions fd = open(tag_file,'rb') for line in fd: fields = line.strip().split('\t') wid = fields[0] for opinion_idx in opinion_layers: if fields[opinion_idx] == 'Opinion': opinion_wids.add(wid) if non_opinion is not None and fields[non_opinion] == 'NON-OPINIONATED': no_opinion_wids.add(wid) fd.close() ######### ### # Obtain the sentences that are opinionated (positive) and not (negative) # The negatives are: # If there are non-opinionated: just the non opinionated # If not --> all the rest that are not positive ##### sentences = {} all_sent_ids = set() sent_for_token_id = {} kaf_obj = KafNafParser(kaf_file) for token in kaf_obj.get_tokens(): token_id = token.get_id() sent_id = token.get_sent() token_value = token.get_text() if sent_id not in sentences: sentences[sent_id] = [] sentences[sent_id].append(token_value) all_sent_ids.add(sent_id) sent_for_token_id[token_id] = sent_id ### positive_sents = set() negative_sents = set() ##Positive sents are the sentences for the opinion_ids for token_id in opinion_wids: positive_sents.add(sent_for_token_id[token_id]) #### #Negative sents if non_opinion is not None: #In this case the negative are just the sentence of the no_opinion_wids for token_id in no_opinion_wids: negative_sents.add(sent_for_token_id[token_id]) else: #In this case the negative are all the sentences but the positive ones negative_sents = all_sent_ids - positive_sents #Free some memory del opinion_wids del no_opinion_wids del kaf_obj ##Store the results in the file output_file = os.path.join(folder_out,basename+'.sents') fd_out = open(output_file,'w') fd_out.write('#'+tag_file+'\n') for sent_id in sorted(list(positive_sents)): text = ' '.join(sentences[sent_id]) fd_out.write('+ '+text.encode('utf-8')+'\n') for sent_id in sorted(list(negative_sents)): text = ' '.join(sentences[sent_id]) fd_out.write('- '+text.encode('utf-8')+'\n') fd_out.close() #print 'Processed ',basename #print ' Subjective sents:',len(positive_sents) #print ' Non subje. sents:',len(negative_sents) total_sents_opi += len(positive_sents) total_sents_no_opi += len(negative_sents) else: print 'KAF FILE NOT FOUND',kaf_file return total_sents_opi, total_sents_no_opi
from KafNafParserPy import KafNafParser import sys if __name__ == '__main__': #Load Wordnet synset_for_skey = {} path_to_index_sense = '/home/izquierdo/wordnets/wordnet-3.0/dict/index.sense' fd = open(path_to_index_sense) for line in fd: fields = line.split() synset_for_skey[fields[0]] = fields[1] fd.close() naf_obj = KafNafParser(sys.stdin) for term in naf_obj.get_terms(): this_skey = None this_synset = None ref_skey = ref_synset = None for ext_ref in term.get_external_references(): if ext_ref.get_reftype() == 'sense': this_skey = ext_ref.get_reference() ref_skey = ext_ref if ext_ref.get_reftype() == 'ilidef': this_synset = ext_ref.get_reference() ref_synset = ext_ref if this_synset == '': print >> sys.stderr, term.get_id()
if __name__ == '__main__': import glob #feature_file = 'my_feat_file' #fd = open(feature_file,'w') #for kaf_file in glob.glob('/home/izquierdo/data/opinion_annotations_en/kaf/hotel/*.kaf'): # print kaf_file # knaf_obj = KafNafParser(kaf_file) # extract_features_polarity_classifier_from_kaf(knaf_obj, fd) #fd.close() #print ' Feature file in ',feature_file #train_polarity_classifier(feature_file) kaf_obj = KafNafParser('dutch00011_f1b91e00bddbf62fbb35e4755e786406.kaf') list_terms = [] list_ids = [] for opinion in kaf_obj.get_opinions(): exp = opinion.get_expression() pol = exp.get_polarity() if pol in ['Positive','Negative','StrongPositive','StrongNegative']: this_id = (opinion.get_id(),pol) ids = exp.get_span().get_span_ids() list_ids.append(this_id) list_terms.append(ids) index_filename = '/home/izquierdo/cltl_repos/opinion_miner_deluxe/check_me/polarity_classifier/index.features' model_filename = '/home/izquierdo/cltl_repos/opinion_miner_deluxe/check_me/polarity_classifier/model.svm' svm_path = '/home/izquierdo/bin/svm_classify' results = classify(kaf_obj,list_terms,index_filename,model_filename, svm_path) for n in range(len(results)):
def extract_all_features(): train_files = load_training_files() logging.debug('Loaded '+str(len(train_files))+' files') feat_folder = my_config_manager.get_feature_folder_name() label_feats = separator = None my_stdout, my_stderr = sys.stdout,sys.stderr rel_exp_tar_filename = my_config_manager.get_relation_exp_tar_training_filename() exp_tar_rel_fic = open(rel_exp_tar_filename,'w') rel_exp_hol_filename = my_config_manager.get_relation_exp_hol_training_filename() exp_hol_rel_fic = open(rel_exp_hol_filename,'w') ### LEXICON FROM THE DOMAIN expressions_lexicon = None targets_lexicon = None if my_config_manager.get_use_training_lexicons(): # Create the lexicons ##GUESS THE LANG: first_train_file = train_files[0] obj = KafNafParser(first_train_file) lang = obj.get_language() expression_lexicon_filename = my_config_manager.get_expression_lexicon_filename() target_lexicon_filename = my_config_manager.get_target_lexicon_filename() this_exp_lex = my_config_manager.get_use_this_expression_lexicon() this_tar_lex = my_config_manager.get_use_this_target_lexicon() if this_exp_lex is None or this_tar_lex is None: path_to_lex_creator = '/home/izquierdo/opener_repos/opinion-domain-lexicon-acquisition/acquire_from_annotated_data.py' training_filename = my_config_manager.get_file_training_list() lexicons_manager.create_lexicons(path_to_lex_creator,training_filename,expression_lexicon_filename,target_lexicon_filename) ##Once created we have to copy the previous one in case: if this_exp_lex is not None: if "$LANG" in this_exp_lex: this_exp_lex = this_exp_lex.replace('$LANG',lang) shutil.copy(this_exp_lex, expression_lexicon_filename) if this_tar_lex is not None: if "$LANG" in this_tar_lex: this_tar_lex = this_tar_lex.replace('$LANG',lang) shutil.copy(this_tar_lex,target_lexicon_filename) expressions_lexicon = lexicons_manager.load_lexicon(expression_lexicon_filename) targets_lexicon = lexicons_manager.load_lexicon(target_lexicon_filename) this_propagation_lexicon = my_config_manager.get_propagation_lexicon_name() if this_propagation_lexicon is not None: if "$LANG" in this_propagation_lexicon: this_propagation_lexicon = this_propagation_lexicon.replace('$LANG',lang) print>>sys.stderr,'Propagated lexicon',this_propagation_lexicon ## Configuration for the relational alcasifier use_deps_now = my_config_manager.get_use_dependencies() use_toks_lems_now = my_config_manager.get_use_tokens_lemmas() accepted_opinions = my_config_manager.get_mapping_valid_opinions() use_dependencies_now = my_config_manager.get_use_dependencies() polarities_found_and_skipped = [] for num_file, train_file in enumerate(train_files): logging.debug('Extracting features '+os.path.basename(train_file)) base_name = os.path.basename(train_file) out_file = os.path.join(feat_folder,'file#'+str(num_file)+'#'+base_name+".feat") err_file = out_file+'.log' #Creates the output file # Returns the labels for the features and the separator used if True: kaf_naf_obj = KafNafParser(train_file) label_feats, separator, pols_skipped_this = extract_features_from_kaf_naf_file(kaf_naf_obj,out_file,err_file, accepted_opinions=accepted_opinions, exp_lex=expressions_lexicon, tar_lex=targets_lexicon, propagation_lex_filename=this_propagation_lexicon) polarities_found_and_skipped.extend(pols_skipped_this) print>>exp_tar_rel_fic,'#'+train_file print>>exp_hol_rel_fic,'#'+train_file # SET valid_opinions to None to use all the possible opinions in the KAF file for extracitng relations create_rel_exp_tar_training(kaf_naf_obj, output=exp_tar_rel_fic, valid_opinions=accepted_opinions,use_dependencies=use_dependencies_now,use_tokens=use_toks_lems_now,use_lemmas=use_toks_lems_now) create_rel_exp_hol_training(kaf_naf_obj ,output=exp_hol_rel_fic, valid_opinions=accepted_opinions,use_dependencies=use_dependencies_now,use_tokens=use_toks_lems_now,use_lemmas=use_toks_lems_now) if False: #except Exception as e: sys.stdout, sys.stderr = my_stdout, my_stderr print>>sys.stderr,str(e),dir(e) pass ##Show just for information how many instances have been skipped becase the polarity of opinion expression was not allowed count = defaultdict(int) for exp_label in polarities_found_and_skipped: count[exp_label] += 1 info = '\nOpinions skipped because the polarity label is not included in the configuration\n' info += 'Accepted opinions: '+' '.join(accepted_opinions.keys())+'\n' info += 'Number of complete opinions skipped\n' for label, c in count.items(): info+=' '+label+' :'+str(c)+'\n' info+='\n' logging.debug(info) ################################################### #Re-set the stdout and stderr exp_tar_rel_fic.close() exp_hol_rel_fic.close() sys.stdout,sys.stderr = my_stdout, my_stderr #Sabe labelfeats and separator in a file filename = my_config_manager.get_feature_desc_filename() fic = open(filename,'w') fic.write(' '.join(label_feats)+'\n') fic.close() logging.debug('Description of features --> '+filename)
def main(inputfile, type, folder, overall_parameters={},log=False): files = [] output_fd = None if type == 'train': if not os.path.isdir(folder): os.mkdir(folder) res_fol = os.path.join(folder,RESOURCES_FOLDER) if not os.path.isdir(res_fol): os.mkdir(res_fol) output_fd = open(folder+'/'+TRAINING_FILENAME,'w') ##Save the parametes parameter_filename = os.path.join(folder,PARAMETERS_FILENAME) fd_parameter = open(parameter_filename,'w') pickler.dump(overall_parameters,fd_parameter,protocol=0) print('Parameters saved to file %s' % parameter_filename, file=sys.stderr) fd_parameter.close() #Input is a files with a list of files fin = open(inputfile,'r') for line in fin: files.append(line.strip()) fin.close() elif type == 'tag': parameter_filename = os.path.join(folder,PARAMETERS_FILENAME) fd_param = open(parameter_filename,'rb') try: overall_parameters = pickler.load(fd_param,encoding='bytes') except TypeError: overall_parameters = pickler.load(fd_param) fd_param.close() #Input is a isngle file files.append(inputfile) #Output FD will be a temporary file output_fd = tempfile.NamedTemporaryFile('w', delete=False) elif type == 'test': parameter_filename = os.path.join(folder,PARAMETERS_FILENAME) fd_param = open(parameter_filename,'r') these_overall_parameters = pickler.load(fd_param) fd_param.close() for opt, val in list(these_overall_parameters.items()): overall_parameters[opt] = val #Input is a files with a list of files fin = open(inputfile,'r') for line in fin: files.append(line.strip()) fin.close() output_fd = open(folder+'/'+TESTING_FILENAME,'w') ##Load the sentiment-nva-gi42.txt ##overall_parameters['sentiment-nva-gi42'] = load_sentiment_nva_gi42() ##overall_parameters['lexOut_90000_monovalue'] = load_lexOut_90000() ###if overall_parameters['use_mpqa_lexicon']: from mpqa_lexicon import MPQA_subjectivity_lexicon overall_parameters['mpqa_lexicon'] = MPQA_subjectivity_lexicon() if overall_parameters.get('use_wordnet_lexicon', False): from wordnet_lexicon import WordnetLexicon wordnet_lexicon_expression = WordnetLexicon() complete_wn_filename = os.path.join(folder, RESOURCES_FOLDER, WORDNET_LEXICON_FILENAME) if type == 'train': #We create it from the training files print('Creating WORDNET LEXICON FILE from %d files and storing it on %s' % (len(files), complete_wn_filename), file=sys.stderr) wordnet_lexicon_expression.create_from_files(files,'expression') wordnet_lexicon_expression.save_to_file(complete_wn_filename) else: #READ IT wordnet_lexicon_expression.load_from_file(complete_wn_filename) overall_parameters['wordnet_lexicon'] = wordnet_lexicon_expression gold_fd = None gold_filename = overall_parameters.get('gold_standard') if gold_filename is not None: gold_fd = open(gold_filename ,'w') #Processing every file #### FOR THE CUSTOM LEXICON #from customized_lexicon import CustomizedLexicon #overall_parameters['custom_lexicon'] = CustomizedLexicon() #overall_parameters['custom_lexicon'].load_from_filename('EXP.nl') ########################### #from customized_lexicon import CustomizedLexicon #overall_parameters['custom_lexicon'] = CustomizedLexicon() #overall_parameters['custom_lexicon'].load_for_language('it') for filename in files: if log: print('EXPRESSION: processing file', filename, file=sys.stderr) if isinstance(filename,KafNafParser): naf_obj = filename else: naf_obj = KafNafParser(filename) create_structures(naf_obj, filename) #Extract all the opinions opinions_per_sentence = defaultdict(list) num_opinions = 0 for opinion in naf_obj.get_opinions(): exp = opinion.get_expression() if exp is not None: p = exp.get_polarity() if p != 'NON-OPINIONATED': #if p.startswith('D-'): sentence_id = get_sentence_id_for_opinion(naf_obj,opinion) if sentence_id is not None: opinions_per_sentence[sentence_id].append(opinion) num_opinions += 1 if log: print('\tNum of opinions:', num_opinions, file=sys.stderr) if type == 'train': ############################ # One sequence per sentence ############################ for sentence_id in naf_obj.list_sentence_ids: opinions_in_sent = opinions_per_sentence.get(sentence_id,[]) if len(opinions_in_sent) != 0: ##Only sentences with opinions create_sequence(naf_obj, sentence_id, overall_parameters, opinions_in_sent, output = output_fd) elif type == 'test': #TESTING CASE #For the testing, one sequence is created for every sentence for sentence_id in naf_obj.list_sentence_ids: opinions_in_sent = opinions_per_sentence.get(sentence_id,[]) if len(opinions_in_sent) != 0: #Only tested on sentences with opinions create_sequence(naf_obj, sentence_id, overall_parameters, opinions_in_sent,output = output_fd) ## Create the gold standard data also opinion_list = [] for this_sentence, these_opinions in list(opinions_per_sentence.items()): opinion_list.extend(these_opinions) if gold_fd is not None: create_gold_standard(naf_obj,opinion_list,gold_fd) elif type == 'tag': #TAGGING CASE # All the sentences are considered for sentence_id in naf_obj.list_sentence_ids: create_sequence(naf_obj, sentence_id, overall_parameters, list_opinions = [],output = output_fd, log=log) if gold_fd is not None: gold_fd.close() print('Gold standard in the file %s' % gold_fd.name, file=sys.stderr) output_fd.close() return output_fd.name
def corenlp2naf(xml_bytes, annotators): """ Call from on the text and return a Naf object """ naf = KafNafParser(type="NAF") try: doc = Document(xml_bytes) except: log.exception("Error on parsing xml") raise terms = {} # (xml_sentid, xml_tokenid) : term for sent in doc.sentences: for t in sent.tokens: wf = naf.create_wf(t.word, sent.id, t.character_offset_begin) term = naf.create_term(t.lemma, POSMAP[t.pos], t.pos, [wf]) terms[sent.id, t.id] = term if t.ner not in (None, 'O'): naf.create_entity(t.ner, [term.get_id()]) if sent.collapsed_ccprocessed_dependencies: dependencies = True for dep in sent.collapsed_ccprocessed_dependencies.links: if dep.type != 'root': child = terms[sent.id, dep.dependent.idx] parent = terms[sent.id, dep.governor.idx] comment = "{t}({o}, {s})".format(s=child.get_lemma(), t=dep.type, o=parent.get_lemma()) naf.create_dependency(child.get_id(), parent.get_id(), dep.type, comment=comment) if doc.coreferences: for coref in doc.coreferences: cterms = set() for m in coref.mentions: cterms |= {terms[m.sentence.id, t.id].get_id() for t in m.tokens} naf.create_coreference("term", cterms) for annotator in annotators: if annotator in LAYERMAP: naf.create_linguistic_processor(LAYERMAP[annotator], "CoreNLP {annotator}".format(**locals()), get_corenlp_version()) s = BytesIO() naf.dump(s) return s.getvalue()
def get_naf_from_sentences(sentences): naf_obj = KafNafParser(type="NAF") naf_obj.set_version("3.0") naf_obj.set_language("nl") naf_obj.lang = "nl" naf_obj.raw = '\n'.join([' '.join(s) for s in sentences]) naf_obj.set_raw(naf_obj.raw) # Create text layer wcount = 1 offsets = {} txt = naf_obj.get_raw() token_ids = [] for sid, sentence in enumerate(sentences): token_ids_sub = [] for token in sentence: token_obj = KafNafParserPy.Cwf(type=naf_obj.get_type()) token_id = 'w{}'.format(wcount) token_length = len(token) offsets[wcount] = txt.find(token, offsets.get(wcount - 1, 0)) token_obj.set_id(token_id) token_obj.set_length(str(token_length)) # token_obj.set_offset(str(offset)) # Is this correct???? token_obj.set_para('1') token_obj.set_sent(str(sid + 1)) token_obj.set_text(token) token_obj.set_offset(str(offsets[wcount])) token_ids_sub.append(token_id) wcount += 1 naf_obj.add_wf(token_obj) token_ids.append(token_ids_sub) # Create term layers term_ids = [] count_terms = 0 for sid, (sentence, token_ids_sub) in enumerate(zip(sentences, token_ids)): term_ids_sub = [] logger.info('Creating the term layer...') for num_token, (token, token_id) in enumerate(zip(sentence, token_ids_sub)): new_term_id = 't_' + str(count_terms) count_terms += 1 term_ids_sub.append(new_term_id) term_obj = KafNafParserPy.Cterm(type=naf_obj.get_type()) term_obj.set_id(new_term_id) new_span = KafNafParserPy.Cspan() new_span.create_from_ids([token_id]) term_obj.set_span(new_span) naf_obj.add_term(term_obj) term_ids.append(term_ids_sub) return naf_obj, term_ids
map[fields[0]] = fields[1] fic.close() return map if __name__=='__main__': this_folder = os.path.dirname(os.path.realpath(__file__)) if sys.stdin.isatty(): print>>sys.stderr,'Input stream required.' print>>sys.stderr,'Example usage: cat myUTF8file.kaf |',sys.argv[0] sys.exit(-1) input_obj = KafNafParser(sys.stdin) my_lang = input_obj.get_language() complete_path_to_treetagger = find_treetagger() if complete_path_to_treetagger is None: print>>sys.stderr,'Treetagger could not be found. You need to specify there treetagger is installed in 2 ways:' print>>sys.stderr,'\t1)Update the TREE_TAGGER_PATH variable in the file lib/__init__.py' print>>sys.stderr,'\t2_Update your TREE_TAGGER_PATH environment variable' sys.exit(0) # In the last version of treetagger all the names of commands have been change from X-utf to just X # /cmd/tree-tagger-english-utf8 ==> /cmd/tree-tagger-english # This could be a problem in case other version of treetagger is being used. if my_lang == 'en': treetagger_cmd = complete_path_to_treetagger+'/cmd/tree-tagger-english'
def _get_text(a, to_naf=False, lang='nl'): result = "\n\n".join([_normalize(a[x]) for x in ('headline', 'text')]) if to_naf: naf = KafNafParser(type="NAF") naf.header = CHeader(type=naf.type) naf.root.insert(0, naf.header.get_node()) naf.set_language(lang) naf.set_raw(result) naf.set_version("3.0") fd = CfileDesc() if 'author' in a: fd.set_author(a['author']) if 'headline' in a: fd.set_title(a['headline']) if 'date' in a: fd.set_creationtime(a['date']) if 'medium' in a: fd.set_magazine(a['medium']) if 'page' in a: fd.set_pages(str(a['page'])) if 'section' in a: fd.set_section(a['section']) naf.header.set_fileDesc(fd) naf.header.set_publicId(a['uuid']) #if 'url' in a: # naf.header.set_uri(a['url']) b = BytesIO() naf.dump(b) result = b.getvalue().decode("utf-8") return result
from KafNafParserPy import KafNafParser, Clp, Crole, Cspan, Cpredicate from KafNafParserPy.span_data import Ctarget import sys import datetime import time import sys import datetime import time import pprint import re # Make sure you get the order of the input files right nafinput = sys.argv[1] timblpredictions = sys.argv[2] my_parser = KafNafParser(nafinput) ## Create header info lp = Clp() lp.set_name('SoNaR-News-trained-SRL') lp.set_version('1.1') lp.set_timestamp() my_parser.add_linguistic_processor('srl', lp) # If the naf file already contains predicates, store those to make sure # you don't overwrite them or create new predicate elements for existing predicates roles = [] predicate_spans = [] for predicate in my_parser.get_predicates(): for role in predicate.get_roles(): role_id = role.get_id()
def main(inputfile, type, folder, overall_parameters={},log=False): files = [] output_fd = None if type == 'train': if not os.path.isdir(folder): os.mkdir(folder) res_fol = os.path.join(folder,RESOURCES_FOLDER) if not os.path.isdir(res_fol): os.mkdir(res_fol) output_fd = open(folder+'/'+TRAINING_FILENAME,'w') ##Save the parametes parameter_filename = os.path.join(folder,PARAMETERS_FILENAME) fd_parameter = open(parameter_filename,'w') pickler.dump(overall_parameters,fd_parameter,protocol=0) print>>sys.stderr,'Parameters saved to file %s' % parameter_filename fd_parameter.close() #Input is a files with a list of files fin = open(inputfile,'r') for line in fin: files.append(line.strip()) fin.close() elif type == 'tag': parameter_filename = os.path.join(folder,PARAMETERS_FILENAME) fd_param = open(parameter_filename,'r') overall_parameters = pickler.load(fd_param) fd_param.close() #Input is a isngle file files.append(inputfile) #Output FD will be a temporary file output_fd = tempfile.NamedTemporaryFile('w', delete=False) elif type == 'test': parameter_filename = os.path.join(folder,PARAMETERS_FILENAME) fd_param = open(parameter_filename,'r') these_overall_parameters = pickler.load(fd_param) fd_param.close() for opt, val in these_overall_parameters.items(): overall_parameters[opt] = val #Input is a files with a list of files fin = open(inputfile,'r') for line in fin: files.append(line.strip()) fin.close() output_fd = open(folder+'/'+TESTING_FILENAME,'w') ##Load the sentiment-nva-gi42.txt ##overall_parameters['sentiment-nva-gi42'] = load_sentiment_nva_gi42() ##overall_parameters['lexOut_90000_monovalue'] = load_lexOut_90000() ###if overall_parameters['use_mpqa_lexicon']: from mpqa_lexicon import MPQA_subjectivity_lexicon overall_parameters['mpqa_lexicon'] = MPQA_subjectivity_lexicon() if overall_parameters.get('use_wordnet_lexicon', False): from wordnet_lexicon import WordnetLexicon wordnet_lexicon_expression = WordnetLexicon() complete_wn_filename = os.path.join(folder, RESOURCES_FOLDER, WORDNET_LEXICON_FILENAME) if type == 'train': #We create it from the training files print>>sys.stderr,'Creating WORDNET LEXICON FILE from %d files and storing it on %s' % (len(files), complete_wn_filename) wordnet_lexicon_expression.create_from_files(files,'expression') wordnet_lexicon_expression.save_to_file(complete_wn_filename) else: #READ IT wordnet_lexicon_expression.load_from_file(complete_wn_filename) overall_parameters['wordnet_lexicon'] = wordnet_lexicon_expression gold_fd = None gold_filename = overall_parameters.get('gold_standard') if gold_filename is not None: gold_fd = open(gold_filename ,'w') #Processing every file #### FOR THE CUSTOM LEXICON #from customized_lexicon import CustomizedLexicon #overall_parameters['custom_lexicon'] = CustomizedLexicon() #overall_parameters['custom_lexicon'].load_from_filename('EXP.nl') ########################### #from customized_lexicon import CustomizedLexicon #overall_parameters['custom_lexicon'] = CustomizedLexicon() #overall_parameters['custom_lexicon'].load_for_language('it') for filename in files: if log: print>>sys.stderr,'EXPRESSION: processing file', filename if isinstance(filename,KafNafParser): naf_obj = filename else: naf_obj = KafNafParser(filename) create_structures(naf_obj, filename) #Extract all the opinions opinions_per_sentence = defaultdict(list) num_opinions = 0 for opinion in naf_obj.get_opinions(): exp = opinion.get_expression() if exp is not None: p = exp.get_polarity() if p != 'NON-OPINIONATED': #if p.startswith('D-'): sentence_id = get_sentence_id_for_opinion(naf_obj,opinion) if sentence_id is not None: opinions_per_sentence[sentence_id].append(opinion) num_opinions += 1 if log: print>>sys.stderr,'\tNum of opinions:', num_opinions if type == 'train': ############################ # One sequence per sentence ############################ for sentence_id in naf_obj.list_sentence_ids: opinions_in_sent = opinions_per_sentence.get(sentence_id,[]) if len(opinions_in_sent) != 0: ##Only sentences with opinions create_sequence(naf_obj, sentence_id, overall_parameters, opinions_in_sent, output = output_fd) elif type == 'test': #TESTING CASE #For the testing, one sequence is created for every sentence for sentence_id in naf_obj.list_sentence_ids: opinions_in_sent = opinions_per_sentence.get(sentence_id,[]) if len(opinions_in_sent) != 0: #Only tested on sentences with opinions create_sequence(naf_obj, sentence_id, overall_parameters, opinions_in_sent,output = output_fd) ## Create the gold standard data also opinion_list = [] for this_sentence, these_opinions in opinions_per_sentence.items(): opinion_list.extend(these_opinions) if gold_fd is not None: create_gold_standard(naf_obj,opinion_list,gold_fd) elif type == 'tag': #TAGGING CASE # All the sentences are considered for sentence_id in naf_obj.list_sentence_ids: create_sequence(naf_obj, sentence_id, overall_parameters, list_opinions = [],output = output_fd, log=log) if gold_fd is not None: gold_fd.close() print>>sys.stderr,'Gold standard in the file %s' % gold_fd.name output_fd.close() return output_fd.name
def create_training_sentences(folder_tag_in, folder_kaf_in, opinion_layers, non_opinion, folder_out): #Remove the outputfolder if exists and create it again if os.path.exists(folder_out): shutil.rmtree(folder_out) os.mkdir(folder_out) total_sents_opi = total_sents_no_opi = 0 for tag_file in glob.glob(os.path.join(folder_tag_in, '*.tag')): basename = os.path.basename(tag_file).replace('.tag', '') kaf_file = os.path.join(folder_kaf_in, basename + '.kaf') if os.path.exists(kaf_file): ##From the tag file we extract the token ids for opinions and for non opinionated opinion_wids = set() #token ids annotated as opinions no_opinion_wids = set() #token ids annotated as no opinions fd = open(tag_file, 'rb') for line in fd: fields = line.strip().split('\t') wid = fields[0] for opinion_idx in opinion_layers: if fields[opinion_idx] == 'Opinion': opinion_wids.add(wid) if non_opinion is not None and fields[ non_opinion] == 'NON-OPINIONATED': no_opinion_wids.add(wid) fd.close() ######### ### # Obtain the sentences that are opinionated (positive) and not (negative) # The negatives are: # If there are non-opinionated: just the non opinionated # If not --> all the rest that are not positive ##### sentences = {} all_sent_ids = set() sent_for_token_id = {} kaf_obj = KafNafParser(kaf_file) for token in kaf_obj.get_tokens(): token_id = token.get_id() sent_id = token.get_sent() token_value = token.get_text() if sent_id not in sentences: sentences[sent_id] = [] sentences[sent_id].append(token_value) all_sent_ids.add(sent_id) sent_for_token_id[token_id] = sent_id ### positive_sents = set() negative_sents = set() ##Positive sents are the sentences for the opinion_ids for token_id in opinion_wids: positive_sents.add(sent_for_token_id[token_id]) #### #Negative sents if non_opinion is not None: #In this case the negative are just the sentence of the no_opinion_wids for token_id in no_opinion_wids: negative_sents.add(sent_for_token_id[token_id]) else: #In this case the negative are all the sentences but the positive ones negative_sents = all_sent_ids - positive_sents #Free some memory del opinion_wids del no_opinion_wids del kaf_obj ##Store the results in the file output_file = os.path.join(folder_out, basename + '.sents') fd_out = open(output_file, 'w') fd_out.write('#' + tag_file + '\n') for sent_id in sorted(list(positive_sents)): text = ' '.join(sentences[sent_id]) fd_out.write('+ ' + text.encode('utf-8') + '\n') for sent_id in sorted(list(negative_sents)): text = ' '.join(sentences[sent_id]) fd_out.write('- ' + text.encode('utf-8') + '\n') fd_out.close() #print 'Processed ',basename #print ' Subjective sents:',len(positive_sents) #print ' Non subje. sents:',len(negative_sents) total_sents_opi += len(positive_sents) total_sents_no_opi += len(negative_sents) else: print 'KAF FILE NOT FOUND', kaf_file return total_sents_opi, total_sents_no_opi
def train(self,list_training_files, out_folder): self.folder= out_folder os.mkdir(self.folder) print('Creating output folder %s' % self.folder) training_fd = open(os.path.join(self.folder,TRAIN_FILE),'w') for this_file in list_training_files: print('\tEncoding training file %s' % this_file) this_obj = KafNafParser(this_file) num_pos = num_neg = 0 for opinion in this_obj.get_opinions(): opinion_expression = opinion.get_expression() polarity = opinion_expression.get_polarity() span_obj = opinion_expression.get_span() if span_obj is None: continue list_term_ids = span_obj.get_span_ids() features = self.extract_features(this_obj, list_term_ids) int_features = self.encode_string_features(features, update_index=True) #Map feat index --> frequency if len(int_features) != 0: this_class = None if self.is_positive(polarity): this_class = '+1' num_pos += 1 elif self.is_negative(polarity): this_class = '-1' num_neg += 1 if this_class is not None: self.write_example_to_file(training_fd, this_class, int_features) #END FOR print('\t\tNum positive examples: %d' % num_pos) print('\t\tNum negative examples: %d' % num_neg) training_fd.close() print('Training file at %s' % training_fd.name) ##RUN THE TRAINING training_cmd = [SVM_LEARN] training_cmd.append(training_fd.name) whole_model_file = os.path.join(self.folder, MODEL_FILE) training_cmd.append(whole_model_file) ret_code = check_call(training_cmd) print('Training done on %s with code %d' % (whole_model_file,ret_code)) #Save also the index whole_index_file = os.path.join(self.folder,INDEX_FILE) index_fd = open(whole_index_file,'wb') pickle.dump(self.index_features, index_fd, -1) index_fd.close() print('Feature index saved to %s with %d features' % (whole_index_file,len(self.index_features)))
def extract_all_features(): train_files = load_training_files() logging.debug('Loaded '+str(len(train_files))+' files') feat_folder = my_config_manager.get_feature_folder_name() label_feats = separator = None my_stdout, my_stderr = sys.stdout,sys.stderr rel_exp_tar_filename = my_config_manager.get_relation_exp_tar_training_filename() exp_tar_rel_fic = open(rel_exp_tar_filename,'w') rel_exp_hol_filename = my_config_manager.get_relation_exp_hol_training_filename() exp_hol_rel_fic = open(rel_exp_hol_filename,'w') filename_features_polarity_classifier = my_config_manager.get_filename_features_polarity_classifier() fd_filename_features_polarity_classifier = open(filename_features_polarity_classifier,'w') ## Configuration for the relational alcasifier use_these_lexicons = [] use_deps_now = my_config_manager.get_use_dependencies() use_toks_lems_now = my_config_manager.get_use_tokens_lemmas() #accepted_opinions = my_config_manager.get_mapping_valid_opinions(map_all_to_this=OPINION_EXPRESSION) accepted_opinions = my_config_manager.get_mapping_valid_opinions(map_all_to_this=None) mapping_positive_negative = my_config_manager.get_mapping_valid_opinions() use_dependencies_now = my_config_manager.get_use_dependencies() polarities_found_and_skipped = [] for num_file, train_file in enumerate(train_files): logging.debug('Extracting features '+os.path.basename(train_file)) base_name = os.path.basename(train_file) out_file = os.path.join(feat_folder,'file#'+str(num_file)+'#'+base_name+".feat") err_file = out_file+'.log' kaf_naf_obj = KafNafParser(train_file) print>>sys.stderr,'Extracting features from',train_file if num_file == 0: #The first time we load the lexicons lang = kaf_naf_obj.get_language() use_these_lexicons = load_lexicons(my_config_manager,lang) label_feats, separator, pols_skipped_this = extract_features_from_kaf_naf_file(kaf_naf_obj,out_file,err_file, accepted_opinions=accepted_opinions, lexicons = use_these_lexicons) polarities_found_and_skipped.extend(pols_skipped_this) print>>exp_tar_rel_fic,'#'+train_file print>>exp_hol_rel_fic,'#'+train_file # SET valid_opinions to None to use all the possible opinions in the KAF file for extracitng relations # set it valid_opinions = accepted opinions for feiltering ''' create_rel_exp_tar_training(kaf_naf_obj, output=exp_tar_rel_fic, valid_opinions=None, use_dependencies=use_dependencies_now,use_tokens=use_toks_lems_now, use_lemmas=use_toks_lems_now, log=err_file) create_rel_exp_hol_training(kaf_naf_obj ,output=exp_hol_rel_fic, valid_opinions=None, use_dependencies=use_dependencies_now,use_tokens=use_toks_lems_now, use_lemmas=use_toks_lems_now) ''' ##Extract features for the polarity classifier #for mpqa there will be no polarity classifier #extract_features_polarity_classifier_from_kaf(kaf_naf_obj,fd_filename_features_polarity_classifier,mapping_positive_negative) fd_filename_features_polarity_classifier.close() ##Show just for information how many instances have been skipped becase the polarity of opinion expression was not allowed count = defaultdict(int) for exp_label in polarities_found_and_skipped: count[exp_label] += 1 info = '\nOpinions skipped because the polarity label is not included in the configuration\n' info += 'Accepted opinions: '+' '.join(accepted_opinions.keys())+'\n' info += 'Number of complete opinions skipped\n' for label, c in count.items(): info+=' '+label+' :'+str(c)+'\n' info+='\n' logging.debug(info) ################################################### #Re-set the stdout and stderr exp_tar_rel_fic.close() exp_hol_rel_fic.close() sys.stdout,sys.stderr = my_stdout, my_stderr #Sabe labelfeats and separator in a file filename = my_config_manager.get_feature_desc_filename() fic = open(filename,'w') fic.write(' '.join(label_feats)+'\n') fic.close() logging.debug('Description of features --> '+filename)
def corenlp2naf(xml_bytes, annotators): """ Call from on the text and return a Naf object """ naf = KafNafParser(type="NAF") try: doc = Document(xml_bytes) except: log.exception("Error on parsing xml") raise terms = {} # (xml_sentid, xml_tokenid) : term for sent in doc.sentences: for t in sent.tokens: wf = naf.create_wf(t.word, sent.id, t.character_offset_begin) term = naf.create_term(t.lemma, POSMAP[t.pos], t.pos, [wf]) terms[sent.id, t.id] = term if t.ner not in (None, 'O'): naf.create_entity(t.ner, [term.get_id()]) if sent.collapsed_ccprocessed_dependencies: dependencies = True for dep in sent.collapsed_ccprocessed_dependencies.links: if dep.type != 'root': child = terms[sent.id, dep.dependent.idx] parent = terms[sent.id, dep.governor.idx] comment = "{t}({o}, {s})".format(s=child.get_lemma(), t=dep.type, o=parent.get_lemma()) naf.create_dependency(child.get_id(), parent.get_id(), dep.type, comment=comment) if doc.coreferences: for coref in doc.coreferences: cterms = set() for m in coref.mentions: cterms |= { terms[m.sentence.id, t.id].get_id() for t in m.tokens } naf.create_coreference("term", cterms) for annotator in annotators: if annotator in LAYERMAP: naf.create_linguistic_processor( LAYERMAP[annotator], "CoreNLP {annotator}".format(**locals()), get_corenlp_version()) s = BytesIO() naf.dump(s) return s.getvalue()
def process_file(this_file,token_freq): xml_obj = KafNafParser(this_file) print>>sys.stderr,'Processing file',this_file token_for_wid = {} order_for_wid = {} opinion_expressions = [] opinion_targets = [] whole_text = ' ' for n, token in enumerate(xml_obj.get_tokens()): text = token.get_text().lower() token_freq[text] += 1 token_for_wid[token.get_id()] = text order_for_wid[token.get_id()] = n whole_text += text + ' ' wids_for_tid = {} lemma_for_wid = {} pos_for_wid = {} for term in xml_obj.get_terms(): tid = term.get_id() wids = term.get_span().get_span_ids() wids_for_tid[tid] = wids for wid in wids: lemma_for_wid[wid] = term.get_lemma() pos_for_wid[wid] = term.get_pos() ##Properties! aspects = [] ## [(label,term_span)...] for property in xml_obj.get_properties(): for refs in property.get_references(): for span in refs: aspects.append((property.get_type(),span.get_span_ids())) already_counted = {EXP:set(), TAR:set()} for opinion in xml_obj.get_opinions(): for this_type, opinion_obj in [(EXP,opinion.get_expression()),(TAR,opinion.get_target())]: if this_type is EXP and opinion_obj.get_polarity()=='NON-OPINIONATED': continue if opinion_obj is not None: span = opinion_obj.get_span() if span is not None: list_wids = [] for tid in span.get_span_ids(): list_wids.extend(wids_for_tid.get(tid,[])) list_wids.sort(key=lambda wid: order_for_wid[wid]) ##Sorted according the the order of the tokens string_wids = '#'.join(list_wids) opinion_tokens = ' '.join( token_for_wid[wid] for wid in list_wids) opinion_lemmas = ' '.join( lemma_for_wid[wid] for wid in list_wids) opinion_pos = ' '.join( pos_for_wid[wid] for wid in list_wids) if string_wids not in already_counted[this_type]: if this_type == EXP: polarity = (opinion_obj.get_polarity()).lower() opinion_expressions.append((opinion_tokens,polarity,opinion_lemmas,opinion_pos)) else: ##Calculate the aspect type possible_aspects = [] target_ids = span.get_span_ids() for aspect_label, aspect_span in aspects: num_in_common = len(set(target_ids) & set(aspect_span)) if num_in_common != 0: possible_aspects.append((aspect_label,num_in_common,len(aspect_span))) aspect_for_target = 'unknown' if len(possible_aspects) != 0: ##Sorting by the number in common first, and by the lengtgh of the aspect secondly aspect_for_target = sorted(possible_aspects,key=lambda t: (t[1],t[2]), reverse=True)[0][0] opinion_targets.append((opinion_tokens,aspect_for_target, opinion_lemmas,opinion_pos)) already_counted[this_type].add(string_wids) del xml_obj print>>sys.stderr,'\tNumber of opinion expressions:',len(opinion_expressions) print>>sys.stderr,'\tNumber of opinion targets:',len(opinion_targets) print>>sys.stderr,'\tNumber of characters of the text:',len(whole_text) return opinion_expressions, opinion_targets, whole_text
if __name__ == '__main__': files = [] fd = open('nl.list.test') for line in fd: files.append(line.strip()) fd.close() my_polarity_classifier = PolarityClassifier('nl') my_polarity_classifier.load_models(sys.argv[1]) OK = WR = 1 for example_file in files: this_obj = KafNafParser(example_file) my_polarity_classifier.classify_kaf_naf_object(this_obj) this_obj.dump() break GOLD = {} list_ids_term_ids = [] for opinion in this_obj.get_opinions(): op_exp = opinion.get_expression() polarity = op_exp.get_polarity() term_ids = op_exp.get_span().get_span_ids() list_ids_term_ids.append((opinion.get_id(),term_ids)) GOLD[opinion.get_id()] = polarity
# Author: Marieke van Erp ([email protected]) # Date: 27 September 2014 # # Update 23 February 2015: better constituent extraction for feature generation # with help from Ruben Izquierdo from KafNafParserPy import KafNafParser import re import sys from collections import OrderedDict import codecs input = sys.stdin my_parser = KafNafParser(input) ### We first need a list of the predicates that we want to create feature vectors for predicates = {} for term_obj in my_parser.get_terms(): predicate = re.match("WW", term_obj.get_morphofeat()) if predicate is not None: predicates[term_obj.get_id()] = term_obj.get_pos() #print term_obj.get_id(), term_obj.get_morphofeat(), term_obj.get_lemma() # We need the dependencies to find out the structure of the argument patterns # and also to know which verbs are auxiliary verbs and which ones are main verbs dependencies = {} for dep_obj in my_parser.get_dependencies(): relparts = dep_obj.get_function().split('/') rel_from = relparts[0]