def test_corenlp2naf(): xml = open(os.path.join(os.path.dirname(__file__), "test_corenlp.xml")).read() naf_bytes = corenlp.corenlp2naf(xml, corenlp.PARSER) naf = KafNafParser(BytesIO(naf_bytes)) terms = {t.get_id(): t.get_lemma() for t in naf.get_terms()} assert_equal(set(terms.values()), {"John", "attack", "I", "in", "London", "hit", "he", "back", "."}) london = [t for t in naf.get_terms() if t.get_lemma() == 'London'][0] assert_equal(london.get_pos(), 'R') assert_equal(london.get_morphofeat(), 'NNP') ents = {} for e in naf.get_entities(): for ref in e.get_references(): for term_id in ref.get_span().get_span_ids(): ents[terms[term_id]] = e.get_type() assert_equal(ents, {"John": "PERSON", "London": "LOCATION"}) deps = {terms[d.get_from()]: (d.get_function(), terms[d.get_to()]) for d in naf.get_dependencies()} expected = {'I': ('nsubj', 'hit'), 'John': ('nsubj', 'attack'), 'London': ('prep_in', 'attack'), 'back': ('advmod', 'hit'), 'he': ('dobj', 'hit')} assert_equal(deps, expected) corefs = [] for coref in naf.get_corefs(): corefs.append(set()) for span in coref.get_spans(): corefs[-1] |= {terms[t] for t in span.get_span_ids()} assert_in({"John", "he"}, corefs)
def test_create_terms(): """ Can we create_terms via the create_{term,token} functions? """ naf = KafNafParser(type="NAF") sent = 1 offset = 0 input = [(u'dit', u'dit', u'O', u'VNW'), (u'is', u'zijn', u'V', u'WW'), (u'een', u'een', u'D', u'LID'), (u'test', u'test', u'N', u'N')] offset = 0 for (word, lemma, pos, morph) in input: token = naf.create_wf(word, 1, offset) offset += len(word) term = naf.create_term(lemma, pos, morph, [token]) tokens = {t.get_id(): t for t in naf.get_tokens()} assert_equal(len(tokens), 4) result = {} for term in naf.get_terms(): for token_id in term.get_span().get_span_ids(): token = tokens[token_id] result[term.get_id()] = (token.get_text(), term.get_lemma(), term.get_pos(), term.get_morphofeat()) result = [result[tid] for tid in sorted(result.keys())] assert_equal(input, result)
def process_single_file(self,file): try: xml_obj = KafNafParser(file) except: print>>sys.stderr,'Error parsing',file,': skipped' return print>>sys.stderr,'Processing file', os.path.basename(file), 'Type:',xml_obj.get_type() self.langs[xml_obj.get_language()] += 1 sentences = [] current_sent = [] this_sent = None pos_for_wid = {} ## For each token id (wid) the pos of it for term in xml_obj.get_terms(): w_ids = term.get_span().get_span_ids() pos = term.get_pos() for wid in term.get_span().get_span_ids(): pos_for_wid[wid] = pos for token in xml_obj.get_tokens(): wid = token.get_id() value = token.get_text() if self.convert_to_lowercase: value = value.lower() if value in self.punctuation: value = 'PUN' if value == '*': value = 'STAR' sentence = token.get_sent() if this_sent is not None and sentence != this_sent: ## There is a new sent sentences.append(current_sent) current_sent = [] current_sent.append((wid,value)) this_sent = sentence ## Add the last sentence as well sentences.append(current_sent) for sentence in sentences: if self.include_sentence_delimiters: sentence.insert(0,('xxx','<S>')) sentence.append(('xxx','</S>')) for idx in range(0,len(sentence)): for ngramlen in range(self.min_ngram_len,self.max_ngram_len+1): file_desc = self.get_file_desc_for_ngram(ngramlen) start = idx end = start + ngramlen if end <= len(sentence): this_ngram = '\t'.join(value for wid, value in sentence[start:end]) this_ngram_pos = '\t'.join(pos_for_wid.get(wid,'X') for wid, value in sentence[start:end]) file_desc.write(this_ngram.encode('utf-8')+'\t'+DELIMITER+'\t'+this_ngram_pos+'\n')
def test_create_terms(): """ Can we create_terms via the create_{term,token} functions? """ naf = KafNafParser(type="NAF") sent=1; offset=0 input = [(u'dit', u'dit', u'O', u'VNW'), (u'is', u'zijn', u'V', u'WW'), (u'een', u'een', u'D', u'LID'), (u'test', u'test', u'N', u'N')] offset = 0 for (word, lemma, pos, morph) in input: token = naf.create_wf(word, 1, offset) offset += len(word) term = naf.create_term(lemma, pos, morph, [token]) tokens = {t.get_id(): t for t in naf.get_tokens()} assert_equal(len(tokens), 4) result = {} for term in naf.get_terms(): for token_id in term.get_span().get_span_ids(): token = tokens[token_id] result[term.get_id()] = (token.get_text(), term.get_lemma(), term.get_pos(), term.get_morphofeat()) result = [result[tid] for tid in sorted(result.keys())] assert_equal(input, result)
def from_naf(self, article, naf): def _int(x): return None if x is None else int(x) naf = KafNafParser(BytesIO(naf.encode("utf-8"))) deps = {dep.get_to(): (dep.get_function(), dep.get_from()) for dep in naf.get_dependencies()} tokendict = {token.get_id(): token for token in naf.get_tokens()} for term in naf.get_terms(): tokens = [tokendict[id] for id in term.get_span().get_span_ids()] for token in tokens: tid = term.get_id() tok = {"aid": article, "token_id": token.get_id(), "offset": _int(token.get_offset()), "sentence": _int(token.get_sent()), "para": _int(token.get_para()), "word": token.get_text(), "term_id": tid, "lemma": term.get_lemma(), "pos": term.get_pos()} if tid in deps: rel, parent = deps[tid] tok['parent'] = parent tok['relation'] = rel.split("/")[-1] yield tok
def convert(self, id, result, format): assert format == "csv" _int = lambda x: None if x is None else int(x) naf = KafNafParser(BytesIO(result.encode("utf-8"))) deps = {dep.get_to(): (dep.get_function(), dep.get_from()) for dep in naf.get_dependencies()} tokendict = {token.get_id(): token for token in naf.get_tokens()} s = StringIO() w = csv.writer(s) w.writerow(["id", "token_id", "offset", "sentence", "para", "word", "term_id", "lemma", "pos", "pos1", "parent", "relation"]) for term in naf.get_terms(): tokens = [tokendict[id] for id in term.get_span().get_span_ids()] for token in tokens: tid = term.get_id() pos = term.get_pos() pos1 = POSMAP[pos] row = [id, token.get_id(), _int(token.get_offset()), _int(token.get_para()), token.get_text(), tid, term.get_lemma(), pos, pos1] if tid in deps: rel, parent = deps[tid] row += [parent, rel.split("/")[-1]] else: row += [None, None] w.writerow(row) return s.getvalue()
def test_frog_saf(): _check_frog() naf_str = frog._process("Mark Rutte werkte gisteren nog bij de Vrije Universiteit in Amsterdam") naf = KafNafParser(BytesIO(naf_str)) lemmata = {t.get_lemma() for t in naf.get_terms()} assert_equal(lemmata, {"Mark_Rutte", "werken", "gisteren", "nog", "bij", "de", "vrij", "universiteit", "in", "Amsterdam"})
def test_corenlp_naf(): _check_corenlp() naf_bytes = corenlp.corenlp_naf("John shoots himself", annotators=corenlp.LEMMATIZER) print naf_bytes naf = KafNafParser(BytesIO(naf_bytes)) terms = {t.get_id(): t.get_lemma() for t in naf.get_terms()} assert_equal(set(terms.values()), {"John", "shoot", "himself"})
def test_frog_saf(): _check_frog() naf_str = frog._process( "Mark Rutte werkte gisteren nog bij de Vrije Universiteit in Amsterdam" ) naf = KafNafParser(BytesIO(naf_str)) lemmata = {t.get_lemma() for t in naf.get_terms()} assert_equal( lemmata, { "Mark_Rutte", "werken", "gisteren", "nog", "bij", "de", "vrij", "universiteit", "in", "Amsterdam" })
def test_corenlp2naf(): xml = open(os.path.join(os.path.dirname(__file__), "test_corenlp.xml")).read() naf_bytes = corenlp.corenlp2naf(xml, corenlp.PARSER) naf = KafNafParser(BytesIO(naf_bytes)) terms = {t.get_id(): t.get_lemma() for t in naf.get_terms()} assert_equal( set(terms.values()), {"John", "attack", "I", "in", "London", "hit", "he", "back", "."}) london = [t for t in naf.get_terms() if t.get_lemma() == 'London'][0] assert_equal(london.get_pos(), 'R') assert_equal(london.get_morphofeat(), 'NNP') ents = {} for e in naf.get_entities(): for ref in e.get_references(): for term_id in ref.get_span().get_span_ids(): ents[terms[term_id]] = e.get_type() assert_equal(ents, {"John": "PERSON", "London": "LOCATION"}) deps = { terms[d.get_from()]: (d.get_function(), terms[d.get_to()]) for d in naf.get_dependencies() } expected = { 'I': ('nsubj', 'hit'), 'John': ('nsubj', 'attack'), 'London': ('prep_in', 'attack'), 'back': ('advmod', 'hit'), 'he': ('dobj', 'hit') } assert_equal(deps, expected) corefs = [] for coref in naf.get_corefs(): corefs.append(set()) for span in coref.get_spans(): corefs[-1] |= {terms[t] for t in span.get_span_ids()} assert_in({"John", "he"}, corefs)
def from_naf(self, naf): naf = KafNafParser(BytesIO(naf.encode("utf-8"))) tokendict = {token.get_id(): token for token in naf.get_tokens()} for term in naf.get_terms(): tokens = [tokendict[id] for id in term.get_span().get_span_ids()] for token in tokens: yield {"aid": article.pk, "token_id": token.get_id(), "offset": token.get_offset(), "sentence": token.get_sent(), "para": token.get_para(), "word": token.get_text(), "term_id": term.get_id(), "lemma": term.get_lemma(), "pos": term.get_pos()}
def convert(self, id, result, format): assert format == "csv" naf = KafNafParser(BytesIO(result.encode("utf-8"))) memo = self._csv_memo(naf) tokendict = {token.get_id(): token for token in naf.get_tokens()} s = StringIO() w = csv.writer(s) w.writerow(self._csv_header()) for term in naf.get_terms(): tokens = [tokendict[id] for id in term.get_span().get_span_ids()] for token in tokens: tid = term.get_id() pos = term.get_pos() pos1 = POSMAP[pos] row = [id] + list(self._csv_row(memo, term, token)) w.writerow(row) return s.getvalue()
def read_training_data(file_name): """ read kaf/naf and matches the aspects with the words """ parser = KafNafParser(PATH_ANNOTATED_DATA + file_name) terms = list(parser.get_terms()) # create token dictionairy containing naf info tokens_container = dict() for token_el in parser.get_tokens(): token_node = token_el.node token_id = token_node.get('wid').replace('w', 't') token_info = token_node.attrib tokens_container[token_id] = token_info properties = list(parser.get_properties()) handled_properties, term_dict = handle_properties(properties, terms, tokens_container) return terms, properties, handled_properties, term_dict, tokens_container
def extract_data_file(filename, label_gold, label_system, this_temp_folder=None, get_random=False): if this_temp_folder is None: temp_folder = mkdtemp() else: temp_folder = this_temp_folder fd_gold = open(temp_folder+'/'+__gold_filename__,'a') fd_system = open(temp_folder+'/'+__system_filename__, 'a') input_obj = KafNafParser(filename) for term in input_obj.get_terms(): #Get gold term_id = term.get_id() results_gold = [] results_system = [] for ext_ref in term.get_external_references(): resource = ext_ref.get_resource() if resource == label_gold: results_gold.append((ext_ref.get_reference(),ext_ref.get_confidence())) elif resource == label_system: results_system.append((ext_ref.get_reference(),ext_ref.get_confidence())) if len(results_gold) > 0: best_gold_label, best_gold_value = get_max_from_list(results_gold) fd_gold.write(filename+'\t'+term_id+'\t'+best_gold_label+'\n') if get_random: best_system_label, best_system_value = get_random_from_list(results_system) else: best_system_label, best_system_value = get_max_from_list(results_system) if best_system_label is not None: fd_system.write(filename+'\t'+term_id+'\t'+best_system_label+'\n') fd_gold.close() fd_system.close() #Create the "fake" sense.mappings fd_map = open(temp_folder+'/'+__sense_mapping__,'w') fd_map.close() return temp_folder
def _test_file(this_file): input_fd = open(this_file) result = subprocess.check_output(os.path.join(__here__,'run_parser.sh'), stdin=input_fd) my_obj = KafNafParser(BytesIO(result)) #Check the terms terms = [term for term in my_obj.get_terms()] assert_equal(len(terms),12) assert_equal(my_obj.get_term('t_4').get_lemma(),'mooi') assert_equal(my_obj.get_term('t_4').get_pos(),'adj') #Check constituents trees = [tree for tree in my_obj.get_trees()] assert_equal(len(trees),2) assert_equal(trees[0].get_terminals_as_list()[1].get_span().get_span_ids(),['t_1']) #Check dependencies dependencies = [dep for dep in my_obj.get_dependencies()] assert_equal(len(dependencies),10) assert_equal(dependencies[5].get_function(),'hd/su')
def process_single_file(self, file): try: xml_obj = KafNafParser(file) except: print >> sys.stderr, 'Error parsing', file, ': skipped' return print >> sys.stderr, 'Processing file', os.path.basename( file), 'Type:', xml_obj.get_type() self.langs[xml_obj.get_language()] += 1 sentences = [] current_sent = [] this_sent = None pos_for_wid = {} ## For each token id (wid) the pos of it for term in xml_obj.get_terms(): w_ids = term.get_span().get_span_ids() pos = term.get_pos() for wid in term.get_span().get_span_ids(): pos_for_wid[wid] = pos for token in xml_obj.get_tokens(): wid = token.get_id() value = token.get_text() if self.convert_to_lowercase: value = value.lower() if value in self.punctuation: value = 'PUN' if value == '*': value = 'STAR' sentence = token.get_sent() if this_sent is not None and sentence != this_sent: ## There is a new sent sentences.append(current_sent) current_sent = [] current_sent.append((wid, value)) this_sent = sentence ## Add the last sentence as well sentences.append(current_sent) for sentence in sentences: if self.include_sentence_delimiters: sentence.insert(0, ('xxx', '<S>')) sentence.append(('xxx', '</S>')) for idx in range(0, len(sentence)): for ngramlen in range(self.min_ngram_len, self.max_ngram_len + 1): file_desc = self.get_file_desc_for_ngram(ngramlen) start = idx end = start + ngramlen if end <= len(sentence): this_ngram = '\t'.join( value for wid, value in sentence[start:end]) this_ngram_pos = '\t'.join( pos_for_wid.get(wid, 'X') for wid, value in sentence[start:end]) file_desc.write( this_ngram.encode('utf-8') + '\t' + DELIMITER + '\t' + this_ngram_pos + '\n')
def process_file(this_file,token_freq): xml_obj = KafNafParser(this_file) print>>sys.stderr,'Processing file',this_file token_for_wid = {} order_for_wid = {} opinion_expressions = [] opinion_targets = [] whole_text = ' ' for n, token in enumerate(xml_obj.get_tokens()): text = token.get_text().lower() token_freq[text] += 1 token_for_wid[token.get_id()] = text order_for_wid[token.get_id()] = n whole_text += text + ' ' wids_for_tid = {} lemma_for_wid = {} pos_for_wid = {} for term in xml_obj.get_terms(): tid = term.get_id() wids = term.get_span().get_span_ids() wids_for_tid[tid] = wids for wid in wids: lemma_for_wid[wid] = term.get_lemma() pos_for_wid[wid] = term.get_pos() ##Properties! aspects = [] ## [(label,term_span)...] for property in xml_obj.get_properties(): for refs in property.get_references(): for span in refs: aspects.append((property.get_type(),span.get_span_ids())) already_counted = {EXP:set(), TAR:set()} for opinion in xml_obj.get_opinions(): for this_type, opinion_obj in [(EXP,opinion.get_expression()),(TAR,opinion.get_target())]: if this_type is EXP and opinion_obj.get_polarity()=='NON-OPINIONATED': continue if opinion_obj is not None: span = opinion_obj.get_span() if span is not None: list_wids = [] for tid in span.get_span_ids(): list_wids.extend(wids_for_tid.get(tid,[])) list_wids.sort(key=lambda wid: order_for_wid[wid]) ##Sorted according the the order of the tokens string_wids = '#'.join(list_wids) opinion_tokens = ' '.join( token_for_wid[wid] for wid in list_wids) opinion_lemmas = ' '.join( lemma_for_wid[wid] for wid in list_wids) opinion_pos = ' '.join( pos_for_wid[wid] for wid in list_wids) if string_wids not in already_counted[this_type]: if this_type == EXP: polarity = (opinion_obj.get_polarity()).lower() opinion_expressions.append((opinion_tokens,polarity,opinion_lemmas,opinion_pos)) else: ##Calculate the aspect type possible_aspects = [] target_ids = span.get_span_ids() for aspect_label, aspect_span in aspects: num_in_common = len(set(target_ids) & set(aspect_span)) if num_in_common != 0: possible_aspects.append((aspect_label,num_in_common,len(aspect_span))) aspect_for_target = 'unknown' if len(possible_aspects) != 0: ##Sorting by the number in common first, and by the lengtgh of the aspect secondly aspect_for_target = sorted(possible_aspects,key=lambda t: (t[1],t[2]), reverse=True)[0][0] opinion_targets.append((opinion_tokens,aspect_for_target, opinion_lemmas,opinion_pos)) already_counted[this_type].add(string_wids) del xml_obj print>>sys.stderr,'\tNumber of opinion expressions:',len(opinion_expressions) print>>sys.stderr,'\tNumber of opinion targets:',len(opinion_targets) print>>sys.stderr,'\tNumber of characters of the text:',len(whole_text) return opinion_expressions, opinion_targets, whole_text
import sys if __name__ == '__main__': #Load Wordnet synset_for_skey = {} path_to_index_sense = '/home/izquierdo/wordnets/wordnet-3.0/dict/index.sense' fd = open(path_to_index_sense) for line in fd: fields = line.split() synset_for_skey[fields[0]] = fields[1] fd.close() naf_obj = KafNafParser(sys.stdin) for term in naf_obj.get_terms(): this_skey = None this_synset = None ref_skey = ref_synset = None for ext_ref in term.get_external_references(): if ext_ref.get_reftype() == 'sense': this_skey = ext_ref.get_reference() ref_skey = ext_ref if ext_ref.get_reftype() == 'ilidef': this_synset = ext_ref.get_reference() ref_synset = ext_ref if this_synset == '': print >> sys.stderr, term.get_id() if '%3:' in this_skey: this_skey = this_skey.replace('%3:', '%5:')
def load_naf_stdin(): """Load a dataset in NAF format. Use this function to create a new ConlluDataset from a NAF file, read from stdin. NOTE: you can only add to NAF files, not create one from scratch. """ my_parser = KafNafParser(sys.stdin) my_dataset = ConlluDataset() # a big look-up table: for any NAF id, return a hash with # {sent_id, token_id} in the ConlluDataset naf2conll_id = {} # collect the sentences in a hash, indexed by token_obj.get_sent() sentences = {} # iterate over the tokens to get: ID, FORM for token_obj in my_parser.get_tokens(): # (string) identifier of the sentence sent_id = token_obj.get_sent() if sent_id in sentences: sentence = sentences[sent_id] else: sentence = Sentence(sent_id=sent_id) sentences[sent_id] = sentence # (string) number of the token in the sentence, starting at '1' token_id = '{}'.format(len(sentence) + 1) # ID new_token = Token([ token_id, # ID token_obj.get_text(), # FORM '_', # LEMMA '_', # UPOS '_', # XPOS '_', # FEATS '0', # HEAD -> to be overwritten later 'root', # DEPREL -> to be overwritten later '_', # DEPS '_' # MISC ]) sentence.add(new_token) # to match a NAF span to conll tokens, we need sent_id and token_id naf2conll_id[token_obj.get_id()] = { 'sent_id': sent_id, 'token_id': token_id } # iterate over the term to get: LEMMA, XPOS, UPOS, FEATS, sent_id, nafid for term_obj in my_parser.get_terms(): # span # TODO: for now, assume terms map one-on-one on tokens nafid = term_obj.get_span().get_span_ids() if len(nafid) > 1: logging.error('Multi-word tokens not implemented yet.') return nafid = nafid[0] conllid = naf2conll_id[nafid] sent_id = conllid['sent_id'] sentence = sentences[sent_id] token_id = conllid['token_id'] token = sentence[token_id] # store the identifier of the NAF term on the token, so we can add # information to the NAF later. token.nafid = term_obj.get_id() token.LEMMA = term_obj.get_lemma() # NAF pos='' is in lower case, UD UPOS is upper case token.UPOS = term_obj.get_pos().upper() # naf: A(B,C) -> ud: A|B|C xpos = term_obj.get_morphofeat() if xpos: token.XPOS = xpos.replace('(', '|').replace(')', '').replace(',', '|') if token.XPOS[-1] == '|': token.XPOS = token.XPOS[:-1] # look for an external reference containing FEATS for ext_ref in term_obj.get_external_references(): if ext_ref.get_reftype() == 'FEATS': token.FEATS = ext_ref.get_reference() # to match NAF dependencies to conll tokens, we need sent_id and token_id naf2conll_id[term_obj.get_id()] = { 'sent_id': sent_id, 'token_id': token_id } # iterate over the dependencies to get: HEAD, DEPREL for dep_obj in my_parser.get_dependencies(): # from conllid = naf2conll_id[dep_obj.get_from()] sent_id = conllid['sent_id'] sentence = sentences[sent_id] token_id = conllid['token_id'] token_from = sentence[token_id] # to conllid = naf2conll_id[dep_obj.get_to()] sent_id = conllid['sent_id'] sentence = sentences[sent_id] token_id = conllid['token_id'] token_to = sentence[token_id] # function depfunc = dep_obj.get_function() token_to.HEAD = token_from.ID token_to.DEPREL = depfunc # A final conversion of our list of sentences to a ConlluDataset for sent_id in sentences: sentence = sentences[sent_id] # construct the sentence.full_text raw_tokens = [] for token in sentence: raw_tokens.append(token.FORM) sentence.full_text = ' '.join(raw_tokens) # add to the dataset my_dataset.add(sentence) my_dataset.naf2conll_id = naf2conll_id return my_dataset, my_parser
def find_terms(naf: KafNafParser, words: Sequence[str]) -> Iterable[Cterm]: """Find all terms whose lemma or word form is in the list of words""" for t in naf.get_terms(): if t.get_lemma() in words or get_word(naf, t) in words: yield t
# with help from Ruben Izquierdo from KafNafParserPy import KafNafParser import re import sys from collections import OrderedDict import codecs input = sys.stdin my_parser = KafNafParser(input) ### We first need a list of the predicates that we want to create feature vectors for predicates = {} for term_obj in my_parser.get_terms(): predicate = re.match("WW", term_obj.get_morphofeat()) if predicate is not None: predicates[term_obj.get_id()] = term_obj.get_pos() #print term_obj.get_id(), term_obj.get_morphofeat(), term_obj.get_lemma() # We need the dependencies to find out the structure of the argument patterns # and also to know which verbs are auxiliary verbs and which ones are main verbs dependencies = {} for dep_obj in my_parser.get_dependencies(): relparts = dep_obj.get_function().split('/') rel_from = relparts[0] rel_to = relparts[1] dep_id = dep_obj.get_from() + '-' + dep_obj.get_to() dependencies[dep_id] = dep_obj.get_function()
required=True) parser.add_argument('-ov', dest='output_version', help='Output WN version of the synsets', required=True) parser.add_argument('-or', dest='output_res_label', help='Output resource label for synset references', required=True) args = parser.parse_args() mapping = load_mapping(args.input_version, args.output_version) obj = KafNafParser(args.input_file) for term in obj.get_terms(): source_synset = None for ext_ref in term.get_external_references(): if ext_ref.get_resource( ) == args.input_res_label and ext_ref.get_reftype() == 'synset': source_synset = ext_ref.get_reference() break if source_synset is not None: fields = source_synset.split('-') this_synset = fields[1] short_pos = fields[2] if short_pos == 'a': this_pos = ADJ elif short_pos == 'n': this_pos = NOUN elif short_pos == 'r': this_pos = ADV elif short_pos == 'v': this_pos = VERB else: this_pos = None
def add_file(filename, data_lexelt, reftype='lexical_key'): obj = KafNafParser(filename) tokens_per_sent = {} sent_for_token = {} sents_in_order = [] for token in obj.get_tokens(): sentid = token.get_sent() if sentid not in sents_in_order: sents_in_order.append(sentid) sent_for_token[token.get_id()] = sentid if sentid not in tokens_per_sent: tokens_per_sent[sentid] = [] tokens_per_sent[sentid].append((token.get_id(), token.get_text())) annotated_lemmas = [] # LIST of (full_id, token ids, lemma,pos,synset) for term in obj.get_terms(): synset_label = None for ext_ref in term.get_external_references(): if ext_ref.get_reftype() == 'lexical_key': synset_label = term.get_lemma() + '%' + ext_ref.get_reference() elif ext_ref.get_reftype() == 'sense' and ext_ref.get_resource( ) == 'WordNet-3.0': synset_label = ext_ref.get_reference() if synset_label is not None: break if synset_label is not None: annotated_lemmas.append( (filename + '#' + term.get_id(), term.get_span().get_span_ids(), term.get_lemma(), term.get_pos(), synset_label)) for full_id, token_ids, lemma, pos, synset_label in annotated_lemmas: #CREATE NEW INSTANCE this_key = lemma + '.' + pos.lower()[0] if this_key not in data_lexelt: data_lexelt[this_key] = Clexelt(this_key, pos) if not data_lexelt[this_key].exists(full_id): #Create the new instance new_instance = Cinstance() new_instance.id = full_id new_instance.docsrc = filename new_instance.key = synset_label tokens = [] target_indexes = [] this_sent = sent_for_token[token_ids[0]] index = sents_in_order.index(this_sent) start_idx = max(index - 2, 0) end_idx = min(index + 2, len(sents_in_order) - 1) selected_sents = sents_in_order[start_idx:end_idx + 1] num_token = 0 for current_sent in selected_sents: for token_id, token_text in tokens_per_sent[str(current_sent)]: tokens.append(token_text) if token_id in token_ids: target_indexes.append(num_token) num_token += 1 new_instance.tokens = tokens[:] new_instance.index_head = target_indexes[:] data_lexelt[this_key].add_instance(new_instance)
def process_file(this_file, token_freq): xml_obj = KafNafParser(this_file) print >> sys.stderr, 'Processing file', this_file token_for_wid = {} order_for_wid = {} opinion_expressions = [] opinion_targets = [] whole_text = ' ' for n, token in enumerate(xml_obj.get_tokens()): text = token.get_text().lower() token_freq[text] += 1 token_for_wid[token.get_id()] = text order_for_wid[token.get_id()] = n whole_text += text + ' ' wids_for_tid = {} lemma_for_wid = {} pos_for_wid = {} for term in xml_obj.get_terms(): tid = term.get_id() wids = term.get_span().get_span_ids() wids_for_tid[tid] = wids for wid in wids: lemma_for_wid[wid] = term.get_lemma() pos_for_wid[wid] = term.get_pos() ##Properties! aspects = [] ## [(label,term_span)...] for property in xml_obj.get_properties(): for refs in property.get_references(): for span in refs: aspects.append((property.get_type(), span.get_span_ids())) already_counted = {EXP: set(), TAR: set()} for opinion in xml_obj.get_opinions(): for this_type, opinion_obj in [(EXP, opinion.get_expression()), (TAR, opinion.get_target())]: if this_type is EXP and opinion_obj.get_polarity( ) == 'NON-OPINIONATED': continue if opinion_obj is not None: span = opinion_obj.get_span() if span is not None: list_wids = [] for tid in span.get_span_ids(): list_wids.extend(wids_for_tid.get(tid, [])) list_wids.sort(key=lambda wid: order_for_wid[ wid]) ##Sorted according the the order of the tokens string_wids = '#'.join(list_wids) opinion_tokens = ' '.join(token_for_wid[wid] for wid in list_wids) opinion_lemmas = ' '.join(lemma_for_wid[wid] for wid in list_wids) opinion_pos = ' '.join(pos_for_wid[wid] for wid in list_wids) if string_wids not in already_counted[this_type]: if this_type == EXP: polarity = (opinion_obj.get_polarity()).lower() opinion_expressions.append( (opinion_tokens, polarity, opinion_lemmas, opinion_pos)) else: ##Calculate the aspect type possible_aspects = [] target_ids = span.get_span_ids() for aspect_label, aspect_span in aspects: num_in_common = len( set(target_ids) & set(aspect_span)) if num_in_common != 0: possible_aspects.append( (aspect_label, num_in_common, len(aspect_span))) aspect_for_target = 'unknown' if len(possible_aspects) != 0: ##Sorting by the number in common first, and by the lengtgh of the aspect secondly aspect_for_target = sorted(possible_aspects, key=lambda t: (t[1], t[2]), reverse=True)[0][0] opinion_targets.append( (opinion_tokens, aspect_for_target, opinion_lemmas, opinion_pos)) already_counted[this_type].add(string_wids) del xml_obj print >> sys.stderr, '\tNumber of opinion expressions:', len( opinion_expressions) print >> sys.stderr, '\tNumber of opinion targets:', len(opinion_targets) print >> sys.stderr, '\tNumber of characters of the text:', len(whole_text) return opinion_expressions, opinion_targets, whole_text