def convert(self, id, result, format): assert format == "csv" _int = lambda x: None if x is None else int(x) naf = KafNafParser(BytesIO(result.encode("utf-8"))) deps = {dep.get_to(): (dep.get_function(), dep.get_from()) for dep in naf.get_dependencies()} tokendict = {token.get_id(): token for token in naf.get_tokens()} s = StringIO() w = csv.writer(s) w.writerow(["id", "token_id", "offset", "sentence", "para", "word", "term_id", "lemma", "pos", "pos1", "parent", "relation"]) for term in naf.get_terms(): tokens = [tokendict[id] for id in term.get_span().get_span_ids()] for token in tokens: tid = term.get_id() pos = term.get_pos() pos1 = POSMAP[pos] row = [id, token.get_id(), _int(token.get_offset()), _int(token.get_para()), token.get_text(), tid, term.get_lemma(), pos, pos1] if tid in deps: rel, parent = deps[tid] row += [parent, rel.split("/")[-1]] else: row += [None, None] w.writerow(row) return s.getvalue()
def test_create_terms(): """ Can we create_terms via the create_{term,token} functions? """ naf = KafNafParser(type="NAF") sent=1; offset=0 input = [(u'dit', u'dit', u'O', u'VNW'), (u'is', u'zijn', u'V', u'WW'), (u'een', u'een', u'D', u'LID'), (u'test', u'test', u'N', u'N')] offset = 0 for (word, lemma, pos, morph) in input: token = naf.create_wf(word, 1, offset) offset += len(word) term = naf.create_term(lemma, pos, morph, [token]) tokens = {t.get_id(): t for t in naf.get_tokens()} assert_equal(len(tokens), 4) result = {} for term in naf.get_terms(): for token_id in term.get_span().get_span_ids(): token = tokens[token_id] result[term.get_id()] = (token.get_text(), term.get_lemma(), term.get_pos(), term.get_morphofeat()) result = [result[tid] for tid in sorted(result.keys())] assert_equal(input, result)
def from_naf(self, article, naf): def _int(x): return None if x is None else int(x) naf = KafNafParser(BytesIO(naf.encode("utf-8"))) deps = {dep.get_to(): (dep.get_function(), dep.get_from()) for dep in naf.get_dependencies()} tokendict = {token.get_id(): token for token in naf.get_tokens()} for term in naf.get_terms(): tokens = [tokendict[id] for id in term.get_span().get_span_ids()] for token in tokens: tid = term.get_id() tok = {"aid": article, "token_id": token.get_id(), "offset": _int(token.get_offset()), "sentence": _int(token.get_sent()), "para": _int(token.get_para()), "word": token.get_text(), "term_id": tid, "lemma": term.get_lemma(), "pos": term.get_pos()} if tid in deps: rel, parent = deps[tid] tok['parent'] = parent tok['relation'] = rel.split("/")[-1] yield tok
def process_single_file(self,file): try: xml_obj = KafNafParser(file) except: print>>sys.stderr,'Error parsing',file,': skipped' return print>>sys.stderr,'Processing file', os.path.basename(file), 'Type:',xml_obj.get_type() self.langs[xml_obj.get_language()] += 1 sentences = [] current_sent = [] this_sent = None pos_for_wid = {} ## For each token id (wid) the pos of it for term in xml_obj.get_terms(): w_ids = term.get_span().get_span_ids() pos = term.get_pos() for wid in term.get_span().get_span_ids(): pos_for_wid[wid] = pos for token in xml_obj.get_tokens(): wid = token.get_id() value = token.get_text() if self.convert_to_lowercase: value = value.lower() if value in self.punctuation: value = 'PUN' if value == '*': value = 'STAR' sentence = token.get_sent() if this_sent is not None and sentence != this_sent: ## There is a new sent sentences.append(current_sent) current_sent = [] current_sent.append((wid,value)) this_sent = sentence ## Add the last sentence as well sentences.append(current_sent) for sentence in sentences: if self.include_sentence_delimiters: sentence.insert(0,('xxx','<S>')) sentence.append(('xxx','</S>')) for idx in range(0,len(sentence)): for ngramlen in range(self.min_ngram_len,self.max_ngram_len+1): file_desc = self.get_file_desc_for_ngram(ngramlen) start = idx end = start + ngramlen if end <= len(sentence): this_ngram = '\t'.join(value for wid, value in sentence[start:end]) this_ngram_pos = '\t'.join(pos_for_wid.get(wid,'X') for wid, value in sentence[start:end]) file_desc.write(this_ngram.encode('utf-8')+'\t'+DELIMITER+'\t'+this_ngram_pos+'\n')
def test_create_terms(): """ Can we create_terms via the create_{term,token} functions? """ naf = KafNafParser(type="NAF") sent = 1 offset = 0 input = [(u'dit', u'dit', u'O', u'VNW'), (u'is', u'zijn', u'V', u'WW'), (u'een', u'een', u'D', u'LID'), (u'test', u'test', u'N', u'N')] offset = 0 for (word, lemma, pos, morph) in input: token = naf.create_wf(word, 1, offset) offset += len(word) term = naf.create_term(lemma, pos, morph, [token]) tokens = {t.get_id(): t for t in naf.get_tokens()} assert_equal(len(tokens), 4) result = {} for term in naf.get_terms(): for token_id in term.get_span().get_span_ids(): token = tokens[token_id] result[term.get_id()] = (token.get_text(), term.get_lemma(), term.get_pos(), term.get_morphofeat()) result = [result[tid] for tid in sorted(result.keys())] assert_equal(input, result)
def from_naf(self, naf): naf = KafNafParser(BytesIO(naf.encode("utf-8"))) tokendict = {token.get_id(): token for token in naf.get_tokens()} for term in naf.get_terms(): tokens = [tokendict[id] for id in term.get_span().get_span_ids()] for token in tokens: yield {"aid": article.pk, "token_id": token.get_id(), "offset": token.get_offset(), "sentence": token.get_sent(), "para": token.get_para(), "word": token.get_text(), "term_id": term.get_id(), "lemma": term.get_lemma(), "pos": term.get_pos()}
def read_training_data(file_name): """ read kaf/naf and matches the aspects with the words """ parser = KafNafParser(PATH_ANNOTATED_DATA + file_name) terms = list(parser.get_terms()) # create token dictionairy containing naf info tokens_container = dict() for token_el in parser.get_tokens(): token_node = token_el.node token_id = token_node.get('wid').replace('w', 't') token_info = token_node.attrib tokens_container[token_id] = token_info properties = list(parser.get_properties()) handled_properties, term_dict = handle_properties(properties, terms, tokens_container) return terms, properties, handled_properties, term_dict, tokens_container
def convert(self, id, result, format): assert format == "csv" naf = KafNafParser(BytesIO(result.encode("utf-8"))) memo = self._csv_memo(naf) tokendict = {token.get_id(): token for token in naf.get_tokens()} s = StringIO() w = csv.writer(s) w.writerow(self._csv_header()) for term in naf.get_terms(): tokens = [tokendict[id] for id in term.get_span().get_span_ids()] for token in tokens: tid = term.get_id() pos = term.get_pos() pos1 = POSMAP[pos] row = [id] + list(self._csv_row(memo, term, token)) w.writerow(row) return s.getvalue()
def process_file(this_file,token_freq): xml_obj = KafNafParser(this_file) print>>sys.stderr,'Processing file',this_file token_for_wid = {} order_for_wid = {} opinion_expressions = [] opinion_targets = [] whole_text = ' ' for n, token in enumerate(xml_obj.get_tokens()): text = token.get_text().lower() token_freq[text] += 1 token_for_wid[token.get_id()] = text order_for_wid[token.get_id()] = n whole_text += text + ' ' wids_for_tid = {} lemma_for_wid = {} pos_for_wid = {} for term in xml_obj.get_terms(): tid = term.get_id() wids = term.get_span().get_span_ids() wids_for_tid[tid] = wids for wid in wids: lemma_for_wid[wid] = term.get_lemma() pos_for_wid[wid] = term.get_pos() ##Properties! aspects = [] ## [(label,term_span)...] for property in xml_obj.get_properties(): for refs in property.get_references(): for span in refs: aspects.append((property.get_type(),span.get_span_ids())) already_counted = {EXP:set(), TAR:set()} for opinion in xml_obj.get_opinions(): for this_type, opinion_obj in [(EXP,opinion.get_expression()),(TAR,opinion.get_target())]: if this_type is EXP and opinion_obj.get_polarity()=='NON-OPINIONATED': continue if opinion_obj is not None: span = opinion_obj.get_span() if span is not None: list_wids = [] for tid in span.get_span_ids(): list_wids.extend(wids_for_tid.get(tid,[])) list_wids.sort(key=lambda wid: order_for_wid[wid]) ##Sorted according the the order of the tokens string_wids = '#'.join(list_wids) opinion_tokens = ' '.join( token_for_wid[wid] for wid in list_wids) opinion_lemmas = ' '.join( lemma_for_wid[wid] for wid in list_wids) opinion_pos = ' '.join( pos_for_wid[wid] for wid in list_wids) if string_wids not in already_counted[this_type]: if this_type == EXP: polarity = (opinion_obj.get_polarity()).lower() opinion_expressions.append((opinion_tokens,polarity,opinion_lemmas,opinion_pos)) else: ##Calculate the aspect type possible_aspects = [] target_ids = span.get_span_ids() for aspect_label, aspect_span in aspects: num_in_common = len(set(target_ids) & set(aspect_span)) if num_in_common != 0: possible_aspects.append((aspect_label,num_in_common,len(aspect_span))) aspect_for_target = 'unknown' if len(possible_aspects) != 0: ##Sorting by the number in common first, and by the lengtgh of the aspect secondly aspect_for_target = sorted(possible_aspects,key=lambda t: (t[1],t[2]), reverse=True)[0][0] opinion_targets.append((opinion_tokens,aspect_for_target, opinion_lemmas,opinion_pos)) already_counted[this_type].add(string_wids) del xml_obj print>>sys.stderr,'\tNumber of opinion expressions:',len(opinion_expressions) print>>sys.stderr,'\tNumber of opinion targets:',len(opinion_targets) print>>sys.stderr,'\tNumber of characters of the text:',len(whole_text) return opinion_expressions, opinion_targets, whole_text
def create_training_sentences(folder_tag_in, folder_kaf_in, opinion_layers, non_opinion, folder_out): #Remove the outputfolder if exists and create it again if os.path.exists(folder_out): shutil.rmtree(folder_out) os.mkdir(folder_out) total_sents_opi = total_sents_no_opi = 0 for tag_file in glob.glob(os.path.join(folder_tag_in, '*.tag')): basename = os.path.basename(tag_file).replace('.tag', '') kaf_file = os.path.join(folder_kaf_in, basename + '.kaf') if os.path.exists(kaf_file): ##From the tag file we extract the token ids for opinions and for non opinionated opinion_wids = set() #token ids annotated as opinions no_opinion_wids = set() #token ids annotated as no opinions fd = open(tag_file, 'rb') for line in fd: fields = line.strip().split('\t') wid = fields[0] for opinion_idx in opinion_layers: if fields[opinion_idx] == 'Opinion': opinion_wids.add(wid) if non_opinion is not None and fields[ non_opinion] == 'NON-OPINIONATED': no_opinion_wids.add(wid) fd.close() ######### ### # Obtain the sentences that are opinionated (positive) and not (negative) # The negatives are: # If there are non-opinionated: just the non opinionated # If not --> all the rest that are not positive ##### sentences = {} all_sent_ids = set() sent_for_token_id = {} kaf_obj = KafNafParser(kaf_file) for token in kaf_obj.get_tokens(): token_id = token.get_id() sent_id = token.get_sent() token_value = token.get_text() if sent_id not in sentences: sentences[sent_id] = [] sentences[sent_id].append(token_value) all_sent_ids.add(sent_id) sent_for_token_id[token_id] = sent_id ### positive_sents = set() negative_sents = set() ##Positive sents are the sentences for the opinion_ids for token_id in opinion_wids: positive_sents.add(sent_for_token_id[token_id]) #### #Negative sents if non_opinion is not None: #In this case the negative are just the sentence of the no_opinion_wids for token_id in no_opinion_wids: negative_sents.add(sent_for_token_id[token_id]) else: #In this case the negative are all the sentences but the positive ones negative_sents = all_sent_ids - positive_sents #Free some memory del opinion_wids del no_opinion_wids del kaf_obj ##Store the results in the file output_file = os.path.join(folder_out, basename + '.sents') fd_out = open(output_file, 'w') fd_out.write('#' + tag_file + '\n') for sent_id in sorted(list(positive_sents)): text = ' '.join(sentences[sent_id]) fd_out.write('+ ' + text.encode('utf-8') + '\n') for sent_id in sorted(list(negative_sents)): text = ' '.join(sentences[sent_id]) fd_out.write('- ' + text.encode('utf-8') + '\n') fd_out.close() #print 'Processed ',basename #print ' Subjective sents:',len(positive_sents) #print ' Non subje. sents:',len(negative_sents) total_sents_opi += len(positive_sents) total_sents_no_opi += len(negative_sents) else: print 'KAF FILE NOT FOUND', kaf_file return total_sents_opi, total_sents_no_opi
treetagger_cmd = complete_path_to_treetagger+'/cmd/tree-tagger-spanish' mapping_file = this_folder +'/mappings/spanish.map.treetagger.kaf.csv' model = 'Spanish models' else: ## Default is dutch print>>sys.stderr,'Language',my_lang,'not supported by this wrapper' sys.exit(0) map_tt_to_kaf = loadMapping(mapping_file) ## Create the input text for reference_tokens = [] sentences = [] prev_sent='-200' aux = [] for token in input_obj.get_tokens(): sent_id = token.get_sent() word = token.get_text() w_id = token.get_id() if sent_id != prev_sent: if len(aux) != 0: sentences.append(aux) aux = [] aux.append((word,w_id)) prev_sent = sent_id if len(aux)!=0: sentences.append(aux) num_term = 0
def get_terms_in_sentence(naf: KafNafParser, sent: int) -> Iterable[Cterm]: tokens = sort_tokens(t for t in naf.get_tokens() if t.get_sent() == sent) tokenids = [t.get_id() for t in tokens] return sort_terms( naf, [naf.get_term(tid) for tid in naf.map_tokens_to_terms(tokenids)])
def process_file(this_file, token_freq): xml_obj = KafNafParser(this_file) print >> sys.stderr, 'Processing file', this_file token_for_wid = {} order_for_wid = {} opinion_expressions = [] opinion_targets = [] whole_text = ' ' for n, token in enumerate(xml_obj.get_tokens()): text = token.get_text().lower() token_freq[text] += 1 token_for_wid[token.get_id()] = text order_for_wid[token.get_id()] = n whole_text += text + ' ' wids_for_tid = {} lemma_for_wid = {} pos_for_wid = {} for term in xml_obj.get_terms(): tid = term.get_id() wids = term.get_span().get_span_ids() wids_for_tid[tid] = wids for wid in wids: lemma_for_wid[wid] = term.get_lemma() pos_for_wid[wid] = term.get_pos() ##Properties! aspects = [] ## [(label,term_span)...] for property in xml_obj.get_properties(): for refs in property.get_references(): for span in refs: aspects.append((property.get_type(), span.get_span_ids())) already_counted = {EXP: set(), TAR: set()} for opinion in xml_obj.get_opinions(): for this_type, opinion_obj in [(EXP, opinion.get_expression()), (TAR, opinion.get_target())]: if this_type is EXP and opinion_obj.get_polarity( ) == 'NON-OPINIONATED': continue if opinion_obj is not None: span = opinion_obj.get_span() if span is not None: list_wids = [] for tid in span.get_span_ids(): list_wids.extend(wids_for_tid.get(tid, [])) list_wids.sort(key=lambda wid: order_for_wid[ wid]) ##Sorted according the the order of the tokens string_wids = '#'.join(list_wids) opinion_tokens = ' '.join(token_for_wid[wid] for wid in list_wids) opinion_lemmas = ' '.join(lemma_for_wid[wid] for wid in list_wids) opinion_pos = ' '.join(pos_for_wid[wid] for wid in list_wids) if string_wids not in already_counted[this_type]: if this_type == EXP: polarity = (opinion_obj.get_polarity()).lower() opinion_expressions.append( (opinion_tokens, polarity, opinion_lemmas, opinion_pos)) else: ##Calculate the aspect type possible_aspects = [] target_ids = span.get_span_ids() for aspect_label, aspect_span in aspects: num_in_common = len( set(target_ids) & set(aspect_span)) if num_in_common != 0: possible_aspects.append( (aspect_label, num_in_common, len(aspect_span))) aspect_for_target = 'unknown' if len(possible_aspects) != 0: ##Sorting by the number in common first, and by the lengtgh of the aspect secondly aspect_for_target = sorted(possible_aspects, key=lambda t: (t[1], t[2]), reverse=True)[0][0] opinion_targets.append( (opinion_tokens, aspect_for_target, opinion_lemmas, opinion_pos)) already_counted[this_type].add(string_wids) del xml_obj print >> sys.stderr, '\tNumber of opinion expressions:', len( opinion_expressions) print >> sys.stderr, '\tNumber of opinion targets:', len(opinion_targets) print >> sys.stderr, '\tNumber of characters of the text:', len(whole_text) return opinion_expressions, opinion_targets, whole_text
def load_naf_stdin(): """Load a dataset in NAF format. Use this function to create a new ConlluDataset from a NAF file, read from stdin. NOTE: you can only add to NAF files, not create one from scratch. """ my_parser = KafNafParser(sys.stdin) my_dataset = ConlluDataset() # a big look-up table: for any NAF id, return a hash with # {sent_id, token_id} in the ConlluDataset naf2conll_id = {} # collect the sentences in a hash, indexed by token_obj.get_sent() sentences = {} # iterate over the tokens to get: ID, FORM for token_obj in my_parser.get_tokens(): # (string) identifier of the sentence sent_id = token_obj.get_sent() if sent_id in sentences: sentence = sentences[sent_id] else: sentence = Sentence(sent_id=sent_id) sentences[sent_id] = sentence # (string) number of the token in the sentence, starting at '1' token_id = '{}'.format(len(sentence) + 1) # ID new_token = Token([ token_id, # ID token_obj.get_text(), # FORM '_', # LEMMA '_', # UPOS '_', # XPOS '_', # FEATS '0', # HEAD -> to be overwritten later 'root', # DEPREL -> to be overwritten later '_', # DEPS '_' # MISC ]) sentence.add(new_token) # to match a NAF span to conll tokens, we need sent_id and token_id naf2conll_id[token_obj.get_id()] = { 'sent_id': sent_id, 'token_id': token_id } # iterate over the term to get: LEMMA, XPOS, UPOS, FEATS, sent_id, nafid for term_obj in my_parser.get_terms(): # span # TODO: for now, assume terms map one-on-one on tokens nafid = term_obj.get_span().get_span_ids() if len(nafid) > 1: logging.error('Multi-word tokens not implemented yet.') return nafid = nafid[0] conllid = naf2conll_id[nafid] sent_id = conllid['sent_id'] sentence = sentences[sent_id] token_id = conllid['token_id'] token = sentence[token_id] # store the identifier of the NAF term on the token, so we can add # information to the NAF later. token.nafid = term_obj.get_id() token.LEMMA = term_obj.get_lemma() # NAF pos='' is in lower case, UD UPOS is upper case token.UPOS = term_obj.get_pos().upper() # naf: A(B,C) -> ud: A|B|C xpos = term_obj.get_morphofeat() if xpos: token.XPOS = xpos.replace('(', '|').replace(')', '').replace(',', '|') if token.XPOS[-1] == '|': token.XPOS = token.XPOS[:-1] # look for an external reference containing FEATS for ext_ref in term_obj.get_external_references(): if ext_ref.get_reftype() == 'FEATS': token.FEATS = ext_ref.get_reference() # to match NAF dependencies to conll tokens, we need sent_id and token_id naf2conll_id[term_obj.get_id()] = { 'sent_id': sent_id, 'token_id': token_id } # iterate over the dependencies to get: HEAD, DEPREL for dep_obj in my_parser.get_dependencies(): # from conllid = naf2conll_id[dep_obj.get_from()] sent_id = conllid['sent_id'] sentence = sentences[sent_id] token_id = conllid['token_id'] token_from = sentence[token_id] # to conllid = naf2conll_id[dep_obj.get_to()] sent_id = conllid['sent_id'] sentence = sentences[sent_id] token_id = conllid['token_id'] token_to = sentence[token_id] # function depfunc = dep_obj.get_function() token_to.HEAD = token_from.ID token_to.DEPREL = depfunc # A final conversion of our list of sentences to a ConlluDataset for sent_id in sentences: sentence = sentences[sent_id] # construct the sentence.full_text raw_tokens = [] for token in sentence: raw_tokens.append(token.FORM) sentence.full_text = ' '.join(raw_tokens) # add to the dataset my_dataset.add(sentence) my_dataset.naf2conll_id = naf2conll_id return my_dataset, my_parser
def create_training_sentences(folder_tag_in,folder_kaf_in, opinion_layers,non_opinion,folder_out): #Remove the outputfolder if exists and create it again if os.path.exists(folder_out): shutil.rmtree(folder_out) os.mkdir(folder_out) total_sents_opi = total_sents_no_opi = 0 for tag_file in glob.glob(os.path.join(folder_tag_in,'*.tag')): basename = os.path.basename(tag_file).replace('.tag','') kaf_file = os.path.join(folder_kaf_in,basename+'.kaf') if os.path.exists(kaf_file): ##From the tag file we extract the token ids for opinions and for non opinionated opinion_wids = set() #token ids annotated as opinions no_opinion_wids = set() #token ids annotated as no opinions fd = open(tag_file,'rb') for line in fd: fields = line.strip().split('\t') wid = fields[0] for opinion_idx in opinion_layers: if fields[opinion_idx] == 'Opinion': opinion_wids.add(wid) if non_opinion is not None and fields[non_opinion] == 'NON-OPINIONATED': no_opinion_wids.add(wid) fd.close() ######### ### # Obtain the sentences that are opinionated (positive) and not (negative) # The negatives are: # If there are non-opinionated: just the non opinionated # If not --> all the rest that are not positive ##### sentences = {} all_sent_ids = set() sent_for_token_id = {} kaf_obj = KafNafParser(kaf_file) for token in kaf_obj.get_tokens(): token_id = token.get_id() sent_id = token.get_sent() token_value = token.get_text() if sent_id not in sentences: sentences[sent_id] = [] sentences[sent_id].append(token_value) all_sent_ids.add(sent_id) sent_for_token_id[token_id] = sent_id ### positive_sents = set() negative_sents = set() ##Positive sents are the sentences for the opinion_ids for token_id in opinion_wids: positive_sents.add(sent_for_token_id[token_id]) #### #Negative sents if non_opinion is not None: #In this case the negative are just the sentence of the no_opinion_wids for token_id in no_opinion_wids: negative_sents.add(sent_for_token_id[token_id]) else: #In this case the negative are all the sentences but the positive ones negative_sents = all_sent_ids - positive_sents #Free some memory del opinion_wids del no_opinion_wids del kaf_obj ##Store the results in the file output_file = os.path.join(folder_out,basename+'.sents') fd_out = open(output_file,'w') fd_out.write('#'+tag_file+'\n') for sent_id in sorted(list(positive_sents)): text = ' '.join(sentences[sent_id]) fd_out.write('+ '+text.encode('utf-8')+'\n') for sent_id in sorted(list(negative_sents)): text = ' '.join(sentences[sent_id]) fd_out.write('- '+text.encode('utf-8')+'\n') fd_out.close() #print 'Processed ',basename #print ' Subjective sents:',len(positive_sents) #print ' Non subje. sents:',len(negative_sents) total_sents_opi += len(positive_sents) total_sents_no_opi += len(negative_sents) else: print 'KAF FILE NOT FOUND',kaf_file return total_sents_opi, total_sents_no_opi
def add_file(filename, data_lexelt, reftype='lexical_key'): obj = KafNafParser(filename) tokens_per_sent = {} sent_for_token = {} sents_in_order = [] for token in obj.get_tokens(): sentid = token.get_sent() if sentid not in sents_in_order: sents_in_order.append(sentid) sent_for_token[token.get_id()] = sentid if sentid not in tokens_per_sent: tokens_per_sent[sentid] = [] tokens_per_sent[sentid].append((token.get_id(), token.get_text())) annotated_lemmas = [] # LIST of (full_id, token ids, lemma,pos,synset) for term in obj.get_terms(): synset_label = None for ext_ref in term.get_external_references(): if ext_ref.get_reftype() == 'lexical_key': synset_label = term.get_lemma() + '%' + ext_ref.get_reference() elif ext_ref.get_reftype() == 'sense' and ext_ref.get_resource( ) == 'WordNet-3.0': synset_label = ext_ref.get_reference() if synset_label is not None: break if synset_label is not None: annotated_lemmas.append( (filename + '#' + term.get_id(), term.get_span().get_span_ids(), term.get_lemma(), term.get_pos(), synset_label)) for full_id, token_ids, lemma, pos, synset_label in annotated_lemmas: #CREATE NEW INSTANCE this_key = lemma + '.' + pos.lower()[0] if this_key not in data_lexelt: data_lexelt[this_key] = Clexelt(this_key, pos) if not data_lexelt[this_key].exists(full_id): #Create the new instance new_instance = Cinstance() new_instance.id = full_id new_instance.docsrc = filename new_instance.key = synset_label tokens = [] target_indexes = [] this_sent = sent_for_token[token_ids[0]] index = sents_in_order.index(this_sent) start_idx = max(index - 2, 0) end_idx = min(index + 2, len(sents_in_order) - 1) selected_sents = sents_in_order[start_idx:end_idx + 1] num_token = 0 for current_sent in selected_sents: for token_id, token_text in tokens_per_sent[str(current_sent)]: tokens.append(token_text) if token_id in token_ids: target_indexes.append(num_token) num_token += 1 new_instance.tokens = tokens[:] new_instance.index_head = target_indexes[:] data_lexelt[this_key].add_instance(new_instance)
def process_single_file(self, file): try: xml_obj = KafNafParser(file) except: print >> sys.stderr, 'Error parsing', file, ': skipped' return print >> sys.stderr, 'Processing file', os.path.basename( file), 'Type:', xml_obj.get_type() self.langs[xml_obj.get_language()] += 1 sentences = [] current_sent = [] this_sent = None pos_for_wid = {} ## For each token id (wid) the pos of it for term in xml_obj.get_terms(): w_ids = term.get_span().get_span_ids() pos = term.get_pos() for wid in term.get_span().get_span_ids(): pos_for_wid[wid] = pos for token in xml_obj.get_tokens(): wid = token.get_id() value = token.get_text() if self.convert_to_lowercase: value = value.lower() if value in self.punctuation: value = 'PUN' if value == '*': value = 'STAR' sentence = token.get_sent() if this_sent is not None and sentence != this_sent: ## There is a new sent sentences.append(current_sent) current_sent = [] current_sent.append((wid, value)) this_sent = sentence ## Add the last sentence as well sentences.append(current_sent) for sentence in sentences: if self.include_sentence_delimiters: sentence.insert(0, ('xxx', '<S>')) sentence.append(('xxx', '</S>')) for idx in range(0, len(sentence)): for ngramlen in range(self.min_ngram_len, self.max_ngram_len + 1): file_desc = self.get_file_desc_for_ngram(ngramlen) start = idx end = start + ngramlen if end <= len(sentence): this_ngram = '\t'.join( value for wid, value in sentence[start:end]) this_ngram_pos = '\t'.join( pos_for_wid.get(wid, 'X') for wid, value in sentence[start:end]) file_desc.write( this_ngram.encode('utf-8') + '\t' + DELIMITER + '\t' + this_ngram_pos + '\n')