def get_naf_from_sentences(sentences): naf_obj = KafNafParser(type="NAF") naf_obj.set_version("3.0") naf_obj.set_language("nl") naf_obj.lang = "nl" naf_obj.raw = '\n'.join([' '.join(s) for s in sentences]) naf_obj.set_raw(naf_obj.raw) # Create text layer wcount = 1 offsets = {} txt = naf_obj.get_raw() token_ids = [] for sid, sentence in enumerate(sentences): token_ids_sub = [] for token in sentence: token_obj = KafNafParserPy.Cwf(type=naf_obj.get_type()) token_id = 'w{}'.format(wcount) token_length = len(token) offsets[wcount] = txt.find(token, offsets.get(wcount - 1, 0)) token_obj.set_id(token_id) token_obj.set_length(str(token_length)) # token_obj.set_offset(str(offset)) # Is this correct???? token_obj.set_para('1') token_obj.set_sent(str(sid + 1)) token_obj.set_text(token) token_obj.set_offset(str(offsets[wcount])) token_ids_sub.append(token_id) wcount += 1 naf_obj.add_wf(token_obj) token_ids.append(token_ids_sub) # Create term layers term_ids = [] count_terms = 0 for sid, (sentence, token_ids_sub) in enumerate(zip(sentences, token_ids)): term_ids_sub = [] logger.info('Creating the term layer...') for num_token, (token, token_id) in enumerate(zip(sentence, token_ids_sub)): new_term_id = 't_' + str(count_terms) count_terms += 1 term_ids_sub.append(new_term_id) term_obj = KafNafParserPy.Cterm(type=naf_obj.get_type()) term_obj.set_id(new_term_id) new_span = KafNafParserPy.Cspan() new_span.create_from_ids([token_id]) term_obj.set_span(new_span) naf_obj.add_term(term_obj) term_ids.append(term_ids_sub) return naf_obj, term_ids
def process_single_file(self,file): try: xml_obj = KafNafParser(file) except: print>>sys.stderr,'Error parsing',file,': skipped' return print>>sys.stderr,'Processing file', os.path.basename(file), 'Type:',xml_obj.get_type() self.langs[xml_obj.get_language()] += 1 sentences = [] current_sent = [] this_sent = None pos_for_wid = {} ## For each token id (wid) the pos of it for term in xml_obj.get_terms(): w_ids = term.get_span().get_span_ids() pos = term.get_pos() for wid in term.get_span().get_span_ids(): pos_for_wid[wid] = pos for token in xml_obj.get_tokens(): wid = token.get_id() value = token.get_text() if self.convert_to_lowercase: value = value.lower() if value in self.punctuation: value = 'PUN' if value == '*': value = 'STAR' sentence = token.get_sent() if this_sent is not None and sentence != this_sent: ## There is a new sent sentences.append(current_sent) current_sent = [] current_sent.append((wid,value)) this_sent = sentence ## Add the last sentence as well sentences.append(current_sent) for sentence in sentences: if self.include_sentence_delimiters: sentence.insert(0,('xxx','<S>')) sentence.append(('xxx','</S>')) for idx in range(0,len(sentence)): for ngramlen in range(self.min_ngram_len,self.max_ngram_len+1): file_desc = self.get_file_desc_for_ngram(ngramlen) start = idx end = start + ngramlen if end <= len(sentence): this_ngram = '\t'.join(value for wid, value in sentence[start:end]) this_ngram_pos = '\t'.join(pos_for_wid.get(wid,'X') for wid, value in sentence[start:end]) file_desc.write(this_ngram.encode('utf-8')+'\t'+DELIMITER+'\t'+this_ngram_pos+'\n')
#print terms_for_token not_use = set() for id_new,type_term,pos_kaf,pos,lemma,span in new_terms: #print not_use #print id_new if id_new not in not_use: new_lemma = '' for tokenid in span: if len(terms_for_token[tokenid]) > 1: new_lemma += (''.join(data[t][2] for t in terms_for_token[tokenid])).lower() not_use |= set(terms_for_token[tokenid]) if new_lemma != '': lemma = new_lemma ############### new_term = Cterm(type=input_obj.get_type()) new_term.set_id(id_new) new_term.set_type(type_term) new_term.set_pos(pos_kaf) new_term.set_morphofeat(pos) new_term.set_lemma(lemma) term_span = Cspan() term_span.create_from_ids(span) new_term.set_span(term_span) input_obj.add_term(new_term) ##End for each sentence my_lp = Clp() my_lp.set_name('Treetagger model'+model) my_lp.set_version(__version__) my_lp.set_timestamp()
def process_single_file(self, file): try: xml_obj = KafNafParser(file) except: print >> sys.stderr, 'Error parsing', file, ': skipped' return print >> sys.stderr, 'Processing file', os.path.basename( file), 'Type:', xml_obj.get_type() self.langs[xml_obj.get_language()] += 1 sentences = [] current_sent = [] this_sent = None pos_for_wid = {} ## For each token id (wid) the pos of it for term in xml_obj.get_terms(): w_ids = term.get_span().get_span_ids() pos = term.get_pos() for wid in term.get_span().get_span_ids(): pos_for_wid[wid] = pos for token in xml_obj.get_tokens(): wid = token.get_id() value = token.get_text() if self.convert_to_lowercase: value = value.lower() if value in self.punctuation: value = 'PUN' if value == '*': value = 'STAR' sentence = token.get_sent() if this_sent is not None and sentence != this_sent: ## There is a new sent sentences.append(current_sent) current_sent = [] current_sent.append((wid, value)) this_sent = sentence ## Add the last sentence as well sentences.append(current_sent) for sentence in sentences: if self.include_sentence_delimiters: sentence.insert(0, ('xxx', '<S>')) sentence.append(('xxx', '</S>')) for idx in range(0, len(sentence)): for ngramlen in range(self.min_ngram_len, self.max_ngram_len + 1): file_desc = self.get_file_desc_for_ngram(ngramlen) start = idx end = start + ngramlen if end <= len(sentence): this_ngram = '\t'.join( value for wid, value in sentence[start:end]) this_ngram_pos = '\t'.join( pos_for_wid.get(wid, 'X') for wid, value in sentence[start:end]) file_desc.write( this_ngram.encode('utf-8') + '\t' + DELIMITER + '\t' + this_ngram_pos + '\n')