Esempio n. 1
0
def get_naf_from_sentences(sentences):
    naf_obj = KafNafParser(type="NAF")
    naf_obj.set_version("3.0")
    naf_obj.set_language("nl")
    naf_obj.lang = "nl"
    naf_obj.raw = '\n'.join([' '.join(s) for s in sentences])
    naf_obj.set_raw(naf_obj.raw)
    # Create text layer
    wcount = 1
    offsets = {}
    txt = naf_obj.get_raw()
    token_ids = []
    for sid, sentence in enumerate(sentences):
        token_ids_sub = []
        for token in sentence:
            token_obj = KafNafParserPy.Cwf(type=naf_obj.get_type())
            token_id = 'w{}'.format(wcount)
            token_length = len(token)
            offsets[wcount] = txt.find(token, offsets.get(wcount - 1, 0))
            token_obj.set_id(token_id)
            token_obj.set_length(str(token_length))
            # token_obj.set_offset(str(offset)) # Is this correct????
            token_obj.set_para('1')
            token_obj.set_sent(str(sid + 1))
            token_obj.set_text(token)
            token_obj.set_offset(str(offsets[wcount]))
            token_ids_sub.append(token_id)
            wcount += 1
            naf_obj.add_wf(token_obj)
        token_ids.append(token_ids_sub)
    # Create term layers
    term_ids = []
    count_terms = 0
    for sid, (sentence, token_ids_sub) in enumerate(zip(sentences, token_ids)):
        term_ids_sub = []
        logger.info('Creating the term layer...')
        for num_token, (token,
                        token_id) in enumerate(zip(sentence, token_ids_sub)):
            new_term_id = 't_' + str(count_terms)
            count_terms += 1
            term_ids_sub.append(new_term_id)
            term_obj = KafNafParserPy.Cterm(type=naf_obj.get_type())
            term_obj.set_id(new_term_id)
            new_span = KafNafParserPy.Cspan()
            new_span.create_from_ids([token_id])
            term_obj.set_span(new_span)
            naf_obj.add_term(term_obj)
        term_ids.append(term_ids_sub)

    return naf_obj, term_ids
    def process_single_file(self,file):
        try:
            xml_obj = KafNafParser(file)
        except:
            print>>sys.stderr,'Error parsing',file,': skipped'
            return        


        print>>sys.stderr,'Processing file', os.path.basename(file), 'Type:',xml_obj.get_type()
        self.langs[xml_obj.get_language()] += 1
        sentences = []
        current_sent = []
        this_sent = None
                 
                 
               
         
        pos_for_wid = {} ## For each token id (wid) the pos of it
        for term in xml_obj.get_terms():
            w_ids = term.get_span().get_span_ids()
            pos = term.get_pos()
            for wid in term.get_span().get_span_ids():
                pos_for_wid[wid] = pos

            
        for token in xml_obj.get_tokens():
            wid = token.get_id()
            value = token.get_text()
            if self.convert_to_lowercase:
                value = value.lower()
                
            if value in self.punctuation:
                value = 'PUN'
                
            if value == '*':
                value = 'STAR'
            
            sentence = token.get_sent()
            if this_sent is not None and sentence != this_sent:  ## There is a new sent
                sentences.append(current_sent)
                current_sent = []
            current_sent.append((wid,value))
            this_sent = sentence
        ## Add the last sentence as well
        sentences.append(current_sent)
        
        for sentence in sentences:
            if self.include_sentence_delimiters:
                sentence.insert(0,('xxx','<S>'))
                sentence.append(('xxx','</S>'))
        
            for idx in range(0,len(sentence)):
                for ngramlen in range(self.min_ngram_len,self.max_ngram_len+1):
                    file_desc = self.get_file_desc_for_ngram(ngramlen)
                    start = idx
                    end = start + ngramlen
                    if end <= len(sentence):
                        this_ngram = '\t'.join(value for wid, value in sentence[start:end])
                        this_ngram_pos = '\t'.join(pos_for_wid.get(wid,'X') for wid, value in sentence[start:end])
                        file_desc.write(this_ngram.encode('utf-8')+'\t'+DELIMITER+'\t'+this_ngram_pos+'\n')
		#print terms_for_token
		not_use = set()
		for id_new,type_term,pos_kaf,pos,lemma,span in new_terms:
			#print not_use
			#print id_new
			if id_new not in not_use:
				new_lemma = ''
				for tokenid in span:
					if len(terms_for_token[tokenid]) > 1:
						new_lemma += (''.join(data[t][2] for t in terms_for_token[tokenid])).lower()
						not_use |= set(terms_for_token[tokenid])
				if new_lemma != '':
					lemma = new_lemma

				###############
				new_term = Cterm(type=input_obj.get_type())
				new_term.set_id(id_new)
				new_term.set_type(type_term)
				new_term.set_pos(pos_kaf)
				new_term.set_morphofeat(pos)
				new_term.set_lemma(lemma)
				term_span = Cspan()
				term_span.create_from_ids(span)
				new_term.set_span(term_span)
				input_obj.add_term(new_term)
	##End for each sentence

	my_lp = Clp()
	my_lp.set_name('Treetagger model'+model)
	my_lp.set_version(__version__)
	my_lp.set_timestamp()
Esempio n. 4
0
    def process_single_file(self, file):
        try:
            xml_obj = KafNafParser(file)
        except:
            print >> sys.stderr, 'Error parsing', file, ': skipped'
            return

        print >> sys.stderr, 'Processing file', os.path.basename(
            file), 'Type:', xml_obj.get_type()
        self.langs[xml_obj.get_language()] += 1
        sentences = []
        current_sent = []
        this_sent = None

        pos_for_wid = {}  ## For each token id (wid) the pos of it
        for term in xml_obj.get_terms():
            w_ids = term.get_span().get_span_ids()
            pos = term.get_pos()
            for wid in term.get_span().get_span_ids():
                pos_for_wid[wid] = pos

        for token in xml_obj.get_tokens():
            wid = token.get_id()
            value = token.get_text()
            if self.convert_to_lowercase:
                value = value.lower()

            if value in self.punctuation:
                value = 'PUN'

            if value == '*':
                value = 'STAR'

            sentence = token.get_sent()
            if this_sent is not None and sentence != this_sent:  ## There is a new sent
                sentences.append(current_sent)
                current_sent = []
            current_sent.append((wid, value))
            this_sent = sentence
        ## Add the last sentence as well
        sentences.append(current_sent)

        for sentence in sentences:
            if self.include_sentence_delimiters:
                sentence.insert(0, ('xxx', '<S>'))
                sentence.append(('xxx', '</S>'))

            for idx in range(0, len(sentence)):
                for ngramlen in range(self.min_ngram_len,
                                      self.max_ngram_len + 1):
                    file_desc = self.get_file_desc_for_ngram(ngramlen)
                    start = idx
                    end = start + ngramlen
                    if end <= len(sentence):
                        this_ngram = '\t'.join(
                            value for wid, value in sentence[start:end])
                        this_ngram_pos = '\t'.join(
                            pos_for_wid.get(wid, 'X')
                            for wid, value in sentence[start:end])
                        file_desc.write(
                            this_ngram.encode('utf-8') + '\t' + DELIMITER +
                            '\t' + this_ngram_pos + '\n')