def __init__(self, tagger_model, abst_slots): self._tagger = Tagger.load(tagger_model) self._analyzer = self._tagger.getMorpho() self._tokenizer = self._tagger.newTokenizer() self._abst_slots = set(abst_slots.split(',')) self._forms_buf = Forms() self._tokens_buf = TokenRanges() self._analyses_buf = Analyses() self._indices_buf = Indices() self._sf_dict = {} self._sf_max_len = 0
def lemmatize(file, output_file): morphodita_model = os.path.join( dir_cur, 'czech-morfflex-pdt-131112-raw_lemmas.tagger-best_accuracy') tagger = Tagger.load(morphodita_model) assert tagger forms = Forms() lemmas = TaggedLemmas() tokens = TokenRanges() tokenizer = tagger.newTokenizer() assert tokenizer with open_gz(output_file, 'w') as out, open_gz(file) as f: for line in f: tokenizer.setText(line) while tokenizer.nextSentence(forms, tokens): tagger.tag(forms, lemmas) # for i in range(len(tokens)): # lemma = lemmas[i] # token = tokens[i] #word = line[token.start:token.start + token.length] #out.write(str(lemma.lemma) + ' ') #out.write(" ".join(list(map(lambda x: str(x.lemma), lemmas)))) out.write(" ".join( list( map( lambda x: str(x.lemma).strip() + '___' + str(x.tag) .strip(), lemmas)))) out.write('\n')
def __init__(self): self.morphodita_model = os.path.join( dir_cur, 'czech-morfflex-131112.tagger-fast') self.tagger = Tagger.load(self.morphodita_model) self.forms = Forms() self.lemmas = TaggedLemmas() self.tokens = TokenRanges() self.tokenizer = self.tagger.newTokenizer()
def __init__(self, tagger_model): if not os.path.isfile(tagger_model): raise IOError('File %s does not exist' % tagger_model) self._tagger = Tagger.load(tagger_model) self._tokenizer = self._tagger.newTokenizer() self._forms_buf = Forms() self._tokens_buf = TokenRanges() self._tags_buf = TaggedLemmas()
def __init__(self, model_file): """ Instantiates Morphodita from a provided model file. :param model_file: Path to the model file, :type model_file: str """ from ufal.morphodita import Tagger, Forms, TaggedLemmas, TokenRanges self.tagger = Tagger.load(model_file) self.forms = Forms() self.lemmas = TaggedLemmas() self.tokens = TokenRanges() self.tokenizer = self.tagger.newTokenizer()
def __init__(self, tagger_model, abst_slots): self._tagger = Tagger.load(tagger_model) self._analyzer = self._tagger.getMorpho() self._tokenizer = self._tagger.newTokenizer() self._abst_slots = set(abst_slots.split(',')) self._forms_buf = Forms() self._tokens_buf = TokenRanges() self._analyses_buf = Analyses() self._indices_buf = Indices() self._sf_dict = {} self._rev_sf_dict = {} self._sf_max_len = 0
def create_lemmas(self, text): _forms = Forms() _lemmas = TaggedLemmas() _tokens = TokenRanges() self.tokenizer.setText(text) lemmas = [] while self.tokenizer.nextSentence(_forms, _tokens): self.tagger.tag(_forms, _lemmas) for i in range(len(_lemmas)): lemma = _lemmas[i] token = _tokens[i] form = _forms[i] lemmas.append(Lemma(lemma.lemma, lemma.tag, form)) return lemmas
def lemmatize_and_replace_entities(file, output_file): nametag_model = os.path.join(dir_cur, 'czech-cnec2.0-140304.ner') morphodita_model = os.path.join(dir_cur, 'czech-morfflex-131112.tagger-fast') tagger = Tagger.load(morphodita_model) assert tagger ner = Ner.load(nametag_model) assert ner forms = Forms() lemmas = TaggedLemmas() tokens = TokenRanges() entities = NamedEntities() tokenizer = ner.newTokenizer() assert tokenizer with open_gz(output_file, 'w') as out, open_gz(file) as f: for line in f: tokenizer.setText(line) while tokenizer.nextSentence(forms, tokens): tagger.tag(forms, lemmas) ner.recognize(forms, entities) sorted_entities = sort_entities(entities) open_entities = [] open_entities_type = [] e = 0 for i in range(len(tokens)): lemma = lemmas[i] token = tokens[i] word = line[token.start:token.start + token.length] while e < len( sorted_entities) and sorted_entities[e].start == i: open_entities.append(sorted_entities[e].start + sorted_entities[e].length - 1) open_entities_type.append(sorted_entities[e].type) e += 1 if len(open_entities) == 0: out.write(str(lemma.lemma) + ' ') else: out.write("@!ENT!%s " % ('!'.join(open_entities_type))) while open_entities and open_entities[-1] == i: open_entities.pop() open_entities_type.pop() out.write('\n')
class Reader(object): def __init__(self, tagger_model, abst_slots): self._tagger = Tagger.load(tagger_model) self._analyzer = self._tagger.getMorpho() self._tokenizer = self._tagger.newTokenizer() self._abst_slots = set(abst_slots.split(',')) self._forms_buf = Forms() self._tokens_buf = TokenRanges() self._analyses_buf = Analyses() self._indices_buf = Indices() self._sf_dict = {} self._rev_sf_dict = {} self._sf_max_len = 0 def load_surface_forms(self, surface_forms_fname): """Load all proper name surface forms from a file.""" with codecs.open(surface_forms_fname, 'rb', 'UTF-8') as fh: data = json.load(fh) for slot, values in data.items(): for value in values.keys(): for surface_form in values[value]: lemma, form, tag = surface_form.split("\t") form_toks = form.lower().split(" ") if slot == 'street': # add street number placeholders to addresses lemma += ' _' form_toks.append('_') form_toks = tuple(form_toks) self._sf_max_len = max((self._sf_max_len, len(form_toks))) if form_toks not in self._sf_dict: self._sf_dict[form_toks] = [] self._sf_dict[form_toks].append((lemma, tag)) self._rev_sf_dict[(form.lower(), lemma, tag)] = (slot, value) def _get_surface_form_taggedlemmas(self, forms_in): """Given a tokens deque, return the form & list of tagged lemmas (analyses) for the proper name in the list of forms at the current position, if applicable. If there is no proper name at the beginning of the tokens deque, return (None, None). @param forms_in: a deque of forms tokens @return: (form, tagged lemmas list) or (None, None) """ for test_len in range(min(self._sf_max_len, len(forms_in)), 0, -1): # test the string, handle number placeholders full_substr = [form for form in islice(forms_in, 0, test_len)] test_substr = tuple(['_' if re.match(r'^[0-9]+$', form) else form.lower() for form in full_substr]) if test_substr in self._sf_dict: tls = TaggedLemmas() nums = [num for num in full_substr if re.match(r'^[0-9]+$', num)] for lemma, tag in self._sf_dict[test_substr]: tls.push_back(TaggedLemma()) for num in nums: # replace number placeholders by actual values lemma = re.sub(r'_', num, lemma, count=1) tls[-1].lemma = lemma tls[-1].tag = tag for _ in range(len(test_substr)): # move on in the sentence forms_in.popleft() return " ".join(full_substr), tls return None, None def analyze(self, sent): """Perform morphological analysis on the given sentence, preferring analyses from the list of surface forms. Return a list of tuples (form, lemma, tag).""" self._tokenizer.setText(sent) analyzed = [] while self._tokenizer.nextSentence(self._forms_buf, self._tokens_buf): forms_in = deque(self._forms_buf) self._forms_buf.resize(0) self._analyses_buf.resize(0) # reset previous analyses while forms_in: form, analyses = self._get_surface_form_taggedlemmas(forms_in) if form: # our custom analysis self._analyses_buf.push_back(analyses) else: # Morphodita analysis form = forms_in.popleft() analyses = TaggedLemmas() self._analyzer.analyze(form, 1, analyses) for i in range(len(analyses)): # shorten lemmas (must access the vector directly) analyses[i].lemma = self._analyzer.rawLemma(analyses[i].lemma) self._analyses_buf.push_back(analyses) self._forms_buf.push_back(form) # tag according to the given analysis self._tagger.tagAnalyzed(self._forms_buf, self._analyses_buf, self._indices_buf) analyzed.extend([(f, a[idx].lemma, a[idx].tag) for (f, a, idx) in zip(self._forms_buf, self._analyses_buf, self._indices_buf)]) return analyzed def process_dataset(self, input_data): """Load DAs & sentences, obtain abstraction instructions, and store it all in member variables (to be used later by writing methods). @param input_data: path to the input JSON file with the data """ # load data from JSON self._das = [] self._texts = [] with codecs.open(input_data, 'r', encoding='UTF-8') as fh: data = json.load(fh) for inst in data: da = DA.parse_cambridge_da(inst['da']) da.sort() self._das.append(da) self._texts.append(self.analyze(inst['text'])) # delexicalize DAs and sentences self._create_delex_texts() self._create_delex_das() # return the result out = [] for da, text, delex_da, delex_text, abst in zip(self._das, self._texts, self._delex_das, self._delex_texts, self._absts): out.append(Inst(da, text, delex_da, delex_text, abst)) return out def _create_delex_texts(self): """Delexicalize texts in the buffers and save them separately in the member variables, along with the delexicalization instructions used for the operation.""" self._delex_texts = [] self._absts = [] for text_idx, (text, da) in enumerate(zip(self._texts, self._das)): delex_text = [] absts = [] # do the delexicalization, keep track of which slots we used for tok_idx, (form, lemma, tag) in enumerate(text): # abstract away from numbers abst_form = re.sub(r'( |^)[0-9]+( |$)', r'\1_\2', form.lower()) abst_lemma = re.sub(r'( |^)[0-9]+( |$)', r'\1_\2', lemma) # try to find if the surface form belongs to some slot slot, value = self._rev_sf_dict.get((abst_form, abst_lemma, tag), (None, None)) # if we found a slot, get back the numbers if slot: for num_match in re.finditer(r'(?: |^)([0-9]+)(?: |$)', lemma): value = re.sub(r'_', num_match.group(1), value, count=1) # fall back to directly comparing against the DA value else: slot = da.has_value(lemma) value = lemma # if we found something, delexicalize it (check if the value corresponds to the DA!) if (slot and slot in self._abst_slots and da.value_for_slot(slot) not in [None, 'none', 'dont_care'] and value in da.value_for_slot(slot)): delex_text.append(('X-' + slot, 'X-' + slot, tag)) absts.append(Abst(slot, value, form, tok_idx, tok_idx + 1)) # otherwise keep the token as it is else: delex_text.append((form, lemma, tag)) # fix coordinated delexicalized values self._delex_fix_coords(delex_text, da, absts) covered_slots = set([a.slot for a in absts]) # check and warn if we left isomething non-delexicalized for dai in da: if (dai.slot in self._abst_slots and dai.value not in [None, 'none', 'dont_care'] and dai.slot not in covered_slots): log_info("Cannot delexicalize slot %s at %d:\nDA: %s\nTx: %s\n" % (dai.slot, text_idx, str(da), " ".join([form for form, _, _ in text]))) # save the delexicalized text and the delexicalization instructions self._delex_texts.append(delex_text) self._absts.append(absts) def _delex_fix_coords(self, text, da, absts): """Fix (merge) coordinated values in delexicalized text (X-slot and X-slot -> X-slot). Modifies the input list directly. @param text: list of form-lemma-tag tokens of the delexicalized sentence @return: None """ idx = 0 while idx < len(absts) - 1: if (absts[idx].slot == absts[idx+1].slot and absts[idx].end + 1 == absts[idx + 1].start and re.search(r' (and|or) ', da.value_for_slot(absts[idx].slot))): for abst in absts[idx+2:]: abst.start -= 2 abst.end -= 2 absts[idx].value = da.value_for_slot(absts[idx].slot) del text[absts[idx].end:absts[idx + 1].end] del absts[idx + 1] idx += 1 def _create_delex_das(self): """Delexicalize DAs in the buffers, save them separately.""" out = [] for da in self._das: delex_da = DA() for dai in da: delex_dai = DAI(dai.da_type, dai.slot, 'X-' + dai.slot if (dai.value not in [None, 'none', 'dont_care'] and dai.slot in self._abst_slots) else dai.value) delex_da.append(delex_dai) out.append(delex_da) self._delex_das = out
class MorphoAnalyzer(object): def __init__(self, tagger_model, abst_slots): self._tagger = Tagger.load(tagger_model) self._analyzer = self._tagger.getMorpho() self._tokenizer = self._tagger.newTokenizer() self._abst_slots = set(abst_slots.split(',')) self._forms_buf = Forms() self._tokens_buf = TokenRanges() self._analyses_buf = Analyses() self._indices_buf = Indices() self._sf_dict = {} self._rev_sf_dict = {} self._sf_max_len = 0 def load_surface_forms(self, surface_forms_fname): """Load all proper name surface forms from a file.""" with codecs.open(surface_forms_fname, 'rb', 'UTF-8') as fh: data = json.load(fh) for slot, values in data.iteritems(): for value in values.keys(): for surface_form in values[value]: lemma, form, tag = surface_form.split("\t") form_toks = form.lower().split(" ") if slot == 'street': # add street number placeholders to addresses lemma += ' _' form_toks.append('_') form_toks = tuple(form_toks) self._sf_max_len = max((self._sf_max_len, len(form_toks))) if form_toks not in self._sf_dict: self._sf_dict[form_toks] = [] self._sf_dict[form_toks].append((lemma, tag)) self._rev_sf_dict[(form.lower(), lemma, tag)] = (slot, value) def _get_surface_form_taggedlemmas(self, forms_in): """Given a tokens deque, return the form & list of tagged lemmas (analyses) for the proper name in the list of forms at the current position, if applicable. If there is no proper name at the beginning of the tokens deque, return (None, None). @param forms_in: a deque of forms tokens @return: (form, tagged lemmas list) or (None, None) """ for test_len in xrange(min(self._sf_max_len, len(forms_in)), 0, -1): # test the string, handle number placeholders full_substr = [form for form in islice(forms_in, 0, test_len)] test_substr = tuple(['_' if re.match(r'^[0-9]+$', form) else form.lower() for form in full_substr]) if test_substr in self._sf_dict: tls = TaggedLemmas() nums = [num for num in full_substr if re.match(r'^[0-9]+$', num)] for lemma, tag in self._sf_dict[test_substr]: tls.push_back(TaggedLemma()) for num in nums: # replace number placeholders by actual values lemma = re.sub(r'_', num, lemma, count=1) tls[-1].lemma = lemma tls[-1].tag = tag for _ in xrange(len(test_substr)): # move on in the sentence forms_in.popleft() return " ".join(full_substr), tls return None, None def analyze(self, sent): """Perform morphological analysis on the given sentence, preferring analyses from the list of surface forms. Return a list of tuples (form, lemma, tag).""" self._tokenizer.setText(sent) analyzed = [] while self._tokenizer.nextSentence(self._forms_buf, self._tokens_buf): forms_in = deque(self._forms_buf) self._forms_buf.resize(0) self._analyses_buf.resize(0) # reset previous analyses while forms_in: form, analyses = self._get_surface_form_taggedlemmas(forms_in) if form: # our custom analysis self._analyses_buf.push_back(analyses) else: # Morphodita analysis form = forms_in.popleft() analyses = TaggedLemmas() self._analyzer.analyze(form, 1, analyses) for i in xrange(len(analyses)): # shorten lemmas (must access the vector directly) analyses[i].lemma = self._analyzer.rawLemma(analyses[i].lemma) self._analyses_buf.push_back(analyses) self._forms_buf.push_back(form) # tag according to the given analysis self._tagger.tagAnalyzed(self._forms_buf, self._analyses_buf, self._indices_buf) analyzed.extend([(f, a[idx].lemma, a[idx].tag) for (f, a, idx) in zip(self._forms_buf, self._analyses_buf, self._indices_buf)]) return analyzed def process_files(self, input_text_file, input_da_file, skip_hello=False): """Load DAs & sentences, obtain abstraction instructions, and store it all in member variables (to be used later by writing methods). @param input_text_file: path to the input file with sentences @param input_da_file: path to the input file with DAs @param skip_hello: skip hello() DAs (remove them from the output?) """ # load DAs self._das = [] with codecs.open(input_da_file, 'r', encoding='UTF-8') as fh: for line in fh: self._das.append(DA.parse(line.strip())) # load & process sentences self._sents = [] with codecs.open(input_text_file, 'r', encoding='UTF-8') as fh: for line in fh: self._sents.append(self.analyze(line.strip())) assert(len(self._das) == len(self._sents)) # skip hello() DAs, if required if skip_hello: pos = 0 while pos < len(self._das): da = self._das[pos] if len(da) == 1 and da[0].da_type == 'hello': del self._das[pos] del self._sents[pos] else: pos += 1 # delexicalize DAs and sentences self._delex_texts() self._delex_das() def buf_length(self): """Return the number of sentence-DA pairs currently loaded in the buffer.""" return len(self._sents) def _write_plain(self, output_file, data_items): with codecs.open(output_file, 'wb', encoding='UTF-8') as fh: for data_item in data_items: print >> fh, unicode(data_item) def _write_conll(self, output_file, data_items): with codecs.open(output_file, 'wb', encoding='UTF-8') as fh: for line in data_items: for idx, tok in enumerate(line, start=1): print >> fh, "\t".join((str(idx), tok[0].replace(' ', '_'), tok[1].replace(' ', '_'), '_', tok[2], '_', '0', '_', '_', '_')) print >> fh def _write_interleaved(self, output_file, data_items): with codecs.open(output_file, 'wb', encoding='UTF-8') as fh: for line in data_items: for _, lemma, tag in line: print >> fh, lemma.replace(' ', '_'), tag, print >> fh def write_text(self, data_file, out_format, subrange, delex=False): """Write output sentences for the given data subrange. @param data_file: output file name @param out_format: output format ('conll' -- CoNLL-U morphology, \ 'interleaved' -- lemma/tag interleaved, 'plain' -- plain text) @param subrange: data range (slice) from buffers to write @param delex: delexicalize? false by default """ if delex: texts = self._delexed_texts[subrange] else: texts = self._sents[subrange] if out_format == 'interleaved': self._write_interleaved(data_file, texts) elif out_format == 'conll': self._write_conll(data_file, texts) else: self._write_plain(data_file, [" ".join([form for form, _, _ in sent]) for sent in texts]) def write_absts(self, data_file, subrange): """Write delexicalization/abstraction instructions (for the given data subrange). @param data_file: output file name @param subrange: data range (slice) from buffers to write """ self._write_plain(data_file, ["\t".join([unicode(abst_) for abst_ in abst]) for abst in self._absts[subrange]]) def write_das(self, data_file, subrange, delex=False): """Write DAs (for the given subrange). @param data_file: output file name @param subrange: data range (slice) from buffers to write @param delex: delexicalize? false by default """ if delex: das = self._delexed_das[subrange] else: das = self._das[subrange] self._write_plain(data_file, das) def _delex_das(self): """Delexicalize DAs in the buffers, save them separately.""" out = [] for da in self._das: delex_da = DA() for dai in da: delex_dai = DAI(dai.da_type, dai.slot, 'X-' + dai.slot if (dai.value not in [None, 'none', 'dont_care'] and dai.slot in self._abst_slots) else dai.value) delex_da.append(delex_dai) out.append(delex_da) self._delexed_das = out def _delex_texts(self): """Delexicalize texts in the buffers and save them separately in the member variables, along with the delexicalization instructions used for the operation.""" self._delexed_texts = [] self._absts = [] for text_idx, (text, da) in enumerate(zip(self._sents, self._das)): delex_text = [] absts = [] # do the delexicalization, keep track of which slots we used for tok_idx, (form, lemma, tag) in enumerate(text): # abstract away from numbers abst_form = re.sub(r'( |^)[0-9]+( |$)', r'\1_\2', form.lower()) abst_lemma = re.sub(r'( |^)[0-9]+( |$)', r'\1_\2', lemma) # try to find if the surface form belongs to some slot slot, value = self._rev_sf_dict.get((abst_form, abst_lemma, tag), (None, None)) # if we found a slot, get back the numbers if slot: for num_match in re.finditer(r'(?: |^)([0-9]+)(?: |$)', lemma): value = re.sub(r'_', num_match.group(1), value, count=1) # fall back to directly comparing against the DA value else: slot = da.has_value(lemma) value = lemma # if we found something, delexicalize it if (slot and slot in self._abst_slots and da.value_for_slot(slot) not in [None, 'none', 'dont_care']): delex_text.append(('X-' + slot, 'X-' + slot, tag)) absts.append(Abst(slot, value, form, tok_idx, tok_idx + 1)) # otherwise keep the token as it is else: delex_text.append((form, lemma, tag)) # fix coordinated delexicalized values self._delex_fix_coords(delex_text, da, absts) covered_slots = set([a.slot for a in absts]) # check and warn if we left isomething non-delexicalized for dai in da: if (dai.slot in self._abst_slots and dai.value not in [None, 'none', 'dont_care'] and dai.slot not in covered_slots): log_info("Cannot delexicalize slot %s at %d:\nDA: %s\nTx: %s\n" % (dai.slot, text_idx, unicode(da), " ".join([form for form, _, _ in text]))) # save the delexicalized text and the delexicalization instructions self._delexed_texts.append(delex_text) self._absts.append(absts) def _delex_fix_coords(self, text, da, absts): """Fix (merge) coordinated values in delexicalized text (X-slot and X-slot -> X-slot). Modifies the input list directly. @param text: list of form-lemma-tag tokens of the delexicalized sentence @return: None """ idx = 0 while idx < len(absts) - 1: if (absts[idx].slot == absts[idx+1].slot and absts[idx].end + 1 == absts[idx + 1].start and re.search(r' (and|or) ', da.value_for_slot(absts[idx].slot))): for abst in absts[idx+2:]: abst.start -= 2 abst.end -= 2 absts[idx].value = da.value_for_slot(absts[idx].slot) del text[absts[idx].end:absts[idx + 1].end] del absts[idx + 1] idx += 1
class MorphoAnalyzer(object): def __init__(self, tagger_model, abst_slots): self._tagger = Tagger.load(tagger_model) self._analyzer = self._tagger.getMorpho() self._tokenizer = self._tagger.newTokenizer() self._abst_slots = set(abst_slots.split(',')) self._forms_buf = Forms() self._tokens_buf = TokenRanges() self._analyses_buf = Analyses() self._indices_buf = Indices() self._sf_dict = {} self._rev_sf_dict = {} self._sf_max_len = 0 def load_surface_forms(self, surface_forms_fname): """Load all proper name surface forms from a file.""" with codecs.open(surface_forms_fname, 'rb', 'UTF-8') as fh: data = json.load(fh) for slot, values in data.iteritems(): for value in values.keys(): for surface_form in values[value]: lemma, form, tag = surface_form.split("\t") form_toks = form.lower().split(" ") if slot == 'street': # add street number placeholders to addresses lemma += ' _' form_toks.append('_') form_toks = tuple(form_toks) self._sf_max_len = max((self._sf_max_len, len(form_toks))) if form_toks not in self._sf_dict: self._sf_dict[form_toks] = [] self._sf_dict[form_toks].append((lemma, tag)) self._rev_sf_dict[(form.lower(), lemma, tag)] = (slot, value) def _get_surface_form_taggedlemmas(self, forms_in): """Given a tokens deque, return the form & list of tagged lemmas (analyses) for the proper name in the list of forms at the current position, if applicable. If there is no proper name at the beginning of the tokens deque, return (None, None). @param forms_in: a deque of forms tokens @return: (form, tagged lemmas list) or (None, None) """ for test_len in xrange(min(self._sf_max_len, len(forms_in)), 0, -1): # test the string, handle number placeholders full_substr = [form for form in islice(forms_in, 0, test_len)] test_substr = tuple([ '_' if re.match(r'^[0-9]+$', form) else form.lower() for form in full_substr ]) if test_substr in self._sf_dict: tls = TaggedLemmas() nums = [ num for num in full_substr if re.match(r'^[0-9]+$', num) ] for lemma, tag in self._sf_dict[test_substr]: tls.push_back(TaggedLemma()) for num in nums: # replace number placeholders by actual values lemma = re.sub(r'_', num, lemma, count=1) tls[-1].lemma = lemma tls[-1].tag = tag for _ in xrange(len(test_substr)): # move on in the sentence forms_in.popleft() return " ".join(full_substr), tls return None, None def analyze(self, sent): """Perform morphological analysis on the given sentence, preferring analyses from the list of surface forms. Return a list of tuples (form, lemma, tag).""" self._tokenizer.setText(sent) analyzed = [] while self._tokenizer.nextSentence(self._forms_buf, self._tokens_buf): forms_in = deque(self._forms_buf) self._forms_buf.resize(0) self._analyses_buf.resize(0) # reset previous analyses while forms_in: form, analyses = self._get_surface_form_taggedlemmas(forms_in) if form: # our custom analysis self._analyses_buf.push_back(analyses) else: # Morphodita analysis form = forms_in.popleft() analyses = TaggedLemmas() self._analyzer.analyze(form, 1, analyses) for i in xrange( len(analyses) ): # shorten lemmas (must access the vector directly) analyses[i].lemma = self._analyzer.rawLemma( analyses[i].lemma) self._analyses_buf.push_back(analyses) self._forms_buf.push_back(form) # tag according to the given analysis self._tagger.tagAnalyzed(self._forms_buf, self._analyses_buf, self._indices_buf) analyzed.extend([ (f, a[idx].lemma, a[idx].tag) for (f, a, idx) in zip( self._forms_buf, self._analyses_buf, self._indices_buf) ]) return analyzed def process_files(self, input_text_file, input_da_file, skip_hello=False): """Load DAs & sentences, obtain abstraction instructions, and store it all in member variables (to be used later by writing methods). @param input_text_file: path to the input file with sentences @param input_da_file: path to the input file with DAs @param skip_hello: skip hello() DAs (remove them from the output?) """ # load DAs self._das = [] with codecs.open(input_da_file, 'r', encoding='UTF-8') as fh: for line in fh: self._das.append(DA.parse(line.strip())) # load & process sentences self._sents = [] with codecs.open(input_text_file, 'r', encoding='UTF-8') as fh: for line in fh: self._sents.append(self.analyze(line.strip())) assert (len(self._das) == len(self._sents)) # skip hello() DAs, if required if skip_hello: pos = 0 while pos < len(self._das): da = self._das[pos] if len(da) == 1 and da[0].da_type == 'hello': del self._das[pos] del self._sents[pos] else: pos += 1 # delexicalize DAs and sentences self._delex_texts() self._delex_das() def buf_length(self): """Return the number of sentence-DA pairs currently loaded in the buffer.""" return len(self._sents) def _write_plain(self, output_file, data_items): with codecs.open(output_file, 'wb', encoding='UTF-8') as fh: for data_item in data_items: print >> fh, unicode(data_item) def _write_conll(self, output_file, data_items): with codecs.open(output_file, 'wb', encoding='UTF-8') as fh: for line in data_items: for idx, tok in enumerate(line, start=1): print >> fh, "\t".join((str(idx), tok[0].replace(' ', '_'), tok[1].replace(' ', '_'), '_', tok[2], '_', '0', '_', '_', '_')) print >> fh def _write_interleaved(self, output_file, data_items): with codecs.open(output_file, 'wb', encoding='UTF-8') as fh: for line in data_items: for _, lemma, tag in line: print >> fh, lemma.replace(' ', '_'), tag, print >> fh def write_text(self, data_file, out_format, subrange, delex=False): """Write output sentences for the given data subrange. @param data_file: output file name @param out_format: output format ('conll' -- CoNLL-U morphology, \ 'interleaved' -- lemma/tag interleaved, 'plain' -- plain text) @param subrange: data range (slice) from buffers to write @param delex: delexicalize? false by default """ if delex: texts = self._delexed_texts[subrange] else: texts = self._sents[subrange] if out_format == 'interleaved': self._write_interleaved(data_file, texts) elif out_format == 'conll': self._write_conll(data_file, texts) else: self._write_plain( data_file, [" ".join([form for form, _, _ in sent]) for sent in texts]) def write_absts(self, data_file, subrange): """Write delexicalization/abstraction instructions (for the given data subrange). @param data_file: output file name @param subrange: data range (slice) from buffers to write """ self._write_plain(data_file, [ "\t".join([unicode(abst_) for abst_ in abst]) for abst in self._absts[subrange] ]) def write_das(self, data_file, subrange, delex=False): """Write DAs (for the given subrange). @param data_file: output file name @param subrange: data range (slice) from buffers to write @param delex: delexicalize? false by default """ if delex: das = self._delexed_das[subrange] else: das = self._das[subrange] self._write_plain(data_file, das) def _delex_das(self): """Delexicalize DAs in the buffers, save them separately.""" out = [] for da in self._das: delex_da = DA() for dai in da: delex_dai = DAI( dai.da_type, dai.slot, 'X-' + dai.slot if (dai.value not in [None, 'none', 'dont_care'] and dai.slot in self._abst_slots) else dai.value) delex_da.append(delex_dai) out.append(delex_da) self._delexed_das = out def _delex_texts(self): """Delexicalize texts in the buffers and save them separately in the member variables, along with the delexicalization instructions used for the operation.""" self._delexed_texts = [] self._absts = [] for text_idx, (text, da) in enumerate(zip(self._sents, self._das)): delex_text = [] absts = [] # do the delexicalization, keep track of which slots we used for tok_idx, (form, lemma, tag) in enumerate(text): # abstract away from numbers abst_form = re.sub(r'( |^)[0-9]+( |$)', r'\1_\2', form.lower()) abst_lemma = re.sub(r'( |^)[0-9]+( |$)', r'\1_\2', lemma) # try to find if the surface form belongs to some slot slot, value = self._rev_sf_dict.get( (abst_form, abst_lemma, tag), (None, None)) # if we found a slot, get back the numbers if slot: for num_match in re.finditer(r'(?: |^)([0-9]+)(?: |$)', lemma): value = re.sub(r'_', num_match.group(1), value, count=1) # fall back to directly comparing against the DA value else: slot = da.has_value(lemma) value = lemma # if we found something, delexicalize it if (slot and slot in self._abst_slots and da.value_for_slot(slot) not in [None, 'none', 'dont_care']): delex_text.append(('X-' + slot, 'X-' + slot, tag)) absts.append(Abst(slot, value, form, tok_idx, tok_idx + 1)) # otherwise keep the token as it is else: delex_text.append((form, lemma, tag)) # fix coordinated delexicalized values self._delex_fix_coords(delex_text, da, absts) covered_slots = set([a.slot for a in absts]) # check and warn if we left isomething non-delexicalized for dai in da: if (dai.slot in self._abst_slots and dai.value not in [None, 'none', 'dont_care'] and dai.slot not in covered_slots): log_info( "Cannot delexicalize slot %s at %d:\nDA: %s\nTx: %s\n" % (dai.slot, text_idx, unicode(da), " ".join( [form for form, _, _ in text]))) # save the delexicalized text and the delexicalization instructions self._delexed_texts.append(delex_text) self._absts.append(absts) def _delex_fix_coords(self, text, da, absts): """Fix (merge) coordinated values in delexicalized text (X-slot and X-slot -> X-slot). Modifies the input list directly. @param text: list of form-lemma-tag tokens of the delexicalized sentence @return: None """ idx = 0 while idx < len(absts) - 1: if (absts[idx].slot == absts[idx + 1].slot and absts[idx].end + 1 == absts[idx + 1].start and re.search(r' (and|or) ', da.value_for_slot(absts[idx].slot))): for abst in absts[idx + 2:]: abst.start -= 2 abst.end -= 2 absts[idx].value = da.value_for_slot(absts[idx].slot) del text[absts[idx].end:absts[idx + 1].end] del absts[idx + 1] idx += 1
def __init__(self, tagger_model): self.__tagger = Tagger.load(tagger_model) self.__tokenizer = self.__tagger.newTokenizer() self.__forms_buf = Forms() self.__tokens_buf = TokenRanges() self.__lemmas_buf = TaggedLemmas()
def pos_tagging(self, text: str, stem=False, preprocess=True): """ Perform pos tagging of given text :param text: input text :param stem: use stem of word or just lemma :param preprocess: use preprocess :return: list of list of tagged words: List[List[WordPos]] """ lemmas = TaggedLemmas() tokens = TokenRanges() forms = Forms() sentences = [] vanilla_text = text # remove diacritic text = unidecode(text) if preprocess: # remove stop words text = " ".join([ w if w not in self.preprocesor.stop_words else "" for w in text.split() ]) # lower all text text = text.lower() # replace smileys text = self.preprocesor.replace_emoji(text) vanilla_text = text # POS taging self.tokenizer.setText(text) while self.tokenizer.nextSentence(forms, tokens): sentence = [] self.tagger.tag(forms, lemmas) for i in range(len(lemmas)): lemma = lemmas[i].lemma tag = lemmas[i].tag token = tokens[i] token_text = vanilla_text[token.start:token.start + token.length] # remove diacritic lemma = unidecode(lemma) # eng flag eng_word = False # '-' is not boundary token # boundary token if tag[0] == "Z" and lemma != "-": if not preprocess: sentence.append(WordPos(lemma, tag, token_text)) if sentence: sentences.append(sentence) sentence = [] continue # dont stem english words if lemma.find("angl") != -1: eng_word = True # remove additional informations lemma = lemma.split("_")[0] lemma = re.sub(r'-\d*$', '', lemma) # Stem if stem and not eng_word: lemma = cz_stem(lemma) if lemma and not preprocess or len(lemma) > 2: sentence.append(WordPos(lemma, tag, token_text)) if sentence: sentences.append(sentence) return sentences