Beispiel #1
0
    def __init__(self, tagger_model, abst_slots):
        self._tagger = Tagger.load(tagger_model)
        self._analyzer = self._tagger.getMorpho()
        self._tokenizer = self._tagger.newTokenizer()
        self._abst_slots = set(abst_slots.split(','))

        self._forms_buf = Forms()
        self._tokens_buf = TokenRanges()
        self._analyses_buf = Analyses()
        self._indices_buf = Indices()

        self._sf_dict = {}
        self._sf_max_len = 0
Beispiel #2
0
def lemmatize(file, output_file):
    morphodita_model = os.path.join(
        dir_cur, 'czech-morfflex-pdt-131112-raw_lemmas.tagger-best_accuracy')
    tagger = Tagger.load(morphodita_model)
    assert tagger
    forms = Forms()
    lemmas = TaggedLemmas()
    tokens = TokenRanges()
    tokenizer = tagger.newTokenizer()
    assert tokenizer
    with open_gz(output_file, 'w') as out, open_gz(file) as f:
        for line in f:
            tokenizer.setText(line)
            while tokenizer.nextSentence(forms, tokens):
                tagger.tag(forms, lemmas)
                # for i in range(len(tokens)):
                # lemma = lemmas[i]
                # token = tokens[i]
                #word = line[token.start:token.start + token.length]
                #out.write(str(lemma.lemma) + ' ')
                #out.write(" ".join(list(map(lambda x: str(x.lemma), lemmas))))
                out.write(" ".join(
                    list(
                        map(
                            lambda x: str(x.lemma).strip() + '___' + str(x.tag)
                            .strip(), lemmas))))
            out.write('\n')
Beispiel #3
0
 def __init__(self):
     self.morphodita_model = os.path.join(
         dir_cur, 'czech-morfflex-131112.tagger-fast')
     self.tagger = Tagger.load(self.morphodita_model)
     self.forms = Forms()
     self.lemmas = TaggedLemmas()
     self.tokens = TokenRanges()
     self.tokenizer = self.tagger.newTokenizer()
Beispiel #4
0
 def __init__(self, tagger_model):
     if not os.path.isfile(tagger_model):
         raise IOError('File %s does not exist' % tagger_model)
     self._tagger = Tagger.load(tagger_model)
     self._tokenizer = self._tagger.newTokenizer()
     self._forms_buf = Forms()
     self._tokens_buf = TokenRanges()
     self._tags_buf = TaggedLemmas()
Beispiel #5
0
    def __init__(self, model_file):
        """
        Instantiates Morphodita from a provided model file.

        :param model_file: Path to the model file,
        :type model_file: str
        """
        from ufal.morphodita import Tagger, Forms, TaggedLemmas, TokenRanges
        self.tagger = Tagger.load(model_file)
        self.forms = Forms()
        self.lemmas = TaggedLemmas()
        self.tokens = TokenRanges()
        self.tokenizer = self.tagger.newTokenizer()
Beispiel #6
0
    def __init__(self, tagger_model, abst_slots):
        self._tagger = Tagger.load(tagger_model)
        self._analyzer = self._tagger.getMorpho()
        self._tokenizer = self._tagger.newTokenizer()
        self._abst_slots = set(abst_slots.split(','))

        self._forms_buf = Forms()
        self._tokens_buf = TokenRanges()
        self._analyses_buf = Analyses()
        self._indices_buf = Indices()

        self._sf_dict = {}
        self._rev_sf_dict = {}
        self._sf_max_len = 0
Beispiel #7
0
 def create_lemmas(self, text):
     _forms = Forms()
     _lemmas = TaggedLemmas()
     _tokens = TokenRanges()
     self.tokenizer.setText(text)
     lemmas = []
     while self.tokenizer.nextSentence(_forms, _tokens):
         self.tagger.tag(_forms, _lemmas)
         for i in range(len(_lemmas)):
             lemma = _lemmas[i]
             token = _tokens[i]
             form = _forms[i]
             lemmas.append(Lemma(lemma.lemma, lemma.tag, form))
     return lemmas
Beispiel #8
0
def lemmatize_and_replace_entities(file, output_file):
    nametag_model = os.path.join(dir_cur, 'czech-cnec2.0-140304.ner')
    morphodita_model = os.path.join(dir_cur,
                                    'czech-morfflex-131112.tagger-fast')
    tagger = Tagger.load(morphodita_model)
    assert tagger
    ner = Ner.load(nametag_model)
    assert ner
    forms = Forms()
    lemmas = TaggedLemmas()
    tokens = TokenRanges()
    entities = NamedEntities()
    tokenizer = ner.newTokenizer()
    assert tokenizer
    with open_gz(output_file, 'w') as out, open_gz(file) as f:
        for line in f:
            tokenizer.setText(line)
            while tokenizer.nextSentence(forms, tokens):
                tagger.tag(forms, lemmas)
                ner.recognize(forms, entities)
                sorted_entities = sort_entities(entities)
                open_entities = []
                open_entities_type = []
                e = 0
                for i in range(len(tokens)):
                    lemma = lemmas[i]
                    token = tokens[i]
                    word = line[token.start:token.start + token.length]
                    while e < len(
                            sorted_entities) and sorted_entities[e].start == i:
                        open_entities.append(sorted_entities[e].start +
                                             sorted_entities[e].length - 1)
                        open_entities_type.append(sorted_entities[e].type)
                        e += 1
                    if len(open_entities) == 0:
                        out.write(str(lemma.lemma) + ' ')
                    else:
                        out.write("@!ENT!%s " % ('!'.join(open_entities_type)))
                    while open_entities and open_entities[-1] == i:
                        open_entities.pop()
                        open_entities_type.pop()
            out.write('\n')
Beispiel #9
0
class Reader(object):

    def __init__(self, tagger_model, abst_slots):
        self._tagger = Tagger.load(tagger_model)
        self._analyzer = self._tagger.getMorpho()
        self._tokenizer = self._tagger.newTokenizer()
        self._abst_slots = set(abst_slots.split(','))

        self._forms_buf = Forms()
        self._tokens_buf = TokenRanges()
        self._analyses_buf = Analyses()
        self._indices_buf = Indices()

        self._sf_dict = {}
        self._rev_sf_dict = {}
        self._sf_max_len = 0

    def load_surface_forms(self, surface_forms_fname):
        """Load all proper name surface forms from a file."""
        with codecs.open(surface_forms_fname, 'rb', 'UTF-8') as fh:
            data = json.load(fh)
        for slot, values in data.items():
            for value in values.keys():
                for surface_form in values[value]:
                    lemma, form, tag = surface_form.split("\t")
                    form_toks = form.lower().split(" ")
                    if slot == 'street':  # add street number placeholders to addresses
                        lemma += ' _'
                        form_toks.append('_')
                    form_toks = tuple(form_toks)
                    self._sf_max_len = max((self._sf_max_len, len(form_toks)))
                    if form_toks not in self._sf_dict:
                        self._sf_dict[form_toks] = []
                    self._sf_dict[form_toks].append((lemma, tag))
                    self._rev_sf_dict[(form.lower(), lemma, tag)] = (slot, value)

    def _get_surface_form_taggedlemmas(self, forms_in):
        """Given a tokens deque, return the form & list of tagged lemmas (analyses)
        for the proper name in the list of forms at the current position, if applicable.
        If there is no proper name at the beginning of the tokens deque, return (None, None).

        @param forms_in: a deque of forms tokens
        @return: (form, tagged lemmas list) or (None, None)
        """
        for test_len in range(min(self._sf_max_len, len(forms_in)), 0, -1):
            # test the string, handle number placeholders
            full_substr = [form for form in islice(forms_in, 0, test_len)]
            test_substr = tuple(['_' if re.match(r'^[0-9]+$', form) else form.lower()
                                 for form in full_substr])
            if test_substr in self._sf_dict:
                tls = TaggedLemmas()
                nums = [num for num in full_substr if re.match(r'^[0-9]+$', num)]
                for lemma, tag in self._sf_dict[test_substr]:
                    tls.push_back(TaggedLemma())
                    for num in nums:  # replace number placeholders by actual values
                        lemma = re.sub(r'_', num, lemma, count=1)
                    tls[-1].lemma = lemma
                    tls[-1].tag = tag
                for _ in range(len(test_substr)):  # move on in the sentence
                    forms_in.popleft()
                return " ".join(full_substr), tls
        return None, None

    def analyze(self, sent):
        """Perform morphological analysis on the given sentence, preferring analyses from the
        list of surface forms. Return a list of tuples (form, lemma, tag)."""
        self._tokenizer.setText(sent)
        analyzed = []
        while self._tokenizer.nextSentence(self._forms_buf, self._tokens_buf):

            forms_in = deque(self._forms_buf)
            self._forms_buf.resize(0)
            self._analyses_buf.resize(0)  # reset previous analyses

            while forms_in:
                form, analyses = self._get_surface_form_taggedlemmas(forms_in)
                if form:
                    # our custom analysis
                    self._analyses_buf.push_back(analyses)
                else:
                    # Morphodita analysis
                    form = forms_in.popleft()
                    analyses = TaggedLemmas()
                    self._analyzer.analyze(form, 1, analyses)
                    for i in range(len(analyses)):  # shorten lemmas (must access the vector directly)
                        analyses[i].lemma = self._analyzer.rawLemma(analyses[i].lemma)
                    self._analyses_buf.push_back(analyses)

                self._forms_buf.push_back(form)

            # tag according to the given analysis
            self._tagger.tagAnalyzed(self._forms_buf, self._analyses_buf, self._indices_buf)
            analyzed.extend([(f, a[idx].lemma, a[idx].tag)
                             for (f, a, idx)
                             in zip(self._forms_buf, self._analyses_buf, self._indices_buf)])
        return analyzed

    def process_dataset(self, input_data):
        """Load DAs & sentences, obtain abstraction instructions, and store it all in member
        variables (to be used later by writing methods).
        @param input_data: path to the input JSON file with the data
        """
        # load data from JSON
        self._das = []
        self._texts = []
        with codecs.open(input_data, 'r', encoding='UTF-8') as fh:
            data = json.load(fh)
            for inst in data:
                da = DA.parse_cambridge_da(inst['da'])
                da.sort()
                self._das.append(da)
                self._texts.append(self.analyze(inst['text']))

        # delexicalize DAs and sentences
        self._create_delex_texts()
        self._create_delex_das()

        # return the result
        out = []
        for da, text, delex_da, delex_text, abst in zip(self._das, self._texts, self._delex_das, self._delex_texts, self._absts):
            out.append(Inst(da, text, delex_da, delex_text, abst))
        return out

    def _create_delex_texts(self):
        """Delexicalize texts in the buffers and save them separately in the member variables,
        along with the delexicalization instructions used for the operation."""
        self._delex_texts = []
        self._absts = []
        for text_idx, (text, da) in enumerate(zip(self._texts, self._das)):
            delex_text = []
            absts = []
            # do the delexicalization, keep track of which slots we used
            for tok_idx, (form, lemma, tag) in enumerate(text):
                # abstract away from numbers
                abst_form = re.sub(r'( |^)[0-9]+( |$)', r'\1_\2', form.lower())
                abst_lemma = re.sub(r'( |^)[0-9]+( |$)', r'\1_\2', lemma)
                # try to find if the surface form belongs to some slot
                slot, value = self._rev_sf_dict.get((abst_form, abst_lemma, tag), (None, None))
                # if we found a slot, get back the numbers
                if slot:
                    for num_match in re.finditer(r'(?: |^)([0-9]+)(?: |$)', lemma):
                        value = re.sub(r'_', num_match.group(1), value, count=1)
                # fall back to directly comparing against the DA value
                else:
                    slot = da.has_value(lemma)
                    value = lemma

                # if we found something, delexicalize it (check if the value corresponds to the DA!)
                if (slot and slot in self._abst_slots and
                        da.value_for_slot(slot) not in [None, 'none', 'dont_care'] and
                        value in da.value_for_slot(slot)):
                    delex_text.append(('X-' + slot, 'X-' + slot, tag))
                    absts.append(Abst(slot, value, form, tok_idx, tok_idx + 1))
                # otherwise keep the token as it is
                else:
                    delex_text.append((form, lemma, tag))
            # fix coordinated delexicalized values
            self._delex_fix_coords(delex_text, da, absts)
            covered_slots = set([a.slot for a in absts])
            # check and warn if we left isomething non-delexicalized
            for dai in da:
                if (dai.slot in self._abst_slots and
                        dai.value not in [None, 'none', 'dont_care'] and
                        dai.slot not in covered_slots):
                    log_info("Cannot delexicalize slot  %s  at %d:\nDA: %s\nTx: %s\n" %
                             (dai.slot,
                              text_idx,
                              str(da),
                              " ".join([form for form, _, _ in text])))
            # save the delexicalized text and the delexicalization instructions
            self._delex_texts.append(delex_text)
            self._absts.append(absts)

    def _delex_fix_coords(self, text, da, absts):
        """Fix (merge) coordinated values in delexicalized text (X-slot and X-slot -> X-slot).
        Modifies the input list directly.

        @param text: list of form-lemma-tag tokens of the delexicalized sentence
        @return: None
        """
        idx = 0
        while idx < len(absts) - 1:
            if (absts[idx].slot == absts[idx+1].slot and
                    absts[idx].end + 1 == absts[idx + 1].start and
                    re.search(r' (and|or) ', da.value_for_slot(absts[idx].slot))):
                for abst in absts[idx+2:]:
                    abst.start -= 2
                    abst.end -= 2
                absts[idx].value = da.value_for_slot(absts[idx].slot)
                del text[absts[idx].end:absts[idx + 1].end]
                del absts[idx + 1]
            idx += 1

    def _create_delex_das(self):
        """Delexicalize DAs in the buffers, save them separately."""
        out = []
        for da in self._das:
            delex_da = DA()
            for dai in da:
                delex_dai = DAI(dai.da_type, dai.slot,
                                'X-' + dai.slot
                                if (dai.value not in [None, 'none', 'dont_care'] and
                                    dai.slot in self._abst_slots)
                                else dai.value)
                delex_da.append(delex_dai)
            out.append(delex_da)
        self._delex_das = out
Beispiel #10
0
class MorphoAnalyzer(object):

    def __init__(self, tagger_model, abst_slots):
        self._tagger = Tagger.load(tagger_model)
        self._analyzer = self._tagger.getMorpho()
        self._tokenizer = self._tagger.newTokenizer()
        self._abst_slots = set(abst_slots.split(','))

        self._forms_buf = Forms()
        self._tokens_buf = TokenRanges()
        self._analyses_buf = Analyses()
        self._indices_buf = Indices()

        self._sf_dict = {}
        self._rev_sf_dict = {}
        self._sf_max_len = 0

    def load_surface_forms(self, surface_forms_fname):
        """Load all proper name surface forms from a file."""
        with codecs.open(surface_forms_fname, 'rb', 'UTF-8') as fh:
            data = json.load(fh)
        for slot, values in data.iteritems():
            for value in values.keys():
                for surface_form in values[value]:
                    lemma, form, tag = surface_form.split("\t")
                    form_toks = form.lower().split(" ")
                    if slot == 'street':  # add street number placeholders to addresses
                        lemma += ' _'
                        form_toks.append('_')
                    form_toks = tuple(form_toks)
                    self._sf_max_len = max((self._sf_max_len, len(form_toks)))
                    if form_toks not in self._sf_dict:
                        self._sf_dict[form_toks] = []
                    self._sf_dict[form_toks].append((lemma, tag))
                    self._rev_sf_dict[(form.lower(), lemma, tag)] = (slot, value)

    def _get_surface_form_taggedlemmas(self, forms_in):
        """Given a tokens deque, return the form & list of tagged lemmas (analyses)
        for the proper name in the list of forms at the current position, if applicable.
        If there is no proper name at the beginning of the tokens deque, return (None, None).

        @param forms_in: a deque of forms tokens
        @return: (form, tagged lemmas list) or (None, None)
        """
        for test_len in xrange(min(self._sf_max_len, len(forms_in)), 0, -1):
            # test the string, handle number placeholders
            full_substr = [form for form in islice(forms_in, 0, test_len)]
            test_substr = tuple(['_' if re.match(r'^[0-9]+$', form) else form.lower()
                                 for form in full_substr])
            if test_substr in self._sf_dict:
                tls = TaggedLemmas()
                nums = [num for num in full_substr if re.match(r'^[0-9]+$', num)]
                for lemma, tag in self._sf_dict[test_substr]:
                    tls.push_back(TaggedLemma())
                    for num in nums:  # replace number placeholders by actual values
                        lemma = re.sub(r'_', num, lemma, count=1)
                    tls[-1].lemma = lemma
                    tls[-1].tag = tag
                for _ in xrange(len(test_substr)):  # move on in the sentence
                    forms_in.popleft()
                return " ".join(full_substr), tls
        return None, None

    def analyze(self, sent):
        """Perform morphological analysis on the given sentence, preferring analyses from the
        list of surface forms. Return a list of tuples (form, lemma, tag)."""
        self._tokenizer.setText(sent)
        analyzed = []
        while self._tokenizer.nextSentence(self._forms_buf, self._tokens_buf):

            forms_in = deque(self._forms_buf)
            self._forms_buf.resize(0)
            self._analyses_buf.resize(0)  # reset previous analyses

            while forms_in:
                form, analyses = self._get_surface_form_taggedlemmas(forms_in)
                if form:
                    # our custom analysis
                    self._analyses_buf.push_back(analyses)
                else:
                    # Morphodita analysis
                    form = forms_in.popleft()
                    analyses = TaggedLemmas()
                    self._analyzer.analyze(form, 1, analyses)
                    for i in xrange(len(analyses)):  # shorten lemmas (must access the vector directly)
                        analyses[i].lemma = self._analyzer.rawLemma(analyses[i].lemma)
                    self._analyses_buf.push_back(analyses)

                self._forms_buf.push_back(form)

            # tag according to the given analysis
            self._tagger.tagAnalyzed(self._forms_buf, self._analyses_buf, self._indices_buf)
            analyzed.extend([(f, a[idx].lemma, a[idx].tag)
                             for (f, a, idx)
                             in zip(self._forms_buf, self._analyses_buf, self._indices_buf)])
        return analyzed

    def process_files(self, input_text_file, input_da_file, skip_hello=False):
        """Load DAs & sentences, obtain abstraction instructions, and store it all in member
        variables (to be used later by writing methods).
        @param input_text_file: path to the input file with sentences
        @param input_da_file: path to the input file with DAs
        @param skip_hello: skip hello() DAs (remove them from the output?)
        """
        # load DAs
        self._das = []
        with codecs.open(input_da_file, 'r', encoding='UTF-8') as fh:
            for line in fh:
                self._das.append(DA.parse(line.strip()))
        # load & process sentences
        self._sents = []
        with codecs.open(input_text_file, 'r', encoding='UTF-8') as fh:
            for line in fh:
                self._sents.append(self.analyze(line.strip()))
        assert(len(self._das) == len(self._sents))
        # skip hello() DAs, if required
        if skip_hello:
            pos = 0
            while pos < len(self._das):
                da = self._das[pos]
                if len(da) == 1 and da[0].da_type == 'hello':
                    del self._das[pos]
                    del self._sents[pos]
                else:
                    pos += 1
        # delexicalize DAs and sentences
        self._delex_texts()
        self._delex_das()

    def buf_length(self):
        """Return the number of sentence-DA pairs currently loaded in the buffer."""
        return len(self._sents)

    def _write_plain(self, output_file, data_items):
        with codecs.open(output_file, 'wb', encoding='UTF-8') as fh:
            for data_item in data_items:
                print >> fh, unicode(data_item)

    def _write_conll(self, output_file, data_items):
        with codecs.open(output_file, 'wb', encoding='UTF-8') as fh:
            for line in data_items:
                for idx, tok in enumerate(line, start=1):
                    print >> fh, "\t".join((str(idx),
                                            tok[0].replace(' ', '_'),
                                            tok[1].replace(' ', '_'),
                                            '_', tok[2], '_',
                                            '0', '_', '_', '_'))
                print >> fh

    def _write_interleaved(self, output_file, data_items):
        with codecs.open(output_file, 'wb', encoding='UTF-8') as fh:
            for line in data_items:
                for _, lemma, tag in line:
                    print >> fh, lemma.replace(' ', '_'), tag,
                print >> fh

    def write_text(self, data_file, out_format, subrange, delex=False):
        """Write output sentences for the given data subrange.
        @param data_file: output file name
        @param out_format: output format ('conll' -- CoNLL-U morphology, \
            'interleaved' -- lemma/tag interleaved, 'plain' -- plain text)
        @param subrange: data range (slice) from buffers to write
        @param delex: delexicalize? false by default
        """
        if delex:
            texts = self._delexed_texts[subrange]
        else:
            texts = self._sents[subrange]
        if out_format == 'interleaved':
            self._write_interleaved(data_file, texts)
        elif out_format == 'conll':
            self._write_conll(data_file, texts)
        else:
            self._write_plain(data_file, [" ".join([form for form, _, _ in sent])
                                          for sent in texts])

    def write_absts(self, data_file, subrange):
        """Write delexicalization/abstraction instructions (for the given data subrange).
        @param data_file: output file name
        @param subrange: data range (slice) from buffers to write
        """
        self._write_plain(data_file, ["\t".join([unicode(abst_) for abst_ in abst])
                                      for abst in self._absts[subrange]])

    def write_das(self, data_file, subrange, delex=False):
        """Write DAs (for the given subrange).
        @param data_file: output file name
        @param subrange: data range (slice) from buffers to write
        @param delex: delexicalize? false by default
        """
        if delex:
            das = self._delexed_das[subrange]
        else:
            das = self._das[subrange]
        self._write_plain(data_file, das)

    def _delex_das(self):
        """Delexicalize DAs in the buffers, save them separately."""
        out = []
        for da in self._das:
            delex_da = DA()
            for dai in da:
                delex_dai = DAI(dai.da_type, dai.slot,
                                'X-' + dai.slot
                                if (dai.value not in [None, 'none', 'dont_care'] and
                                    dai.slot in self._abst_slots)
                                else dai.value)
                delex_da.append(delex_dai)
            out.append(delex_da)
        self._delexed_das = out

    def _delex_texts(self):
        """Delexicalize texts in the buffers and save them separately in the member variables,
        along with the delexicalization instructions used for the operation."""
        self._delexed_texts = []
        self._absts = []
        for text_idx, (text, da) in enumerate(zip(self._sents, self._das)):
            delex_text = []
            absts = []
            # do the delexicalization, keep track of which slots we used
            for tok_idx, (form, lemma, tag) in enumerate(text):
                # abstract away from numbers
                abst_form = re.sub(r'( |^)[0-9]+( |$)', r'\1_\2', form.lower())
                abst_lemma = re.sub(r'( |^)[0-9]+( |$)', r'\1_\2', lemma)
                # try to find if the surface form belongs to some slot
                slot, value = self._rev_sf_dict.get((abst_form, abst_lemma, tag), (None, None))
                # if we found a slot, get back the numbers
                if slot:
                    for num_match in re.finditer(r'(?: |^)([0-9]+)(?: |$)', lemma):
                        value = re.sub(r'_', num_match.group(1), value, count=1)
                # fall back to directly comparing against the DA value
                else:
                    slot = da.has_value(lemma)
                    value = lemma

                # if we found something, delexicalize it
                if (slot and slot in self._abst_slots and
                        da.value_for_slot(slot) not in [None, 'none', 'dont_care']):
                    delex_text.append(('X-' + slot, 'X-' + slot, tag))
                    absts.append(Abst(slot, value, form, tok_idx, tok_idx + 1))
                # otherwise keep the token as it is
                else:
                    delex_text.append((form, lemma, tag))
            # fix coordinated delexicalized values
            self._delex_fix_coords(delex_text, da, absts)
            covered_slots = set([a.slot for a in absts])
            # check and warn if we left isomething non-delexicalized
            for dai in da:
                if (dai.slot in self._abst_slots and
                        dai.value not in [None, 'none', 'dont_care'] and
                        dai.slot not in covered_slots):
                    log_info("Cannot delexicalize slot  %s  at %d:\nDA: %s\nTx: %s\n" %
                             (dai.slot,
                              text_idx,
                              unicode(da),
                              " ".join([form for form, _, _ in text])))
            # save the delexicalized text and the delexicalization instructions
            self._delexed_texts.append(delex_text)
            self._absts.append(absts)

    def _delex_fix_coords(self, text, da, absts):
        """Fix (merge) coordinated values in delexicalized text (X-slot and X-slot -> X-slot).
        Modifies the input list directly.

        @param text: list of form-lemma-tag tokens of the delexicalized sentence
        @return: None
        """
        idx = 0
        while idx < len(absts) - 1:
            if (absts[idx].slot == absts[idx+1].slot and
                    absts[idx].end + 1 == absts[idx + 1].start and
                    re.search(r' (and|or) ', da.value_for_slot(absts[idx].slot))):
                for abst in absts[idx+2:]:
                    abst.start -= 2
                    abst.end -= 2
                absts[idx].value = da.value_for_slot(absts[idx].slot)
                del text[absts[idx].end:absts[idx + 1].end]
                del absts[idx + 1]
            idx += 1
Beispiel #11
0
class MorphoAnalyzer(object):
    def __init__(self, tagger_model, abst_slots):
        self._tagger = Tagger.load(tagger_model)
        self._analyzer = self._tagger.getMorpho()
        self._tokenizer = self._tagger.newTokenizer()
        self._abst_slots = set(abst_slots.split(','))

        self._forms_buf = Forms()
        self._tokens_buf = TokenRanges()
        self._analyses_buf = Analyses()
        self._indices_buf = Indices()

        self._sf_dict = {}
        self._rev_sf_dict = {}
        self._sf_max_len = 0

    def load_surface_forms(self, surface_forms_fname):
        """Load all proper name surface forms from a file."""
        with codecs.open(surface_forms_fname, 'rb', 'UTF-8') as fh:
            data = json.load(fh)
        for slot, values in data.iteritems():
            for value in values.keys():
                for surface_form in values[value]:
                    lemma, form, tag = surface_form.split("\t")
                    form_toks = form.lower().split(" ")
                    if slot == 'street':  # add street number placeholders to addresses
                        lemma += ' _'
                        form_toks.append('_')
                    form_toks = tuple(form_toks)
                    self._sf_max_len = max((self._sf_max_len, len(form_toks)))
                    if form_toks not in self._sf_dict:
                        self._sf_dict[form_toks] = []
                    self._sf_dict[form_toks].append((lemma, tag))
                    self._rev_sf_dict[(form.lower(), lemma, tag)] = (slot,
                                                                     value)

    def _get_surface_form_taggedlemmas(self, forms_in):
        """Given a tokens deque, return the form & list of tagged lemmas (analyses)
        for the proper name in the list of forms at the current position, if applicable.
        If there is no proper name at the beginning of the tokens deque, return (None, None).

        @param forms_in: a deque of forms tokens
        @return: (form, tagged lemmas list) or (None, None)
        """
        for test_len in xrange(min(self._sf_max_len, len(forms_in)), 0, -1):
            # test the string, handle number placeholders
            full_substr = [form for form in islice(forms_in, 0, test_len)]
            test_substr = tuple([
                '_' if re.match(r'^[0-9]+$', form) else form.lower()
                for form in full_substr
            ])
            if test_substr in self._sf_dict:
                tls = TaggedLemmas()
                nums = [
                    num for num in full_substr if re.match(r'^[0-9]+$', num)
                ]
                for lemma, tag in self._sf_dict[test_substr]:
                    tls.push_back(TaggedLemma())
                    for num in nums:  # replace number placeholders by actual values
                        lemma = re.sub(r'_', num, lemma, count=1)
                    tls[-1].lemma = lemma
                    tls[-1].tag = tag
                for _ in xrange(len(test_substr)):  # move on in the sentence
                    forms_in.popleft()
                return " ".join(full_substr), tls
        return None, None

    def analyze(self, sent):
        """Perform morphological analysis on the given sentence, preferring analyses from the
        list of surface forms. Return a list of tuples (form, lemma, tag)."""
        self._tokenizer.setText(sent)
        analyzed = []
        while self._tokenizer.nextSentence(self._forms_buf, self._tokens_buf):

            forms_in = deque(self._forms_buf)
            self._forms_buf.resize(0)
            self._analyses_buf.resize(0)  # reset previous analyses

            while forms_in:
                form, analyses = self._get_surface_form_taggedlemmas(forms_in)
                if form:
                    # our custom analysis
                    self._analyses_buf.push_back(analyses)
                else:
                    # Morphodita analysis
                    form = forms_in.popleft()
                    analyses = TaggedLemmas()
                    self._analyzer.analyze(form, 1, analyses)
                    for i in xrange(
                            len(analyses)
                    ):  # shorten lemmas (must access the vector directly)
                        analyses[i].lemma = self._analyzer.rawLemma(
                            analyses[i].lemma)
                    self._analyses_buf.push_back(analyses)

                self._forms_buf.push_back(form)

            # tag according to the given analysis
            self._tagger.tagAnalyzed(self._forms_buf, self._analyses_buf,
                                     self._indices_buf)
            analyzed.extend([
                (f, a[idx].lemma, a[idx].tag) for (f, a, idx) in zip(
                    self._forms_buf, self._analyses_buf, self._indices_buf)
            ])
        return analyzed

    def process_files(self, input_text_file, input_da_file, skip_hello=False):
        """Load DAs & sentences, obtain abstraction instructions, and store it all in member
        variables (to be used later by writing methods).
        @param input_text_file: path to the input file with sentences
        @param input_da_file: path to the input file with DAs
        @param skip_hello: skip hello() DAs (remove them from the output?)
        """
        # load DAs
        self._das = []
        with codecs.open(input_da_file, 'r', encoding='UTF-8') as fh:
            for line in fh:
                self._das.append(DA.parse(line.strip()))
        # load & process sentences
        self._sents = []
        with codecs.open(input_text_file, 'r', encoding='UTF-8') as fh:
            for line in fh:
                self._sents.append(self.analyze(line.strip()))
        assert (len(self._das) == len(self._sents))
        # skip hello() DAs, if required
        if skip_hello:
            pos = 0
            while pos < len(self._das):
                da = self._das[pos]
                if len(da) == 1 and da[0].da_type == 'hello':
                    del self._das[pos]
                    del self._sents[pos]
                else:
                    pos += 1
        # delexicalize DAs and sentences
        self._delex_texts()
        self._delex_das()

    def buf_length(self):
        """Return the number of sentence-DA pairs currently loaded in the buffer."""
        return len(self._sents)

    def _write_plain(self, output_file, data_items):
        with codecs.open(output_file, 'wb', encoding='UTF-8') as fh:
            for data_item in data_items:
                print >> fh, unicode(data_item)

    def _write_conll(self, output_file, data_items):
        with codecs.open(output_file, 'wb', encoding='UTF-8') as fh:
            for line in data_items:
                for idx, tok in enumerate(line, start=1):
                    print >> fh, "\t".join((str(idx), tok[0].replace(' ', '_'),
                                            tok[1].replace(' ', '_'), '_',
                                            tok[2], '_', '0', '_', '_', '_'))
                print >> fh

    def _write_interleaved(self, output_file, data_items):
        with codecs.open(output_file, 'wb', encoding='UTF-8') as fh:
            for line in data_items:
                for _, lemma, tag in line:
                    print >> fh, lemma.replace(' ', '_'), tag,
                print >> fh

    def write_text(self, data_file, out_format, subrange, delex=False):
        """Write output sentences for the given data subrange.
        @param data_file: output file name
        @param out_format: output format ('conll' -- CoNLL-U morphology, \
            'interleaved' -- lemma/tag interleaved, 'plain' -- plain text)
        @param subrange: data range (slice) from buffers to write
        @param delex: delexicalize? false by default
        """
        if delex:
            texts = self._delexed_texts[subrange]
        else:
            texts = self._sents[subrange]
        if out_format == 'interleaved':
            self._write_interleaved(data_file, texts)
        elif out_format == 'conll':
            self._write_conll(data_file, texts)
        else:
            self._write_plain(
                data_file,
                [" ".join([form for form, _, _ in sent]) for sent in texts])

    def write_absts(self, data_file, subrange):
        """Write delexicalization/abstraction instructions (for the given data subrange).
        @param data_file: output file name
        @param subrange: data range (slice) from buffers to write
        """
        self._write_plain(data_file, [
            "\t".join([unicode(abst_) for abst_ in abst])
            for abst in self._absts[subrange]
        ])

    def write_das(self, data_file, subrange, delex=False):
        """Write DAs (for the given subrange).
        @param data_file: output file name
        @param subrange: data range (slice) from buffers to write
        @param delex: delexicalize? false by default
        """
        if delex:
            das = self._delexed_das[subrange]
        else:
            das = self._das[subrange]
        self._write_plain(data_file, das)

    def _delex_das(self):
        """Delexicalize DAs in the buffers, save them separately."""
        out = []
        for da in self._das:
            delex_da = DA()
            for dai in da:
                delex_dai = DAI(
                    dai.da_type, dai.slot, 'X-' + dai.slot if
                    (dai.value not in [None, 'none', 'dont_care']
                     and dai.slot in self._abst_slots) else dai.value)
                delex_da.append(delex_dai)
            out.append(delex_da)
        self._delexed_das = out

    def _delex_texts(self):
        """Delexicalize texts in the buffers and save them separately in the member variables,
        along with the delexicalization instructions used for the operation."""
        self._delexed_texts = []
        self._absts = []
        for text_idx, (text, da) in enumerate(zip(self._sents, self._das)):
            delex_text = []
            absts = []
            # do the delexicalization, keep track of which slots we used
            for tok_idx, (form, lemma, tag) in enumerate(text):
                # abstract away from numbers
                abst_form = re.sub(r'( |^)[0-9]+( |$)', r'\1_\2', form.lower())
                abst_lemma = re.sub(r'( |^)[0-9]+( |$)', r'\1_\2', lemma)
                # try to find if the surface form belongs to some slot
                slot, value = self._rev_sf_dict.get(
                    (abst_form, abst_lemma, tag), (None, None))
                # if we found a slot, get back the numbers
                if slot:
                    for num_match in re.finditer(r'(?: |^)([0-9]+)(?: |$)',
                                                 lemma):
                        value = re.sub(r'_',
                                       num_match.group(1),
                                       value,
                                       count=1)
                # fall back to directly comparing against the DA value
                else:
                    slot = da.has_value(lemma)
                    value = lemma

                # if we found something, delexicalize it
                if (slot and slot in self._abst_slots
                        and da.value_for_slot(slot)
                        not in [None, 'none', 'dont_care']):
                    delex_text.append(('X-' + slot, 'X-' + slot, tag))
                    absts.append(Abst(slot, value, form, tok_idx, tok_idx + 1))
                # otherwise keep the token as it is
                else:
                    delex_text.append((form, lemma, tag))
            # fix coordinated delexicalized values
            self._delex_fix_coords(delex_text, da, absts)
            covered_slots = set([a.slot for a in absts])
            # check and warn if we left isomething non-delexicalized
            for dai in da:
                if (dai.slot in self._abst_slots
                        and dai.value not in [None, 'none', 'dont_care']
                        and dai.slot not in covered_slots):
                    log_info(
                        "Cannot delexicalize slot  %s  at %d:\nDA: %s\nTx: %s\n"
                        % (dai.slot, text_idx, unicode(da), " ".join(
                            [form for form, _, _ in text])))
            # save the delexicalized text and the delexicalization instructions
            self._delexed_texts.append(delex_text)
            self._absts.append(absts)

    def _delex_fix_coords(self, text, da, absts):
        """Fix (merge) coordinated values in delexicalized text (X-slot and X-slot -> X-slot).
        Modifies the input list directly.

        @param text: list of form-lemma-tag tokens of the delexicalized sentence
        @return: None
        """
        idx = 0
        while idx < len(absts) - 1:
            if (absts[idx].slot == absts[idx + 1].slot
                    and absts[idx].end + 1 == absts[idx + 1].start
                    and re.search(r' (and|or) ',
                                  da.value_for_slot(absts[idx].slot))):
                for abst in absts[idx + 2:]:
                    abst.start -= 2
                    abst.end -= 2
                absts[idx].value = da.value_for_slot(absts[idx].slot)
                del text[absts[idx].end:absts[idx + 1].end]
                del absts[idx + 1]
            idx += 1
Beispiel #12
0
 def __init__(self, tagger_model):
     self.__tagger = Tagger.load(tagger_model)
     self.__tokenizer = self.__tagger.newTokenizer()
     self.__forms_buf = Forms()
     self.__tokens_buf = TokenRanges()
     self.__lemmas_buf = TaggedLemmas()
    def pos_tagging(self, text: str, stem=False, preprocess=True):
        """
        Perform pos tagging of given text
        :param text: input text
        :param stem: use stem of word or just lemma
        :param preprocess: use preprocess
        :return: list of list of tagged words: List[List[WordPos]]
        """
        lemmas = TaggedLemmas()
        tokens = TokenRanges()
        forms = Forms()
        sentences = []

        vanilla_text = text
        # remove diacritic
        text = unidecode(text)
        if preprocess:
            # remove stop words
            text = " ".join([
                w if w not in self.preprocesor.stop_words else ""
                for w in text.split()
            ])
            # lower all text
            text = text.lower()
            # replace smileys
            text = self.preprocesor.replace_emoji(text)
            vanilla_text = text

        # POS taging
        self.tokenizer.setText(text)
        while self.tokenizer.nextSentence(forms, tokens):
            sentence = []
            self.tagger.tag(forms, lemmas)
            for i in range(len(lemmas)):
                lemma = lemmas[i].lemma
                tag = lemmas[i].tag
                token = tokens[i]
                token_text = vanilla_text[token.start:token.start +
                                          token.length]
                # remove diacritic
                lemma = unidecode(lemma)
                # eng flag
                eng_word = False

                # '-' is not boundary token
                # boundary token
                if tag[0] == "Z" and lemma != "-":
                    if not preprocess:
                        sentence.append(WordPos(lemma, tag, token_text))
                    if sentence:
                        sentences.append(sentence)
                    sentence = []
                    continue
                # dont stem english words
                if lemma.find("angl") != -1:
                    eng_word = True

                # remove additional informations
                lemma = lemma.split("_")[0]
                lemma = re.sub(r'-\d*$', '', lemma)

                # Stem
                if stem and not eng_word:
                    lemma = cz_stem(lemma)
                if lemma and not preprocess or len(lemma) > 2:
                    sentence.append(WordPos(lemma, tag, token_text))
            if sentence:
                sentences.append(sentence)

        return sentences