Example #1
0
    def before_file(self, fileobj, info={}):
        if not self.chain:
            ext = output_filetype_ext
            self.chain = filetype.printer_class(ext)("candidates")
            self.chain.handle_meta(Meta(None, None, None), info)
            self.candidate_factory = CandidateFactory()
            self.all_entities = collections.OrderedDict()

        self.chain.before_file(fileobj, info)
        self.current_corpus_name = re.sub(
            ".*/", "", re.sub("\.(xml|info)", "", fileobj.name))
Example #2
0
class GrepHandler(filetype.ChainedInputHandler):
    """For each entity in the file, match it against patterns
    and output it if the match was successful.
    """
    def before_file(self, fileobj, info={}):
        if not self.chain:
            self.chain = self.make_printer(info, output_filetype_ext)
            self.candidate_factory = CandidateFactory()
            self.global_dict = {}
        self.chain.before_file(fileobj, info)


    def handle_candidate(self, original_cand, info={}):
        matched = False
        for match_ngram, indexes in self._iter_matches(original_cand):
            matched = True
            # XXX not implementing global `annotate` for now
            if only_the_matching_subpart:
                cand = self.candidate_factory.make(match_ngram)
                self.chain.handle(cand, info)

        if matched and not only_the_matching_subpart:
            self.chain.handle(original_cand, info)


    def handle_sentence(self, sentence, info={}):
        matched = False
        for match_ngram, indexes in self._iter_matches(sentence):
            matched = True
            cand = self.candidate_factory.make(match_ngram)
            cand.add_sources("{}:{}".format(sentence.id_number,
                    ",".join(unicode(wn+1) for wn in indexes)))

            if only_the_matching_subpart:
                subsent = sentence.sub_sentence(indexes)
                self.chain.handle(subsent, info)
            elif annotate:
                mweo = MWEOccurrence(sentence, cand, indexes)
                sentence.mweoccurs.append(mweo)

        if matched and not only_the_matching_subpart:
            self.chain.handle(sentence, info)


    def _iter_matches(self, entity):
        for pattern in input_patterns:
            for (match_ngram, indexes) in pattern.matches(entity,
                    match_distance=match_distance, id_order=id_order,
                    overlapping=not non_overlapping):
                yield match_ngram, indexes
Example #3
0
    def before_file(self, fileobj, info={}):
        if not self.chain:
            ext = output_filetype_ext
            self.chain = filetype.printer_class(ext)("candidates")
            self.chain.handle_meta(Meta(None,None,None), info)
            self.candidate_factory = CandidateFactory()
            self.all_entities = collections.OrderedDict()

        self.chain.before_file(fileobj, info)
        self.current_corpus_name = re.sub(".*/", "",
                re.sub("\.(xml|info)", "", fileobj.name))
Example #4
0
class CandidatesGeneratorHandler(filetype.ChainedInputHandler):
    r"""An InputHandler that generates Candidates."""
    
    def before_file(self, fileobj, info={}):
        if not self.chain:
            ext = output_filetype_ext
            self.chain = filetype.printer_class(ext)("candidates")
            self.chain.handle_meta(Meta(None,None,None), info)
            self.candidate_factory = CandidateFactory()
            self.all_entities = collections.OrderedDict()

        self.chain.before_file(fileobj, info)
        self.current_corpus_name = re.sub(".*/", "",
                re.sub("\.(xml|info)", "", fileobj.name))


    def handle_sentence(self, sentence, info={}):
        """For each sentence in the corpus, generates all the candidates that match
        at least one pattern in the patterns file (-p option) or all the
        ngrams that are in the valid range (-n option).
        
        @param sentence A `Sentence` that is being read from the XML file.    
        """
        global patterns, ignore_pos, surface_instead_lemmas, \
               longest_pattern, shortest_pattern

        already_matched = set()

        for pattern in patterns:
            for (match_ngram, wordnums) in pattern.matches(sentence,
                    match_distance=match_distance, id_order=id_order,
                    overlapping=not non_overlapping):
                wordnums_string = ",".join(unicode(wn+1) for wn in wordnums)
                if wordnums_string in already_matched:
                    continue
                already_matched.add( wordnums_string )

                if ignore_pos :    
                    match_ngram.set_all( pos=WILDCARD )
                ngram_real = unicode(match_ngram.to_string())

                if( surface_instead_lemmas ) :
                    match_ngram.set_all( lemma=WILDCARD )
                else :
                    for word in match_ngram:
                        # (Still uses surface if lemma is unavailable)
                        if word.lemma != WILDCARD:
                            word.surface = WILDCARD

                ngram_basestring = unicode(match_ngram.to_string())
                info_for_ngram_basestring = self.all_entities.setdefault(ngram_basestring, {})
                (surfaces_dict, total_freq) = info_for_ngram_basestring \
                        .get(self.current_corpus_name, ({}, 0))
                freq_surface = surfaces_dict.setdefault(ngram_real, [])

                # Append the id of the source sentence. The number of items in
                # surfaces_dict[form] is the number of occurrences of that form.
                source_sent_id = str( sentence.id_number ) + ":" + wordnums_string
                surfaces_dict[ ngram_real ].append( source_sent_id )
                info_for_ngram_basestring[self.current_corpus_name] \
                        = (surfaces_dict, total_freq + 1)


    def finish(self):
        self.print_candidates(self.chain)
        self.chain.finish()


    def print_candidates(self, chain):
        """Prints a XML file (mwetoolkit-candidates.dtd) from a temporary 
        candidates file generated by the treat_sentence callback function. 
        Repeated candidates are not printed several times: instead, each base 
        form has a joint frequency of the candidate in the corpus. Since the
        new version of the "count.py" script, this initial frequency is only
        printed if you explicitely ask to do it through the -f option.

        @param filename: The file name of the corpus from which we generate the
        candidates.
        """
        global print_cand_freq, print_source
        verbose("Outputting candidates file...")
        for ngram_basestring, info in self.all_entities.iteritems() :
            cand = self.candidate_factory.make()
            cand.from_string(ngram_basestring)
            for corpus_name, (surface_dict, total_freq) in info.iteritems():
                if print_cand_freq :
                   freq = Frequency( corpus_name, total_freq )
                   cand.add_frequency( freq )
                for occur_string in surface_dict.keys() :
                    occur_form = Ngram( None, None )
                    occur_form.from_string(occur_string)
                    sources = surface_dict[occur_string]
                    freq_value = len(sources)
                    freq = Frequency( corpus_name, freq_value )
                    occur_form.add_frequency( freq )
                    if print_source:
                        occur_form.add_sources(sources)
                    cand.add_occur( occur_form )
            chain.handle_candidate(cand, info)
Example #5
0
class CandidatesGeneratorHandler(filetype.ChainedInputHandler):
    r"""An InputHandler that generates Candidates."""
    def before_file(self, fileobj, info={}):
        if not self.chain:
            ext = output_filetype_ext
            self.chain = filetype.printer_class(ext)("candidates")
            self.chain.handle_meta(Meta(None, None, None), info)
            self.candidate_factory = CandidateFactory()
            self.all_entities = collections.OrderedDict()

        self.chain.before_file(fileobj, info)
        self.current_corpus_name = re.sub(
            ".*/", "", re.sub("\.(xml|info)", "", fileobj.name))

    def handle_sentence(self, sentence, info={}):
        """For each sentence in the corpus, generates all the candidates that match
        at least one pattern in the patterns file (-p option) or all the
        ngrams that are in the valid range (-n option).
        
        @param sentence A `Sentence` that is being read from the XML file.    
        """
        global patterns, ignore_pos, surface_instead_lemmas, \
               longest_pattern, shortest_pattern

        already_matched = set()

        for pattern in patterns:
            for (match_ngram,
                 wordnums) in pattern.matches(sentence,
                                              match_distance=match_distance,
                                              id_order=id_order,
                                              overlapping=not non_overlapping):
                wordnums_string = ",".join(unicode(wn + 1) for wn in wordnums)
                if wordnums_string in already_matched:
                    continue
                already_matched.add(wordnums_string)

                if ignore_pos:
                    match_ngram.set_all(pos=WILDCARD)
                ngram_real = unicode(match_ngram.to_string())

                if (surface_instead_lemmas):
                    match_ngram.set_all(lemma=WILDCARD)
                else:
                    for word in match_ngram:
                        # (Still uses surface if lemma is unavailable)
                        if word.lemma != WILDCARD:
                            word.surface = WILDCARD

                ngram_basestring = unicode(match_ngram.to_string())
                info_for_ngram_basestring = self.all_entities.setdefault(
                    ngram_basestring, {})
                (surfaces_dict, total_freq) = info_for_ngram_basestring \
                        .get(self.current_corpus_name, ({}, 0))
                freq_surface = surfaces_dict.setdefault(ngram_real, [])

                # Append the id of the source sentence. The number of items in
                # surfaces_dict[form] is the number of occurrences of that form.
                source_sent_id = str(
                    sentence.id_number) + ":" + wordnums_string
                surfaces_dict[ngram_real].append(source_sent_id)
                info_for_ngram_basestring[self.current_corpus_name] \
                        = (surfaces_dict, total_freq + 1)

    def finish(self):
        self.print_candidates(self.chain)
        self.chain.finish()

    def print_candidates(self, chain):
        """Prints a XML file (mwetoolkit-candidates.dtd) from a temporary 
        candidates file generated by the treat_sentence callback function. 
        Repeated candidates are not printed several times: instead, each base 
        form has a joint frequency of the candidate in the corpus. Since the
        new version of the "count.py" script, this initial frequency is only
        printed if you explicitely ask to do it through the -f option.

        @param filename: The file name of the corpus from which we generate the
        candidates.
        """
        global print_cand_freq, print_source
        verbose("Outputting candidates file...")
        for ngram_basestring, info in self.all_entities.iteritems():
            cand = self.candidate_factory.make()
            cand.from_string(ngram_basestring)
            for corpus_name, (surface_dict, total_freq) in info.iteritems():
                if print_cand_freq:
                    freq = Frequency(corpus_name, total_freq)
                    cand.add_frequency(freq)
                for occur_string in surface_dict.keys():
                    occur_form = Ngram(None, None)
                    occur_form.from_string(occur_string)
                    sources = surface_dict[occur_string]
                    freq_value = len(sources)
                    freq = Frequency(corpus_name, freq_value)
                    occur_form.add_frequency(freq)
                    if print_source:
                        occur_form.add_sources(sources)
                    cand.add_occur(occur_form)
            chain.handle_candidate(cand, info)
Example #6
0
 def __init__(self, *args, **kwargs):
     super(NGramCounterHandler, self).__init__(*args, **kwargs)
     self.candidate_factory = CandidateFactory()
     self.chain = None
Example #7
0
class NGramCounterHandler(filetype.InputHandler):
    def __init__(self, *args, **kwargs):
        super(NGramCounterHandler, self).__init__(*args, **kwargs)
        self.candidate_factory = CandidateFactory()
        self.chain = None

    def handle_sentence(self, sentence, info={}):
        """Count all ngrams being considered in the sentence."""
        global corpus_size

        # 'shelve' does not speak Unicode; we must convert Unicode strings back to
        # plain bytestrings to use them as keys.
        words = [getattr(w, base_attr).encode('utf-8') for w in sentence]

        for ngram_size in range(1, max_ngram + 2):
            for i in range(len(words) - ngram_size + 1):
                ngram = words[i : i+ngram_size]
                ngram_key = key(ngram)
                count = ngram_counts.get(ngram_key, 0)
                ngram_counts[ngram_key] = count + 1
                selected_candidates[ngram_key] = True

        corpus_size += len(words)

    
    def before_file(self, fileobj, info={}):
        if self.chain is None:
            self.chain = self.make_printer(info, None)
            self.chain.before_file(fileobj, info)
            m = Meta(None,None,None)
            m.add_corpus_size(CorpusSize("corpus", corpus_size))
            m.add_meta_feat(MetaFeat("glue", "real"))
            self.chain.handle_meta(m)

    def after_file(self, fileobj, info={}):
        global corpus_size_f
        corpus_size_f = float(corpus_size)
        verbose("Selecting ngrams through LocalMaxs...")
        self.localmaxs()
        verbose("Outputting candidates file...")

        for ngram_key in selected_candidates:
            if selected_candidates[ngram_key] and ngram_counts[ngram_key] >= min_frequency:
                self.dump_ngram(ngram_key, None)
        self.chain.after_file(fileobj, info)


    def localmaxs(self):
        """The LocalMaxs algorithm. Check whether each of the extracted
        ngrams is a local maximum in terms of glue value.
        """
        for ngram_key in ngram_counts:
            ngram = unkey(ngram_key)
            if len(ngram) >= min_ngram and len(ngram) <= max_ngram + 1:
                left = ngram[:-1]
                right = ngram[1:]
                this_glue = glue(ngram)

                for subgram in [left, right]:
                    subglue = glue(subgram)
                    subkey = key(subgram)
                    if this_glue < subglue:
                        selected_candidates[ngram_key] = False
                    elif subglue < this_glue:
                        selected_candidates[subkey] = False
            else:
                selected_candidates[ngram_key] = False


    def dump_ngram(self, ngram_key, cand_id=None):
        """Print an ngram as XML."""
        ngram = unkey(ngram_key)
        cand = self.candidate_factory.make(id_number=cand_id)
        for value in ngram:
            word = Word(WILDCARD, WILDCARD, WILDCARD, WILDCARD)
            setattr(word, base_attr, value.decode('utf-8'))
            cand.append(word)
        freq = Frequency('corpus', ngram_counts[ngram_key])
        cand.add_frequency(freq)
        cand.add_feat(Feature('glue', glue(ngram)))
        self.chain.handle_candidate(cand)
Example #8
0
 def before_file(self, fileobj, info={}):
     if not self.chain:
         self.chain = self.make_printer(info, output_filetype_ext)
         self.candidate_factory = CandidateFactory()
         self.global_dict = {}
     self.chain.before_file(fileobj, info)
Example #9
0
 def before_file(self, fileobj, info={}):
     if not self.chain:
         self.chain = self.make_printer(info, None)
         self.candidate_factory = CandidateFactory()
     self.chain.before_file(fileobj, info)
Example #10
0
class EvaluatorHandler(filetype.ChainedInputHandler):
    def before_file(self, fileobj, info={}):
        if not self.chain:
            self.chain = self.make_printer(info, None)
            self.candidate_factory = CandidateFactory()
        self.chain.before_file(fileobj, info)

    def handle_meta(self, meta, info={}) :
        """Adds new meta-TP class corresponding to the evaluation of the candidate
        list according to a reference gold standard. Automatic evaluation is
        2-class only, the class values are "True" and "False" for true and
        false positives.

        @param meta The `Meta` header that is being read from the XML file.
        """
        global gs_name
        meta.add_meta_tpclass( MetaTPClass( gs_name, "{True,False}" ) )
        self.chain.handle_meta(meta)


    def handle_candidate(self, candidate_i, info={}) :
        """For each candidate, verifies whether it is contained in the reference
        list (in which case it is a *True* positive) or else, it is not in the
        reference list (in which case it is a *False* positive, i.e. a random
        ngram that does not constitute a MWE).

        @param candidate_i The `Candidate` that is being read from the XML file.
        """
        global ignore_pos
        global gs_name
        global ignore_case
        global entity_counter
        global tp_counter
        global pre_gs
        global lemma_or_surface
        global fuzzy_pre_gs

        true_positive = False
        #pdb.set_trace()
        candidate = self.candidate_factory.make()
        for w in candidate_i :
            copy_w = Word( w.surface, w.lemma, w.pos, w.syn)
            candidate.append( copy_w )    
        
        if ignore_pos :
            candidate.set_all( pos=WILDCARD )     # reference has type Pattern
        pre_gs_key = candidate.to_string()
        if ignore_case :
            pre_gs_key = pre_gs_key.lower()
        entries_to_check = pre_gs.get( pre_gs_key, [] )

        if lemma_or_surface:
            entries_to_check += fuzzy_pre_gs.get(WORD_SEPARATOR.join([w.lemma for w in candidate]), [])
            entries_to_check += fuzzy_pre_gs.get(WORD_SEPARATOR.join([w.surface for w in candidate]), [])

        for gold_entry in entries_to_check :
            if gold_entry.match( candidate, ignore_case=ignore_case, lemma_or_surface=lemma_or_surface ) :
                true_positive = True
                break # Stop at first positive match

        if true_positive :
            candidate_i.add_tpclass( TPClass( gs_name, "True" ) )
            tp_counter = tp_counter + 1
        else :
            candidate_i.add_tpclass( TPClass( gs_name, "False" ) )
        self.chain.handle_candidate(candidate_i, info)
        entity_counter += 1


    def finish(self):
        precision = float( tp_counter ) / float( entity_counter )
        recall = float( tp_counter ) / float( ref_counter )
        if precision + recall > 0 :
            fmeasure =  ( 2 * precision * recall) / ( precision + recall )
        else :
            fmeasure = 0.0

        footer = """\
            ====================
            Nb. of true positives: {tp}
            Nb. of candidates: {ca}
            Nb. of references: {refs}
            Precision: {p:.6f}
            Recall: {r:.6f}
            F-measure: {f:.6f}
            ===================="""
        footer = footer.format(tp=tp_counter, ca=entity_counter,
                refs=ref_counter, p=precision, r=recall, f=fmeasure)
        footer = textwrap.dedent(footer)
        self.chain.handle_comment(footer)
        super(EvaluatorHandler, self).finish()
Example #11
0
 def __init__(self, *args, **kwargs):
     super(NGramCounterHandler, self).__init__(*args, **kwargs)
     self.candidate_factory = CandidateFactory()
     self.chain = None
Example #12
0
class NGramCounterHandler(filetype.InputHandler):
    def __init__(self, *args, **kwargs):
        super(NGramCounterHandler, self).__init__(*args, **kwargs)
        self.candidate_factory = CandidateFactory()
        self.chain = None

    def handle_sentence(self, sentence, info={}):
        """Count all ngrams being considered in the sentence."""
        global corpus_size

        # 'shelve' does not speak Unicode; we must convert Unicode strings back to
        # plain bytestrings to use them as keys.
        words = [getattr(w, base_attr).encode('utf-8') for w in sentence]

        for ngram_size in range(1, max_ngram + 2):
            for i in range(len(words) - ngram_size + 1):
                ngram = words[i:i + ngram_size]
                ngram_key = key(ngram)
                count = ngram_counts.get(ngram_key, 0)
                ngram_counts[ngram_key] = count + 1
                selected_candidates[ngram_key] = True

        corpus_size += len(words)

    def before_file(self, fileobj, info={}):
        if self.chain is None:
            self.chain = self.make_printer(info, None)
            self.chain.before_file(fileobj, info)
            m = Meta(None, None, None)
            m.add_corpus_size(CorpusSize("corpus", corpus_size))
            m.add_meta_feat(MetaFeat("glue", "real"))
            self.chain.handle_meta(m)

    def after_file(self, fileobj, info={}):
        global corpus_size_f
        corpus_size_f = float(corpus_size)
        verbose("Selecting ngrams through LocalMaxs...")
        self.localmaxs()
        verbose("Outputting candidates file...")

        for ngram_key in selected_candidates:
            if selected_candidates[
                    ngram_key] and ngram_counts[ngram_key] >= min_frequency:
                self.dump_ngram(ngram_key, None)
        self.chain.after_file(fileobj, info)

    def localmaxs(self):
        """The LocalMaxs algorithm. Check whether each of the extracted
        ngrams is a local maximum in terms of glue value.
        """
        for ngram_key in ngram_counts:
            ngram = unkey(ngram_key)
            if len(ngram) >= min_ngram and len(ngram) <= max_ngram + 1:
                left = ngram[:-1]
                right = ngram[1:]
                this_glue = glue(ngram)

                for subgram in [left, right]:
                    subglue = glue(subgram)
                    subkey = key(subgram)
                    if this_glue < subglue:
                        selected_candidates[ngram_key] = False
                    elif subglue < this_glue:
                        selected_candidates[subkey] = False
            else:
                selected_candidates[ngram_key] = False

    def dump_ngram(self, ngram_key, cand_id=None):
        """Print an ngram as XML."""
        ngram = unkey(ngram_key)
        cand = self.candidate_factory.make(id_number=cand_id)
        for value in ngram:
            word = Word(WILDCARD, WILDCARD, WILDCARD, WILDCARD)
            setattr(word, base_attr, value.decode('utf-8'))
            cand.append(word)
        freq = Frequency('corpus', ngram_counts[ngram_key])
        cand.add_frequency(freq)
        cand.add_feat(Feature('glue', glue(ngram)))
        self.chain.handle_candidate(cand)
Example #13
0
 def before_file(self, fileobj, info={}):
     if not self.chain:
         self.chain = self.make_printer(info, None)
         self.candidate_factory = CandidateFactory()
     self.chain.before_file(fileobj, info)
Example #14
0
class EvaluatorHandler(filetype.ChainedInputHandler):
    def before_file(self, fileobj, info={}):
        if not self.chain:
            self.chain = self.make_printer(info, None)
            self.candidate_factory = CandidateFactory()
        self.chain.before_file(fileobj, info)

    def handle_meta(self, meta, info={}):
        """Adds new meta-TP class corresponding to the evaluation of the candidate
        list according to a reference gold standard. Automatic evaluation is
        2-class only, the class values are "True" and "False" for true and
        false positives.

        @param meta The `Meta` header that is being read from the XML file.
        """
        global gs_name
        meta.add_meta_tpclass(MetaTPClass(gs_name, "{True,False}"))
        self.chain.handle_meta(meta)

    def handle_candidate(self, candidate_i, info={}):
        """For each candidate, verifies whether it is contained in the reference
        list (in which case it is a *True* positive) or else, it is not in the
        reference list (in which case it is a *False* positive, i.e. a random
        ngram that does not constitute a MWE).

        @param candidate_i The `Candidate` that is being read from the XML file.
        """
        global ignore_pos
        global gs_name
        global ignore_case
        global entity_counter
        global tp_counter
        global pre_gs
        global lemma_or_surface
        global fuzzy_pre_gs

        true_positive = False
        #pdb.set_trace()
        candidate = self.candidate_factory.make()
        for w in candidate_i:
            copy_w = Word(w.surface, w.lemma, w.pos, w.syn)
            candidate.append(copy_w)

        if ignore_pos:
            candidate.set_all(pos=WILDCARD)  # reference has type Pattern
        pre_gs_key = candidate.to_string()
        if ignore_case:
            pre_gs_key = pre_gs_key.lower()
        entries_to_check = pre_gs.get(pre_gs_key, [])

        if lemma_or_surface:
            entries_to_check += fuzzy_pre_gs.get(
                WORD_SEPARATOR.join([w.lemma for w in candidate]), [])
            entries_to_check += fuzzy_pre_gs.get(
                WORD_SEPARATOR.join([w.surface for w in candidate]), [])

        for gold_entry in entries_to_check:
            if gold_entry.match(candidate,
                                ignore_case=ignore_case,
                                lemma_or_surface=lemma_or_surface):
                true_positive = True
                break  # Stop at first positive match

        if true_positive:
            candidate_i.add_tpclass(TPClass(gs_name, "True"))
            tp_counter = tp_counter + 1
        else:
            candidate_i.add_tpclass(TPClass(gs_name, "False"))
        self.chain.handle_candidate(candidate_i, info)
        entity_counter += 1

    def finish(self):
        precision = float(tp_counter) / float(entity_counter)
        recall = float(tp_counter) / float(ref_counter)
        if precision + recall > 0:
            fmeasure = (2 * precision * recall) / (precision + recall)
        else:
            fmeasure = 0.0

        footer = """\
            ====================
            Nb. of true positives: {tp}
            Nb. of candidates: {ca}
            Nb. of references: {refs}
            Precision: {p:.6f}
            Recall: {r:.6f}
            F-measure: {f:.6f}
            ===================="""
        footer = footer.format(tp=tp_counter,
                               ca=entity_counter,
                               refs=ref_counter,
                               p=precision,
                               r=recall,
                               f=fmeasure)
        footer = textwrap.dedent(footer)
        self.chain.handle_comment(footer)
        super(EvaluatorHandler, self).finish()