Esempio n. 1
0
    def test(self, test_sequence, **kwargs):
        """
        Tests the HiddenMarkovModelTagger instance.

    	:param test_sequence: a sequence of labeled test instances
        :type test_sequence: list(list)
        :param verbose: boolean flag indicating whether training should be
            verbose or include printed output
        :type verbose: bool
        """

        def words(sent):
            return [word for (word, tag) in sent]

        def tags(sent):
            return [tag for (word, tag) in sent]

        test_sequence = LazyMap(self._transform.transform, test_sequence)
        predicted_sequence = LazyMap(self._tag, LazyMap(words, test_sequence))

        if kwargs.get('verbose', False):
            # This will be used again later for accuracy so there's no sense
            # in tagging it twice.
            test_sequence = list(test_sequence)
            predicted_sequence = list(predicted_sequence)

            for test_sent, predicted_sent in zip(test_sequence,
                                                 predicted_sequence):
                print 'Test:', \
                    ' '.join(['%s/%s' % (str(token), str(tag))
                              for (token, tag) in test_sent])
                print
                print 'Untagged:', \
                    ' '.join([str(token) for (token, tag) in test_sent])
                print
                print 'HMM-tagged:', \
                    ' '.join(['%s/%s' % (str(token), str(tag))
                              for (token, tag) in predicted_sent])
                print
                print 'Entropy:', \
                    self.entropy([(token, None) for
                                  (token, tag) in predicted_sent])
                print
                print '-' * 60

        test_tags = LazyConcatenation(LazyMap(tags, test_sequence))
        predicted_tags = LazyConcatenation(LazyMap(tags, predicted_sequence))

        acc = accuracy(test_tags, predicted_tags)

        count = sum([len(sent) for sent in test_sequence])

        print 'accuracy over %d tokens: %.2f' % (count, acc * 100)
Esempio n. 2
0
    def get_ne(self, fileids=None, tagset=None):
        self._require(self.NE)

        def get_ne_inn(grid):
            return self._get_ne(grid, tagset)

        return LazyConcatenation(LazyMap(get_ne_inn, self._grids(fileids)))
Esempio n. 3
0
    def _chunked_sents(self, sents, depth=0):
        """
        @return: A list of sentence chunk trees which are flatter than the
            original trees.
        @rtype: C{list} of C{list} of C{Tree}
        @param sents: A list of sentence trees.
        @type sents: C{list} of C{list} of C{Tree}
        @param depth: How deep to read nested chunks off of the trees. If
            depth is None, all possible chunk substrees are returned, 
            otherwise, chunks are returned starting at the highest level 0,
            then the next highest 1, etc.
        @type depth: C{int}
        """
        def __chunked_sent(sent):
            for chunk in sent:
                # If the chunk is a Tree, append it's immediate subtrees.
                if isinstance(chunk, Tree):
                    return list(chunk)
                # If the chunk is not a tree, append it.
                else:
                    return chunk

        # If depth is None, return all possible subtrees
        if depth is None:
            return LazyMap(lambda sent: sent.subtrees(), sents)
        # If depth is too small, no need to recurse and read further.
        if not depth - 1 >= 0:
            return sents
        # Otherwise, apply __chunked_sent() and recurse.
        return self._chunked_sents(
            LazyConcatenation(LazyMap(__chunked_sent, sents)), depth - 1)
    def full_tagged_words(self, fileids=None, tagset=None):
        #self._require(self.WORDS, self.POS, self.TEXTLAYER, self.CHUNK, self.LEMMA)#, self.ENTITYID)
        def get_tagged_words(grid):
            return self._get_full_tagged_words(grid, tagset)

        return LazyConcatenation(
            LazyMap(get_tagged_words, self._grids(fileids)))
Esempio n. 5
0
 def tagged_sents(self):
     sents = self.sents()
     batch_indices = range(len(sents) / 1024 + 1)
     return LazyConcatenation(
         LazyMap(
             lambda i: self._tagger.batch_tag(sents[i * 1024:i * 1024 + 1024
                                                    ]), batch_indices))
Esempio n. 6
0
    def tagged_sents(self,
                     fileids=None,
                     stem=False,
                     relation=None,
                     strip_space=True,
                     replace=False):
        """
        :return: the given file(s) as a list of
            sentences, each encoded as a list of ``(word,tag)`` tuples.
        :rtype: list(list(tuple(str,str)))

        :param speaker: If specified, select specific speaker(s) defined
            in the corpus. Default is 'ALL' (all participants). Common choices
            are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
            researchers)
        :param stem: If true, then use word stems instead of word strings.
        :param relation: If true, then return tuples of ``(str,pos,relation_list)``.
            If there is manually-annotated relation info, it will return
            tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
        :param strip_space: If true, then strip trailing spaces from word
            tokens. Otherwise, leave the spaces on the tokens.
        :param replace: If true, then use the replaced (intended) word instead
            of the original word (e.g., 'wat' will be replaced with 'watch')
        """
        sent = True
        pos = True
        if not self._lazy:
            return [
                self._get_words(fileid, sent, stem, relation, pos, strip_space,
                                replace) for fileid in self.abspaths(fileids)
            ]

        get_words = lambda fileid: self._get_words(
            fileid, sent, stem, relation, pos, strip_space, replace)
        return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
Esempio n. 7
0
    def tagged_words(self, fileids=None, tagset=None):
        self._require(self.WORDS, self.POS)

        def get_tagged_words(grid):
            return self._get_tagged_words(grid, tagset)

        return LazyConcatenation(LazyMap(get_tagged_words, self._grids(fileids)))
Esempio n. 8
0
    def __init__(self, source):
        if hasattr(source, 'words'):  # bridge to the text corpus reader
            source = [source.words(f) for f in source.fileids()]

        self._texts = source
        Text.__init__(self, LazyConcatenation(source))
        self._idf_cache = {}
Esempio n. 9
0
    def words(self,
              fileids=None,
              speaker='ALL',
              stem=False,
              relation=False,
              strip_space=True,
              replace=False):
        """
        :return: the given file(s) as a list of words
        :rtype: list(str)

        :param speaker: If specified, select specific speaker(s) defined
            in the corpus. Default is 'ALL' (all participants). Common choices
            are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
            researchers)
        :param stem: If true, then use word stems instead of word strings.
        :param relation: If true, then return tuples of (stem, index,
            dependent_index)
        :param strip_space: If true, then strip trailing spaces from word
            tokens. Otherwise, leave the spaces on the tokens.
        :param replace: If true, then use the replaced (intended) word instead
            of the original word (e.g., 'wat' will be replaced with 'watch')
        """
        sent = None
        pos = False
        if not self._lazy:
            return [
                self._get_words(fileid, speaker, sent, stem, relation, pos,
                                strip_space, replace)
                for fileid in self.abspaths(fileids)
            ]

        get_words = lambda fileid: self._get_words(
            fileid, speaker, sent, stem, relation, pos, strip_space, replace)
        return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))
Esempio n. 10
0
    def get_tags(self, fileids=None, tagset=None, tags=[]):
        required = []
        for tag in tags:
            if tag == 'offset':
                required.append(self.OFFSET)
            if tag == 'len':
                required.append(self.LEN)
            if tag == 'words':
                required.append(self.WORDS)
            if tag == 'pos':
                required.append(self.POS)
            if tag == 'tree':
                required.append(self.TREE)
            if tag == 'ne':
                required.append(self.NE)
            if tag == 'srl':
                required.append(self.SRL)
            if tag == 'ignore':
                required.append(self.IGNORE)
            if tag == 'chunk':
                required.append(self.CHUNK)

        self._require(*required)

        def get_tags_inn(grid, tags=tags):
            return self._get_tags(grid, tagset, tags=tags)

        return LazyConcatenation(LazyMap(get_tags_inn, self._grids(fileids)))
Esempio n. 11
0
 def chunked_words(self, fileids=None, chunk_types=None,
                   tagset=None):
     self._require(self.WORDS, self.POS, self.CHUNK)
     if chunk_types is None: chunk_types = self._chunk_types
     def get_chunked_words(grid): # capture chunk_types as local var
         return self._get_chunked_words(grid, chunk_types, tagset)
     return LazyConcatenation(LazyMap(get_chunked_words,
                                      self._grids(fileids)))
Esempio n. 12
0
    def tagged_words(self, fileids=None, simplify_tags=False):
        self._require(self.WORDS, self.POS)

        def get_tagged_words(grid):
            return self._get_tagged_words(grid, simplify_tags)

        return LazyConcatenation(
            LazyMap(get_tagged_words, self._grids(fileids)))
Esempio n. 13
0
 def sents(self, fileids=None):
     """
     @return: A list of sentences.
     @rtype: C{list} of C{list} of C{str}
     @param fileids: A list of corpus files.
     @type fileids: C{list} of C{str} or regular expression        
     """
     return LazyConcatenation(self.paras(fileids))
Esempio n. 14
0
 def srl_instances(self, fileids=None, pos_in_tree=None, flatten=True):
     self._require(self.WORDS, self.POS, self.TREE, self.SRL)
     if pos_in_tree is None: pos_in_tree = self._pos_in_tree
     def get_srl_instances(grid): # capture pos_in_tree as local var
         return self._get_srl_instances(grid, pos_in_tree)
     result = LazyMap(get_srl_instances, self._grids(fileids))
     if flatten: result = LazyConcatenation(result)
     return result
Esempio n. 15
0
    def lemmatised_words(self, fileids=None):
        """Retrieve a list of lemmatised words. Words are encoded as tuples in
           C{(word, lemma)} form.
        @return: A list of words and their tuples.
        @rtype: C{list} of C{(word, lemma)}
        """

        self._require(self.WORDS, self.LEMMA)
        return LazyConcatenation(
            LazyMap(self._get_lemmatised_words, self._grids(fileids)))
Esempio n. 16
0
 def iob_words(self, fileids=None):
     """
     @return: a list of word/tag/IOB tuples 
     @rtype: C{list} of C{tuple}
     @param fileids: the list of fileids that make up this corpus 
     @type fileids: C{None} or C{str} or C{list}
     """
     self._require(self.WORDS, self.POS, self.CHUNK)
     return LazyConcatenation(
         LazyMap(self._get_iob_words, self._grids(fileids)))
Esempio n. 17
0
    def chunked_words(self, fileids=None):
        """Retrieve a list of chunked words. Words are encoded as C{(word, tag)}
           and chunks are encoded as trees over C{(word, tag)} leaves.
        @return: A tree representation of the word chunk.
        @rtype: C{list} of (C{(str,str)} and L{Tree})
        """

        self._require(self.WORDS, self.POS, self.PARENT)
        return LazyConcatenation(
            LazyMap(self._get_chunked_words, self._grids(fileids)))
Esempio n. 18
0
    def dep_srl_instances(self, fileids=None, flatten=True):
        self._require(self.WORDS, self.POS, self.HEAD, self.DEPREL,
                      self.FILLPRED, self.SRL)

        def get_dep_srl_instances(grid):
            return self._get_dep_srl_instances(grid)

        result = LazyMap(get_dep_srl_instances, self._grids(fileids))
        if flatten: result = LazyConcatenation(result)
        return result
Esempio n. 19
0
    def morphological_words(self, fileids=None):
        """Retrieve a list of sentences with the words' morphological type.
           Words are encoded as tuples in C{(word, morph)} form.
        @return: A list of sentences with words and their morphological type.
        @rtype: C{list} of C{(word, morph)}
        """

        self._require(self.WORDS, self.MORPH)
        return LazyConcatenation(
            LazyMap(self._get_morphological_words, self._grids(fileids)))
Esempio n. 20
0
 def words(self, fileids=None):
     """
     @return: A list of words.
     @rtype: C{list} of C{str}
     @param fileids: A list of corpus files.
     @type fileids: C{list} of C{str} or regular expression 
     @kwparam depth: Depth of chunk parsing for nested chunks.
     @type depth: C{int}        
     """
     # Concatenate the list of lists given by sents().
     return LazyConcatenation(self.sents(fileids))
Esempio n. 21
0
 def iob_words(self, fileids=None, **kwargs):
     """
     @return: A list of word/iob/other tag tuples.
     @rtype: C{list} of C{tuple}
     @param fileids: A list of corpus files.
     @type fileids: C{list} of C{str} or regular expression 
     @kwparam depth: Depth of chunk parsing for nested chunks.
     @type depth: C{int}        
     """
     # Concatenate the list of lists given by iob_sents().
     return LazyConcatenation(self.iob_sents(fileids, **kwargs))
Esempio n. 22
0
    def _paras(self, fileids=None):
        """
        @return: A list of paragraphs.
        @rtype: C{list} of C{Tree}
        @param fileids: A list of corpus files.
        @type fileids: C{list} of C{str} or regular expression.
        """
        def __para(doc):
            return list(doc.text)

        return LazyConcatenation(LazyMap(__para, self.parsed_docs(fileids)))
Esempio n. 23
0
 def iob_words(self, fileids=None, tagset=None):
     """
     :return: a list of word/tag/IOB tuples
     :rtype: list(tuple)
     :param fileids: the list of fileids that make up this corpus
     :type fileids: None or str or list
     """
     self._require(self.WORDS, self.POS, self.CHUNK)
     def get_iob_words(grid):
         return self._get_iob_words(grid, tagset)
     return LazyConcatenation(LazyMap(get_iob_words, self._grids(fileids)))
Esempio n. 24
0
    def _sents(self, fileids=None):
        """
        @return: A list of sentence trees.
        @rtype: C{list} of C{list} of C{Tree}
        @param fileids: A list of corpus files.
        @type fileids: C{list} of C{str} or regular expression      
        """
        def __sents(para):
            return list(para)

        # Flatten this because it's a list of list of trees for each doc. It
        # doesn't matter which doc the list is from so chain them together.
        return LazyConcatenation(LazyMap(__sents, self._paras(fileids)))
Esempio n. 25
0
def concat(docs):
    """
    Concatenate together the contents of multiple documents from a
    single corpus, using an appropriate concatenation function.  This
    utility function is used by corpus readers when the user requests
    more than one document at a time.
    """
    if len(docs) == 1:
        return docs[0]
    if len(docs) == 0:
        raise ValueError("concat() expects at least one object!")

    types = set(d.__class__ for d in docs)

    # If they're all strings, use string concatenation.
    if all(isinstance(doc, str) for doc in docs):
        return "".join(docs)

    # If they're all corpus views, then use ConcatenatedCorpusView.
    for typ in types:
        if not issubclass(typ,
                          (StreamBackedCorpusView, ConcatenatedCorpusView)):
            break
    else:
        return ConcatenatedCorpusView(docs)

    # If they're all lazy sequences, use a lazy concatenation
    for typ in types:
        if not issubclass(typ, AbstractLazySequence):
            break
    else:
        return LazyConcatenation(docs)

    # Otherwise, see what we can do:
    if len(types) == 1:
        typ = list(types)[0]

        if issubclass(typ, list):
            return reduce((lambda a, b: a + b), docs, [])

        if issubclass(typ, tuple):
            return reduce((lambda a, b: a + b), docs, ())

        if ElementTree.iselement(typ):
            xmltree = ElementTree.Element("documents")
            for doc in docs:
                xmltree.append(doc)
            return xmltree

    # No method found!
    raise ValueError("Don't know how to concatenate types: %r" % types)
    def iob_words(self, fileids=None, columns=None, convert2iob=[]):
        """
        :return: a list of word/tag/IOB tuples
        :rtype: list(tuple)
        :param fileids: the list of fileids that make up this corpus
        :type fileids: None or str or list
        """
        if columns is None:
            columns = [c for c in self.columns if c != "ignore"]

        self._require(*columns)

        def get_iob_words(grid):
            return self._get_converted_iob_words(grid,
                                                 columns,
                                                 convert2iob=convert2iob)

        return LazyConcatenation(LazyMap(get_iob_words, self._grids(fileids)))
Esempio n. 27
0
    def iob_words(self, fileids=None, tagset=None,
                  column=ConllCorpusReader.CHUNK):
        """Returns IOB annotations as tuples.

        Args:
            fileids: The list of fileids that make up this corpus.
            tagset: The tagset.
            column: The column to get the IOB annotations from, e.g. 'ne' for
                named entities or 'pos' for POS tags.

        Returns:
            A list of word/tag/IOB tuples.
        """
        self._require(self.WORDS, self.POS, self.CHUNK)

        def get_iob_words(grid):
            return self._get_iob_words(grid, tagset, column)

        return LazyConcatenation(LazyMap(get_iob_words, self._grids(fileids)))
Esempio n. 28
0
    def mentions(self, fileids=None, **kwargs):
        """
        @return: A list of mentions as the tuple of 
            ([words...], id, referent, type)
        @rtype: C{list} of C{list} of C{tuple}
        @param fileids: A list of corpus files.
        @type fileids: C{list} of C{str} or regular expression 
        @kwparam depth: Depth of chunk parsing for nested chunks.
        @type depth: C{int}  
        @kwparam concat: Concatenate sentence lists into one list; works like
            itertools.chain(). Defaults to False.
        @type concat: C{bool}
        @kwparam nonmentions: Return nonmentions as well as mentions. Defaults
            to False.
        @type nonmentions: C{bool}              
        """
        def __mentions(sent):
            mentions = []
            # Map each sentence subtree into a tuple.
            for token in map(tree2tuple, sent):
                # If the token type is COREF then append the token contents
                # and everything but the token type.
                if token[1] == 'COREF':
                    mentions.append(token[:1] + token[2:])
                # If including nonmentions, append the token contents only.
                elif kwargs.get('nonmentions'):
                    mentions.append(token[:1])
            return mentions

        # TODO: Is depth doing what it's expected to?
        depth = kwargs.get('depth', 0)
        sents = self._chunked_sents(self._sents(fileids), depth)
        # Concatenate the lists.
        if kwargs.get('concat'):
            return LazyConcatenation(LazyMap(__mentions, sents))
        # Or not.
        else:
            return LazyMap(__mentions, sents)
Esempio n. 29
0
    def chunks(self, fileids=None, **kwargs):
        """
        @return: A list of chunked sents where chunks are multi-word strings.
        @rtype: C{list} of C{list} of C{str}
        @param fileids: A list of corpus files.
        @type fileids: C{list} of C{str} or regular expression 
        @kwparam depth: Depth of chunk parsing for nested chunks.
        @type depth: C{int}        
        @kwparam concat: Concatenate sentence lists into one list; works like
            itertools.chain()
        @type concat: C{bool}
        """
        def __chunks(sent):
            chunks = []
            for token in sent:
                # If the token is a list of chunk pieces, append the piece's
                # contents as a string.
                if isinstance(token, list):
                    # TODO: Better if able to reverse Treebank-style
                    # tokenization. The join leaves some weird whitespace.
                    chunks.append(' '.join([word[0] for word in token]))
                # If the token is a tuple, append the token's contents.
                elif isinstance(token, tuple):
                    chunks.append(token[0])
                # Something bad happened.
                else:
                    raise
            return chunks

        sents = self.chunked_sents(fileids, **kwargs)
        # Concatenate the lists.
        if kwargs.get('concat'):
            return LazyConcatenation(LazyMap(__chunks, sents))
        # Or not.
        else:
            return LazyMap(__chunks, sents)
Esempio n. 30
0
 def words(self, fileids=None):
     self._require(self.WORDS)
     return LazyConcatenation(LazyMap(self._get_words,
                                      self._grids(fileids)))