Esempi in Python per LazyConcatenation, esempi in Python per nltk.util.LazyConcatenation

Esempio n. 1

0

Mostra file

    def test(self, test_sequence, **kwargs):
        """
        Tests the HiddenMarkovModelTagger instance.

    	:param test_sequence: a sequence of labeled test instances
        :type test_sequence: list(list)
        :param verbose: boolean flag indicating whether training should be
            verbose or include printed output
        :type verbose: bool
        """

        def words(sent):
            return [word for (word, tag) in sent]

        def tags(sent):
            return [tag for (word, tag) in sent]

        test_sequence = LazyMap(self._transform.transform, test_sequence)
        predicted_sequence = LazyMap(self._tag, LazyMap(words, test_sequence))

        if kwargs.get('verbose', False):
            # This will be used again later for accuracy so there's no sense
            # in tagging it twice.
            test_sequence = list(test_sequence)
            predicted_sequence = list(predicted_sequence)

            for test_sent, predicted_sent in zip(test_sequence,
                                                 predicted_sequence):
                print 'Test:', \
                    ' '.join(['%s/%s' % (str(token), str(tag))
                              for (token, tag) in test_sent])
                print
                print 'Untagged:', \
                    ' '.join([str(token) for (token, tag) in test_sent])
                print
                print 'HMM-tagged:', \
                    ' '.join(['%s/%s' % (str(token), str(tag))
                              for (token, tag) in predicted_sent])
                print
                print 'Entropy:', \
                    self.entropy([(token, None) for
                                  (token, tag) in predicted_sent])
                print
                print '-' * 60

        test_tags = LazyConcatenation(LazyMap(tags, test_sequence))
        predicted_tags = LazyConcatenation(LazyMap(tags, predicted_sequence))

        acc = accuracy(test_tags, predicted_tags)

        count = sum([len(sent) for sent in test_sequence])

        print 'accuracy over %d tokens: %.2f' % (count, acc * 100)

Esempio n. 2

0

Mostra file

    def get_ne(self, fileids=None, tagset=None):
        self._require(self.NE)

        def get_ne_inn(grid):
            return self._get_ne(grid, tagset)

        return LazyConcatenation(LazyMap(get_ne_inn, self._grids(fileids)))

Esempio n. 3

0

Mostra file

    def _chunked_sents(self, sents, depth=0):
        """
        @return: A list of sentence chunk trees which are flatter than the
            original trees.
        @rtype: C{list} of C{list} of C{Tree}
        @param sents: A list of sentence trees.
        @type sents: C{list} of C{list} of C{Tree}
        @param depth: How deep to read nested chunks off of the trees. If
            depth is None, all possible chunk substrees are returned, 
            otherwise, chunks are returned starting at the highest level 0,
            then the next highest 1, etc.
        @type depth: C{int}
        """
        def __chunked_sent(sent):
            for chunk in sent:
                # If the chunk is a Tree, append it's immediate subtrees.
                if isinstance(chunk, Tree):
                    return list(chunk)
                # If the chunk is not a tree, append it.
                else:
                    return chunk

        # If depth is None, return all possible subtrees
        if depth is None:
            return LazyMap(lambda sent: sent.subtrees(), sents)
        # If depth is too small, no need to recurse and read further.
        if not depth - 1 >= 0:
            return sents
        # Otherwise, apply __chunked_sent() and recurse.
        return self._chunked_sents(
            LazyConcatenation(LazyMap(__chunked_sent, sents)), depth - 1)

Esempio n. 4

0

Mostra file

File: korr_corpusreader.py Progetto: dainst/Gelehrtenkorrespondenz

    def full_tagged_words(self, fileids=None, tagset=None):
        #self._require(self.WORDS, self.POS, self.TEXTLAYER, self.CHUNK, self.LEMMA)#, self.ENTITYID)
        def get_tagged_words(grid):
            return self._get_full_tagged_words(grid, tagset)

        return LazyConcatenation(
            LazyMap(get_tagged_words, self._grids(fileids)))

Esempio n. 5

0

Mostra file

 def tagged_sents(self):
     sents = self.sents()
     batch_indices = range(len(sents) / 1024 + 1)
     return LazyConcatenation(
         LazyMap(
             lambda i: self._tagger.batch_tag(sents[i * 1024:i * 1024 + 1024
                                                    ]), batch_indices))

Esempio n. 6

0

Mostra file

    def tagged_sents(self,
                     fileids=None,
                     stem=False,
                     relation=None,
                     strip_space=True,
                     replace=False):
        """
        :return: the given file(s) as a list of
            sentences, each encoded as a list of ``(word,tag)`` tuples.
        :rtype: list(list(tuple(str,str)))

        :param speaker: If specified, select specific speaker(s) defined
            in the corpus. Default is 'ALL' (all participants). Common choices
            are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
            researchers)
        :param stem: If true, then use word stems instead of word strings.
        :param relation: If true, then return tuples of ``(str,pos,relation_list)``.
            If there is manually-annotated relation info, it will return
            tuples of ``(str,pos,test_relation_list,str,pos,gold_relation_list)``
        :param strip_space: If true, then strip trailing spaces from word
            tokens. Otherwise, leave the spaces on the tokens.
        :param replace: If true, then use the replaced (intended) word instead
            of the original word (e.g., 'wat' will be replaced with 'watch')
        """
        sent = True
        pos = True
        if not self._lazy:
            return [
                self._get_words(fileid, sent, stem, relation, pos, strip_space,
                                replace) for fileid in self.abspaths(fileids)
            ]

        get_words = lambda fileid: self._get_words(
            fileid, sent, stem, relation, pos, strip_space, replace)
        return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))

Esempio n. 7

0

Mostra file

    def tagged_words(self, fileids=None, tagset=None):
        self._require(self.WORDS, self.POS)

        def get_tagged_words(grid):
            return self._get_tagged_words(grid, tagset)

        return LazyConcatenation(LazyMap(get_tagged_words, self._grids(fileids)))

Esempio n. 8

0

Mostra file

    def __init__(self, source):
        if hasattr(source, 'words'):  # bridge to the text corpus reader
            source = [source.words(f) for f in source.fileids()]

        self._texts = source
        Text.__init__(self, LazyConcatenation(source))
        self._idf_cache = {}

Esempio n. 9

0

Mostra file

File: childes.py Progetto: mattclark/nltk

    def words(self,
              fileids=None,
              speaker='ALL',
              stem=False,
              relation=False,
              strip_space=True,
              replace=False):
        """
        :return: the given file(s) as a list of words
        :rtype: list(str)

        :param speaker: If specified, select specific speaker(s) defined
            in the corpus. Default is 'ALL' (all participants). Common choices
            are 'CHI' (the child), 'MOT' (mother), ['CHI','MOT'] (exclude
            researchers)
        :param stem: If true, then use word stems instead of word strings.
        :param relation: If true, then return tuples of (stem, index,
            dependent_index)
        :param strip_space: If true, then strip trailing spaces from word
            tokens. Otherwise, leave the spaces on the tokens.
        :param replace: If true, then use the replaced (intended) word instead
            of the original word (e.g., 'wat' will be replaced with 'watch')
        """
        sent = None
        pos = False
        if not self._lazy:
            return [
                self._get_words(fileid, speaker, sent, stem, relation, pos,
                                strip_space, replace)
                for fileid in self.abspaths(fileids)
            ]

        get_words = lambda fileid: self._get_words(
            fileid, speaker, sent, stem, relation, pos, strip_space, replace)
        return LazyConcatenation(LazyMap(get_words, self.abspaths(fileids)))

Esempio n. 10

0

Mostra file

    def get_tags(self, fileids=None, tagset=None, tags=[]):
        required = []
        for tag in tags:
            if tag == 'offset':
                required.append(self.OFFSET)
            if tag == 'len':
                required.append(self.LEN)
            if tag == 'words':
                required.append(self.WORDS)
            if tag == 'pos':
                required.append(self.POS)
            if tag == 'tree':
                required.append(self.TREE)
            if tag == 'ne':
                required.append(self.NE)
            if tag == 'srl':
                required.append(self.SRL)
            if tag == 'ignore':
                required.append(self.IGNORE)
            if tag == 'chunk':
                required.append(self.CHUNK)

        self._require(*required)

        def get_tags_inn(grid, tags=tags):
            return self._get_tags(grid, tagset, tags=tags)

        return LazyConcatenation(LazyMap(get_tags_inn, self._grids(fileids)))

Esempio n. 11

0

Mostra file

File: conll.py Progetto: VinodhSubramanian1193/NLP

 def chunked_words(self, fileids=None, chunk_types=None,
                   tagset=None):
     self._require(self.WORDS, self.POS, self.CHUNK)
     if chunk_types is None: chunk_types = self._chunk_types
     def get_chunked_words(grid): # capture chunk_types as local var
         return self._get_chunked_words(grid, chunk_types, tagset)
     return LazyConcatenation(LazyMap(get_chunked_words,
                                      self._grids(fileids)))

Esempio n. 12

0

Mostra file

    def tagged_words(self, fileids=None, simplify_tags=False):
        self._require(self.WORDS, self.POS)

        def get_tagged_words(grid):
            return self._get_tagged_words(grid, simplify_tags)

        return LazyConcatenation(
            LazyMap(get_tagged_words, self._grids(fileids)))

Esempio n. 13

0

Mostra file

 def sents(self, fileids=None):
     """
     @return: A list of sentences.
     @rtype: C{list} of C{list} of C{str}
     @param fileids: A list of corpus files.
     @type fileids: C{list} of C{str} or regular expression        
     """
     return LazyConcatenation(self.paras(fileids))

Esempio n. 14

0

Mostra file

File: conll.py Progetto: VinodhSubramanian1193/NLP

 def srl_instances(self, fileids=None, pos_in_tree=None, flatten=True):
     self._require(self.WORDS, self.POS, self.TREE, self.SRL)
     if pos_in_tree is None: pos_in_tree = self._pos_in_tree
     def get_srl_instances(grid): # capture pos_in_tree as local var
         return self._get_srl_instances(grid, pos_in_tree)
     result = LazyMap(get_srl_instances, self._grids(fileids))
     if flatten: result = LazyConcatenation(result)
     return result

Esempio n. 15

0

Mostra file

File: NegraCorpusReader.py Progetto: s6rapala/pos-tagger

    def lemmatised_words(self, fileids=None):
        """Retrieve a list of lemmatised words. Words are encoded as tuples in
           C{(word, lemma)} form.
        @return: A list of words and their tuples.
        @rtype: C{list} of C{(word, lemma)}
        """

        self._require(self.WORDS, self.LEMMA)
        return LazyConcatenation(
            LazyMap(self._get_lemmatised_words, self._grids(fileids)))

Esempio n. 16

0

Mostra file

File: conll.py Progetto: jparise/haitwu-appengine

 def iob_words(self, fileids=None):
     """
     @return: a list of word/tag/IOB tuples 
     @rtype: C{list} of C{tuple}
     @param fileids: the list of fileids that make up this corpus 
     @type fileids: C{None} or C{str} or C{list}
     """
     self._require(self.WORDS, self.POS, self.CHUNK)
     return LazyConcatenation(
         LazyMap(self._get_iob_words, self._grids(fileids)))

Esempio n. 17

0

Mostra file

File: NegraCorpusReader.py Progetto: s6rapala/pos-tagger

    def chunked_words(self, fileids=None):
        """Retrieve a list of chunked words. Words are encoded as C{(word, tag)}
           and chunks are encoded as trees over C{(word, tag)} leaves.
        @return: A tree representation of the word chunk.
        @rtype: C{list} of (C{(str,str)} and L{Tree})
        """

        self._require(self.WORDS, self.POS, self.PARENT)
        return LazyConcatenation(
            LazyMap(self._get_chunked_words, self._grids(fileids)))

Esempio n. 18

0

Mostra file

File: pbrconll.py Progetto: rafaelanchieta/rbamr

    def dep_srl_instances(self, fileids=None, flatten=True):
        self._require(self.WORDS, self.POS, self.HEAD, self.DEPREL,
                      self.FILLPRED, self.SRL)

        def get_dep_srl_instances(grid):
            return self._get_dep_srl_instances(grid)

        result = LazyMap(get_dep_srl_instances, self._grids(fileids))
        if flatten: result = LazyConcatenation(result)
        return result

Esempio n. 19

0

Mostra file

File: NegraCorpusReader.py Progetto: s6rapala/pos-tagger

    def morphological_words(self, fileids=None):
        """Retrieve a list of sentences with the words' morphological type.
           Words are encoded as tuples in C{(word, morph)} form.
        @return: A list of sentences with words and their morphological type.
        @rtype: C{list} of C{(word, morph)}
        """

        self._require(self.WORDS, self.MORPH)
        return LazyConcatenation(
            LazyMap(self._get_morphological_words, self._grids(fileids)))

Esempio n. 20

0

Mostra file

 def words(self, fileids=None):
     """
     @return: A list of words.
     @rtype: C{list} of C{str}
     @param fileids: A list of corpus files.
     @type fileids: C{list} of C{str} or regular expression 
     @kwparam depth: Depth of chunk parsing for nested chunks.
     @type depth: C{int}        
     """
     # Concatenate the list of lists given by sents().
     return LazyConcatenation(self.sents(fileids))

Esempio n. 21

0

Mostra file

 def iob_words(self, fileids=None, **kwargs):
     """
     @return: A list of word/iob/other tag tuples.
     @rtype: C{list} of C{tuple}
     @param fileids: A list of corpus files.
     @type fileids: C{list} of C{str} or regular expression 
     @kwparam depth: Depth of chunk parsing for nested chunks.
     @type depth: C{int}        
     """
     # Concatenate the list of lists given by iob_sents().
     return LazyConcatenation(self.iob_sents(fileids, **kwargs))

Esempio n. 22

0

Mostra file

    def _paras(self, fileids=None):
        """
        @return: A list of paragraphs.
        @rtype: C{list} of C{Tree}
        @param fileids: A list of corpus files.
        @type fileids: C{list} of C{str} or regular expression.
        """
        def __para(doc):
            return list(doc.text)

        return LazyConcatenation(LazyMap(__para, self.parsed_docs(fileids)))

Esempio n. 23

0

Mostra file

File: conll.py Progetto: VinodhSubramanian1193/NLP

 def iob_words(self, fileids=None, tagset=None):
     """
     :return: a list of word/tag/IOB tuples
     :rtype: list(tuple)
     :param fileids: the list of fileids that make up this corpus
     :type fileids: None or str or list
     """
     self._require(self.WORDS, self.POS, self.CHUNK)
     def get_iob_words(grid):
         return self._get_iob_words(grid, tagset)
     return LazyConcatenation(LazyMap(get_iob_words, self._grids(fileids)))

Esempio n. 24

0

Mostra file

    def _sents(self, fileids=None):
        """
        @return: A list of sentence trees.
        @rtype: C{list} of C{list} of C{Tree}
        @param fileids: A list of corpus files.
        @type fileids: C{list} of C{str} or regular expression      
        """
        def __sents(para):
            return list(para)

        # Flatten this because it's a list of list of trees for each doc. It
        # doesn't matter which doc the list is from so chain them together.
        return LazyConcatenation(LazyMap(__sents, self._paras(fileids)))

Esempio n. 25

0

Mostra file

def concat(docs):
    """
    Concatenate together the contents of multiple documents from a
    single corpus, using an appropriate concatenation function.  This
    utility function is used by corpus readers when the user requests
    more than one document at a time.
    """
    if len(docs) == 1:
        return docs[0]
    if len(docs) == 0:
        raise ValueError("concat() expects at least one object!")

    types = set(d.__class__ for d in docs)

    # If they're all strings, use string concatenation.
    if all(isinstance(doc, str) for doc in docs):
        return "".join(docs)

    # If they're all corpus views, then use ConcatenatedCorpusView.
    for typ in types:
        if not issubclass(typ,
                          (StreamBackedCorpusView, ConcatenatedCorpusView)):
            break
    else:
        return ConcatenatedCorpusView(docs)

    # If they're all lazy sequences, use a lazy concatenation
    for typ in types:
        if not issubclass(typ, AbstractLazySequence):
            break
    else:
        return LazyConcatenation(docs)

    # Otherwise, see what we can do:
    if len(types) == 1:
        typ = list(types)[0]

        if issubclass(typ, list):
            return reduce((lambda a, b: a + b), docs, [])

        if issubclass(typ, tuple):
            return reduce((lambda a, b: a + b), docs, ())

        if ElementTree.iselement(typ):
            xmltree = ElementTree.Element("documents")
            for doc in docs:
                xmltree.append(doc)
            return xmltree

    # No method found!
    raise ValueError("Don't know how to concatenate types: %r" % types)

Esempio n. 26

0

Mostra file

File: corpusreader.py Progetto: mikpim01/custom-corpus-readers

    def iob_words(self, fileids=None, columns=None, convert2iob=[]):
        """
        :return: a list of word/tag/IOB tuples
        :rtype: list(tuple)
        :param fileids: the list of fileids that make up this corpus
        :type fileids: None or str or list
        """
        if columns is None:
            columns = [c for c in self.columns if c != "ignore"]

        self._require(*columns)

        def get_iob_words(grid):
            return self._get_converted_iob_words(grid,
                                                 columns,
                                                 convert2iob=convert2iob)

        return LazyConcatenation(LazyMap(get_iob_words, self._grids(fileids)))

Esempio n. 27

0

Mostra file

File: processing.py Progetto: vt257/allnews-am

    def iob_words(self, fileids=None, tagset=None,
                  column=ConllCorpusReader.CHUNK):
        """Returns IOB annotations as tuples.

        Args:
            fileids: The list of fileids that make up this corpus.
            tagset: The tagset.
            column: The column to get the IOB annotations from, e.g. 'ne' for
                named entities or 'pos' for POS tags.

        Returns:
            A list of word/tag/IOB tuples.
        """
        self._require(self.WORDS, self.POS, self.CHUNK)

        def get_iob_words(grid):
            return self._get_iob_words(grid, tagset, column)

        return LazyConcatenation(LazyMap(get_iob_words, self._grids(fileids)))

Esempio n. 28

0

Mostra file

    def mentions(self, fileids=None, **kwargs):
        """
        @return: A list of mentions as the tuple of 
            ([words...], id, referent, type)
        @rtype: C{list} of C{list} of C{tuple}
        @param fileids: A list of corpus files.
        @type fileids: C{list} of C{str} or regular expression 
        @kwparam depth: Depth of chunk parsing for nested chunks.
        @type depth: C{int}  
        @kwparam concat: Concatenate sentence lists into one list; works like
            itertools.chain(). Defaults to False.
        @type concat: C{bool}
        @kwparam nonmentions: Return nonmentions as well as mentions. Defaults
            to False.
        @type nonmentions: C{bool}              
        """
        def __mentions(sent):
            mentions = []
            # Map each sentence subtree into a tuple.
            for token in map(tree2tuple, sent):
                # If the token type is COREF then append the token contents
                # and everything but the token type.
                if token[1] == 'COREF':
                    mentions.append(token[:1] + token[2:])
                # If including nonmentions, append the token contents only.
                elif kwargs.get('nonmentions'):
                    mentions.append(token[:1])
            return mentions

        # TODO: Is depth doing what it's expected to?
        depth = kwargs.get('depth', 0)
        sents = self._chunked_sents(self._sents(fileids), depth)
        # Concatenate the lists.
        if kwargs.get('concat'):
            return LazyConcatenation(LazyMap(__mentions, sents))
        # Or not.
        else:
            return LazyMap(__mentions, sents)

Esempio n. 29

0

Mostra file

    def chunks(self, fileids=None, **kwargs):
        """
        @return: A list of chunked sents where chunks are multi-word strings.
        @rtype: C{list} of C{list} of C{str}
        @param fileids: A list of corpus files.
        @type fileids: C{list} of C{str} or regular expression 
        @kwparam depth: Depth of chunk parsing for nested chunks.
        @type depth: C{int}        
        @kwparam concat: Concatenate sentence lists into one list; works like
            itertools.chain()
        @type concat: C{bool}
        """
        def __chunks(sent):
            chunks = []
            for token in sent:
                # If the token is a list of chunk pieces, append the piece's
                # contents as a string.
                if isinstance(token, list):
                    # TODO: Better if able to reverse Treebank-style
                    # tokenization. The join leaves some weird whitespace.
                    chunks.append(' '.join([word[0] for word in token]))
                # If the token is a tuple, append the token's contents.
                elif isinstance(token, tuple):
                    chunks.append(token[0])
                # Something bad happened.
                else:
                    raise
            return chunks

        sents = self.chunked_sents(fileids, **kwargs)
        # Concatenate the lists.
        if kwargs.get('concat'):
            return LazyConcatenation(LazyMap(__chunks, sents))
        # Or not.
        else:
            return LazyMap(__chunks, sents)

Esempio n. 30

0

Mostra file

 def words(self, fileids=None):
     self._require(self.WORDS)
     return LazyConcatenation(LazyMap(self._get_words,
                                      self._grids(fileids)))