Exemple #1
0
    def parse(self, corenlppath = False, operations = False, copula_head = True,
              speaker_segmentation = False, memory_mb = False, *args, **kwargs):
        """
        Parse an unparsed corpus, saving to disk

           >>> parsed = corpus.parse(speaker_segmentation = True)

        :param corenlppath: folder containing corenlp jar files
        :type corenlppath: str
                
        :param operations: which kinds of annotations to do
        :type operations: str
        
        :param speaker_segmentation: add speaker name to parser output if your corpus is script-like:
        :type speaker_segmentation: bool

        :param memory_mb: Amount of memory in MB for parser
        :type memory_mb: int

        :param copula_head: Make copula head in dependency parse
        :type copula_head: bool

        :returns: The newly created :class:`corpkit.corpus.Corpus`
        """
        from corpkit import make_corpus
        from corpkit.corpus import Corpus
        #from corpkit.process import determine_datatype
        #dtype, singlefile = determine_datatype(self.path)
        if self.datatype != 'plaintext':
            raise ValueError('parse method can only be used on plaintext corpora.')
        kwargs.pop('parse', None)
        kwargs.pop('tokenise', None)
        return Corpus(make_corpus(self.path, parse = True, tokenise = False, 
              corenlppath = corenlppath, operations = operations, copula_head = copula_head,
              speaker_segmentation = speaker_segmentation, memory_mb = memory_mb, *args, **kwargs))
Exemple #2
0
    def tokenise(self, *args, **kwargs):
        """
        Tokenise a plaintext corpus, saving to disk

           >>> tok = corpus.tokenise()

        :param nltk_data_path: path to tokeniser if not found automatically
        :type nltk_data_path: str

        :returns: The newly created :class:`corpkit.corpus.Corpus`
        """
        
        from corpkit import make_corpus
        from corpkit.corpus import Corpus
        #from corpkit.process import determine_datatype
        #dtype, singlefile = determine_datatype(self.path)
        if self.datatype != 'plaintext':
            raise ValueError('parse method can only be used on plaintext corpora.')
        kwargs.pop('parse', None)
        kwargs.pop('tokenise', None)

        return Corpus(make_corpus(self.path, parse = False, tokenise = True, *args, **kwargs))