def parse(self, corenlppath = False, operations = False, copula_head = True, speaker_segmentation = False, memory_mb = False, *args, **kwargs): """ Parse an unparsed corpus, saving to disk >>> parsed = corpus.parse(speaker_segmentation = True) :param corenlppath: folder containing corenlp jar files :type corenlppath: str :param operations: which kinds of annotations to do :type operations: str :param speaker_segmentation: add speaker name to parser output if your corpus is script-like: :type speaker_segmentation: bool :param memory_mb: Amount of memory in MB for parser :type memory_mb: int :param copula_head: Make copula head in dependency parse :type copula_head: bool :returns: The newly created :class:`corpkit.corpus.Corpus` """ from make import make_corpus from corpus import Corpus #from process import determine_datatype #dtype, singlefile = determine_datatype(self.path) if self.datatype != 'plaintext': raise ValueError('parse method can only be used on plaintext corpora.') kwargs.pop('parse', None) kwargs.pop('tokenise', None) return Corpus(make_corpus(self.path, parse = True, tokenise = False, corenlppath = corenlppath, operations = operations, copula_head = copula_head, speaker_segmentation = speaker_segmentation, memory_mb = memory_mb, *args, **kwargs))
def tokenise(self, *args, **kwargs): """ Tokenise a plaintext corpus, saving to disk :param nltk_data_path: path to tokeniser if not found automatically :type nltk_data_path: str :Example: >>> tok = corpus.tokenise() >>> tok <corpkit.corpus.Corpus instance: speeches-tokenised; 9 subcorpora> :returns: The newly created :class:`corpkit.corpus.Corpus` """ from corpkit import make_corpus from corpus import Corpus #from process import determine_datatype #dtype, singlefile = determine_datatype(self.path) if self.datatype != 'plaintext': raise ValueError('parse method can only be used on plaintext corpora.') kwargs.pop('parse', None) kwargs.pop('tokenise', None) return Corpus(make_corpus(self.path, parse = False, tokenise = True, *args, **kwargs))
def tokenise(self, *args, **kwargs): """ Tokenise a plaintext corpus, saving to disk :param nltk_data_path: path to tokeniser if not found automatically :type nltk_data_path: str :Example: >>> tok = corpus.tokenise() >>> tok <corpkit.corpus.Corpus instance: speeches-tokenised; 9 subcorpora> :returns: The newly created :class:`corpkit.corpus.Corpus` """ from corpkit import make_corpus from corpus import Corpus #from process import determine_datatype #dtype, singlefile = determine_datatype(self.path) if self.datatype != 'plaintext': raise ValueError( 'parse method can only be used on plaintext corpora.') kwargs.pop('parse', None) kwargs.pop('tokenise', None) return Corpus( make_corpus(self.path, parse=False, tokenise=True, *args, **kwargs))
def parse(self, corenlppath=False, operations=False, copula_head=True, speaker_segmentation=False, memory_mb=False, *args, **kwargs): """ Parse an unparsed corpus, saving to disk :param corenlppath: folder containing corenlp jar files :type corenlppath: str :param operations: which kinds of annotations to do :type operations: str :param speaker_segmentation: add speaker name to parser output if your corpus is script-like: :type speaker_segmentation: bool :param memory_mb: Amount of memory in MB for parser :type memory_mb: int :param copula_head: Make copula head in dependency parse :type copula_head: bool :Example: >>> parsed = corpus.parse(speaker_segmentation = True) >>> parsed <corpkit.corpus.Corpus instance: speeches-parsed; 9 subcorpora> :returns: The newly created :class:`corpkit.corpus.Corpus` """ from make import make_corpus from corpus import Corpus #from process import determine_datatype #dtype, singlefile = determine_datatype(self.path) if self.datatype != 'plaintext': raise ValueError( 'parse method can only be used on plaintext corpora.') kwargs.pop('parse', None) kwargs.pop('tokenise', None) return Corpus( make_corpus(self.path, parse=True, tokenise=False, corenlppath=corenlppath, operations=operations, copula_head=copula_head, speaker_segmentation=speaker_segmentation, memory_mb=memory_mb, *args, **kwargs))
def tokenise(self, *args, **kwargs): """ Tokenise a plaintext corpus, saving to disk >>> tok = corpus.tokenise() :param nltk_data_path: path to tokeniser if not found automatically :type nltk_data_path: str :returns: The newly created :class:`corpkit.corpus.Corpus` """ from corpkit import make_corpus from corpus import Corpus # from process import determine_datatype # dtype, singlefile = determine_datatype(self.path) if self.datatype != "plaintext": raise ValueError("parse method can only be used on plaintext corpora.") kwargs.pop("parse", None) kwargs.pop("tokenise", None) return Corpus(make_corpus(self.path, parse=False, tokenise=True, *args, **kwargs))