def load_sent_tokenizer(sentence_tokenizer, add_abbrev_types=None, del_sent_starters=None): _sentence_tokenizer = None _sentence_tokenize = lambda x: [x] if sentence_tokenizer is not None: if sentence_tokenizer[0] == 'nltk_data': punkt = nltk.data.load(sentence_tokenizer[1]) # TODO: why was the (now commented-out) line below here? # return punkt, punkt.tokenize return punkt, punkt.sentences_from_text elif sentence_tokenizer[0] == 'data': tokenizer_path = os.path.join('..', 'data', sentence_tokenizer[1]) tokenizer_path = resource_filename(__name__, tokenizer_path) if os.path.exists(tokenizer_path): with open_gz(tokenizer_path, 'rb') as fhandle: try: punkt = pickle.load(fhandle) except EOFError: logging.warn("Could not load tokenizer from %s", tokenizer_path) return _sentence_tokenizer, _sentence_tokenize if add_abbrev_types: punkt._params.abbrev_types = punkt._params.abbrev_types | set(add_abbrev_types) if del_sent_starters: punkt._params.sent_starters = punkt._params.sent_starters - set(del_sent_starters) return punkt, punkt.sentences_from_text else: logging.warn("Tokenizer not found at %s", tokenizer_path) else: raise ValueError("Invalid sentence tokenizer class") return _sentence_tokenizer, _sentence_tokenize
def load_sent_tokenizer(sentence_tokenizer, add_abbrev_types=None, del_sent_starters=None): _sentence_tokenizer = None _sentence_tokenize = lambda x: [x] if sentence_tokenizer is not None: if sentence_tokenizer[0] == 'nltk_data': punkt = nltk.data.load(sentence_tokenizer[1]) # TODO: why was the (now commented-out) line below here? # return punkt, punkt.tokenize return punkt, punkt.sentences_from_text else: tokenizer_path = os.path.join('..', *sentence_tokenizer) tokenizer_path = resource_filename(__name__, tokenizer_path) if os.path.exists(tokenizer_path): with open_gz(tokenizer_path, 'rb') as fhandle: try: punkt = pickle.load(fhandle) except EOFError: logging.warn("Could not load tokenizer from %s", tokenizer_path) return _sentence_tokenizer, _sentence_tokenize if add_abbrev_types: punkt._params.abbrev_types = punkt._params.abbrev_types | set(add_abbrev_types) if del_sent_starters: punkt._params.sent_starters = punkt._params.sent_starters - set(del_sent_starters) return punkt, punkt.sentences_from_text else: logging.warn("Tokenizer not found at %s", tokenizer_path) return _sentence_tokenizer, _sentence_tokenize
def write_file(self, fname, lines): fname = os.path.join(self.tmp_dir, fname) with open_gz(fname, mode='w') as fh: for line in lines: fh.write(line) return fname
def get_data_alt(args): with open_gz(args.vectors, "rb") as fh: train_X, train_y = pickle.load(fh) test_X, test_y = pickle.load(fh) vect = BagVectorizer().fit(train_X).fit(test_X) train_X = vect.transform(train_X) test_X = vect.transform(test_X) return train_X, test_X, np.asarray(train_y), np.asarray(test_y)