Example #1
0
 def load_sent_tokenizer(sentence_tokenizer, add_abbrev_types=None, del_sent_starters=None):
     _sentence_tokenizer = None
     _sentence_tokenize = lambda x: [x]
     if sentence_tokenizer is not None:
         if sentence_tokenizer[0] == 'nltk_data':
             punkt = nltk.data.load(sentence_tokenizer[1])
             # TODO: why was the (now commented-out) line below here?
             # return punkt, punkt.tokenize
             return punkt, punkt.sentences_from_text
         elif sentence_tokenizer[0] == 'data':
             tokenizer_path = os.path.join('..', 'data', sentence_tokenizer[1])
             tokenizer_path = resource_filename(__name__, tokenizer_path)
             if os.path.exists(tokenizer_path):
                 with open_gz(tokenizer_path, 'rb') as fhandle:
                     try:
                         punkt = pickle.load(fhandle)
                     except EOFError:
                         logging.warn("Could not load tokenizer from %s", tokenizer_path)
                         return _sentence_tokenizer, _sentence_tokenize
                 if add_abbrev_types:
                     punkt._params.abbrev_types = punkt._params.abbrev_types | set(add_abbrev_types)
                 if del_sent_starters:
                     punkt._params.sent_starters = punkt._params.sent_starters - set(del_sent_starters)
                 return punkt, punkt.sentences_from_text
             else:
                 logging.warn("Tokenizer not found at %s", tokenizer_path)
         else:
             raise ValueError("Invalid sentence tokenizer class")
     return _sentence_tokenizer, _sentence_tokenize
Example #2
0
 def load_sent_tokenizer(sentence_tokenizer, add_abbrev_types=None, del_sent_starters=None):
     _sentence_tokenizer = None
     _sentence_tokenize = lambda x: [x]
     if sentence_tokenizer is not None:
         if sentence_tokenizer[0] == 'nltk_data':
             punkt = nltk.data.load(sentence_tokenizer[1])
             # TODO: why was the (now commented-out) line below here?
             # return punkt, punkt.tokenize
             return punkt, punkt.sentences_from_text
         else:
             tokenizer_path = os.path.join('..', *sentence_tokenizer)
             tokenizer_path = resource_filename(__name__, tokenizer_path)
             if os.path.exists(tokenizer_path):
                 with open_gz(tokenizer_path, 'rb') as fhandle:
                     try:
                         punkt = pickle.load(fhandle)
                     except EOFError:
                         logging.warn("Could not load tokenizer from %s", tokenizer_path)
                         return _sentence_tokenizer, _sentence_tokenize
                 if add_abbrev_types:
                     punkt._params.abbrev_types = punkt._params.abbrev_types | set(add_abbrev_types)
                 if del_sent_starters:
                     punkt._params.sent_starters = punkt._params.sent_starters - set(del_sent_starters)
                 return punkt, punkt.sentences_from_text
             else:
                 logging.warn("Tokenizer not found at %s", tokenizer_path)
     return _sentence_tokenizer, _sentence_tokenize
Example #3
0
    def write_file(self, fname, lines):
        fname = os.path.join(self.tmp_dir, fname)

        with open_gz(fname, mode='w') as fh:
            for line in lines:
                fh.write(line)

        return fname
Example #4
0
    def write_file(self, fname, lines):
        fname = os.path.join(self.tmp_dir, fname)

        with open_gz(fname, mode='w') as fh:
            for line in lines:
                fh.write(line)

        return fname
Example #5
0
def get_data_alt(args):
    with open_gz(args.vectors, "rb") as fh:
        train_X, train_y = pickle.load(fh)
        test_X, test_y = pickle.load(fh)
    vect = BagVectorizer().fit(train_X).fit(test_X)
    train_X = vect.transform(train_X)
    test_X = vect.transform(test_X)
    return train_X, test_X, np.asarray(train_y), np.asarray(test_y)
Example #6
0
def get_data_alt(args):
    with open_gz(args.vectors, "rb") as fh:
        train_X, train_y = pickle.load(fh)
        test_X, test_y = pickle.load(fh)
    vect = BagVectorizer().fit(train_X).fit(test_X)
    train_X = vect.transform(train_X)
    test_X = vect.transform(test_X)
    return train_X, test_X, np.asarray(train_y), np.asarray(test_y)