def run(args): enum = enumerator() data = list(vectorize_sentences(enum, chain(*(read_json_lines(fn) for fn in args.input)))) X, y = zip(*data) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=0) pickle.dump((X_train, y_train), args.output) pickle.dump((X_test, y_test), args.output)
def fit(self, X, y=None): enum = self.vocabulary_ or enumerator() for row in X: for lbl in row: enum[lbl] self.vocabulary_ = enum self.feature_names_ = enum.keys() return self
def run(args): enum = enumerator() data = list( vectorize_sentences(enum, chain(*(read_json_lines(fn) for fn in args.input)))) X, y = zip(*data) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) pickle.dump((X_train, y_train), args.output) pickle.dump((X_test, y_test), args.output)
def vectorize_sentences(input_iter): enum = enumerator() for obj in input_iter: yield ([enum[w] for w in chain(*obj['X'])], obj['Y'])