def bench_wordbatch_wordbag(input, data_size, partitions, client): texts = pd.read_csv(input, nrows=data_size, squeeze=True) stemmer = PorterStemmer() batch_size = data_size // partitions batcher = Batcher(procs=1, minibatch_size=batch_size, backend="dask", backend_handle=client) wb = WordBatch(normalize_text=normalize_text, dictionary=Dictionary(min_df=10, max_words=1000000, verbose=0), tokenizer=Tokenizer(spellcor_count=2, spellcor_dist=2, stemmer=stemmer), extractor=WordBag(hash_ngrams=0, norm='l2', tf='binary', idf=50.0), batcher=batcher, verbose=0) start = time.time() t = wb.fit_transform(texts) duration = time.time() - start return (np.sum(t.data), duration)
def test_wordbatch(self): WordBatch(extractor=(WordBag, { "hash_ngrams":2, "hash_ngrams_weights":[0.5, -1.0], "hash_size":2**23, "norm":'l2', "tf":'log', "idf":50.0}))
def __init__(self, pickle_model="", datadir=None, batcher=None): self.wb = WordBatch(normalize_text, tokenizer=Tokenizer(stemmer=stemmer), extractor=WordHash(decode_error='ignore', n_features=2**25, ngram_range=(1, 2), norm='l2'), batcher=batcher) self.clf = FM_FTRL(D=2**25, D_fm=4, iters=1, inv_link="identity", threads=multiprocessing.cpu_count() // 2) if datadir == None: (self.wb, self.clf) = pkl.load(gzip.open(pickle_model, 'rb')) else: self.train(datadir, pickle_model)
def __init__(self, pickle_model="", datadir=None, batcher=None): self.wb = WordBatch(normalize_text=normalize_text, extractor=WordBag( hash_ngrams=3, hash_ngrams_weights=[-1.0, -1.0, 1.0], hash_size=2**23, norm='l2', tf='binary', idf=50.0), batcher=batcher) self.clf = FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2**23, iters=1, inv_link="identity") if datadir == None: (self.wb, self.clf) = pkl.load(gzip.open(pickle_model, 'rb')) else: self.train(datadir, pickle_model)
def __init__(self, pickle_model="", datadir=None, batcher=None): seed = 10002 session_conf = tf.ConfigProto( intra_op_parallelism_threads=multiprocessing.cpu_count() // 2, inter_op_parallelism_threads=1) os.environ['PYTHONHASHSEED'] = str(seed) np.random.seed(seed + 1) random.seed(seed + 2) tf.set_random_seed(seed + 3) K.set_session( tf.Session(graph=tf.get_default_graph(), config=session_conf)) self.maxlen = 200 self.max_words = 20000 self.wb = WordBatch(normalize_text, dictionary=Dictionary(max_words=self.max_words), extractor=WordSeq(seq_maxlen=self.maxlen), batcher=batcher) self.model = Sequential() self.model.add( Embedding(self.max_words + 2, 20, input_length=self.maxlen)) self.model.add( Conv1D(activation="relu", padding="same", strides=1, filters=10, kernel_size=3)) self.model.add(Dropout(0.5)) self.model.add(BatchNormalization()) self.model.add(GlobalMaxPooling1D()) self.model.add(Dense(1)) self.model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error']) if datadir == None: self.model = load_model(pickle_model) self.wb = pkl.load(gzip.open(pickle_model + ".wb", 'rb')) else: self.train(datadir, pickle_model)
def __init__(self, pickle_model="", datadir=None, batcher=None): self.wb = WordBatch( normalize_text, extractor=Hstack([ WordVec(wordvec_file= "../../../data/word2vec/glove.twitter.27B.100d.txt.gz", normalize_text=normalize_text, encoding="utf8"), WordVec( wordvec_file="../../../data/word2vec/glove.6B.50d.txt.gz", normalize_text=normalize_text, encoding="utf8") ])) # from wordbatch.pipelines import FeatureUnion # from wordbatch.transformers import Dictionary, TextNormalizer # from sklearn.pipeline import Pipeline # tn= TextNormalizer(normalize_text=normalize_text) # dct= Dictionary() # vec1= WordVec(wordvec_file="../../../data/word2vec/glove.twitter.27B.100d.txt.gz", # normalize_text=normalize_text, encoding="utf8", dictionary= dct) # vec2= WordVec(wordvec_file="../../../data/word2vec/glove.6B.50d.txt.gz", # normalize_text=normalize_text, encoding="utf8", dictionary= dct) # self.wb = Pipeline(steps= [("tn", tn), ("dct", dct), ("vecs", FeatureUnion([("vec1", vec1), ("vec2", vec2)]))]) self.batcher = batcher self.clf = FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=100 + 50, iters=1, inv_link="identity") if datadir == None: (self.wb, self.clf) = pkl.load(gzip.open(pickle_model, 'rb')) else: self.train(datadir, pickle_model)
class WordvecRegressor(object): def __init__(self, pickle_model="", datadir=None, batcher=None): self.wb = WordBatch( normalize_text, extractor=Hstack([ WordVec(wordvec_file= "../../../data/word2vec/glove.twitter.27B.100d.txt.gz", normalize_text=normalize_text, encoding="utf8"), WordVec( wordvec_file="../../../data/word2vec/glove.6B.50d.txt.gz", normalize_text=normalize_text, encoding="utf8") ])) # from wordbatch.pipelines import FeatureUnion # from wordbatch.transformers import Dictionary, TextNormalizer # from sklearn.pipeline import Pipeline # tn= TextNormalizer(normalize_text=normalize_text) # dct= Dictionary() # vec1= WordVec(wordvec_file="../../../data/word2vec/glove.twitter.27B.100d.txt.gz", # normalize_text=normalize_text, encoding="utf8", dictionary= dct) # vec2= WordVec(wordvec_file="../../../data/word2vec/glove.6B.50d.txt.gz", # normalize_text=normalize_text, encoding="utf8", dictionary= dct) # self.wb = Pipeline(steps= [("tn", tn), ("dct", dct), ("vecs", FeatureUnion([("vec1", vec1), ("vec2", vec2)]))]) self.batcher = batcher self.clf = FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=100 + 50, iters=1, inv_link="identity") if datadir == None: (self.wb, self.clf) = pkl.load(gzip.open(pickle_model, 'rb')) else: self.train(datadir, pickle_model) def fit_batch(self, texts, labels, rcount): texts, labels = shuffle(texts, labels) print("Transforming", rcount) #texts= self.wb.fit_transform(texts, tn__batcher=self.batcher, dct__reset= False, dct__batcher= self.batcher) texts = self.wb.fit_transform(texts) print("Training", rcount) self.clf.fit(texts, labels, reset=False) def train(self, datadir, pickle_model=""): texts = [] labels = [] training_data = os.listdir(datadir) rcount = 0 batchsize = 80000 p = None for jsonfile in training_data: with open(datadir + "/" + jsonfile, 'r') as inputfile: for line in inputfile: #if rcount > 1000000: break try: line = json.loads(line.strip()) except: continue for review in line["Reviews"]: rcount += 1 if rcount % 100000 == 0: print(rcount) if rcount % 6 != 0: continue if "Overall" not in review["Ratings"]: continue texts.append(review["Content"]) labels.append( (float(review["Ratings"]["Overall"]) - 3) * 0.5) if len(texts) % batchsize == 0: if p != None: p.join() p = threading.Thread(target=self.fit_batch, args=(texts, labels, rcount)) p.start() texts = [] labels = [] if p != None: p.join() self.fit_batch(texts, labels, rcount) # if pickle_model!="": # with gzip.open(pickle_model, 'wb') as model_file: # backend = self.wb.batcher.backend # backend_handle = self.wb.batcher.backend_handle # self.wb.batcher.backend = "serial" # self.wb.batcher.backend_handle = None # pkl.dump((self.wb, self.clf), model_file, protocol=2) # self.wb.batcher.backend = backend # self.wb.batcher.backend_handle = backend_handle def predict(self, texts): vecs = self.wb.transform(texts) return self.clf.predict(vecs)
class WordbagRegressor(object): def __init__(self, pickle_model="", datadir=None, batcher=None): self.wb = WordBatch(normalize_text=normalize_text, extractor=WordBag( hash_ngrams=3, hash_ngrams_weights=[-1.0, -1.0, 1.0], hash_size=2**23, norm='l2', tf='binary', idf=50.0), batcher=batcher) self.clf = FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2**23, iters=1, inv_link="identity") if datadir == None: (self.wb, self.clf) = pkl.load(gzip.open(pickle_model, 'rb')) else: self.train(datadir, pickle_model) def fit_batch(self, texts, labels, rcount): texts, labels = self.wb.batcher.shuffle_batch(texts, labels, rcount) print("Transforming", rcount) texts = self.wb.fit_transform(texts, reset=False) print("Training", rcount) self.clf.fit(texts, labels, reset=False) def train(self, datadir, pickle_model=""): texts = [] labels = [] training_data = os.listdir(datadir) rcount = 0 batchsize = 100000 p = None for jsonfile in training_data: with open(datadir + "/" + jsonfile, 'r') as inputfile: for line in inputfile: #if rcount > 1000000: break try: line = json.loads(line.strip()) except: continue for review in line["Reviews"]: rcount += 1 if rcount % 100000 == 0: print(rcount) if rcount % 7 != 0: continue if "Overall" not in review["Ratings"]: continue texts.append(review["Content"]) labels.append( (float(review["Ratings"]["Overall"]) - 3) * 0.5) if len(texts) % batchsize == 0: if p != None: p.join() p = threading.Thread(target=self.fit_batch, args=(texts, labels, rcount)) p.start() texts = [] labels = [] if p != None: p.join() self.fit_batch(texts, labels, rcount) self.wb.dictionary_freeze = True if pickle_model != "": with gzip.open(pickle_model, 'wb') as model_file: backend = self.wb.batcher.backend backend_handle = self.wb.batcher.backend_handle self.wb.batcher.backend = "serial" self.wb.batcher.backend_handle = None pkl.dump((self.wb, self.clf), model_file, protocol=2) self.wb.batcher.backend = backend self.wb.batcher.backend_handle = backend_handle def predict(self, texts): counts = self.wb.transform(texts) return self.clf.predict(counts)
] data_sizes= [40000, 80000, 160000, 320000, 640000, 1280000] for task in tasks: for data_size in data_sizes: texts_chunk = texts[:data_size] print("Task:", task, "Data size:", data_size) for backend in backends: batcher = Batcher(procs=16, minibatch_size=5000, backend=backend[0], backend_handle=backend[1]) #try: with timer("Completed: ["+task+","+str(len(texts_chunk))+","+backend[0]+"]"), warnings.catch_warnings(): warnings.simplefilter("ignore") if task=="ApplyBatch": hv = HashingVectorizer(decode_error='ignore', n_features=2 ** 25, preprocessor=normalize_text, ngram_range=(1, 2), norm='l2') t= ApplyBatch(hv.transform, batcher=batcher).transform(texts_chunk) print(t.shape, t.data[:5]) if task=="WordBag": wb = WordBatch(normalize_text=normalize_text, dictionary=Dictionary(min_df=10, max_words=1000000, verbose=0), tokenizer= Tokenizer(spellcor_count=2, spellcor_dist=2, stemmer= stemmer), extractor=WordBag(hash_ngrams=0, norm= 'l2', tf= 'binary', idf= 50.0), batcher= batcher, verbose= 0) t = wb.fit_transform(texts_chunk) print(t.shape, t.data[:5]) # except: # print("Failed ["+task+","+str(len(texts_chunk))+","+backend[0]+"]") print("")
class WordseqRegressor(): def __init__(self, pickle_model="", datadir=None, batcher=None): seed = 10002 session_conf = tf.ConfigProto( intra_op_parallelism_threads=multiprocessing.cpu_count() // 2, inter_op_parallelism_threads=1) os.environ['PYTHONHASHSEED'] = str(seed) np.random.seed(seed + 1) random.seed(seed + 2) tf.set_random_seed(seed + 3) K.set_session( tf.Session(graph=tf.get_default_graph(), config=session_conf)) self.maxlen = 200 self.max_words = 20000 self.wb = WordBatch(normalize_text, dictionary=Dictionary(max_words=self.max_words), extractor=WordSeq(seq_maxlen=self.maxlen), batcher=batcher) self.model = Sequential() self.model.add( Embedding(self.max_words + 2, 20, input_length=self.maxlen)) self.model.add( Conv1D(activation="relu", padding="same", strides=1, filters=10, kernel_size=3)) self.model.add(Dropout(0.5)) self.model.add(BatchNormalization()) self.model.add(GlobalMaxPooling1D()) self.model.add(Dense(1)) self.model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error']) if datadir == None: self.model = load_model(pickle_model) self.wb = pkl.load(gzip.open(pickle_model + ".wb", 'rb')) else: self.train(datadir, pickle_model) def transform_batch(self, texts, batch_data): batch_data.texts = self.wb.fit_transform(texts, reset=False) def train(self, datadir, pickle_model=""): texts = [] labels = [] training_data = os.listdir(datadir) rcount = 0 texts2 = [] batchsize = 100000 batch_data = BatchData() p_input = None for jsonfile in training_data: with open(datadir + "/" + jsonfile, 'r') as inputfile: for line in inputfile: #if rcount > 1000000: break try: line = json.loads(line.strip()) except: continue for review in line["Reviews"]: rcount += 1 if rcount % 100000 == 0: print(rcount) if rcount % 8 != 0: continue if "Overall" not in review["Ratings"]: continue texts.append(review["Content"]) labels.append( (float(review["Ratings"]["Overall"]) - 3) * 0.5) if len(texts) % batchsize == 0: if p_input != None: p_input.join() texts2.append(batch_data.texts) p_input = threading.Thread( target=self.transform_batch, args=(texts, batch_data)) p_input.start() texts = [] if p_input != None: p_input.join() texts2.append(batch_data.texts) texts2.append(self.wb.partial_fit_transform(texts)) del (texts) texts = sp.vstack(texts2) self.wb.dictionary_freeze = True test = (np.array(texts[-1000:]), np.array(labels[-1000:])) train = (np.array(texts[:-1000]), np.array(labels[:-1000])) self.model.fit(train[0], train[1], batch_size=2048, epochs=2, validation_data=(test[0], test[1])) if pickle_model != "": self.model.save(pickle_model) backend = self.wb.batcher.backend backend_handle = self.wb.batcher.backend_handle self.wb.batcher.backend = "serial" self.wb.batcher.backend_handle = None with gzip.open(pickle_model + ".wb", 'wb') as model_file: pkl.dump(self.wb, model_file, protocol=2) self.wb.batcher.backend = backend self.wb.batcher.backend_handle = backend_handle def predict_batch(self, texts): results = [ x[0] for x in self.model.predict(np.array(self.wb.transform(texts))) ] return results
class WordhashRegressor(object): def __init__(self, pickle_model="", datadir=None, batcher=None): self.wb = WordBatch(normalize_text, tokenizer=Tokenizer(stemmer=stemmer), extractor=WordHash(decode_error='ignore', n_features=2**25, ngram_range=(1, 2), norm='l2'), batcher=batcher) self.clf = FM_FTRL(D=2**25, D_fm=4, iters=1, inv_link="identity", threads=multiprocessing.cpu_count() // 2) if datadir == None: (self.wb, self.clf) = pkl.load(gzip.open(pickle_model, 'rb')) else: self.train(datadir, pickle_model) def transform_batch(self, texts, batch_data): batch_data.texts = self.wb.fit_transform(texts, reset=False) def train(self, datadir, pickle_model=""): texts = [] labels = [] training_data = os.listdir(datadir) rcount = 0 texts2 = [] batchsize = 100000 batch_data = BatchData() p_input = None for jsonfile in training_data: with open(datadir + "/" + jsonfile, 'r') as inputfile: for line in inputfile: # if rcount > 1000000: break try: line = json.loads(line.strip()) except: continue for review in line["Reviews"]: rcount += 1 if rcount % 100000 == 0: print(rcount) if rcount % 9 != 0: continue if "Overall" not in review["Ratings"]: continue texts.append(review["Content"]) labels.append( (float(review["Ratings"]["Overall"]) - 3) * 0.5) if len(texts) % batchsize == 0: if p_input != None: p_input.join() texts2.append(batch_data.texts) p_input = threading.Thread( target=self.transform_batch, args=(texts, batch_data)) p_input.start() texts = [] if p_input != None: p_input.join() texts2.append(batch_data.texts) texts2.append(self.wb.fit_transform(texts, reset=False)) del (texts) if len(texts2) == 1: texts = texts2[0] else: texts = ssp.vstack(texts2) self.wb.dictionary_freeze = True self.clf.fit(texts, labels) if pickle_model != "": with gzip.open(pickle_model, 'wb') as model_file: backend = self.wb.batcher.backend backend_handle = self.wb.batcher.backend_handle self.wb.batcher.backend = "serial" self.wb.batcher.backend_handle = None pkl.dump((self.wb, self.clf), model_file, protocol=2) self.wb.batcher.backend = backend self.wb.batcher.backend_handle = backend_handle def predict(self, texts): counts = self.wb.transform(texts) return self.clf.predict(counts)