Exemple #1
0
def bench_wordbatch_wordbag(input, data_size, partitions, client):
    texts = pd.read_csv(input, nrows=data_size, squeeze=True)

    stemmer = PorterStemmer()
    batch_size = data_size // partitions
    batcher = Batcher(procs=1,
                      minibatch_size=batch_size,
                      backend="dask",
                      backend_handle=client)
    wb = WordBatch(normalize_text=normalize_text,
                   dictionary=Dictionary(min_df=10,
                                         max_words=1000000,
                                         verbose=0),
                   tokenizer=Tokenizer(spellcor_count=2,
                                       spellcor_dist=2,
                                       stemmer=stemmer),
                   extractor=WordBag(hash_ngrams=0,
                                     norm='l2',
                                     tf='binary',
                                     idf=50.0),
                   batcher=batcher,
                   verbose=0)

    start = time.time()
    t = wb.fit_transform(texts)
    duration = time.time() - start
    return (np.sum(t.data), duration)
 def test_wordbatch(self):
     WordBatch(extractor=(WordBag, {
         "hash_ngrams":2, 
         "hash_ngrams_weights":[0.5, -1.0], 
         "hash_size":2**23, 
         "norm":'l2', 
         "tf":'log', 
         "idf":50.0}))
Exemple #3
0
 def __init__(self, pickle_model="", datadir=None, batcher=None):
     self.wb = WordBatch(normalize_text,
                         tokenizer=Tokenizer(stemmer=stemmer),
                         extractor=WordHash(decode_error='ignore',
                                            n_features=2**25,
                                            ngram_range=(1, 2),
                                            norm='l2'),
                         batcher=batcher)
     self.clf = FM_FTRL(D=2**25,
                        D_fm=4,
                        iters=1,
                        inv_link="identity",
                        threads=multiprocessing.cpu_count() // 2)
     if datadir == None:
         (self.wb, self.clf) = pkl.load(gzip.open(pickle_model, 'rb'))
     else:
         self.train(datadir, pickle_model)
    def __init__(self, pickle_model="", datadir=None, batcher=None):
        self.wb = WordBatch(normalize_text=normalize_text,
                            extractor=WordBag(
                                hash_ngrams=3,
                                hash_ngrams_weights=[-1.0, -1.0, 1.0],
                                hash_size=2**23,
                                norm='l2',
                                tf='binary',
                                idf=50.0),
                            batcher=batcher)

        self.clf = FTRL(alpha=1.0,
                        beta=1.0,
                        L1=0.00001,
                        L2=1.0,
                        D=2**23,
                        iters=1,
                        inv_link="identity")
        if datadir == None:
            (self.wb, self.clf) = pkl.load(gzip.open(pickle_model, 'rb'))
        else:
            self.train(datadir, pickle_model)
Exemple #5
0
    def __init__(self, pickle_model="", datadir=None, batcher=None):
        seed = 10002
        session_conf = tf.ConfigProto(
            intra_op_parallelism_threads=multiprocessing.cpu_count() // 2,
            inter_op_parallelism_threads=1)
        os.environ['PYTHONHASHSEED'] = str(seed)
        np.random.seed(seed + 1)
        random.seed(seed + 2)
        tf.set_random_seed(seed + 3)
        K.set_session(
            tf.Session(graph=tf.get_default_graph(), config=session_conf))

        self.maxlen = 200
        self.max_words = 20000
        self.wb = WordBatch(normalize_text,
                            dictionary=Dictionary(max_words=self.max_words),
                            extractor=WordSeq(seq_maxlen=self.maxlen),
                            batcher=batcher)
        self.model = Sequential()
        self.model.add(
            Embedding(self.max_words + 2, 20, input_length=self.maxlen))
        self.model.add(
            Conv1D(activation="relu",
                   padding="same",
                   strides=1,
                   filters=10,
                   kernel_size=3))
        self.model.add(Dropout(0.5))
        self.model.add(BatchNormalization())
        self.model.add(GlobalMaxPooling1D())
        self.model.add(Dense(1))
        self.model.compile(loss='mean_squared_error',
                           optimizer='adam',
                           metrics=['mean_squared_error'])
        if datadir == None:
            self.model = load_model(pickle_model)
            self.wb = pkl.load(gzip.open(pickle_model + ".wb", 'rb'))
        else:
            self.train(datadir, pickle_model)
    def __init__(self, pickle_model="", datadir=None, batcher=None):
        self.wb = WordBatch(
            normalize_text,
            extractor=Hstack([
                WordVec(wordvec_file=
                        "../../../data/word2vec/glove.twitter.27B.100d.txt.gz",
                        normalize_text=normalize_text,
                        encoding="utf8"),
                WordVec(
                    wordvec_file="../../../data/word2vec/glove.6B.50d.txt.gz",
                    normalize_text=normalize_text,
                    encoding="utf8")
            ]))
        # from wordbatch.pipelines import FeatureUnion
        # from wordbatch.transformers import Dictionary, TextNormalizer
        # from sklearn.pipeline import Pipeline
        # tn= TextNormalizer(normalize_text=normalize_text)
        # dct= Dictionary()
        # vec1= WordVec(wordvec_file="../../../data/word2vec/glove.twitter.27B.100d.txt.gz",
        # 			  normalize_text=normalize_text, encoding="utf8", dictionary= dct)
        # vec2= WordVec(wordvec_file="../../../data/word2vec/glove.6B.50d.txt.gz",
        # 			  normalize_text=normalize_text, encoding="utf8", dictionary= dct)
        # self.wb = Pipeline(steps= [("tn", tn), ("dct", dct), ("vecs", FeatureUnion([("vec1", vec1), ("vec2", vec2)]))])
        self.batcher = batcher

        self.clf = FTRL(alpha=1.0,
                        beta=1.0,
                        L1=0.00001,
                        L2=1.0,
                        D=100 + 50,
                        iters=1,
                        inv_link="identity")

        if datadir == None:
            (self.wb, self.clf) = pkl.load(gzip.open(pickle_model, 'rb'))
        else:
            self.train(datadir, pickle_model)
class WordvecRegressor(object):
    def __init__(self, pickle_model="", datadir=None, batcher=None):
        self.wb = WordBatch(
            normalize_text,
            extractor=Hstack([
                WordVec(wordvec_file=
                        "../../../data/word2vec/glove.twitter.27B.100d.txt.gz",
                        normalize_text=normalize_text,
                        encoding="utf8"),
                WordVec(
                    wordvec_file="../../../data/word2vec/glove.6B.50d.txt.gz",
                    normalize_text=normalize_text,
                    encoding="utf8")
            ]))
        # from wordbatch.pipelines import FeatureUnion
        # from wordbatch.transformers import Dictionary, TextNormalizer
        # from sklearn.pipeline import Pipeline
        # tn= TextNormalizer(normalize_text=normalize_text)
        # dct= Dictionary()
        # vec1= WordVec(wordvec_file="../../../data/word2vec/glove.twitter.27B.100d.txt.gz",
        # 			  normalize_text=normalize_text, encoding="utf8", dictionary= dct)
        # vec2= WordVec(wordvec_file="../../../data/word2vec/glove.6B.50d.txt.gz",
        # 			  normalize_text=normalize_text, encoding="utf8", dictionary= dct)
        # self.wb = Pipeline(steps= [("tn", tn), ("dct", dct), ("vecs", FeatureUnion([("vec1", vec1), ("vec2", vec2)]))])
        self.batcher = batcher

        self.clf = FTRL(alpha=1.0,
                        beta=1.0,
                        L1=0.00001,
                        L2=1.0,
                        D=100 + 50,
                        iters=1,
                        inv_link="identity")

        if datadir == None:
            (self.wb, self.clf) = pkl.load(gzip.open(pickle_model, 'rb'))
        else:
            self.train(datadir, pickle_model)

    def fit_batch(self, texts, labels, rcount):
        texts, labels = shuffle(texts, labels)
        print("Transforming", rcount)
        #texts= self.wb.fit_transform(texts, tn__batcher=self.batcher, dct__reset= False, dct__batcher= self.batcher)
        texts = self.wb.fit_transform(texts)
        print("Training", rcount)
        self.clf.fit(texts, labels, reset=False)

    def train(self, datadir, pickle_model=""):
        texts = []
        labels = []
        training_data = os.listdir(datadir)
        rcount = 0
        batchsize = 80000

        p = None
        for jsonfile in training_data:
            with open(datadir + "/" + jsonfile, 'r') as inputfile:
                for line in inputfile:
                    #if rcount > 1000000: break
                    try:
                        line = json.loads(line.strip())
                    except:
                        continue
                    for review in line["Reviews"]:
                        rcount += 1
                        if rcount % 100000 == 0: print(rcount)
                        if rcount % 6 != 0: continue
                        if "Overall" not in review["Ratings"]: continue
                        texts.append(review["Content"])
                        labels.append(
                            (float(review["Ratings"]["Overall"]) - 3) * 0.5)
                        if len(texts) % batchsize == 0:
                            if p != None: p.join()
                            p = threading.Thread(target=self.fit_batch,
                                                 args=(texts, labels, rcount))
                            p.start()
                            texts = []
                            labels = []
        if p != None: p.join()
        self.fit_batch(texts, labels, rcount)

        # if pickle_model!="":
        # 	with gzip.open(pickle_model, 'wb') as model_file:
        # 		backend = self.wb.batcher.backend
        # 		backend_handle = self.wb.batcher.backend_handle
        # 		self.wb.batcher.backend = "serial"
        # 		self.wb.batcher.backend_handle = None
        # 		pkl.dump((self.wb, self.clf), model_file, protocol=2)
        # 		self.wb.batcher.backend = backend
        # 		self.wb.batcher.backend_handle = backend_handle

    def predict(self, texts):
        vecs = self.wb.transform(texts)
        return self.clf.predict(vecs)
class WordbagRegressor(object):
    def __init__(self, pickle_model="", datadir=None, batcher=None):
        self.wb = WordBatch(normalize_text=normalize_text,
                            extractor=WordBag(
                                hash_ngrams=3,
                                hash_ngrams_weights=[-1.0, -1.0, 1.0],
                                hash_size=2**23,
                                norm='l2',
                                tf='binary',
                                idf=50.0),
                            batcher=batcher)

        self.clf = FTRL(alpha=1.0,
                        beta=1.0,
                        L1=0.00001,
                        L2=1.0,
                        D=2**23,
                        iters=1,
                        inv_link="identity")
        if datadir == None:
            (self.wb, self.clf) = pkl.load(gzip.open(pickle_model, 'rb'))
        else:
            self.train(datadir, pickle_model)

    def fit_batch(self, texts, labels, rcount):
        texts, labels = self.wb.batcher.shuffle_batch(texts, labels, rcount)
        print("Transforming", rcount)
        texts = self.wb.fit_transform(texts, reset=False)
        print("Training", rcount)
        self.clf.fit(texts, labels, reset=False)

    def train(self, datadir, pickle_model=""):
        texts = []
        labels = []
        training_data = os.listdir(datadir)
        rcount = 0
        batchsize = 100000

        p = None
        for jsonfile in training_data:
            with open(datadir + "/" + jsonfile, 'r') as inputfile:
                for line in inputfile:
                    #if rcount > 1000000: break
                    try:
                        line = json.loads(line.strip())
                    except:
                        continue
                    for review in line["Reviews"]:
                        rcount += 1
                        if rcount % 100000 == 0: print(rcount)
                        if rcount % 7 != 0: continue
                        if "Overall" not in review["Ratings"]: continue
                        texts.append(review["Content"])
                        labels.append(
                            (float(review["Ratings"]["Overall"]) - 3) * 0.5)
                        if len(texts) % batchsize == 0:
                            if p != None: p.join()
                            p = threading.Thread(target=self.fit_batch,
                                                 args=(texts, labels, rcount))
                            p.start()
                            texts = []
                            labels = []
        if p != None: p.join()
        self.fit_batch(texts, labels, rcount)

        self.wb.dictionary_freeze = True

        if pickle_model != "":
            with gzip.open(pickle_model, 'wb') as model_file:
                backend = self.wb.batcher.backend
                backend_handle = self.wb.batcher.backend_handle
                self.wb.batcher.backend = "serial"
                self.wb.batcher.backend_handle = None
                pkl.dump((self.wb, self.clf), model_file, protocol=2)
                self.wb.batcher.backend = backend
                self.wb.batcher.backend_handle = backend_handle

    def predict(self, texts):
        counts = self.wb.transform(texts)
        return self.clf.predict(counts)
Exemple #9
0
]

data_sizes= [40000, 80000, 160000, 320000, 640000, 1280000]

for task in tasks:
	for data_size in data_sizes:
		texts_chunk = texts[:data_size]
		print("Task:", task, "Data size:", data_size)
		for backend in backends:
			batcher = Batcher(procs=16, minibatch_size=5000, backend=backend[0], backend_handle=backend[1])
			#try:
			with timer("Completed: ["+task+","+str(len(texts_chunk))+","+backend[0]+"]"), warnings.catch_warnings():
				warnings.simplefilter("ignore")
				if task=="ApplyBatch":
					hv = HashingVectorizer(decode_error='ignore', n_features=2 ** 25, preprocessor=normalize_text,
										   ngram_range=(1, 2), norm='l2')
					t= ApplyBatch(hv.transform, batcher=batcher).transform(texts_chunk)
					print(t.shape, t.data[:5])

				if task=="WordBag":
					wb = WordBatch(normalize_text=normalize_text,
					               dictionary=Dictionary(min_df=10, max_words=1000000, verbose=0),
					               tokenizer= Tokenizer(spellcor_count=2, spellcor_dist=2, stemmer= stemmer),
					               extractor=WordBag(hash_ngrams=0, norm= 'l2', tf= 'binary', idf= 50.0),
					               batcher= batcher,
					               verbose= 0)
					t = wb.fit_transform(texts_chunk)
					print(t.shape, t.data[:5])
			# except:
			# 	print("Failed ["+task+","+str(len(texts_chunk))+","+backend[0]+"]")
		print("")
Exemple #10
0
class WordseqRegressor():
    def __init__(self, pickle_model="", datadir=None, batcher=None):
        seed = 10002
        session_conf = tf.ConfigProto(
            intra_op_parallelism_threads=multiprocessing.cpu_count() // 2,
            inter_op_parallelism_threads=1)
        os.environ['PYTHONHASHSEED'] = str(seed)
        np.random.seed(seed + 1)
        random.seed(seed + 2)
        tf.set_random_seed(seed + 3)
        K.set_session(
            tf.Session(graph=tf.get_default_graph(), config=session_conf))

        self.maxlen = 200
        self.max_words = 20000
        self.wb = WordBatch(normalize_text,
                            dictionary=Dictionary(max_words=self.max_words),
                            extractor=WordSeq(seq_maxlen=self.maxlen),
                            batcher=batcher)
        self.model = Sequential()
        self.model.add(
            Embedding(self.max_words + 2, 20, input_length=self.maxlen))
        self.model.add(
            Conv1D(activation="relu",
                   padding="same",
                   strides=1,
                   filters=10,
                   kernel_size=3))
        self.model.add(Dropout(0.5))
        self.model.add(BatchNormalization())
        self.model.add(GlobalMaxPooling1D())
        self.model.add(Dense(1))
        self.model.compile(loss='mean_squared_error',
                           optimizer='adam',
                           metrics=['mean_squared_error'])
        if datadir == None:
            self.model = load_model(pickle_model)
            self.wb = pkl.load(gzip.open(pickle_model + ".wb", 'rb'))
        else:
            self.train(datadir, pickle_model)

    def transform_batch(self, texts, batch_data):
        batch_data.texts = self.wb.fit_transform(texts, reset=False)

    def train(self, datadir, pickle_model=""):
        texts = []
        labels = []
        training_data = os.listdir(datadir)
        rcount = 0
        texts2 = []
        batchsize = 100000

        batch_data = BatchData()
        p_input = None
        for jsonfile in training_data:
            with open(datadir + "/" + jsonfile, 'r') as inputfile:
                for line in inputfile:
                    #if rcount > 1000000: break
                    try:
                        line = json.loads(line.strip())
                    except:
                        continue
                    for review in line["Reviews"]:
                        rcount += 1
                        if rcount % 100000 == 0: print(rcount)
                        if rcount % 8 != 0: continue
                        if "Overall" not in review["Ratings"]: continue
                        texts.append(review["Content"])
                        labels.append(
                            (float(review["Ratings"]["Overall"]) - 3) * 0.5)
                        if len(texts) % batchsize == 0:
                            if p_input != None:
                                p_input.join()
                                texts2.append(batch_data.texts)
                            p_input = threading.Thread(
                                target=self.transform_batch,
                                args=(texts, batch_data))
                            p_input.start()
                            texts = []
        if p_input != None:
            p_input.join()
            texts2.append(batch_data.texts)
        texts2.append(self.wb.partial_fit_transform(texts))
        del (texts)
        texts = sp.vstack(texts2)
        self.wb.dictionary_freeze = True
        test = (np.array(texts[-1000:]), np.array(labels[-1000:]))
        train = (np.array(texts[:-1000]), np.array(labels[:-1000]))

        self.model.fit(train[0],
                       train[1],
                       batch_size=2048,
                       epochs=2,
                       validation_data=(test[0], test[1]))
        if pickle_model != "":
            self.model.save(pickle_model)
            backend = self.wb.batcher.backend
            backend_handle = self.wb.batcher.backend_handle
            self.wb.batcher.backend = "serial"
            self.wb.batcher.backend_handle = None
            with gzip.open(pickle_model + ".wb", 'wb') as model_file:
                pkl.dump(self.wb, model_file, protocol=2)
            self.wb.batcher.backend = backend
            self.wb.batcher.backend_handle = backend_handle

    def predict_batch(self, texts):
        results = [
            x[0]
            for x in self.model.predict(np.array(self.wb.transform(texts)))
        ]
        return results
Exemple #11
0
class WordhashRegressor(object):
    def __init__(self, pickle_model="", datadir=None, batcher=None):
        self.wb = WordBatch(normalize_text,
                            tokenizer=Tokenizer(stemmer=stemmer),
                            extractor=WordHash(decode_error='ignore',
                                               n_features=2**25,
                                               ngram_range=(1, 2),
                                               norm='l2'),
                            batcher=batcher)
        self.clf = FM_FTRL(D=2**25,
                           D_fm=4,
                           iters=1,
                           inv_link="identity",
                           threads=multiprocessing.cpu_count() // 2)
        if datadir == None:
            (self.wb, self.clf) = pkl.load(gzip.open(pickle_model, 'rb'))
        else:
            self.train(datadir, pickle_model)

    def transform_batch(self, texts, batch_data):
        batch_data.texts = self.wb.fit_transform(texts, reset=False)

    def train(self, datadir, pickle_model=""):
        texts = []
        labels = []
        training_data = os.listdir(datadir)
        rcount = 0
        texts2 = []
        batchsize = 100000

        batch_data = BatchData()
        p_input = None
        for jsonfile in training_data:
            with open(datadir + "/" + jsonfile, 'r') as inputfile:
                for line in inputfile:
                    # if rcount > 1000000: break
                    try:
                        line = json.loads(line.strip())
                    except:
                        continue
                    for review in line["Reviews"]:
                        rcount += 1
                        if rcount % 100000 == 0: print(rcount)
                        if rcount % 9 != 0: continue
                        if "Overall" not in review["Ratings"]: continue
                        texts.append(review["Content"])
                        labels.append(
                            (float(review["Ratings"]["Overall"]) - 3) * 0.5)
                        if len(texts) % batchsize == 0:
                            if p_input != None:
                                p_input.join()
                                texts2.append(batch_data.texts)
                            p_input = threading.Thread(
                                target=self.transform_batch,
                                args=(texts, batch_data))
                            p_input.start()
                            texts = []
        if p_input != None:
            p_input.join()
            texts2.append(batch_data.texts)
            texts2.append(self.wb.fit_transform(texts, reset=False))
        del (texts)
        if len(texts2) == 1: texts = texts2[0]
        else: texts = ssp.vstack(texts2)

        self.wb.dictionary_freeze = True

        self.clf.fit(texts, labels)
        if pickle_model != "":
            with gzip.open(pickle_model, 'wb') as model_file:
                backend = self.wb.batcher.backend
                backend_handle = self.wb.batcher.backend_handle
                self.wb.batcher.backend = "serial"
                self.wb.batcher.backend_handle = None
                pkl.dump((self.wb, self.clf), model_file, protocol=2)
                self.wb.batcher.backend = backend
                self.wb.batcher.backend_handle = backend_handle

    def predict(self, texts):
        counts = self.wb.transform(texts)
        return self.clf.predict(counts)