model_new.initialize(dataset=(sentence_length, batch_size)) # setup buffers before accepting reviews xdev = be.zeros((sentence_length, 1), dtype=np.int32) # bsz is 1, feature size xbuf = np.zeros((1, sentence_length), dtype=np.int32) oov = 2 start = 1 index_from = 3 pad_char = 0 vocab, rev_vocab = pickle.load(open(args.vocab_file, 'rb')) while True: line = input('Enter a Review from testData.tsv file \n') # clean the input tokens = clean_string(line).strip().split() # check for oov and add start sent = [len(vocab) + 1 if t not in vocab else vocab[t] for t in tokens] sent = [start] + [w + index_from for w in sent] sent = [oov if w >= vocab_size else w for w in sent] # pad sentences xbuf[:] = 0 trunc = sent[-sentence_length:] xbuf[0, -len(trunc):] = trunc xdev[:] = xbuf.T.copy() y_pred = model_new.fprop(xdev, inference=True) # inference flag dropout print("Sent - {0}".format(xbuf)) print("Pred - {0} ".format(y_pred.get().T))
def build_data_train(path='.', filepath='labeledTrainData.tsv', vocab_file=None, vocab=None, skip_headers=True, train_ratio=0.8): """ Loads the data file and spits out a h5 file with record of {y, review_text, review_int} Typically two passes over the data. 1st pass is for vocab and pre-processing. (WARNING: to get phrases, we need to go though multiple passes). 2nd pass is converting text into integers. We will deal with integers from thereafter. WARNING: we use h5 just as proof of concept for handling large datasets Datasets may fit entirely in memory as numpy as array """ fname_h5 = filepath + '.h5' if vocab_file is None: fname_vocab = filepath + '.vocab' else: fname_vocab = vocab_file if not os.path.exists(fname_h5) or not os.path.exists(fname_vocab): # create the h5 store - NOTE: hdf5 is row-oriented store and we slice rows # reviews_text holds the metadata and processed text file # reviews_int holds the ratings, ints h5f = h5py.File(fname_h5, 'w') shape, maxshape = (2**16, ), (None, ) dt = np.dtype([ ('y', np.uint8), ('split', np.bool), ('num_words', np.uint16), # WARNING: vlen=bytes in python 3 ('text', h5py.special_dtype(vlen=str)) ]) reviews_text = h5f.create_dataset('reviews', shape=shape, maxshape=maxshape, dtype=dt, compression='gzip') reviews_train = h5f.create_dataset( 'train', shape=shape, maxshape=maxshape, dtype=h5py.special_dtype(vlen=np.int32), compression='gzip') reviews_valid = h5f.create_dataset( 'valid', shape=shape, maxshape=maxshape, dtype=h5py.special_dtype(vlen=np.int32), compression='gzip') wdata = np.zeros((1, ), dtype=dt) # init vocab only for train data build_vocab = False if vocab is None: vocab = defaultdict(int) build_vocab = True nsamples = 0 # open the file, skip the headers if needed f = open(filepath, 'r') if skip_headers: f.readline() for i, line in enumerate(f): _, rating, review = line.strip().split('\t') # clean the review review = clean_string(review) review_words = review.strip().split() num_words = len(review_words) split = int(np.random.rand() < train_ratio) # create record wdata['y'] = int(float(rating)) wdata['text'] = review wdata['num_words'] = num_words wdata['split'] = split reviews_text[i] = wdata # update the vocab if needed if build_vocab: for word in review_words: vocab[word] += 1 nsamples += 1 # histogram of class labels, sentence length ratings, counts = np.unique(reviews_text['y'][:nsamples], return_counts=True) sen_len, sen_len_counts = np.unique( reviews_text['num_words'][:nsamples], return_counts=True) vocab_size = len(vocab) nclass = len(ratings) reviews_text.attrs['vocab_size'] = vocab_size reviews_text.attrs['nrows'] = nsamples reviews_text.attrs['nclass'] = nclass reviews_text.attrs['class_distribution'] = counts print "vocabulary size - ", vocab_size print "# of samples - ", nsamples print "# of classes", nclass print "class distribution - ", ratings, counts sen_counts = zip(sen_len, sen_len_counts) sen_counts = sorted(sen_counts, key=lambda kv: kv[1], reverse=True) print "sentence length - ", len(sen_len), sen_len, sen_len_counts # WARNING: assume vocab is of order ~4-5 million words. # sort the vocab , re-assign ids by its frequency. Useful for downstream tasks # only done for train data if build_vocab: vocab_sorted = sorted(vocab.items(), key=lambda kv: kv[1], reverse=True) vocab = {} for i, t in enumerate(zip(*vocab_sorted)[0]): vocab[t] = i # map text to integers ntrain = 0 nvalid = 0 for i in range(nsamples): text = reviews_text[i]['text'] y = int(reviews_text[i]['y']) split = reviews_text[i]['split'] text_int = [y] + [vocab[t] for t in text.strip().split()] if split: reviews_train[ntrain] = text_int ntrain += 1 else: reviews_valid[nvalid] = text_int nvalid += 1 reviews_text.attrs['ntrain'] = ntrain reviews_text.attrs['nvalid'] = nvalid print "# of train - {0}, # of valid - {1}".format( reviews_text.attrs['ntrain'], reviews_text.attrs['nvalid']) # close open files h5f.close() f.close() if not os.path.exists(fname_vocab): rev_vocab = {} for wrd, wrd_id in vocab.iteritems(): rev_vocab[wrd_id] = wrd print "vocabulary from IMDB dataset is saved into {}".format( fname_vocab) cPickle.dump((vocab, rev_vocab), open(fname_vocab, 'wb')) return fname_h5, fname_vocab
def tokenize(s, eos=True): s = clean_string(s) if eos and len(s) > 0: return (s + ' <eos>').strip().split() else: return s.strip().split()
def build_data_train(path='.', filepath='labeledTrainData.tsv', vocab_file=None, vocab=None, skip_headers=True, train_ratio=0.8): """ Loads the data file and spits out a h5 file with record of {y, review_text, review_int} Typically two passes over the data. 1st pass is for vocab and pre-processing. (WARNING: to get phrases, we need to go though multiple passes). 2nd pass is converting text into integers. We will deal with integers from thereafter. WARNING: we use h5 just as proof of concept for handling large datasets Datasets may fit entirely in memory as numpy as array """ fname_h5 = filepath + '.h5' if vocab_file is None: fname_vocab = filepath + '.vocab' else: fname_vocab = vocab_file if not os.path.exists(fname_h5) or not os.path.exists(fname_vocab): # create the h5 store - NOTE: hdf5 is row-oriented store and we slice rows # reviews_text holds the metadata and processed text file # reviews_int holds the ratings, ints h5f = h5py.File(fname_h5, 'w') shape, maxshape = (2 ** 16,), (None, ) dt = np.dtype([('y', np.uint8), ('split', np.bool), ('num_words', np.uint16), # WARNING: vlen=bytes in python 3 ('text', h5py.special_dtype(vlen=str)) ]) reviews_text = h5f.create_dataset('reviews', shape=shape, maxshape=maxshape, dtype=dt, compression='gzip') reviews_train = h5f.create_dataset( 'train', shape=shape, maxshape=maxshape, dtype=h5py.special_dtype(vlen=np.int32), compression='gzip') reviews_valid = h5f.create_dataset( 'valid', shape=shape, maxshape=maxshape, dtype=h5py.special_dtype(vlen=np.int32), compression='gzip') wdata = np.zeros((1, ), dtype=dt) # init vocab only for train data build_vocab = False if vocab is None: vocab = defaultdict(int) build_vocab = True nsamples = 0 # open the file, skip the headers if needed f = open(filepath, 'r') if skip_headers: f.readline() for i, line in enumerate(f): _, rating, review = line.strip().split('\t') # clean the review review = clean_string(review) review_words = review.strip().split() num_words = len(review_words) split = int(np.random.rand() < train_ratio) # create record wdata['y'] = int(float(rating)) wdata['text'] = review wdata['num_words'] = num_words wdata['split'] = split reviews_text[i] = wdata # update the vocab if needed if build_vocab: for word in review_words: vocab[word] += 1 nsamples += 1 # histogram of class labels, sentence length ratings, counts = np.unique( reviews_text['y'][:nsamples], return_counts=True) sen_len, sen_len_counts = np.unique( reviews_text['num_words'][:nsamples], return_counts=True) vocab_size = len(vocab) nclass = len(ratings) reviews_text.attrs['vocab_size'] = vocab_size reviews_text.attrs['nrows'] = nsamples reviews_text.attrs['nclass'] = nclass reviews_text.attrs['class_distribution'] = counts print "vocabulary size - ", vocab_size print "# of samples - ", nsamples print "# of classes", nclass print "class distribution - ", ratings, counts sen_counts = zip(sen_len, sen_len_counts) sen_counts = sorted(sen_counts, key=lambda kv: kv[1], reverse=True) print "sentence length - ", len(sen_len), sen_len, sen_len_counts # WARNING: assume vocab is of order ~4-5 million words. # sort the vocab , re-assign ids by its frequency. Useful for downstream tasks # only done for train data if build_vocab: vocab_sorted = sorted( vocab.items(), key=lambda kv: kv[1], reverse=True) vocab = {} for i, t in enumerate(zip(*vocab_sorted)[0]): vocab[t] = i # map text to integers ntrain = 0 nvalid = 0 for i in range(nsamples): text = reviews_text[i]['text'] y = int(reviews_text[i]['y']) split = reviews_text[i]['split'] text_int = [y] + [vocab[t] for t in text.strip().split()] if split: reviews_train[ntrain] = text_int ntrain += 1 else: reviews_valid[nvalid] = text_int nvalid += 1 reviews_text.attrs['ntrain'] = ntrain reviews_text.attrs['nvalid'] = nvalid print "# of train - {0}, # of valid - {1}".format(reviews_text.attrs['ntrain'], reviews_text.attrs['nvalid']) # close open files h5f.close() f.close() if not os.path.exists(fname_vocab): rev_vocab = {} for wrd, wrd_id in vocab.iteritems(): rev_vocab[wrd_id] = wrd print "vocabulary from IMDB dataset is saved into {}".format(fname_vocab) cPickle.dump((vocab, rev_vocab), open(fname_vocab, 'wb')) return fname_h5, fname_vocab
input_numpy = np.zeros((sentence_length, 1), dtype=np.int32) correct_predictions = 0 neg_files = 0 total_filecount = 0 neg_predictions = 0 pos_predictions = 0 for file in os.listdir('[0]data/test/neg'): if file.endswith('.txt'): filename = '[0]data/test/neg/' + str(file) file = open(filename, 'r') movie_review = file.read() file.close() tokens = clean_string(movie_review).strip().split() # preprocess sentence to one hot sentence = [ len(vocab) + 1 if t not in vocab else vocab[t] for t in tokens ] sentence = [start] + [w + index_from for w in sentence] sentence = [oov if w >= vocab_size else w for w in sentence] # truncate and padding trunc = sentence[ -sentence_length:] # take the last sentence_length words input_numpy[:] = 0 # fill with zeros input_numpy[-len(trunc):, 0] = trunc # place the input into the numpy array
xdev = be.zeros((sentence_length, 1), dtype=np.int32) # bsz is 1, feature size xbuf = np.zeros((1, sentence_length), dtype=np.int32) oov = 2 start = 1 index_from = 3 pad_char = 0 vocab, rev_vocab = pickle.load(open(args.vocab_file, 'rb')) # walk over the reviews in the text files, making inferences for dirpath, dirs, files in os.walk(args.review_files): for file in files: with open(os.path.join(dirpath, file), 'r') as myfile: data=myfile.read() # clean the input tokens = clean_string(data).strip().split() # check for oov and add start sent = [len(vocab) + 1 if t not in vocab else vocab[t] for t in tokens] sent = [start] + [w + index_from for w in sent] sent = [oov if w >= vocab_size else w for w in sent] # pad sentences xbuf[:] = 0 trunc = sent[-sentence_length:] xbuf[0, -len(trunc):] = trunc xdev[:] = xbuf.T.copy() y_pred = model_new.fprop(xdev, inference=True) # inference flag dropout with open(os.path.join(args.output_dir, file), "w") as output_file: output_file.write("Pred - {0}\n".format(y_pred.get().T))