Beispiel #1
0
model_new.initialize(dataset=(sentence_length, batch_size))

# setup buffers before accepting reviews
xdev = be.zeros((sentence_length, 1), dtype=np.int32)  # bsz is 1, feature size
xbuf = np.zeros((1, sentence_length), dtype=np.int32)
oov = 2
start = 1
index_from = 3
pad_char = 0
vocab, rev_vocab = pickle.load(open(args.vocab_file, 'rb'))

while True:
    line = input('Enter a Review from testData.tsv file \n')

    # clean the input
    tokens = clean_string(line).strip().split()

    # check for oov and add start
    sent = [len(vocab) + 1 if t not in vocab else vocab[t] for t in tokens]
    sent = [start] + [w + index_from for w in sent]
    sent = [oov if w >= vocab_size else w for w in sent]

    # pad sentences
    xbuf[:] = 0
    trunc = sent[-sentence_length:]
    xbuf[0, -len(trunc):] = trunc
    xdev[:] = xbuf.T.copy()
    y_pred = model_new.fprop(xdev, inference=True)  # inference flag dropout

    print("Sent - {0}".format(xbuf))
    print("Pred - {0} ".format(y_pred.get().T))
Beispiel #2
0
model_new.initialize(dataset=(sentence_length, batch_size))

# setup buffers before accepting reviews
xdev = be.zeros((sentence_length, 1), dtype=np.int32)  # bsz is 1, feature size
xbuf = np.zeros((1, sentence_length), dtype=np.int32)
oov = 2
start = 1
index_from = 3
pad_char = 0
vocab, rev_vocab = pickle.load(open(args.vocab_file, 'rb'))

while True:
    line = input('Enter a Review from testData.tsv file \n')

    # clean the input
    tokens = clean_string(line).strip().split()

    # check for oov and add start
    sent = [len(vocab) + 1 if t not in vocab else vocab[t] for t in tokens]
    sent = [start] + [w + index_from for w in sent]
    sent = [oov if w >= vocab_size else w for w in sent]

    # pad sentences
    xbuf[:] = 0
    trunc = sent[-sentence_length:]
    xbuf[0, -len(trunc):] = trunc
    xdev[:] = xbuf.T.copy()
    y_pred = model_new.fprop(xdev, inference=True)  # inference flag dropout

    print("Sent - {0}".format(xbuf))
    print("Pred - {0} ".format(y_pred.get().T))
def build_data_train(path='.',
                     filepath='labeledTrainData.tsv',
                     vocab_file=None,
                     vocab=None,
                     skip_headers=True,
                     train_ratio=0.8):
    """
    Loads the data file and spits out a h5 file with record of
    {y, review_text, review_int}
    Typically two passes over the data.
    1st pass is for vocab and pre-processing. (WARNING: to get phrases, we need to go
    though multiple passes). 2nd pass is converting text into integers. We will deal with integers
    from thereafter.

    WARNING: we use h5 just as proof of concept for handling large datasets
    Datasets may fit entirely in memory as numpy as array

    """

    fname_h5 = filepath + '.h5'
    if vocab_file is None:
        fname_vocab = filepath + '.vocab'
    else:
        fname_vocab = vocab_file

    if not os.path.exists(fname_h5) or not os.path.exists(fname_vocab):
        # create the h5 store - NOTE: hdf5 is row-oriented store and we slice rows
        # reviews_text holds the metadata and processed text file
        # reviews_int holds the ratings, ints
        h5f = h5py.File(fname_h5, 'w')
        shape, maxshape = (2**16, ), (None, )
        dt = np.dtype([
            ('y', np.uint8),
            ('split', np.bool),
            ('num_words', np.uint16),
            # WARNING: vlen=bytes in python 3
            ('text', h5py.special_dtype(vlen=str))
        ])
        reviews_text = h5f.create_dataset('reviews',
                                          shape=shape,
                                          maxshape=maxshape,
                                          dtype=dt,
                                          compression='gzip')
        reviews_train = h5f.create_dataset(
            'train',
            shape=shape,
            maxshape=maxshape,
            dtype=h5py.special_dtype(vlen=np.int32),
            compression='gzip')

        reviews_valid = h5f.create_dataset(
            'valid',
            shape=shape,
            maxshape=maxshape,
            dtype=h5py.special_dtype(vlen=np.int32),
            compression='gzip')

        wdata = np.zeros((1, ), dtype=dt)

        # init vocab only for train data
        build_vocab = False
        if vocab is None:
            vocab = defaultdict(int)
            build_vocab = True
        nsamples = 0

        # open the file, skip the headers if needed
        f = open(filepath, 'r')
        if skip_headers:
            f.readline()

        for i, line in enumerate(f):
            _, rating, review = line.strip().split('\t')

            # clean the review
            review = clean_string(review)
            review_words = review.strip().split()
            num_words = len(review_words)
            split = int(np.random.rand() < train_ratio)

            # create record
            wdata['y'] = int(float(rating))
            wdata['text'] = review
            wdata['num_words'] = num_words
            wdata['split'] = split
            reviews_text[i] = wdata

            # update the vocab if needed
            if build_vocab:
                for word in review_words:
                    vocab[word] += 1

            nsamples += 1

        # histogram of class labels, sentence length
        ratings, counts = np.unique(reviews_text['y'][:nsamples],
                                    return_counts=True)
        sen_len, sen_len_counts = np.unique(
            reviews_text['num_words'][:nsamples], return_counts=True)
        vocab_size = len(vocab)
        nclass = len(ratings)
        reviews_text.attrs['vocab_size'] = vocab_size
        reviews_text.attrs['nrows'] = nsamples
        reviews_text.attrs['nclass'] = nclass
        reviews_text.attrs['class_distribution'] = counts
        print "vocabulary size - ", vocab_size
        print "# of samples - ", nsamples
        print "# of classes", nclass
        print "class distribution - ", ratings, counts
        sen_counts = zip(sen_len, sen_len_counts)
        sen_counts = sorted(sen_counts, key=lambda kv: kv[1], reverse=True)
        print "sentence length - ", len(sen_len), sen_len, sen_len_counts

        # WARNING: assume vocab is of order ~4-5 million words.
        # sort the vocab , re-assign ids by its frequency. Useful for downstream tasks
        # only done for train data
        if build_vocab:
            vocab_sorted = sorted(vocab.items(),
                                  key=lambda kv: kv[1],
                                  reverse=True)
            vocab = {}
            for i, t in enumerate(zip(*vocab_sorted)[0]):
                vocab[t] = i

        # map text to integers
        ntrain = 0
        nvalid = 0
        for i in range(nsamples):
            text = reviews_text[i]['text']
            y = int(reviews_text[i]['y'])
            split = reviews_text[i]['split']
            text_int = [y] + [vocab[t] for t in text.strip().split()]
            if split:
                reviews_train[ntrain] = text_int
                ntrain += 1
            else:
                reviews_valid[nvalid] = text_int
                nvalid += 1
        reviews_text.attrs['ntrain'] = ntrain
        reviews_text.attrs['nvalid'] = nvalid
        print "# of train - {0}, # of valid - {1}".format(
            reviews_text.attrs['ntrain'], reviews_text.attrs['nvalid'])
        # close open files
        h5f.close()
        f.close()

    if not os.path.exists(fname_vocab):
        rev_vocab = {}
        for wrd, wrd_id in vocab.iteritems():
            rev_vocab[wrd_id] = wrd
        print "vocabulary from IMDB dataset is saved into {}".format(
            fname_vocab)
        cPickle.dump((vocab, rev_vocab), open(fname_vocab, 'wb'))

    return fname_h5, fname_vocab
Beispiel #4
0
def tokenize(s, eos=True):
    s = clean_string(s)
    if eos and len(s) > 0:
        return (s + ' <eos>').strip().split()
    else:
        return s.strip().split()
Beispiel #5
0
def build_data_train(path='.', filepath='labeledTrainData.tsv', vocab_file=None,
                     vocab=None, skip_headers=True, train_ratio=0.8):
    """
    Loads the data file and spits out a h5 file with record of
    {y, review_text, review_int}
    Typically two passes over the data.
    1st pass is for vocab and pre-processing. (WARNING: to get phrases, we need to go
    though multiple passes). 2nd pass is converting text into integers. We will deal with integers
    from thereafter.

    WARNING: we use h5 just as proof of concept for handling large datasets
    Datasets may fit entirely in memory as numpy as array

    """

    fname_h5 = filepath + '.h5'
    if vocab_file is None:
        fname_vocab = filepath + '.vocab'
    else:
        fname_vocab = vocab_file

    if not os.path.exists(fname_h5) or not os.path.exists(fname_vocab):
        # create the h5 store - NOTE: hdf5 is row-oriented store and we slice rows
        # reviews_text holds the metadata and processed text file
        # reviews_int holds the ratings, ints
        h5f = h5py.File(fname_h5, 'w')
        shape, maxshape = (2 ** 16,), (None, )
        dt = np.dtype([('y', np.uint8),
                       ('split', np.bool),
                       ('num_words', np.uint16),
                       # WARNING: vlen=bytes in python 3
                       ('text', h5py.special_dtype(vlen=str))
                       ])
        reviews_text = h5f.create_dataset('reviews', shape=shape, maxshape=maxshape,
                                          dtype=dt, compression='gzip')
        reviews_train = h5f.create_dataset(
            'train', shape=shape, maxshape=maxshape,
            dtype=h5py.special_dtype(vlen=np.int32), compression='gzip')

        reviews_valid = h5f.create_dataset(
            'valid', shape=shape, maxshape=maxshape,
            dtype=h5py.special_dtype(vlen=np.int32), compression='gzip')

        wdata = np.zeros((1, ), dtype=dt)

        # init vocab only for train data
        build_vocab = False
        if vocab is None:
            vocab = defaultdict(int)
            build_vocab = True
        nsamples = 0

        # open the file, skip the headers if needed
        f = open(filepath, 'r')
        if skip_headers:
            f.readline()

        for i, line in enumerate(f):
            _, rating, review = line.strip().split('\t')

            # clean the review
            review = clean_string(review)
            review_words = review.strip().split()
            num_words = len(review_words)
            split = int(np.random.rand() < train_ratio)

            # create record
            wdata['y'] = int(float(rating))
            wdata['text'] = review
            wdata['num_words'] = num_words
            wdata['split'] = split
            reviews_text[i] = wdata

            # update the vocab if needed
            if build_vocab:
                for word in review_words:
                    vocab[word] += 1

            nsamples += 1

        # histogram of class labels, sentence length
        ratings, counts = np.unique(
            reviews_text['y'][:nsamples], return_counts=True)
        sen_len, sen_len_counts = np.unique(
            reviews_text['num_words'][:nsamples], return_counts=True)
        vocab_size = len(vocab)
        nclass = len(ratings)
        reviews_text.attrs['vocab_size'] = vocab_size
        reviews_text.attrs['nrows'] = nsamples
        reviews_text.attrs['nclass'] = nclass
        reviews_text.attrs['class_distribution'] = counts
        print "vocabulary size - ", vocab_size
        print "# of samples - ", nsamples
        print "# of classes", nclass
        print "class distribution - ", ratings, counts
        sen_counts = zip(sen_len, sen_len_counts)
        sen_counts = sorted(sen_counts, key=lambda kv: kv[1], reverse=True)
        print "sentence length - ", len(sen_len), sen_len, sen_len_counts

        # WARNING: assume vocab is of order ~4-5 million words.
        # sort the vocab , re-assign ids by its frequency. Useful for downstream tasks
        # only done for train data
        if build_vocab:
            vocab_sorted = sorted(
                vocab.items(), key=lambda kv: kv[1], reverse=True)
            vocab = {}
            for i, t in enumerate(zip(*vocab_sorted)[0]):
                vocab[t] = i

        # map text to integers
        ntrain = 0
        nvalid = 0
        for i in range(nsamples):
            text = reviews_text[i]['text']
            y = int(reviews_text[i]['y'])
            split = reviews_text[i]['split']
            text_int = [y] + [vocab[t] for t in text.strip().split()]
            if split:
                reviews_train[ntrain] = text_int
                ntrain += 1
            else:
                reviews_valid[nvalid] = text_int
                nvalid += 1
        reviews_text.attrs['ntrain'] = ntrain
        reviews_text.attrs['nvalid'] = nvalid
        print "# of train - {0}, # of valid - {1}".format(reviews_text.attrs['ntrain'],
                                                          reviews_text.attrs['nvalid'])
        # close open files
        h5f.close()
        f.close()

    if not os.path.exists(fname_vocab):
        rev_vocab = {}
        for wrd, wrd_id in vocab.iteritems():
            rev_vocab[wrd_id] = wrd
        print "vocabulary from IMDB dataset is saved into {}".format(fname_vocab)
        cPickle.dump((vocab, rev_vocab), open(fname_vocab, 'wb'))

    return fname_h5, fname_vocab
input_numpy = np.zeros((sentence_length, 1), dtype=np.int32)

correct_predictions = 0
neg_files = 0
total_filecount = 0
neg_predictions = 0
pos_predictions = 0

for file in os.listdir('[0]data/test/neg'):
    if file.endswith('.txt'):
        filename = '[0]data/test/neg/' + str(file)
        file = open(filename, 'r')
        movie_review = file.read()
        file.close()

        tokens = clean_string(movie_review).strip().split()

        # preprocess sentence to one hot
        sentence = [
            len(vocab) + 1 if t not in vocab else vocab[t] for t in tokens
        ]
        sentence = [start] + [w + index_from for w in sentence]
        sentence = [oov if w >= vocab_size else w for w in sentence]

        # truncate and padding
        trunc = sentence[
            -sentence_length:]  # take the last sentence_length words
        input_numpy[:] = 0  # fill with zeros
        input_numpy[-len(trunc):,
                    0] = trunc  # place the input into the numpy array
Beispiel #7
0
xdev = be.zeros((sentence_length, 1), dtype=np.int32)  # bsz is 1, feature size
xbuf = np.zeros((1, sentence_length), dtype=np.int32)
oov = 2
start = 1
index_from = 3
pad_char = 0
vocab, rev_vocab = pickle.load(open(args.vocab_file, 'rb'))

# walk over the reviews in the text files, making inferences
for dirpath, dirs, files in os.walk(args.review_files):
    for file in files:
        with open(os.path.join(dirpath, file), 'r') as myfile:
                data=myfile.read()

                # clean the input
                tokens = clean_string(data).strip().split()

                # check for oov and add start
                sent = [len(vocab) + 1 if t not in vocab else vocab[t] for t in tokens]
                sent = [start] + [w + index_from for w in sent]
                sent = [oov if w >= vocab_size else w for w in sent]

                # pad sentences
                xbuf[:] = 0
                trunc = sent[-sentence_length:]
                xbuf[0, -len(trunc):] = trunc
                xdev[:] = xbuf.T.copy()
                y_pred = model_new.fprop(xdev, inference=True)  # inference flag dropout

                with open(os.path.join(args.output_dir, file), "w") as output_file:
                        output_file.write("Pred - {0}\n".format(y_pred.get().T))