def prepare_data(self, data_fields, wv_size=600): test_data = Data(self.file_name, self.file_path) test_df = test_data.csv_df(data_fields) # make a copy of the original tweets for later use original_df = test_df.copy() # pre-process data(same as how we trained) test_data.pre_process(test_df) # then convert using word2vec model = test_data.build_wordvec(size=wv_size, verbose=False) # take a look of the max_len of testing. although we still have to use max_len from train max_len_test = test_data.max_len(test_df) data = test_data.convert2vec(test_df, self.max_len_train, model, name='test_' + self.file_name) test_data.save_vec(data, name='test_' + self.file_name) self.data = data self.test_data = test_data self.test_df = test_df self.original_df = original_df print ">>>Done preparing data.<<<\n"
data = np.load(data_file) label = np.load(label_file) # load original tweets # --------------------------------------------------------------------------------- sports_dic = { 'basketball': 1, 'hockey': 2, 'baseball': 3, 'tennis': 4, 'volleyball': 5 } sp_data = Data(sports_dic, file_path) sp_df = sp_data.csv_df(['text']) # load data rm_hashtags = ['#' + s for s in sports_dic.keys()] sp_data.pre_process(sp_df, rm_list=rm_hashtags) # pre-process data sp_df.drop(['tokenized'], axis=1, inplace=True) # --------------------------------------------------------------------------------- # set up lstm structure n_classes = 5 hm_epochs = 20 batch_size = 50 chunk_size = data.shape[2] n_chunks = data.shape[1] rnn_size = 300 # height x width x = tf.placeholder('float', [None, n_chunks, chunk_size]) y = tf.placeholder('float')