def prepareForTest(dataset_path="~/tweetnet/data/text_data.pkl"): text_data = pickle.load(open(expanduser(dataset_path))) testTweets, testHashtags, testMw, testTweetSequence, testHashtagSequence, testMwSequence, testStartIdx = text_data[0],text_data[1],text_data[2],text_data[3],text_data[4],text_data[5],text_data[6] dictionary = pickle.load(open(expanduser("~/tweetnet/data/word2vec_dict.pkl"))) htDic = createHtDict(dictionary, testHashtags) return htDic, testTweets, testHashtags, testMw, testTweetSequence, testHashtagSequence, testMwSequence, testStartIdx
nTestData = len(testTweets) nTrainData = len(trainTweets) nTestSequences = len(testTweetSequence) nTrainSequences = len(trainTweetSequence) print "Number of testing sequences: ", nTestSequences print "Number of training sequences: ", nTrainSequences print "Number of testing tweets: ", nTestData print "Number of training tweets: ", nTrainData #for i in range(1000): # print (trainTweetSequence[i], trainHashtagSequence[i]) # Load word2vec dictionary dictionary = pickle.load(open(expanduser("~/tweetnet/data/word2vec_dict.pkl"))) # Create the hashtag dictionary htDic = createHtDict(dictionary, testHashtags) numEpochs = 50 lamb = 0.0001 #building cLSTM model #print("\n") print("Start building model ....") model = Sequential() #model.add(LSTM(numHiddenFirst, return_sequences=True, input_shape=(sequenceLength, inputSize))) model.add(LSTM(numHiddenFirst, input_shape=(sequenceLength, inputSize))) #model.add(BatchNormalization())
dictionary = pickle.load( open(expanduser("~/tweetnet/data/word2vec_dict.pkl"), "rb")) data = numpy.zeros([len(hashtags), 300]) label = numpy.zeros([len(hashtags), 300]) inputStringLabel = [] outputStringLabel = [] for i in range(len(hashtags)): line = hashtags[i] listHashtag = line.split() data[i, :] = dictionary[listHashtag[1]] label[i, :] = dictionary[listHashtag[2]] inputStringLabel.append(listHashtag[1]) outputStringLabel.append(listHashtag[2]) htDic = createHtDict(dictionary, outputStringLabel) # Train and Test split trainPercent = 0.99 nTrainData = numpy.round(len(data) * trainPercent).astype(int) topN = 10 nEpoch = 5000 logAllPredictions = True trainData = data[0:nTrainData] testData = data[nTrainData + 1:] testInputStringLabel = inputStringLabel[nTrainData + 1:] print testData.shape trainLabel = label[0:nTrainData] testOutputStringLabel = outputStringLabel[nTrainData + 1:] model = Sequential()