def obtainData(typeOfData = "train"): fileName = "20_train" if typeOfData == "train" else "20_test" sentences, emojis = loadData(fileName) texts = [' '.join(ele) for ele in sentences] tokenizer = Tokenizer() tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) wordIdMap = tokenizer.word_index data = pad_sequences(sequences, padding='post', truncating='post') labels = to_categorical(np.asarray(emojis)) return data, labels, wordIdMap
def buildDataFull(): fileNames = ["20_train", "20_validation", "20_test"] fileNames = ["5_train", "5_test"] sentences = [] emojis = [] trainLength = 0 testLength = 0 for ele in fileNames: currS, currE = loadData(ele) if trainLength is 0: trainLength = len(currE) elif testLength is 0: testLength = len(currE) sentences += currS emojis += currE emojiCounts = Counter(emojis) print "number of emojis detected is: " + str(len(emojiCounts)) emojiIdMap = {} idEmojiMap = {} for index, emoji in enumerate(emojiCounts): emojiIdMap[emoji] = index idEmojiMap[index] = emoji emojiLabels = [emojiIdMap[ele] for ele in emojis] texts = [' '.join(ele) for ele in sentences] tokenizer = Tokenizer() tokenizer.fit_on_texts(texts) sequences = tokenizer.texts_to_sequences(texts) wordIdMap = tokenizer.word_index idWordMap = {i: w for w, i in wordIdMap.items()} data = pad_sequences(sequences, padding='post', truncating='post') maxLength = data.shape[1] labels = to_categorical(np.asarray(emojiLabels)) trainX = data[:trainLength] testX = data[-testLength:] trainY = labels[:trainLength] testY = labels[-testLength:] return trainX, testX, trainY, testY, wordIdMap, maxLength, idEmojiMap