Esempio n. 1
0
def obtainData(typeOfData = "train"):
	fileName = "20_train" if typeOfData == "train" else "20_test"
	sentences, emojis = loadData(fileName)

	texts = [' '.join(ele) for ele in sentences]
	tokenizer = Tokenizer()
	tokenizer.fit_on_texts(texts)
	sequences = tokenizer.texts_to_sequences(texts)

	wordIdMap = tokenizer.word_index
	data = pad_sequences(sequences, padding='post', truncating='post')

	labels = to_categorical(np.asarray(emojis))

	return data, labels, wordIdMap
Esempio n. 2
0
def buildDataFull():
    fileNames = ["20_train", "20_validation", "20_test"]
    fileNames = ["5_train", "5_test"]
    sentences = []
    emojis = []
    trainLength = 0
    testLength = 0
    for ele in fileNames:
        currS, currE = loadData(ele)
        if trainLength is 0:
            trainLength = len(currE)
        elif testLength is 0:
            testLength = len(currE)
        sentences += currS
        emojis += currE

    emojiCounts = Counter(emojis)
    print "number of emojis detected is: " + str(len(emojiCounts))
    emojiIdMap = {}
    idEmojiMap = {}
    for index, emoji in enumerate(emojiCounts):
        emojiIdMap[emoji] = index
        idEmojiMap[index] = emoji
    emojiLabels = [emojiIdMap[ele] for ele in emojis]
    texts = [' '.join(ele) for ele in sentences]
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)

    wordIdMap = tokenizer.word_index
    idWordMap = {i: w for w, i in wordIdMap.items()}
    data = pad_sequences(sequences, padding='post', truncating='post')
    maxLength = data.shape[1]

    labels = to_categorical(np.asarray(emojiLabels))

    trainX = data[:trainLength]
    testX = data[-testLength:]

    trainY = labels[:trainLength]
    testY = labels[-testLength:]

    return trainX, testX, trainY, testY, wordIdMap, maxLength, idEmojiMap