testLocations = [] testSource = [] testTexts = [] testUserName = [] testTimeZone = [] testUtc = [] testUserIds = [] testUserLang = [] testCreatedAt = [] f = open(testFile) for line in f: instance = parseJsonLine(line) testDescription.append(str(instance.description)) testLinks.append(extractPreprocessUrl(instance.urls)) testLocations.append(str(instance.location)) source = str(instance.source) testSource.append(source) testTexts.append(instance.text) testUserName.append(str(instance.name)) testTimeZone.append(str(instance.timezone)) testUtc.append(str(instance.utcOffset)) testUserLang.append(str(instance.userLanguage)) testCreatedAt.append( str(instance.createdAt.hour) + "-" + str(roundMinutes(instance.createdAt.minute))) testUserIds.append(instance.userName) #############################
def batch_generator(twitterFile, goldstandard, batch_size=64): while True: #TODO: needed? with gzip.open(twitterFile, 'rb') as file: trainDescriptions = [] trainLinks = [] trainLocation = [] trainSource = [] trainTexts = [] trainUserName = [] trainTZ = [] trainUtc = [] trainUserLang = [] trainCreatedAt = [] trainUserMentions = [] trainLabels = [] for line in file: if len(trainDescriptions) == batch_size: trainDescriptions = [] trainLinks = [] trainLocation = [] trainSource = [] trainTexts = [] trainUserName = [] trainTZ = [] trainUtc = [] trainUserLang = [] trainCreatedAt = [] trainUserMentions = [] trainLabels = [] instance = parseJsonLine(line.decode('utf-8')) trainDescriptions.append(str(instance.description)) trainLinks.append(extractPreprocessUrl(instance.urls)) trainLocation.append(str(instance.location)) trainSource.append(str(instance.source)) trainTexts.append(instance.text) trainUserName.append(str(instance.name)) trainTZ.append(str(instance.timezone)) trainUtc.append(str(instance.utcOffset)) trainUserLang.append(str(instance.userLanguage)) trainCreatedAt.append( str(instance.createdAt.hour) + "-" + str(roundMinutes(instance.createdAt.minute))) trainUserMentions.append(instance.userMentions) trainLabel = goldstandard[instance.id]._name trainLabels.append(trainLabel) #print(str(instance.id) +"\t" +str(len(trainDescriptions))) if len(trainDescriptions) == batch_size: #Descriptions trainDescriptions = descriptionTokenizer.texts_to_sequences( trainDescriptions) trainDescriptions = np.asarray( trainDescriptions) # Convert to ndArraytop trainDescriptions = pad_sequences( trainDescriptions, maxlen=MAX_DESC_SEQUENCE_LENGTH) # Link-Mentions trainDomain = list(map(lambda x: x[0], trainLinks)) # URL-Domain categorial = np.zeros( (len(trainDomain), len(domainEncoder.classes_)), dtype="bool") for i in range(len(trainDomain)): if trainDomain[i] in domainEncoder.classes_: categorial[i, domainEncoder. transform([trainDomain[i]])[0]] = True trainDomain = categorial trainTld = list( map(lambda x: x[1], trainLinks)) # Url suffix; top level domain categorial = np.zeros( (len(trainTld), len(tldEncoder.classes_)), dtype="bool") for i in range(len(trainTld)): if trainTld[i] in tldEncoder.classes_: categorial[ i, tldEncoder.transform([trainTld[i]])[0]] = True trainTld = categorial # Location trainLocation = locationTokenizer.texts_to_sequences( trainLocation) trainLocation = np.asarray( trainLocation) # Convert to ndArraytop trainLocation = pad_sequences( trainLocation, maxlen=MAX_LOC_SEQUENCE_LENGTH) # Source trainSource = sourceEncoder.transform(trainSource) categorial = np.zeros( (len(trainSource), len(sourceEncoder.classes_)), dtype="bool") for i in range(len(trainSource)): categorial[i, trainSource[i]] = True trainSource = categorial #Text Tweet trainTexts = textTokenizer.texts_to_sequences(trainTexts) trainTexts = np.asarray( trainTexts) # Convert to ndArraytop trainTexts = pad_sequences(trainTexts, maxlen=MAX_TEXT_SEQUENCE_LENGTH) #User Name trainUserName = nameTokenizer.texts_to_sequences( trainUserName) trainUserName = np.asarray( trainUserName) # Convert to ndArraytop trainUserName = pad_sequences( trainUserName, maxlen=MAX_NAME_SEQUENCE_LENGTH) #Time Zone trainTZ = timeZoneTokenizer.texts_to_sequences(trainTZ) trainTZ = np.asarray(trainTZ) # Convert to ndArraytop trainTZ = pad_sequences(trainTZ, maxlen=MAX_TZ_SEQUENCE_LENGTH) # UTC trainUtc = utcEncoder.transform(trainUtc) categorial = np.zeros( (len(trainUtc), len(utcEncoder.classes_)), dtype="bool") for i in range(len(trainUtc)): categorial[i, trainUtc[i]] = True trainUtc = categorial # User-Language (63 languages) trainUserLang = langEncoder.transform(trainUserLang) categorial = np.zeros( (len(trainUserLang), len(langEncoder.classes_)), dtype="bool") for i in range(len(trainUserLang)): categorial[i, trainUserLang[i]] = True trainUserLang = categorial # Tweet-Time (120 steps) trainCreatedAt = timeEncoder.transform(trainCreatedAt) categorial = np.zeros( (len(trainCreatedAt), len(timeEncoder.classes_)), dtype="bool") for i in range(len(trainCreatedAt)): categorial[i, trainCreatedAt[i]] = True trainCreatedAt = categorial # class label classes = classEncoder.transform(trainLabels) #yield trainDescriptions, classes yield ( { 'inputDescription': trainDescriptions, 'inputDomain': trainDomain, 'inputTld': trainTld, 'inputLocation': trainLocation, 'inputSource': trainSource, 'inputText': trainTexts, 'inputUser': trainUserName, 'inputTimeZone': trainTZ, 'inputUTC': trainUtc, 'inputUserLang': trainUserLang, 'inputTweetTime': trainCreatedAt }, #{'output': y} classes)
trainDescription = [] trainLinks = [] trainLocation = [] trainSource = [] trainTexts = [] trainUserName = [] trainTZ = [] trainUtc = [] trainUserLang = [] trainSinTime = [] trainCosTime = [] for key in tweetToTextMapping: trainLabels.append(tweetToTextMapping[key].place._name) trainDescription.append(str(tweetToTextMapping[key].description)) trainLinks.append(extractPreprocessUrl(tweetToTextMapping[key].urls)) trainLocation.append(str(tweetToTextMapping[key].location)) trainSource.append(str(tweetToTextMapping[key].source)) trainTexts.append(tweetToTextMapping[key].text) trainUserName.append(str(tweetToTextMapping[key].name)) trainTZ.append(str(tweetToTextMapping[key].timezone)) trainUtc.append(str(tweetToTextMapping[key].utcOffset)) trainUserLang.append(str(tweetToTextMapping[key].userLanguage)) t = tweetToTextMapping[key].createdAt.hour * 60 * 60 + tweetToTextMapping[ key].createdAt.minute * 60 + tweetToTextMapping[key].createdAt.second t = 2 * np.pi * t / (24 * 60 * 60) trainSinTime.append(np.sin(t)) trainCosTime.append(np.cos(t)) trainCreatedAt = np.column_stack((trainSinTime, trainCosTime))