def enrichAllRedditPlaces(): with open(utils.getFullPathFromDataFileName('places.json')) as data_file: places = json.load(data_file) placeCoordinateDictionary = {} for place in places: cityName = place["name"] cityProperName = place["properName"] coordinates = place["coordinates"] placeCoordinateDictionary[cityName] = str(coordinates) print '' print '' print 'About to enrich all reddit data' fileName = 'reddit/allRedditComments' # Enrich with weather data print 'Enriching with Weather Data' augmentWeather.enrichWithWeather(fileName, placeCoordinateDictionary) # Enrich with sentiment print 'Enriching with Sentiment' sentiment.enrichWithSentiment(fileName) # Clean data print 'Cleaning Data' clean.clean(fileName) # Group Data print 'Grouping Data' groupDataByHour(fileName)
def enrichAllPlaces(): with open(utils.getFullPathFromDataFileName('places.json')) as data_file: places = json.load(data_file) for place in places: cityName = place["name"] cityProperName = place["properName"] coordinates = place["coordinates"] print '' print '' print 'About to enrich: ', cityProperName if cityDataExists(cityName): print 'City Data found for: ', cityProperName # # Enrich with weather data # print 'Enriching with Weather Data' # augmentWeather.enrichWithWeather(cityName, coordinates) # # Enrich with sentiment # print 'Enriching with Sentiment' # sentiment.enrichWithSentiment(cityName) # # # Clean data # print 'Cleaning Data' # clean.clean(cityName) # Group Data print 'Grouping Data' groupDataByHour(cityName) else: print 'No data file found for: ', cityProperName
def retrieveJsonData(jsonFileNames): results = [] for jsonFileName in jsonFileNames: jsonPath = utils.getFullPathFromDataFileName(jsonFileName) with open(jsonPath) as dataFile: jsonData = json.load(dataFile) results.extend(jsonData) return results
def enrichWithSentiment(cityName): inputFilePath = utils.getFullPathFromDataFileName(cityName + '_weather.json') outputFilePath = utils.getFullPathFromDataFileName( cityName + '_weather_sentiment.json') if 'Reddit' in cityName and sys.getdefaultencoding() != 'utf-8': reload(sys) sys.setdefaultencoding('utf-8') count = 0 with open(inputFilePath) as data_file: data = json.load(data_file) print 'Adding sentiments to tweet list of length: ', len(data) for tweet in data: if count % 100000 == 0: print "Adding sentiment data: ", count count = count + 1 tweet_body = tweet['body'] if 'Reddit' in cityName: if type(tweet_body) == unicode: tweet_body = tweet_body.encode('utf-8') tweet_body = tweet_body.strip() else: tweet_body = str(tweet_body) sent_score = 0 try: sent_score = getSentScore(tweet_body) # if tweet_body: # tweet_word = tweet_body.lower().split() # # print tweet_word # for word in tweet_word: # word = word.rstrip('?:!.,;"!@') # word = word.replace("\n", "") # if word in sentScores: # # print word # sent_score = sent_score + float(sentScores[word]) except Exception, (e): print str(e) tweet['sentiment'] = sent_score print 'Saving file to ', outputFilePath with open(outputFilePath, 'w') as outfile: json.dump(data, outfile) print 'File saved to ', outputFilePath
def getPresavedWeatherData(gps): weatherDataPath = utils.getFullPathFromDataFileName( 'weather/weatherData_' + gps + '.json') if os.path.isfile(weatherDataPath): # data_file = open(weatherDataPath) # json_data = json.load(data_file) # return json_data with open(weatherDataPath) as data_file: load = json.load(data_file) return load else: return dict()
def clean(cityName): inputPath = utils.getFullPathFromDataFileName(cityName + '_weather_sentiment.json') outputPath = utils.getFullPathFromDataFileName(cityName + '_weather_sentiment_clean.json') with open(inputPath) as data_file: dataEntries = json.load(data_file) count = 0 cleanData = [] for dataEntry in dataEntries: if count % 100000 == 0: print "Cleaning data -- count: ", count count = count + 1 if hasNonNeutralSentiment(dataEntry) and hasWeatherData(dataEntry): addCreatedField(dataEntry) fixWeatherData(dataEntry) addSentimentLabel(dataEntry) cleanData.append(dataEntry) print 'Saving file: ', outputPath with open(outputPath, 'w') as outfile: json.dump(cleanData, outfile) print 'Saved file: ', outputPath
def countGroupData(cityName): inputPath = utils.getFullPathFromDataFileName( cityName + '_weather_sentiment_clean_grouped.json') with open(inputPath) as data_file: dataEntries = json.load(data_file) for data in dataEntries: if data['sentiment_average'] < -1 or data['sentiment_average'] > 1: print 'average is bad' print data if data['sentiment_percent_positive'] < 0 or data[ 'sentiment_percent_positive'] > 1: print 'percent is bad:' print data return len(dataEntries)
def printClassVariables(): jsonFileNames = [ 'chicago_weather_sentiment_clean_grouped.json', 'denver_weather_sentiment_clean_grouped.json', 'detroit_weather_sentiment_clean_grouped.json', 'houston_weather_sentiment_clean_grouped.json', 'manhattan_weather_sentiment_clean_grouped.json', 'phoenix_weather_sentiment_clean_grouped.json', 'sanFrancisco_weather_sentiment_clean_grouped.json', 'seattle_weather_sentiment_clean_grouped.json', ] jsonData = retrieveJsonData(jsonFileNames) dataframe = pd.DataFrame(jsonData) sentimentAveragePath = utils.getFullPathFromDataFileName('csv/full_data.csv') dataframe.to_csv(sentimentAveragePath)
def countAllPlaces(): with open(utils.getFullPathFromDataFileName('places.json')) as data_file: places = json.load(data_file) sum = 0 placeCounts = [] for place in places: cityName = place["name"] cityProperName = place["properName"] if cityDataExists(cityName): count = countGroupData(cityName) placeCounts.append((cityProperName, count)) sum = sum + count else: print 'No data file found for: ', cityProperName placeCountsSorted = sorted(((v, k) for k, v in placeCounts), reverse=True) for key, value in placeCountsSorted: print value + ': ' + str(key) print sum
def countRedditData(): # inputPath = utils.getFullPathFromDataFileName('reddit/allRedditComments.json') # inputPath = utils.getFullPathFromDataFileName('reddit/allRedditComments_weather.json') # inputPath = utils.getFullPathFromDataFileName('reddit/allRedditComments_weather_sentiment.json') # inputPath = utils.getFullPathFromDataFileName('reddit/allRedditComments_weather_sentiment_clean.json') inputPath = utils.getFullPathFromDataFileName( 'reddit/allRedditComments_weather_sentiment_clean_grouped.json') dataEntries = json.load(open(inputPath)) counts = {} # cityKey = 'city' cityKey = 'location' for dataEntry in dataEntries: city = dataEntry[cityKey] if city not in counts: counts[city] = {} counts[city]['count'] = 1 counts[city]['weatherCount'] = 1 else: counts[city]['count'] = counts[city]['count'] + 1 if 'temperature' in dataEntry: counts[city]['weatherCount'] = counts[city]['weatherCount'] + 1 print counts print len(dataEntries) return len(dataEntries)
def cityDataExists(cityName): cityFileName = cityName + '.json' cityFilePath = utils.getFullPathFromDataFileName(cityFileName) return os.path.isfile(cityFilePath)
def countData(cityName): inputPath = utils.getFullPathFromDataFileName(cityName + '.json') with open(inputPath) as data_file: dataEntries = json.load(data_file) return len(dataEntries)
def groupDataByHour(cityName): inputPath = utils.getFullPathFromDataFileName( cityName + '_weather_sentiment_clean.json') outputPath = utils.getFullPathFromDataFileName( cityName + '_weather_sentiment_clean_grouped.json') print 'Opening data from: ', inputPath with open(inputPath) as data_file: dataEntries = json.load(data_file) print 'Data of length: ', len(dataEntries) count = 0 groupedData = {} for dataEntry in dataEntries: if count % 100000 == 0: print "Grouping data -- count: ", count count = count + 1 sentiment = dataEntry['sentiment'] sentimentScore = float(dataEntry['sentimentScore']) if 'location' in dataEntry: location = dataEntry['location'] elif 'city' in dataEntry: location = dataEntry['city'] timeHour = time.strftime('%Y-%m-%d %H', time.localtime(dataEntry['created'])) groupKey = makeUniqueKey(timeHour, location) if groupKey in groupedData: if groupedData[groupKey]['temperature'] != dataEntry[ 'temperature']: print '' print 'This data entry is wrong' print dataEntry oldCount = groupedData[groupKey]['num_data'] newCount = oldCount + 1.0 oldSentimentAverageScore = groupedData[groupKey][ 'sentiment_percent_positive'] newSentimentAverageScore = ( (oldSentimentAverageScore * oldCount) + sentimentScore) / newCount groupedData[groupKey][ 'sentiment_percent_positive'] = newSentimentAverageScore oldSentimentAverage = groupedData[groupKey][ 'sentiment_average'] newSentimentAverage = ( (oldSentimentAverage * oldCount) + sentiment) / newCount groupedData[groupKey][ 'sentiment_average'] = newSentimentAverage groupedData[groupKey]['num_data'] = newCount else: weatherColumnNames = [ 'cloudCover', 'temperature', 'dewPoint', 'visibility', 'apparentTemperature', 'pressure', 'precipIntensity', 'precipTypeNone', 'precipTypeRain', 'precipTypeSnow', 'humidity', 'windSpeed', 'precipProbability', 'precipType', 'icon', ] newDataEntry = { 'timeHour': timeHour, 'created': dataEntry['created'], 'time': dataEntry['time'], 'sentiment_average': sentiment, 'sentiment_percent_positive': sentimentScore, 'num_data': 1.0 } if 'location' in dataEntry: newDataEntry['location'] = location elif 'city' in dataEntry: newDataEntry['location'] = location for weatherColumn in weatherColumnNames: newDataEntry[weatherColumn] = dataEntry[weatherColumn] groupedData[groupKey] = newDataEntry print 'Saving file: ', outputPath print '# values: ', str(len(groupedData)) with open(outputPath, 'w') as outfile: groupedDataValues = groupedData.values() json.dump(groupedDataValues, outfile) print 'Saved file: ', outputPath
def enrichWithWeather(location_name, coordinates): actualCityNameMap = { 'chicago': 'chicago', 'asburypark': 'asburyPark', 'denver': 'denver', 'detroit': 'detroit', 'houston': 'houston', 'nyc': 'manhattan', 'phoenix': 'phoenix', 'sanfrancisco': 'sanFrancisco', 'san francisco': 'sanFrancisco', 'seattle': 'seattle', 'manhattan': 'manhattan' } locationWeatherDictionary = {} print 'Getting weather data' if type(coordinates) == unicode: # weather = None weather = getNewWeather.getWeatherForCoordinates(coordinates) locationWeatherDictionary[location_name] = weather else: for place, coordinate in coordinates.iteritems(): weather = getNewWeather.getWeatherForCoordinates(coordinate) # weather = None locationWeatherDictionary[place] = weather dataFilePath = utils.getFullPathFromDataFileName(location_name + '.json') with open(dataFilePath) as data_file: jsonData = json.load(data_file) print 'Adding weather to data of length = ' + str(len(jsonData)) count = 0 for dataObject in jsonData: if count % 100000 == 0: print "Adding weather data: ", count count = count + 1 if 'created' in dataObject: datetime = dataObject['created'] city_ = str(dataObject['city'].lower().strip()) else: datetime = dataObject['created_at']['$date'] if 'location' not in dataObject: dataObject['location'] = location_name city_ = str(dataObject['location'].lower().strip()) # try: place = actualCityNameMap[city_] weather = locationWeatherDictionary[place] tweetWeather = getWeatherAtDatetime(datetime, locationWeatherDictionary[place]) dataObject.update(tweetWeather) # except: # print city_ outputPath = utils.getFullPathFromDataFileName(location_name + '_weather.json') print 'Saving file: ', outputPath with open(outputPath, 'w') as outfile: json.dump(jsonData, outfile) print 'Saved file: ', outputPath return outputPath
def savePresavedWeatherData(gps, json_data): weatherDataPath = utils.getFullPathFromDataFileName( 'weather/weatherData_' + gps + '.json') with open(weatherDataPath, 'w') as outfile: json.dump(json_data, outfile)