def put_data_into_couchdb(db_json, grid_json, data_json): with open(db_json) as f: db_info = f.read() db_info = json.loads(db_info) couch_user = db_info['user'] couch_password = db_info['password'] couch_host = db_info['host'] couch_port = db_info['port'] db_name = db_info['processed_database'] raw_db_name = db_info['raw_database'] # source_url = db_info['tweet_source'] host_and_port = "http://" + couch_user + ":" + couch_password + "@" + couch_host + ":" + str( couch_port) couch = couchdb.Server(host_and_port) try: db = couch.create(db_name) # create db except: db = couch[db_name] # existing try: raw_db = couch[raw_db_name] # existing except: raw_db = couch.create(raw_db_name) # create db with open(grid_json) as f: grids_str = f.read() suburbs = json.loads(grids_str) f = open(data_json, encoding='utf-8') f.readline() count = 0 while True: print(count) process_data = [] raw_data = [] for i in range(500): line = f.readline().strip("\n").strip() if line[-1] == ",": line = line[:-1] try: tweet = json.loads(line) except: raw_db.update(raw_data) db.update(process_data) print(count) print("Done.") exit() twitter = tweet['doc'] twitter.pop('_rev') raw_data.append(twitter) info = sentiment_polarity(twitter, suburbs) if info != None: process_data.append(info) count += 1 raw_db.update(raw_data) db.update(process_data)
def output2couchdb(self,data): host_and_port = "http://" + self.couch_user + ":" + self.couch_password + "@" + self.couch_host + ":" + str(self.couch_port) couch = couchdb.Server(host_and_port) try: db = couch[self.db_name] # existing except: db = couch.create(self.db_name) # create db try: raw_db = couch[self.raw_db_name] # existing except: raw_db = couch.create(self.raw_db_name) # create db twitter = json.loads(data.encode('utf-8')) raw_db[twitter['id_str']] = twitter info = sentiment_polarity(twitter, self.suburbs) process_data = [] if info != None: process_data.append(info) db.update(process_data)
def get_tweets(db_json, access_json, grid_json, circle): with open(db_json) as f: db_info = f.read() db_info = json.loads(db_info) with open(grid_json) as f: grids_str = f.read() suburbs = json.loads(grids_str) couch_user = db_info['user'] couch_password = db_info['password'] couch_host = db_info['host'] couch_port = db_info['port'] db_name = db_info['processed_database'] raw_db_name = db_info['raw_database'] # source_url = db_info['tweet_source'] host_and_port = "http://" + couch_user + ":" + couch_password + "@" + couch_host + ":" + str( couch_port) couch = couchdb.Server(host_and_port) try: db = couch[db_name] # existing except: db = couch.create(db_name) # create db try: raw_db = couch[raw_db_name] # existing except: raw_db = couch.create(raw_db_name) # create db api = tweepy.API(get_authorization(access_json), wait_on_rate_limit=True) for status in tweepy.Cursor(api.search, q='', geocode=circle).items(): # data = status._json twitter = status._json raw_db[twitter['id_str']] = twitter info = sentiment_polarity(twitter, suburbs) process_data = [] if info != None: process_data.append(info) db.update(process_data)
def put_data_into_couchdb(db_json, grid_json, data_json): with open(db_json) as f: db_info = f.read() db_info = json.loads(db_info) couch_user = db_info['user'] couch_password = db_info['password'] couch_host = db_info['host'] couch_port = db_info['port'] db_name = db_info['processed_database'] raw_db_name = db_info['raw_database'] # source_url = db_info['tweet_source'] host_and_port = "http://" + couch_user + ":" + couch_password + "@" + couch_host + ":" + str( couch_port) couch = couchdb.Server(host_and_port) try: db = couch.create(db_name) # create db except: db = couch[db_name] # existing try: raw_db = couch.create(raw_db_name) # create db except: raw_db = couch[raw_db_name] # existing with open(grid_json) as f: grids_str = f.read() suburbs = json.loads(grids_str) comm = MPI.COMM_WORLD comm_size = comm.Get_size() rank = comm.Get_rank() if rank == 0: # Calculate rough start position by dividing the file size by comm size FILE_SIZE = os.path.getsize(data_json) # Read file size rough_start = [] for r in range(comm_size): rough_start.append(r * int(FILE_SIZE / comm_size)) start = [0] * (comm_size + 1) file = open(data_json, "r") #,encoding='utf-8') start[0] = len(file.readline()) # Skip the first line # Find the actual start position by moving the file reading pointer to the next line for r in range(comm_size - 1): file.seek(rough_start[r + 1]) line_break_position = rough_start[r + 1] + len( file.readline()) # Find the next line break position start[r + 1] = line_break_position start[-1] = FILE_SIZE file.close() else: start = None start = comm.bcast(start, root=0) file = open(data_json, "r") #,encoding='utf-8') file.seek( start[rank]) # Move the file reading pointer to starting position file.readline() count = 0 batch_size = 100 while True: process_data = [] raw_data = [] for i in range(batch_size): try: line = file.readline().strip("\n").strip() if line == ']}': raise if len(line) == 0: continue if line[-1] == ",": line = line[:-1] tweet = json.loads(line) twitter = tweet['doc'] twitter.pop('_rev') raw_data.append(twitter) info = sentiment_polarity(twitter, suburbs) if info != None: process_data.append(info) count += 1 print(file.tell()) if file.tell() >= start[rank + 1]: raise except: raw_db.update(raw_data) db.update(process_data) print(count) print("Done.") file.close() exit() raw_db.update(raw_data) db.update(process_data)
def put_data_into_couchdb(db_json,grid_json,data_json): with open(db_json) as f: db_info = f.read() db_info = json.loads(db_info) couch_user = db_info['user'] couch_password = db_info['password'] couch_host = db_info['host'] couch_port = db_info['port'] db_name = db_info['processed_database'] raw_db_name = db_info['raw_database'] # source_url = db_info['tweet_source'] host_and_port = "http://" +couch_user+":"+couch_password+"@"+couch_host + ":" + str(couch_port) couch = couchdb.Server(host_and_port) try: db = couch.create(db_name) # create db except: db = couch[db_name] # existing try: raw_db = couch[raw_db_name] # existing except: raw_db = couch.create(raw_db_name) # create db with open(grid_json) as f: grids_str = f.read() suburbs = json.loads(grids_str) with open(data_json,'r',encoding='utf-8') as f: data = f.read() geocoded_tweets = json.loads(data) offset = 0 limit = 500 while offset + limit <= len(geocoded_tweets['rows']): print (offset) try: process_data = [] raw_data = [] for tweet in geocoded_tweets['rows'][offset:offset+limit]: twitter = tweet['doc'] twitter.pop('_rev') raw_data.append(twitter) info = sentiment_polarity(twitter, suburbs) if info != None: process_data.append(info) raw_db.update(raw_data) db.update(process_data) offset += limit if offset + limit > len(geocoded_tweets['rows']): limit = len(geocoded_tweets['rows']) - offset if limit == 0: break except: pass print (offset) print ("Done.")
def put_data_into_couchdb(db_json, grid_json, start, end): with open(db_json) as f: db_info = f.read() db_info = json.loads(db_info) couch_user = db_info['user'] couch_password = db_info['password'] couch_host = db_info['host'] couch_port = db_info['port'] db_name = db_info['processed_database'] raw_db_name = db_info['raw_database'] source_url = db_info['tweet_source'] host_and_port = "http://" + couch_user + ":" + couch_password + "@" + couch_host + ":" + str( couch_port) couch = couchdb.Server(host_and_port) try: db = couch.create(db_name) # create db except: db = couch[db_name] # existing try: raw_db = couch[raw_db_name] # existing except: raw_db = couch.create(raw_db_name) # create db with open(grid_json) as f: grids_str = f.read() suburbs = json.loads(grids_str) total_rows = get_geocoded_tweets(source_url, start_key=start, end_key=end, skip=0, limit=1, auth=auth)['total_rows'] limit = 100 for i in range(int(total_rows / limit)): skip = str(i * limit) try: geocoded_tweets = get_geocoded_tweets(source_url, start_key=start, end_key=end, skip=skip, limit=limit, auth=auth) print(len(geocoded_tweets['rows'])) process_data = [] raw_data = [] for tweet in geocoded_tweets['rows']: twitter = tweet['doc'] twitter.pop('_rev') raw_data.append(twitter) info = sentiment_polarity(twitter, suburbs) if info != None: process_data.append(info) raw_db.update(raw_data) db.update(process_data) except: pass try: geocoded_tweets = get_geocoded_tweets(source_url, start_key=start, end_key=end, skip=skip, auth=auth) process_data = [] raw_data = [] for tweet in geocoded_tweets['rows']: twitter = tweet['doc'] # Remove _rev so the data can be stored in couchdb twitter.pop('_rev') raw_data.append(twitter) info = sentiment_polarity(twitter, suburbs) if info != None: process_data.append(info) raw_db.update(raw_data) db.update(process_data) except: pass