def build(self): conn = PostgresConnector().get_connection() cursor = conn.cursor() query = 'select id,entities,trend from organized_tweets' cursor.execute(query) id_column = 0 entities_column = 1 trend_column = 2 with open('copy_from.txt', 'w') as f: for row in cursor: tweet_id = row[id_column] trend = row[trend_column] hashtag_array = row[entities_column] json_array = json.loads(hashtag_array) hashtag_list = [hashtag["text"] for hashtag in json_array] hashtag_list_unique = list(set(hashtag_list)) print 'Writing data to table for the tweet_id ' + tweet_id for hashtag in hashtag_list_unique: f.write(tweet_id + '\t' + hashtag.encode('utf-8') + '\t' + trend + '\n') with open('copy_from.txt') as f: cursor.copy_from(f, 'id_entity', columns=('id', 'entity', 'trend')) conn.commit() os.remove('copy_from.txt')
def build(self): conn = PostgresConnector().get_connection() cursor = conn.cursor() query = 'select id,entities,trend from organized_tweets' cursor.execute(query) id_column = 0 entities_column = 1 trend_column = 2 with open('copy_from.txt','w') as f: for row in cursor: tweet_id = row[id_column] trend = row[trend_column] hashtag_array = row[entities_column] json_array = json.loads(hashtag_array) hashtag_list = [hashtag["text"] for hashtag in json_array] hashtag_list_unique = list(set(hashtag_list)) print 'Writing data to table for the tweet_id ' +tweet_id for hashtag in hashtag_list_unique: f.write(tweet_id + '\t' + hashtag.encode('utf-8') + '\t' + trend + '\n') with open('copy_from.txt') as f: cursor.copy_from(f, 'id_entity', columns=('id', 'entity','trend')) conn.commit() os.remove('copy_from.txt')
def validate_data(self,vehicle_type,toll_type,date,price,vehicle_no): if vehicle_type.strip() == '' or toll_type.strip() == '' or price.strip() == '' or date.strip() == '' or vehicle_no.strip() =='': # figure out what to return here! raise Exception('input data has nulls') else: try: conn = PostgresConnector().get_connection() cursor = conn.cursor() query = """ INSERT INTO transactions (vehicle_type,toll_type,timestamp,price,vehicle_no ) values(%s,%s,\'%s\',%s,\'%s\') """ % (vehicle_type,toll_type,date,float(price),vehicle_no) #print "Inserting data to table using the query %s" % (query,) cursor.execute(query) conn.commit() return 'Success' except psycopg2.IntegrityError as e: raise Exception(' Unique key constraint failed ') except Exception as e: print e raise Exception(' Something else went wrong')
def update_organized_tweets(self): tweet_id_dict = {} try: conn = PostgresConnector().get_connection() cursor = conn.cursor() query_location = 'select id from location' cursor.execute(query_location) location_column = 0 for row_location in cursor: query = """ select id,trend from trends where trend in(select trend from (select count(*) as c,trend from trends where locationid = %s group by trend)as t1 order by c desc limit 15) """ cursor = conn.cursor() location_id = row_location[location_column] cursor.execute(query,(location_id,)) trend_id_column = 0 trend_name_column = 1 trend_count = 0 for row in cursor: trend_count = trend_count + 1 trend_id = row[trend_id_column] trend_name = row[trend_name_column] print 'Processing for trend ' +trend_id+' , ' +str(trend_count) query_tweets = 'select tweets from tweets where trendId = \''+str(trend_id)+'\'' cursor_tweets = conn.cursor() cursor_tweets.execute(query_tweets) tweets_column = 0 with open(trend_name+'.txt','w') as f: # rows of tweets array for tweets_row in cursor_tweets: tweets_json_array = tweets_row[tweets_column] # tweets in a tweets array for json_in in tweets_json_array: id = json_in['id'] tweet_id_exists = tweet_id_dict.get(id) if tweet_id_exists is None: #print jsonIn tweet_id_dict[id] = 1 geo = 'none' if json_in['geo'] is None else 'none' #json['geo'] retweeted = json_in['retweeted'] in_reply_to_screen_name = 'none' if json_in['in_reply_to_screen_name'] is None else json_in['in_reply_to_screen_name'] truncated = 'none' if json_in['truncated'] is None else json_in['truncated'] source = json_in['source'] created_at = json_in['created_at'] place = 'none' if json_in['place'] is None else 'none'#json['place'] user_id = json_in['user']['id'] text = json_in['text'].strip() #text = " ".join(str(text).split()) text = str(filter(lambda x: x in string.printable,text)) #text = text.encode('utf-16') text = re.sub('\s+',' ',text) text = text.replace('\\','') entities = json_in['entities']['hashtags'] user_mentions = json_in['entities']['user_mentions'] user_mentions = [] retweet_count = json_in['retweet_count'] favorite_count = json_in['favorite_count'] # if len(entities) > 0: # for entity in entities: # for k,v in entity.items(): # if k in 'text': # entity_list = {} # new_v = entity[k] # new_v = str(new_v.encode('utf-8')) # new_v = filter(lambda x: x in string.printable,new_v) # #print id,check,new_v,len(new_v) # if len(new_v) > 0: # entity[k] = new_v # else: # entity[k] = '' #print id,geo,retweeted ,in_reply_to_screen_name ,truncated ,source ,created_at ,place ,user_id ,text ,entities ,user_mentions,retweet_count,favorite_count f.write(str(id)+'\t'+str(geo)+'\t'+str(retweeted)+'\t'+str(in_reply_to_screen_name.encode('utf-8'))+'\t'+str(truncated)+'\t'+str(source.encode('utf-8'))+'\t'+str(created_at.encode('utf-8'))+'\t'+str(place)+'\t'+str(user_id)+'\t'+text+'\t'+str(json.dumps(entities))+'\t'+str(user_mentions)+'\t'+str(retweet_count)+'\t'+str(favorite_count)+'\t'+str(trend_name)+'\t'+str(location_id)+'\n') else: continue # array of tweets json ends here #break # total number of tweets rows for a given trend ends here #break print 'Writing to table' with open(trend_name+'.txt') as f: cursor_write = conn.cursor() cursor_write.copy_from(f,'organized_tweets',columns=('id','geo','retweeted','in_reply_to_screen_name','truncated','source','created_at','place','user_id','text','entities','user_mentions','retweet_count','favorite_count','trend','location_id')) conn.commit() os.remove(trend_name+'.txt') # all trends finish here #break except Exception : print traceback.format_exc()
def update_organized_tweets(self): tweet_id_dict = {} try: conn = PostgresConnector().get_connection() cursor = conn.cursor() query_location = 'select id from location' cursor.execute(query_location) location_column = 0 for row_location in cursor: query = """ select id,trend from trends where trend in(select trend from (select count(*) as c,trend from trends where locationid = %s group by trend)as t1 order by c desc limit 80) """ cursor = conn.cursor() location_id = row_location[location_column] cursor.execute(query, (location_id, )) trend_id_column = 0 trend_name_column = 1 trend_count = 0 for row in cursor: trend_count = trend_count + 1 trend_id = row[trend_id_column] trend_name = row[trend_name_column] print 'Processing for trend ' + trend_id + ' , ' + str( trend_count) query_tweets = 'select tweets from tweets where trendId = \'' + str( trend_id) + '\'' cursor_tweets = conn.cursor() cursor_tweets.execute(query_tweets) tweets_column = 0 with open(trend_name + '.txt', 'w') as f: # rows of tweets array for tweets_row in cursor_tweets: tweets_json_array = tweets_row[tweets_column] # tweets in a tweets array for json_in in tweets_json_array: id = json_in['id'] tweet_id_exists = tweet_id_dict.get(id) if tweet_id_exists is None: #print jsonIn tweet_id_dict[id] = 1 geo = 'none' if json_in[ 'geo'] is None else 'none' #json['geo'] retweeted = json_in['retweeted'] in_reply_to_screen_name = 'none' if json_in[ 'in_reply_to_screen_name'] is None else json_in[ 'in_reply_to_screen_name'] truncated = 'none' if json_in[ 'truncated'] is None else json_in[ 'truncated'] source = json_in['source'] created_at = json_in['created_at'] place = 'none' if json_in[ 'place'] is None else 'none' #json['place'] user_id = json_in['user']['id'] text = json_in['text'].strip() #text = " ".join(str(text).split()) text = str( filter(lambda x: x in string.printable, text)) #text = text.encode('utf-16') text = re.sub('\s+', ' ', text) text = text.replace('\\', '') entities = json_in['entities']['hashtags'] user_mentions = json_in['entities'][ 'user_mentions'] user_mentions = [] retweet_count = json_in['retweet_count'] favorite_count = json_in['favorite_count'] # if len(entities) > 0: # for entity in entities: # for k,v in entity.items(): # if k in 'text': # entity_list = {} # new_v = entity[k] # new_v = str(new_v.encode('utf-8')) # new_v = filter(lambda x: x in string.printable,new_v) # #print id,check,new_v,len(new_v) # if len(new_v) > 0: # entity[k] = new_v # else: # entity[k] = '' #print id,geo,retweeted ,in_reply_to_screen_name ,truncated ,source ,created_at ,place ,user_id ,text ,entities ,user_mentions,retweet_count,favorite_count f.write( str(id) + '\t' + str(geo) + '\t' + str(retweeted) + '\t' + str( in_reply_to_screen_name.encode( 'utf-8')) + '\t' + str(truncated) + '\t' + str(source.encode('utf-8')) + '\t' + str(created_at.encode('utf-8')) + '\t' + str(place) + '\t' + str(user_id) + '\t' + text + '\t' + str(json.dumps(entities)) + '\t' + str(user_mentions) + '\t' + str(retweet_count) + '\t' + str(favorite_count) + '\t' + str(trend_name) + '\t' + str(location_id) + '\n') else: continue # array of tweets json ends here #break # total number of tweets rows for a given trend ends here #break print 'Writing to table' with open(trend_name + '.txt') as f: cursor_write = conn.cursor() cursor_write.copy_from( f, 'organized_tweets', columns=('id', 'geo', 'retweeted', 'in_reply_to_screen_name', 'truncated', 'source', 'created_at', 'place', 'user_id', 'text', 'entities', 'user_mentions', 'retweet_count', 'favorite_count', 'trend', 'location_id')) conn.commit() os.remove(trend_name + '.txt') # all trends finish here #break except Exception: print traceback.format_exc()