Example #1
0
	def get_data(self,locationid):
		entity_trend_dict = {} 
		try:
			conn = PostgresConnector().get_connection()
			cursor = conn.cursor()
			query = """select t1.entity,t2.trend from
						(select id,entity from id_entity ) as t1
						inner join
						(select id,trend from organized_tweets where trend in 
							(select trend from 
							(select count(*) as c,trend from 
								trends where locationid = %s group by trend)as t_in order 
							by c desc limit 15))as t2
						on
						t1.id = t2.id"""
			cursor.execute(query,(locationid,))
			entity_column = 0
			trend_column = 1
			for row in cursor:
				id = row[trend_column]
				if id in entity_id_dict.keys():
					entity_list = entity_id_dict[id]
					entity_list.append(row[entity_column])
					entity_id_dict[id] = entity_list
				else:
					entity_id_dict[id] = [row[entity_column]]


		except Exception:
			print traceback.format_exc()

		return entity_trend_dict
Example #2
0
    def build(self):
        conn = PostgresConnector().get_connection()
        cursor = conn.cursor()
        query = 'select id,entities,trend from organized_tweets'
        cursor.execute(query)
        id_column = 0
        entities_column = 1
        trend_column = 2

        with open('copy_from.txt','w') as f:
            for row in cursor:
                tweet_id = row[id_column]
                trend = row[trend_column]
                hashtag_array = row[entities_column]
                json_array = json.loads(hashtag_array)
                hashtag_list = [hashtag["text"] for hashtag in json_array]
                hashtag_list_unique = list(set(hashtag_list))
                print 'Writing data to table for the tweet_id ' +tweet_id 
                for hashtag in hashtag_list_unique:
                    f.write(tweet_id + '\t' + hashtag.encode('utf-8') + '\t' + trend + '\n')

        with open('copy_from.txt') as f:
            cursor.copy_from(f, 'id_entity', columns=('id', 'entity','trend'))
            conn.commit()

        os.remove('copy_from.txt')
Example #3
0
    def build(self):
        conn = PostgresConnector().get_connection()
        cursor = conn.cursor()
        query = 'select id,entities,trend from organized_tweets'
        cursor.execute(query)
        id_column = 0
        entities_column = 1
        trend_column = 2

        with open('copy_from.txt', 'w') as f:
            for row in cursor:
                tweet_id = row[id_column]
                trend = row[trend_column]
                hashtag_array = row[entities_column]
                json_array = json.loads(hashtag_array)
                hashtag_list = [hashtag["text"] for hashtag in json_array]
                hashtag_list_unique = list(set(hashtag_list))
                print 'Writing data to table for the tweet_id ' + tweet_id
                for hashtag in hashtag_list_unique:
                    f.write(tweet_id + '\t' + hashtag.encode('utf-8') + '\t' +
                            trend + '\n')

        with open('copy_from.txt') as f:
            cursor.copy_from(f, 'id_entity', columns=('id', 'entity', 'trend'))
            conn.commit()

        os.remove('copy_from.txt')
Example #4
0
	def get_dates_location(self,locationid):
		min_max_date_list = [] 
		try:
			conn = PostgresConnector().get_connection()
			cursor = conn.cursor()
			query = """
			select max(date),min(date) from trends where trend in 
			(select t1.trend as trend from
			(select count(*) as c,trend from trends where 
				locationid = %s group by trend)as t1 order by c desc limit 15)
				and locationid = %s
			"""
			cursor.execute(query,(locationid,locationid))
			min_date_column = 1
			max_date_column = 0
			for row in cursor:
				min_max_date_dict = {}
				min_max_date_dict["min_date"] = str(row[min_date_column])
				min_max_date_dict["max_date"] = str(row[max_date_column])
				min_max_date_list.append(min_max_date_dict)

		 
		except Exception:
			traceback.format_exc()

		return min_max_date_list
Example #5
0
    def get_trends(self, location_id, start_date, end_date):
        trends_list = []
        try:
            conn = PostgresConnector().get_connection()
            cursor = conn.cursor()
            query = """
			select c,trend from
			(select count(*) as c,trend from trends where 
				locationid = %s and date between %s and %s 
				and id in(select trendid from tweets)
				group by trend)
			as t1 order by c desc limit 15
			"""
            cursor.execute(query, (location_id, start_date, end_date))
            trend_column = 1
            count_column = 0
            for row in cursor:
                trend_count = {}
                trend_count["trend"] = row[trend_column]
                trend_count["count"] = row[count_column]
                trends_list.append(trend_count)

        except Exception as e:
            print e

        return trends_list
Example #6
0
    def get_dates_location(self, locationid):
        min_max_date_list = []
        try:
            conn = PostgresConnector().get_connection()
            cursor = conn.cursor()
            query = """
			select max(date),min(date) from trends where trend in 
			(select t1.trend as trend from
			(select count(*) as c,trend from trends where 
				locationid = %s group by trend)as t1 order by c desc limit 15)
				and locationid = %s
			"""
            cursor.execute(query, (locationid, locationid))
            min_date_column = 1
            max_date_column = 0
            for row in cursor:
                min_max_date_dict = {}
                min_max_date_dict["min_date"] = str(row[min_date_column])
                min_max_date_dict["max_date"] = str(row[max_date_column])
                min_max_date_list.append(min_max_date_dict)

        except Exception:
            traceback.format_exc()

        return min_max_date_list
Example #7
0
	def get_total_documents(self):
		conn = PostgresConnector().get_connection() 
		cursor = conn.cursor() 
		query = 'select count(distinct(id)) from "IdEntity" '
		cursor.execute(query)
		count_of_distinct_id_column = 0
		total_documents_count = 0
		for row in cursor:
			total_documents_count = row[count_of_distinct_id_column]

		return total_documents_count
Example #8
0
	def get_sentiments(self):
		conn = PostgresConnector().get_connection()
		cursor = conn.cursor()
		query = """
		select text from organized_tweets limit 10
		"""
		cursor.execute(query)
		for row in cursor:
			text = row[0]
			blob = TextBlob(text,analyzer=NaiveBayesAnalyzer())
			print blob.sentiment
Example #9
0
    def get_total_documents(self):
        conn = PostgresConnector().get_connection()
        cursor = conn.cursor()
        query = 'select count(distinct(id)) from "IdEntity" '
        cursor.execute(query)
        count_of_distinct_id_column = 0
        total_documents_count = 0
        for row in cursor:
            total_documents_count = row[count_of_distinct_id_column]

        return total_documents_count
Example #10
0
	def get_locations(self):
		conn = PostgresConnector().get_connection()
		cursor = conn.cursor()
		query = 'SELECT id,city from location';
		cursor.execute(query)
		id_column = 0 
		city_column = 1
		locations_list = []
		for row in cursor:
			id_location = {}
			id_location["geoid"] = row[id_column]
			id_location["city"] = row[city_column]
			locations_list.append(id_location)

		return locations_list
Example #11
0
    def build_tf(self):
        # using group by first get tf score for each entity
        conn = PostgresConnector().get_connection()
        cursor = conn.cursor()
        query = 'select count(id),entity from "IdEntity" group by entity'
        cursor.execute(query)
        count_of_id_column = 0
        entities_column = 1
        entity_id_dict = {}
        for row in cursor:
            count_of_id = row[count_of_id_column]
            entity = row[entities_column]
            entity_id_dict[entity] = count_of_id

        return entity_id_dict
Example #12
0
    def get_locations(self):
        conn = PostgresConnector().get_connection()
        cursor = conn.cursor()
        query = 'SELECT id,city from location'
        cursor.execute(query)
        id_column = 0
        city_column = 1
        locations_list = []
        for row in cursor:
            id_location = {}
            id_location["geoid"] = row[id_column]
            id_location["city"] = row[city_column]
            locations_list.append(id_location)

        return locations_list
Example #13
0
	def build_tf(self):
		# using group by first get tf score for each entity
		conn = PostgresConnector().get_connection() 
		cursor = conn.cursor() 
		query = 'select count(id),entity from "IdEntity" group by entity'
		cursor.execute(query)
		count_of_id_column = 0
		entities_column = 1
		entity_id_dict = {}
		for row in cursor:
			count_of_id = row[count_of_id_column]
			entity = row[entities_column]
			entity_id_dict[entity] = count_of_id

		return entity_id_dict			
Example #14
0
    def get_tfidf(self, locationid, trend):
        tfidf_list = []
        try:
            conn = PostgresConnector().get_connection()
            cursor = conn.cursor()
            tfidf_query = """
			select entity,tf_idf_score from 
				(select t4.entity,sum(t4.tf_idf) as tf_idf_score
				from
				(select t1.id,t1.entity,t2.count_id,t3.count_entity,
				(1.0/t3.count_entity)*log((
				select count(*) from organized_tweets 
				where trend = %s 
				and location_id = %s )/t2.count_id) as tf_idf  from
					(select id,entity from id_entity where id in
					(select id from organized_tweets 
					where trend = %s 
					and location_id = %s)) as t1
				inner join
					(select entity,count(id) as count_id from id_entity where id in
					(select id from organized_tweets 
					where trend = %s 
					and location_id = %s)group by entity) as t2
				on
					t1.entity = t2.entity
				inner join
					(select id,count(entity) as count_entity from id_entity 
					where id in(select id from organized_tweets 
					where trend = %s 
					and location_id = %s )group by id) as t3
				on 
					t1.id = t3.id) as t4 group by entity)as t5 order by 
					tf_idf_score desc limit 100;
			"""
            cursor.execute(tfidf_query, (trend, locationid, trend, locationid,
                                         trend, locationid, trend, locationid))
            entity_column = 0
            tfidf_column = 1
            for row in cursor:
                entity_tfidf_score = {}
                entity_tfidf_score["entity"] = row[entity_column]
                entity_tfidf_score["tfidf"] = row[tfidf_column]
                tfidf_list.append(entity_tfidf_score)

            return tfidf_list
        except Exception:
            print traceback.format_exc()
Example #15
0
    def get_tweets(self, trend, entity):
        conn = PostgresConnector().get_connection()
        cursor = conn.cursor()
        query_tweets = """

			select text from organized_tweets
			where id in (select id from id_entity where 
			entity = %s) limit 50
		"""
        cursor.execute(query_tweets, (entity, ))
        text_list = []
        for row in cursor:
            text_dict = {}
            text_dict["name"] = row[0]
            text_list.append(text_dict)

        return text_list
Example #16
0
	def get_sentiments(self):

		conn = PostgresConnector().get_connection()
		cursor = conn.cursor()
		query = """
		select id,text from organized_tweets 
		"""
		cursor.execute(query)
		id_column = 0
		text_column = 1
		with open("sentiments.tsv","w") as f:
			for row in cursor:
				text = row[text_column]
				blob = TextBlob(text,analyzer=NaiveBayesAnalyzer())
				print 'writing for tweet with id ' +str(row[id_column])

				f.write(str(row[id_column])+'\t'+str(blob.sentiment.classification)+'\t'+str(blob.sentiment.p_pos)+'\t'+str(blob.sentiment.p_neg)+'\n')
Example #17
0
	def get_tfidf(self,locationid,trend):
		tfidf_list = []
		try:
			conn = PostgresConnector().get_connection()
			cursor = conn.cursor()
			tfidf_query = """
			select entity,tf_idf_score from 
				(select t4.entity,sum(t4.tf_idf) as tf_idf_score
				from
				(select t1.id,t1.entity,t2.count_id,t3.count_entity,
				(1.0/t3.count_entity)*log((
				select count(*) from organized_tweets 
				where trend = %s 
				and location_id = %s )/t2.count_id) as tf_idf  from
					(select id,entity from id_entity where id in
					(select id from organized_tweets 
					where trend = %s 
					and location_id = %s)) as t1
				inner join
					(select entity,count(id) as count_id from id_entity where id in
					(select id from organized_tweets 
					where trend = %s 
					and location_id = %s)group by entity) as t2
				on
					t1.entity = t2.entity
				inner join
					(select id,count(entity) as count_entity from id_entity 
					where id in(select id from organized_tweets 
					where trend = %s 
					and location_id = %s )group by id) as t3
				on 
					t1.id = t3.id) as t4 group by entity)as t5 order by 
					tf_idf_score desc;
			"""
			cursor.execute(tfidf_query,(trend,locationid,trend,locationid,trend,locationid,trend,locationid))
			entity_column = 0
			tfidf_column = 1
			for row in cursor:
				entity_tfidf_score = {}
				entity_tfidf_score["entity"] = row[entity_column]
				entity_tfidf_score["tfidf"] = row[tfidf_column]
				tfidf_list.append(entity_tfidf_score)

			return tfidf_list
		except Exception :
			print traceback.format_exc()
Example #18
0
    def get_sentiments(self):

        conn = PostgresConnector().get_connection()
        cursor = conn.cursor()
        query = """
		select id,text from organized_tweets 
		"""
        cursor.execute(query)
        id_column = 0
        text_column = 1
        with open("sentiments.tsv", "w") as f:
            for row in cursor:
                text = row[text_column]
                blob = TextBlob(text, analyzer=NaiveBayesAnalyzer())
                print 'writing for tweet with id ' + str(row[id_column])

                f.write(
                    str(row[id_column]) + '\t' +
                    str(blob.sentiment.classification) + '\t' +
                    str(blob.sentiment.p_pos) + '\t' +
                    str(blob.sentiment.p_neg) + '\n')
	def validate_data(self,vehicle_type,toll_type,date,price,vehicle_no):
		if vehicle_type.strip() == '' or toll_type.strip() == '' or price.strip() == '' or date.strip() == '' or vehicle_no.strip() =='':
			# figure out what to return here!
			raise Exception('input data has nulls')
		else:
			try:
				conn = PostgresConnector().get_connection()
				cursor = conn.cursor()
				query = """ INSERT INTO transactions
					(vehicle_type,toll_type,timestamp,price,vehicle_no
						) values(%s,%s,\'%s\',%s,\'%s\')
					""" % (vehicle_type,toll_type,date,float(price),vehicle_no) 
				#print "Inserting data to table using the query %s" % (query,)
				cursor.execute(query)
				conn.commit()
				return 'Success'
			except psycopg2.IntegrityError as e:
				raise Exception(' Unique key constraint failed ')
			except Exception as e:	
				print e
				raise Exception(' Something else went wrong')
Example #20
0
	def validate_data(self,vehicle_no,time):
		if vehicle_no.strip() == '' and time.strip() == '': 
			# figure out what to return here!
			raise Exception('input data has nulls')
		else:
			try:
				conn = PostgresConnector().get_connection()
				cursor = conn.cursor()
				query = ""
				if vehicle_no.strip() != '' and time.strip() != '':
					query = """ SELECT * FROM transactions where vehicle_no = \'%s\' 
						and timestamp = \'%s\'  order by timestamp """ % (vehicle_no,time)
				elif vehicle_no.strip() != '' :
					query = """ select t2.vehicle_type,t2.toll_type,t1.timestamp,t1.price,t1.vehicle_no from
								(SELECT * FROM transactions where vehicle_no =\'%s\') as t1
								inner join
								(select * from master_data) as t2
								on
								t2.vehicle_type_id = t1.vehicle_type
								and 
								t2.toll_type_id = t1.toll_type """ % (vehicle_no,)

				else:
					query = """ SELECT * FROM transactions where timestamp = \'%s\' 
						order by timestamp """ % (time,)

				print "selecting data from the table using the query %s" % (query,)
				cursor.execute(query)
				search_list = []
				for row in cursor:
					elements = {}
					elements['vehicle_type'] = row[0]
					elements['toll_type'] = row[1]
					elements['time'] = str(row[2])
					elements['price'] = row[3]
					elements['vehicle_no'] = row[4]
					search_list.append(elements)
				return search_list 
			except Exception:	
				print traceback.format_exc()
Example #21
0
 def build(self):
     conn = PostgresConnector().get_connection()
     cursor = conn.cursor()
     query = 'select id,hashtags from "organizedTweets" '
     cursor.execute(query)
     id_column = 0
     entities_column = 1
     entity_id_dict = {}
     for row in cursor:
         tweet_id = row[id_column]
         hashtag_array = row[entities_column]
         hashtag_list = [hashtag['text'] for hashtag in hashtag_array]
         for entity in hashtag_list:
             if entity in entity_id_dict.keys():
                 id_list = entity_id_dict[entity]
                 id_list.append(tweet_id)
                 entity_id_dict[entity] = id_list
             else:
                 id_list = []
                 id_list.append(tweet_id)
                 entity_id_dict[entity] = id_list
     return entity_id_dict
Example #22
0
	def build(self):
		conn = PostgresConnector().get_connection() 
		cursor = conn.cursor() 
		query = 'select id,hashtags from "organizedTweets" '
		cursor.execute(query)
		id_column = 0
		entities_column = 1
		entity_id_dict = {}
		for row in cursor:
			tweet_id = row[id_column]
			hashtag_array = row[entities_column]
			hashtag_list = [hashtag['text'] for hashtag in hashtag_array]
			for entity in hashtag_list:
				if entity in entity_id_dict.keys():
					id_list = entity_id_dict[entity]
					id_list.append(tweet_id)
					entity_id_dict[entity] = id_list
				else:
					id_list = []
					id_list.append(tweet_id)
					entity_id_dict[entity] = id_list
		return entity_id_dict			
Example #23
0
	def get_data(self):
		out_list = []
		try:
			conn = PostgresConnector().get_connection()
			cursor = conn.cursor()
			query = """ SELECT vehicle_type,vehicle_type_id,
						toll_type,toll_type_id,price
						from master_data"""
			cursor.execute(query)
			resultset = cursor.fetchall()
			out_list = [] 
			vehicle_dict = {}
			for row in resultset:
				row_data = {}
				row_value_dict = {}
				id = row[1]
				vehicle_name = row[0]
				journey_list = []
				if id not in vehicle_dict.keys():
					vehicle_dict_out = {}
					vehicle_dict[id] = 1
					for row_in in resultset:
						journey_type = {}
						if row_in[1] == id:
							journey_type['toll_type'] = row_in[2]
							journey_type['toll_id'] = str(row_in[3])
							journey_type['price'] = str(row_in[4])
							journey_list.append(journey_type)	
					vehicle_dict_out['vehicle_type'] = vehicle_name
					vehicle_dict_out['vehicle_type_id'] = id
					vehicle_dict_out['journey'] = journey_list
					out_list.append(vehicle_dict_out)

			return out_list 
		except Exception as e:	
			print e

			raise Exception(' Something went wrong while retrieving data')
Example #24
0
	def get_trends(self,location_id,start_date,end_date):
		trends_list = []	
		try:
			conn = PostgresConnector().get_connection()
			cursor = conn.cursor()
			query = """
			select c,trend from
			(select count(*) as c,trend from trends where 
				locationid = %s and date between %s and %s group by trend)as t1 order by c desc limit 15
			"""
			cursor.execute(query,(location_id,start_date,end_date))
			trend_column = 1
			count_column = 0
			for row in cursor:
				trend_count = {}
				trend_count["trend"] = row[trend_column]
				trend_count["count"] = row[count_column]
				trends_list.append(trend_count)

		except Exception as e:
			print e 

		return trends_list
Example #25
0
    def update_organized_tweets(self):
        tweet_id_dict = {}

        try:
            conn = PostgresConnector().get_connection()
            cursor = conn.cursor()
            query_location = 'select id from location'
            cursor.execute(query_location)
            location_column = 0

            for row_location in cursor:

                query = """
				select id,trend from trends 
				where trend in(select trend from (select count(*) as c,trend from 
					trends where locationid = %s group by trend)as t1 order 
					by c desc limit 80)
						"""
                cursor = conn.cursor()
                location_id = row_location[location_column]
                cursor.execute(query, (location_id, ))
                trend_id_column = 0
                trend_name_column = 1
                trend_count = 0

                for row in cursor:
                    trend_count = trend_count + 1
                    trend_id = row[trend_id_column]
                    trend_name = row[trend_name_column]
                    print 'Processing for trend ' + trend_id + ' , ' + str(
                        trend_count)
                    query_tweets = 'select tweets from tweets where trendId = \'' + str(
                        trend_id) + '\''
                    cursor_tweets = conn.cursor()
                    cursor_tweets.execute(query_tweets)
                    tweets_column = 0

                    with open(trend_name + '.txt', 'w') as f:

                        # rows of tweets array
                        for tweets_row in cursor_tweets:
                            tweets_json_array = tweets_row[tweets_column]

                            # tweets in a tweets array
                            for json_in in tweets_json_array:

                                id = json_in['id']
                                tweet_id_exists = tweet_id_dict.get(id)

                                if tweet_id_exists is None:
                                    #print jsonIn
                                    tweet_id_dict[id] = 1
                                    geo = 'none' if json_in[
                                        'geo'] is None else 'none'  #json['geo']
                                    retweeted = json_in['retweeted']
                                    in_reply_to_screen_name = 'none' if json_in[
                                        'in_reply_to_screen_name'] is None else json_in[
                                            'in_reply_to_screen_name']
                                    truncated = 'none' if json_in[
                                        'truncated'] is None else json_in[
                                            'truncated']
                                    source = json_in['source']
                                    created_at = json_in['created_at']
                                    place = 'none' if json_in[
                                        'place'] is None else 'none'  #json['place']
                                    user_id = json_in['user']['id']
                                    text = json_in['text'].strip()
                                    #text = " ".join(str(text).split())
                                    text = str(
                                        filter(lambda x: x in string.printable,
                                               text))
                                    #text = text.encode('utf-16')
                                    text = re.sub('\s+', ' ', text)
                                    text = text.replace('\\', '')
                                    entities = json_in['entities']['hashtags']
                                    user_mentions = json_in['entities'][
                                        'user_mentions']
                                    user_mentions = []
                                    retweet_count = json_in['retweet_count']
                                    favorite_count = json_in['favorite_count']

                                    # if len(entities) > 0:
                                    # 	for entity in entities:
                                    # 		for k,v in entity.items():
                                    # 			if k in 'text':
                                    # 				entity_list = {}
                                    # 				new_v = entity[k]
                                    # 				new_v = str(new_v.encode('utf-8'))
                                    # 				new_v = filter(lambda x: x in string.printable,new_v)
                                    # 				#print id,check,new_v,len(new_v)
                                    # 				if len(new_v) > 0:
                                    # 					entity[k] = new_v
                                    # 				else:
                                    # 					entity[k] = ''

                                    #print id,geo,retweeted ,in_reply_to_screen_name ,truncated ,source ,created_at ,place ,user_id ,text ,entities ,user_mentions,retweet_count,favorite_count
                                    f.write(
                                        str(id) + '\t' + str(geo) + '\t' +
                                        str(retweeted) + '\t' + str(
                                            in_reply_to_screen_name.encode(
                                                'utf-8')) + '\t' +
                                        str(truncated) + '\t' +
                                        str(source.encode('utf-8')) + '\t' +
                                        str(created_at.encode('utf-8')) +
                                        '\t' + str(place) + '\t' +
                                        str(user_id) + '\t' + text + '\t' +
                                        str(json.dumps(entities)) + '\t' +
                                        str(user_mentions) + '\t' +
                                        str(retweet_count) + '\t' +
                                        str(favorite_count) + '\t' +
                                        str(trend_name) + '\t' +
                                        str(location_id) + '\n')

                                else:
                                    continue

                                # array of tweets json ends here
                                #break

                            # total number of tweets rows for a given trend ends here
                            #break

                    print 'Writing to table'

                    with open(trend_name + '.txt') as f:
                        cursor_write = conn.cursor()
                        cursor_write.copy_from(
                            f,
                            'organized_tweets',
                            columns=('id', 'geo', 'retweeted',
                                     'in_reply_to_screen_name', 'truncated',
                                     'source', 'created_at', 'place',
                                     'user_id', 'text', 'entities',
                                     'user_mentions', 'retweet_count',
                                     'favorite_count', 'trend', 'location_id'))

                    conn.commit()
                    os.remove(trend_name + '.txt')

                    # all trends finish here
                    #break

        except Exception:
            print traceback.format_exc()
Example #26
0
	def get_matrix(self,locationid):

		try:
			conn = PostgresConnector().get_connection()
			cursor = conn.cursor()
			query = """
			WITH 

			TREND_COUNT_TT AS
			(SELECT TREND,COUNT(*) AS TREND_COUNT 
				FROM TRENDS 
				WHERE LOCATIONID = %s GROUP BY TREND),


			TOP_TRENDS_TT AS
			(SELECT TREND FROM TREND_COUNT_TT ORDER BY TREND_COUNT DESC LIMIT 15),

			IDS_FOR_TOP_TRENDS_TT AS 
			(SELECT ID FROM ORGANIZED_TWEETS 
			WHERE TREND IN (SELECT TREND FROM TOP_TRENDS_TT) AND LOCATION_ID = '2295420'),
			--SELECT * FROM IDS_FOR_TOP_TRENDS_TT

			ID_ENTITY_TOP_TRENDS_TT AS
			(SELECT TREND,ID,ENTITY 
			FROM ID_ENTITY
			WHERE ID IN (SELECT ID FROM IDS_FOR_TOP_TRENDS_TT)),


			TREND_ENTITY_TF_IDF_SUM_TT AS
			(SELECT TREND,ENTITY,COUNT(ID) TF_IDF_SUM 
			FROM ID_ENTITY
			WHERE ID IN (SELECT ID FROM IDS_FOR_TOP_TRENDS_TT)
			GROUP BY TREND,ENTITY),
			--SELECT * FROM TREND_ENTITY_TF_IDF_SUM_TT

			TREND_TF_IDF_SQ_SUM_TT AS
			(SELECT TREND, 
			SUM(TF_IDF_SUM*TF_IDF_SUM) AS TF_IDF_SQ_SUM
			FROM TREND_ENTITY_TF_IDF_SUM_TT
			GROUP BY TREND),

			COSINE_DIST_NUM_TT AS 
			(SELECT T1.TREND AS TREND1,T2.TREND AS TREND2,
			SUM(T1.TF_IDF_SUM*T2.TF_IDF_SUM) AS COSINE_NUM
			FROM TREND_ENTITY_TF_IDF_SUM_TT AS T1
			INNER JOIN TREND_ENTITY_TF_IDF_SUM_TT AS T2 ON T2.TREND>T1.TREND AND T1.ENTITY = T2.ENTITY
			GROUP BY T1.TREND,T2.TREND),

			COSINE_DIST_TT AS
			(SELECT TREND1,TREND2,
			COSINE_NUM/(SQRT(T2.TF_IDF_SQ_SUM)*SQRT(T3.TF_IDF_SQ_SUM)) AS COSIND_DIST
			FROM COSINE_DIST_NUM_TT AS T1
			INNER JOIN TREND_TF_IDF_SQ_SUM_TT AS T2 ON T1.TREND1=T2.TREND
			INNER JOIN TREND_TF_IDF_SQ_SUM_TT AS T3 ON T1.TREND2=T3.TREND)

			SELECT * FROM COSINE_DIST_TT ORDER BY TREND1,TREND2;
			"""

			cursor.execute(query,(locationid,))
			trend1_column = 0
			trend2_column = 1
			distance_value_column = 2
			trends_list = [] 
			row_counter = 0
			max_columns = 15
			column_iteration = 1
			distance_matrix = [[0 for x in xrange(max_columns)] for x in xrange(max_columns)]

			for row in cursor:
				trend1 = row[trend1_column]
				trend2 = row[trend2_column]
				if trend1 not in trends_list:
					trends_list.append(trend1)	
				if trend2 not in trends_list:
					trends_list.append(trend2)

				# this is to check 0,0 1,1 and so on
				distance_matrix[row_counter][row_counter] = 0
				# this populates 1,2 and 2,1 and so on 
				# this avoid 2 loops

				distance_matrix[row_counter][column_iteration] = row[distance_value_column]	
				distance_matrix[column_iteration][row_counter] = row[distance_value_column]	

				column_iteration = column_iteration + 1
				if column_iteration == max_columns:
					row_counter = row_counter + 1
					column_iteration = row_counter + 1 
						

			return distance_matrix,trends_list

		except Exception:
			print traceback.format_exc()
Example #27
0
	def update_organized_tweets(self):
		tweet_id_dict = {} 

		try:
			conn = PostgresConnector().get_connection()
			cursor = conn.cursor()
			query_location = 'select id from location'	
			cursor.execute(query_location)
			location_column = 0

			for row_location in cursor:

				query = """
				select id,trend from trends 
				where trend in(select trend from (select count(*) as c,trend from 
					trends where locationid = %s group by trend)as t1 order 
					by c desc limit 15)
						"""
				cursor = conn.cursor()
				location_id = row_location[location_column]
				cursor.execute(query,(location_id,))
				trend_id_column = 0
				trend_name_column = 1
				trend_count = 0

				for row in cursor:
					trend_count = trend_count + 1
					trend_id = row[trend_id_column]
					trend_name = row[trend_name_column]
					print 'Processing for trend ' +trend_id+' , ' +str(trend_count)
					query_tweets = 'select tweets from tweets where trendId = \''+str(trend_id)+'\''
					cursor_tweets = conn.cursor()
					cursor_tweets.execute(query_tweets)
					tweets_column = 0

					with open(trend_name+'.txt','w') as f:

						# rows of tweets array
						for tweets_row in cursor_tweets:
							tweets_json_array = tweets_row[tweets_column]

							# tweets in a tweets array
							for json_in in tweets_json_array:

								id = json_in['id']
								tweet_id_exists = tweet_id_dict.get(id)

								if tweet_id_exists is None:
									#print jsonIn
									tweet_id_dict[id] = 1
									geo =  'none' if json_in['geo'] is None else 'none' #json['geo']
									retweeted = json_in['retweeted']
									in_reply_to_screen_name = 'none' if json_in['in_reply_to_screen_name'] is None else json_in['in_reply_to_screen_name']
									truncated = 'none' if json_in['truncated'] is None else json_in['truncated']
									source = json_in['source']
									created_at = json_in['created_at']
									place = 'none' if json_in['place'] is None else 'none'#json['place']
									user_id = json_in['user']['id']
									text = json_in['text'].strip()
									#text = " ".join(str(text).split())
									text = str(filter(lambda x: x in string.printable,text))
									#text = text.encode('utf-16')
									text = re.sub('\s+',' ',text)
									text = text.replace('\\','')
									entities = json_in['entities']['hashtags']
									user_mentions = json_in['entities']['user_mentions']
									user_mentions = [] 
									retweet_count = json_in['retweet_count']
									favorite_count = json_in['favorite_count']

									# if len(entities) > 0:
									# 	for entity in entities:
									# 		for k,v in entity.items():
									# 			if k in 'text':
									# 				entity_list = {}
									# 				new_v = entity[k]
									# 				new_v = str(new_v.encode('utf-8'))
									# 				new_v = filter(lambda x: x in string.printable,new_v)
									# 				#print id,check,new_v,len(new_v)
									# 				if len(new_v) > 0: 
									# 					entity[k] = new_v
									# 				else:
									# 					entity[k] = ''



									#print id,geo,retweeted ,in_reply_to_screen_name ,truncated ,source ,created_at ,place ,user_id ,text ,entities ,user_mentions,retweet_count,favorite_count
									f.write(str(id)+'\t'+str(geo)+'\t'+str(retweeted)+'\t'+str(in_reply_to_screen_name.encode('utf-8'))+'\t'+str(truncated)+'\t'+str(source.encode('utf-8'))+'\t'+str(created_at.encode('utf-8'))+'\t'+str(place)+'\t'+str(user_id)+'\t'+text+'\t'+str(json.dumps(entities))+'\t'+str(user_mentions)+'\t'+str(retweet_count)+'\t'+str(favorite_count)+'\t'+str(trend_name)+'\t'+str(location_id)+'\n')

								else:
									continue

								# array of tweets json ends here
								#break

							# total number of tweets rows for a given trend ends here
							#break

					print 'Writing to table'

					with open(trend_name+'.txt') as f:
						cursor_write = conn.cursor()
						cursor_write.copy_from(f,'organized_tweets',columns=('id','geo','retweeted','in_reply_to_screen_name','truncated','source','created_at','place','user_id','text','entities','user_mentions','retweet_count','favorite_count','trend','location_id'))

					conn.commit()
					os.remove(trend_name+'.txt')

					# all trends finish here
					#break



		except Exception :
			print traceback.format_exc()