Exemple #1
0
    def get_dates_location(self, locationid):
        min_max_date_list = []
        try:
            conn = PostgresConnector().get_connection()
            cursor = conn.cursor()
            query = """
			select max(date),min(date) from trends where trend in 
			(select t1.trend as trend from
			(select count(*) as c,trend from trends where 
				locationid = %s group by trend)as t1 order by c desc limit 15)
				and locationid = %s
			"""
            cursor.execute(query, (locationid, locationid))
            min_date_column = 1
            max_date_column = 0
            for row in cursor:
                min_max_date_dict = {}
                min_max_date_dict["min_date"] = str(row[min_date_column])
                min_max_date_dict["max_date"] = str(row[max_date_column])
                min_max_date_list.append(min_max_date_dict)

        except Exception:
            traceback.format_exc()

        return min_max_date_list
Exemple #2
0
	def get_dates_location(self,locationid):
		min_max_date_list = [] 
		try:
			conn = PostgresConnector().get_connection()
			cursor = conn.cursor()
			query = """
			select max(date),min(date) from trends where trend in 
			(select t1.trend as trend from
			(select count(*) as c,trend from trends where 
				locationid = %s group by trend)as t1 order by c desc limit 15)
				and locationid = %s
			"""
			cursor.execute(query,(locationid,locationid))
			min_date_column = 1
			max_date_column = 0
			for row in cursor:
				min_max_date_dict = {}
				min_max_date_dict["min_date"] = str(row[min_date_column])
				min_max_date_dict["max_date"] = str(row[max_date_column])
				min_max_date_list.append(min_max_date_dict)

		 
		except Exception:
			traceback.format_exc()

		return min_max_date_list
Exemple #3
0
    def build(self):
        conn = PostgresConnector().get_connection()
        cursor = conn.cursor()
        query = 'select id,entities,trend from organized_tweets'
        cursor.execute(query)
        id_column = 0
        entities_column = 1
        trend_column = 2

        with open('copy_from.txt', 'w') as f:
            for row in cursor:
                tweet_id = row[id_column]
                trend = row[trend_column]
                hashtag_array = row[entities_column]
                json_array = json.loads(hashtag_array)
                hashtag_list = [hashtag["text"] for hashtag in json_array]
                hashtag_list_unique = list(set(hashtag_list))
                print 'Writing data to table for the tweet_id ' + tweet_id
                for hashtag in hashtag_list_unique:
                    f.write(tweet_id + '\t' + hashtag.encode('utf-8') + '\t' +
                            trend + '\n')

        with open('copy_from.txt') as f:
            cursor.copy_from(f, 'id_entity', columns=('id', 'entity', 'trend'))
            conn.commit()

        os.remove('copy_from.txt')
Exemple #4
0
    def get_trends(self, location_id, start_date, end_date):
        trends_list = []
        try:
            conn = PostgresConnector().get_connection()
            cursor = conn.cursor()
            query = """
			select c,trend from
			(select count(*) as c,trend from trends where 
				locationid = %s and date between %s and %s 
				and id in(select trendid from tweets)
				group by trend)
			as t1 order by c desc limit 15
			"""
            cursor.execute(query, (location_id, start_date, end_date))
            trend_column = 1
            count_column = 0
            for row in cursor:
                trend_count = {}
                trend_count["trend"] = row[trend_column]
                trend_count["count"] = row[count_column]
                trends_list.append(trend_count)

        except Exception as e:
            print e

        return trends_list
Exemple #5
0
	def get_data(self,locationid):
		entity_trend_dict = {} 
		try:
			conn = PostgresConnector().get_connection()
			cursor = conn.cursor()
			query = """select t1.entity,t2.trend from
						(select id,entity from id_entity ) as t1
						inner join
						(select id,trend from organized_tweets where trend in 
							(select trend from 
							(select count(*) as c,trend from 
								trends where locationid = %s group by trend)as t_in order 
							by c desc limit 15))as t2
						on
						t1.id = t2.id"""
			cursor.execute(query,(locationid,))
			entity_column = 0
			trend_column = 1
			for row in cursor:
				id = row[trend_column]
				if id in entity_id_dict.keys():
					entity_list = entity_id_dict[id]
					entity_list.append(row[entity_column])
					entity_id_dict[id] = entity_list
				else:
					entity_id_dict[id] = [row[entity_column]]


		except Exception:
			print traceback.format_exc()

		return entity_trend_dict
    def build(self):
        conn = PostgresConnector().get_connection()
        cursor = conn.cursor()
        query = 'select id,entities,trend from organized_tweets'
        cursor.execute(query)
        id_column = 0
        entities_column = 1
        trend_column = 2

        with open('copy_from.txt','w') as f:
            for row in cursor:
                tweet_id = row[id_column]
                trend = row[trend_column]
                hashtag_array = row[entities_column]
                json_array = json.loads(hashtag_array)
                hashtag_list = [hashtag["text"] for hashtag in json_array]
                hashtag_list_unique = list(set(hashtag_list))
                print 'Writing data to table for the tweet_id ' +tweet_id 
                for hashtag in hashtag_list_unique:
                    f.write(tweet_id + '\t' + hashtag.encode('utf-8') + '\t' + trend + '\n')

        with open('copy_from.txt') as f:
            cursor.copy_from(f, 'id_entity', columns=('id', 'entity','trend'))
            conn.commit()

        os.remove('copy_from.txt')
    def get_total_documents(self):
        conn = PostgresConnector().get_connection()
        cursor = conn.cursor()
        query = 'select count(distinct(id)) from "IdEntity" '
        cursor.execute(query)
        count_of_distinct_id_column = 0
        total_documents_count = 0
        for row in cursor:
            total_documents_count = row[count_of_distinct_id_column]

        return total_documents_count
	def get_sentiments(self):
		conn = PostgresConnector().get_connection()
		cursor = conn.cursor()
		query = """
		select text from organized_tweets limit 10
		"""
		cursor.execute(query)
		for row in cursor:
			text = row[0]
			blob = TextBlob(text,analyzer=NaiveBayesAnalyzer())
			print blob.sentiment
	def get_total_documents(self):
		conn = PostgresConnector().get_connection() 
		cursor = conn.cursor() 
		query = 'select count(distinct(id)) from "IdEntity" '
		cursor.execute(query)
		count_of_distinct_id_column = 0
		total_documents_count = 0
		for row in cursor:
			total_documents_count = row[count_of_distinct_id_column]

		return total_documents_count
Exemple #10
0
    def get_locations(self):
        conn = PostgresConnector().get_connection()
        cursor = conn.cursor()
        query = 'SELECT id,city from location'
        cursor.execute(query)
        id_column = 0
        city_column = 1
        locations_list = []
        for row in cursor:
            id_location = {}
            id_location["geoid"] = row[id_column]
            id_location["city"] = row[city_column]
            locations_list.append(id_location)

        return locations_list
Exemple #11
0
    def build_tf(self):
        # using group by first get tf score for each entity
        conn = PostgresConnector().get_connection()
        cursor = conn.cursor()
        query = 'select count(id),entity from "IdEntity" group by entity'
        cursor.execute(query)
        count_of_id_column = 0
        entities_column = 1
        entity_id_dict = {}
        for row in cursor:
            count_of_id = row[count_of_id_column]
            entity = row[entities_column]
            entity_id_dict[entity] = count_of_id

        return entity_id_dict
Exemple #12
0
	def get_locations(self):
		conn = PostgresConnector().get_connection()
		cursor = conn.cursor()
		query = 'SELECT id,city from location';
		cursor.execute(query)
		id_column = 0 
		city_column = 1
		locations_list = []
		for row in cursor:
			id_location = {}
			id_location["geoid"] = row[id_column]
			id_location["city"] = row[city_column]
			locations_list.append(id_location)

		return locations_list
Exemple #13
0
	def build_tf(self):
		# using group by first get tf score for each entity
		conn = PostgresConnector().get_connection() 
		cursor = conn.cursor() 
		query = 'select count(id),entity from "IdEntity" group by entity'
		cursor.execute(query)
		count_of_id_column = 0
		entities_column = 1
		entity_id_dict = {}
		for row in cursor:
			count_of_id = row[count_of_id_column]
			entity = row[entities_column]
			entity_id_dict[entity] = count_of_id

		return entity_id_dict			
def train_model():
    np.random.seed(123)
    with open(os.path.join("..", "query_pull_1000v3.pkl"), 'rb') as f:
        query_pull = pickle.load(f)

        connector = PostgresConnector()
        env = DatabaseIndexesEnv(n=COLUMNS_AMOUNT,
                                 table_name=table_name,
                                 query_pull=query_pull,
                                 batch_size=BATCH_SIZE,
                                 connector=connector,
                                 k=3,
                                 max_episodes=1000)

        # Get the environment and extract the number of actions.
        env.seed(123)

        # Next, we build a very simple model.
        model = build_model()
        print(model.summary())

        # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
        # even the metrics!
        dqn = initialize_agent(model)

        # Okay, now it's time to learn something! We visualize the training here for show, but this
        # slows down training quite a lot. You can always safely abort the training prematurely using
        # Ctrl + C.
        dqn.fit(env, nb_steps=50000, visualize=False, verbose=0, callbacks=[CustomEpisodeLogger()])

        # After training is done, we save the final weights.
        dqn.save_weights('dqn_{}_weights_6_4_2_1_2000_episodes_estimated.h5f'.format(ENV_NAME), overwrite=True)

        # Finally, evaluate our algorithm for 5 episodes.
        dqn.test(env, nb_episodes=5, visualize=False)
Exemple #15
0
 def __init__(self,
              prefix="entity",
              window_size=2,
              entities_only=True,
              port=5436,
              log_file=os.path.join(os.path.dirname(__file__), "logs/SchemaCreator.log"),
              log_level=logging.INFO,
              log_verbose=True
              ):
     """
     Set up.
     :param prefix: (str) Prefix to the table names.
     :param port: (int) Used to connect to the Postgres tables.
     :param log_file: (os.path) Path to the file containing the logs.
     :param log_level: (logging.LEVEL) Specifies the level to be logged.
     :param log_verbose: (boolean) Specifies whether or not to look to stdout as well.
     """
     self.logger = set_up_logger(__name__, log_file, log_level, log_verbose)
     self.window_size = window_size
     self.prefix = prefix + "_" + str(self.window_size)
     self.entities_only = entities_only
     self.names = self.get_names(self.prefix)
     self.port = port
     self.pc = PostgresConnector(port=port)
     self.logger.info("Successfully registered SchemaGenerator.")
Exemple #16
0
def create_table(port):
    pc = PostgresConnector(port=port)

    with pc as opc:
        # add sentence index column but in separate table
        print("Starting with ")
        pc.cursor.execute("CREATE TABLE sentences_neo4j AS TABLE sentences;")
        pc.cursor.execute("ALTER TABLE sentences_neo4j ADD COLUMN id int;")
        pc.cursor.execute("""WITH numbered (sid, document_id, sentence_id) AS
                            (select row_number() OVER() sid, * from sentences_neo4j)
                            UPDATE sentences_neo4j
                            SET id = numbered.sid
                            FROM numbered
                            WHERE sentences_neo4j.document_id = numbered.document_id AND
                            sentences_neo4j.sentence_id = numbered.sentence_id;""")

        # add term_occurrence index
        print("Starting with term occurrences...")
        pc.cursor.execute("CREATE TABLE term_occurrence_neo4j AS TABLE term_occurrence;")
        pc.cursor.execute("ALTER TABLE term_occurrence_neo4j ADD COLUMN id int;")
        pc.cursor.execute("""WITH numbered (sid, document_id, sentence_id, term_id) AS
                            (select row_number() OVER() sid, * from term_occurrence)
                            UPDATE term_occurrence_neo4j
                            SET  id = numbered.sid                                                
                            FROM numbered
                            WHERE term_occurrence_neo4j.document_id = numbered.document_id AND
                            term_occurrence_neo4j.sentence_id = numbered.sentence_id AND
                            term_occurrence_neo4j.term_id = numbered.term_id;""")
Exemple #17
0
    def get_tfidf(self, locationid, trend):
        tfidf_list = []
        try:
            conn = PostgresConnector().get_connection()
            cursor = conn.cursor()
            tfidf_query = """
			select entity,tf_idf_score from 
				(select t4.entity,sum(t4.tf_idf) as tf_idf_score
				from
				(select t1.id,t1.entity,t2.count_id,t3.count_entity,
				(1.0/t3.count_entity)*log((
				select count(*) from organized_tweets 
				where trend = %s 
				and location_id = %s )/t2.count_id) as tf_idf  from
					(select id,entity from id_entity where id in
					(select id from organized_tweets 
					where trend = %s 
					and location_id = %s)) as t1
				inner join
					(select entity,count(id) as count_id from id_entity where id in
					(select id from organized_tweets 
					where trend = %s 
					and location_id = %s)group by entity) as t2
				on
					t1.entity = t2.entity
				inner join
					(select id,count(entity) as count_entity from id_entity 
					where id in(select id from organized_tweets 
					where trend = %s 
					and location_id = %s )group by id) as t3
				on 
					t1.id = t3.id) as t4 group by entity)as t5 order by 
					tf_idf_score desc limit 100;
			"""
            cursor.execute(tfidf_query, (trend, locationid, trend, locationid,
                                         trend, locationid, trend, locationid))
            entity_column = 0
            tfidf_column = 1
            for row in cursor:
                entity_tfidf_score = {}
                entity_tfidf_score["entity"] = row[entity_column]
                entity_tfidf_score["tfidf"] = row[tfidf_column]
                tfidf_list.append(entity_tfidf_score)

            return tfidf_list
        except Exception:
            print traceback.format_exc()
Exemple #18
0
	def get_tfidf(self,locationid,trend):
		tfidf_list = []
		try:
			conn = PostgresConnector().get_connection()
			cursor = conn.cursor()
			tfidf_query = """
			select entity,tf_idf_score from 
				(select t4.entity,sum(t4.tf_idf) as tf_idf_score
				from
				(select t1.id,t1.entity,t2.count_id,t3.count_entity,
				(1.0/t3.count_entity)*log((
				select count(*) from organized_tweets 
				where trend = %s 
				and location_id = %s )/t2.count_id) as tf_idf  from
					(select id,entity from id_entity where id in
					(select id from organized_tweets 
					where trend = %s 
					and location_id = %s)) as t1
				inner join
					(select entity,count(id) as count_id from id_entity where id in
					(select id from organized_tweets 
					where trend = %s 
					and location_id = %s)group by entity) as t2
				on
					t1.entity = t2.entity
				inner join
					(select id,count(entity) as count_entity from id_entity 
					where id in(select id from organized_tweets 
					where trend = %s 
					and location_id = %s )group by id) as t3
				on 
					t1.id = t3.id) as t4 group by entity)as t5 order by 
					tf_idf_score desc;
			"""
			cursor.execute(tfidf_query,(trend,locationid,trend,locationid,trend,locationid,trend,locationid))
			entity_column = 0
			tfidf_column = 1
			for row in cursor:
				entity_tfidf_score = {}
				entity_tfidf_score["entity"] = row[entity_column]
				entity_tfidf_score["tfidf"] = row[tfidf_column]
				tfidf_list.append(entity_tfidf_score)

			return tfidf_list
		except Exception :
			print traceback.format_exc()
Exemple #19
0
    def get_tweets(self, trend, entity):
        conn = PostgresConnector().get_connection()
        cursor = conn.cursor()
        query_tweets = """

			select text from organized_tweets
			where id in (select id from id_entity where 
			entity = %s) limit 50
		"""
        cursor.execute(query_tweets, (entity, ))
        text_list = []
        for row in cursor:
            text_dict = {}
            text_dict["name"] = row[0]
            text_list.append(text_dict)

        return text_list
	def get_sentiments(self):

		conn = PostgresConnector().get_connection()
		cursor = conn.cursor()
		query = """
		select id,text from organized_tweets 
		"""
		cursor.execute(query)
		id_column = 0
		text_column = 1
		with open("sentiments.tsv","w") as f:
			for row in cursor:
				text = row[text_column]
				blob = TextBlob(text,analyzer=NaiveBayesAnalyzer())
				print 'writing for tweet with id ' +str(row[id_column])

				f.write(str(row[id_column])+'\t'+str(blob.sentiment.classification)+'\t'+str(blob.sentiment.p_pos)+'\t'+str(blob.sentiment.p_neg)+'\n')
Exemple #21
0
    def get_sentiments(self):

        conn = PostgresConnector().get_connection()
        cursor = conn.cursor()
        query = """
		select id,text from organized_tweets 
		"""
        cursor.execute(query)
        id_column = 0
        text_column = 1
        with open("sentiments.tsv", "w") as f:
            for row in cursor:
                text = row[text_column]
                blob = TextBlob(text, analyzer=NaiveBayesAnalyzer())
                print 'writing for tweet with id ' + str(row[id_column])

                f.write(
                    str(row[id_column]) + '\t' +
                    str(blob.sentiment.classification) + '\t' +
                    str(blob.sentiment.p_pos) + '\t' +
                    str(blob.sentiment.p_neg) + '\n')
	def validate_data(self,vehicle_type,toll_type,date,price,vehicle_no):
		if vehicle_type.strip() == '' or toll_type.strip() == '' or price.strip() == '' or date.strip() == '' or vehicle_no.strip() =='':
			# figure out what to return here!
			raise Exception('input data has nulls')
		else:
			try:
				conn = PostgresConnector().get_connection()
				cursor = conn.cursor()
				query = """ INSERT INTO transactions
					(vehicle_type,toll_type,timestamp,price,vehicle_no
						) values(%s,%s,\'%s\',%s,\'%s\')
					""" % (vehicle_type,toll_type,date,float(price),vehicle_no) 
				#print "Inserting data to table using the query %s" % (query,)
				cursor.execute(query)
				conn.commit()
				return 'Success'
			except psycopg2.IntegrityError as e:
				raise Exception(' Unique key constraint failed ')
			except Exception as e:	
				print e
				raise Exception(' Something else went wrong')
Exemple #23
0
	def validate_data(self,vehicle_no,time):
		if vehicle_no.strip() == '' and time.strip() == '': 
			# figure out what to return here!
			raise Exception('input data has nulls')
		else:
			try:
				conn = PostgresConnector().get_connection()
				cursor = conn.cursor()
				query = ""
				if vehicle_no.strip() != '' and time.strip() != '':
					query = """ SELECT * FROM transactions where vehicle_no = \'%s\' 
						and timestamp = \'%s\'  order by timestamp """ % (vehicle_no,time)
				elif vehicle_no.strip() != '' :
					query = """ select t2.vehicle_type,t2.toll_type,t1.timestamp,t1.price,t1.vehicle_no from
								(SELECT * FROM transactions where vehicle_no =\'%s\') as t1
								inner join
								(select * from master_data) as t2
								on
								t2.vehicle_type_id = t1.vehicle_type
								and 
								t2.toll_type_id = t1.toll_type """ % (vehicle_no,)

				else:
					query = """ SELECT * FROM transactions where timestamp = \'%s\' 
						order by timestamp """ % (time,)

				print "selecting data from the table using the query %s" % (query,)
				cursor.execute(query)
				search_list = []
				for row in cursor:
					elements = {}
					elements['vehicle_type'] = row[0]
					elements['toll_type'] = row[1]
					elements['time'] = str(row[2])
					elements['price'] = row[3]
					elements['vehicle_no'] = row[4]
					search_list.append(elements)
				return search_list 
			except Exception:	
				print traceback.format_exc()
Exemple #24
0
	def build(self):
		conn = PostgresConnector().get_connection() 
		cursor = conn.cursor() 
		query = 'select id,hashtags from "organizedTweets" '
		cursor.execute(query)
		id_column = 0
		entities_column = 1
		entity_id_dict = {}
		for row in cursor:
			tweet_id = row[id_column]
			hashtag_array = row[entities_column]
			hashtag_list = [hashtag['text'] for hashtag in hashtag_array]
			for entity in hashtag_list:
				if entity in entity_id_dict.keys():
					id_list = entity_id_dict[entity]
					id_list.append(tweet_id)
					entity_id_dict[entity] = id_list
				else:
					id_list = []
					id_list.append(tweet_id)
					entity_id_dict[entity] = id_list
		return entity_id_dict			
Exemple #25
0
 def build(self):
     conn = PostgresConnector().get_connection()
     cursor = conn.cursor()
     query = 'select id,hashtags from "organizedTweets" '
     cursor.execute(query)
     id_column = 0
     entities_column = 1
     entity_id_dict = {}
     for row in cursor:
         tweet_id = row[id_column]
         hashtag_array = row[entities_column]
         hashtag_list = [hashtag['text'] for hashtag in hashtag_array]
         for entity in hashtag_list:
             if entity in entity_id_dict.keys():
                 id_list = entity_id_dict[entity]
                 id_list.append(tweet_id)
                 entity_id_dict[entity] = id_list
             else:
                 id_list = []
                 id_list.append(tweet_id)
                 entity_id_dict[entity] = id_list
     return entity_id_dict
Exemple #26
0
	def get_data(self):
		out_list = []
		try:
			conn = PostgresConnector().get_connection()
			cursor = conn.cursor()
			query = """ SELECT vehicle_type,vehicle_type_id,
						toll_type,toll_type_id,price
						from master_data"""
			cursor.execute(query)
			resultset = cursor.fetchall()
			out_list = [] 
			vehicle_dict = {}
			for row in resultset:
				row_data = {}
				row_value_dict = {}
				id = row[1]
				vehicle_name = row[0]
				journey_list = []
				if id not in vehicle_dict.keys():
					vehicle_dict_out = {}
					vehicle_dict[id] = 1
					for row_in in resultset:
						journey_type = {}
						if row_in[1] == id:
							journey_type['toll_type'] = row_in[2]
							journey_type['toll_id'] = str(row_in[3])
							journey_type['price'] = str(row_in[4])
							journey_list.append(journey_type)	
					vehicle_dict_out['vehicle_type'] = vehicle_name
					vehicle_dict_out['vehicle_type_id'] = id
					vehicle_dict_out['journey'] = journey_list
					out_list.append(vehicle_dict_out)

			return out_list 
		except Exception as e:	
			print e

			raise Exception(' Something went wrong while retrieving data')
Exemple #27
0
	def get_trends(self,location_id,start_date,end_date):
		trends_list = []	
		try:
			conn = PostgresConnector().get_connection()
			cursor = conn.cursor()
			query = """
			select c,trend from
			(select count(*) as c,trend from trends where 
				locationid = %s and date between %s and %s group by trend)as t1 order by c desc limit 15
			"""
			cursor.execute(query,(location_id,start_date,end_date))
			trend_column = 1
			count_column = 0
			for row in cursor:
				trend_count = {}
				trend_count["trend"] = row[trend_column]
				trend_count["count"] = row[count_column]
				trends_list.append(trend_count)

		except Exception as e:
			print e 

		return trends_list
 def test_dqn_against_heuristic(self):
     np.random.seed(123)
     with open(path.join("..", "query_pull_1000v3.pkl"), 'rb') as f:
         query_pull = pickle.load(f)[0:5]
         workload = np.random.choice(query_pull, const.BATCH_SIZE)
         env = DatabaseIndexesEnv(n=const.COLUMNS_AMOUNT,
                                  table_name=table_name,
                                  query_pull=query_pull,
                                  batch_size=const.BATCH_SIZE,
                                  connector=PostgresConnector(),
                                  k=3,
                                  max_episodes=1)
         dqn = load_agent(
             path.join("..", "dqn_specific_{}.h5f".format(ENV_NAME)))
         results = dqn.test(env, nb_episodes=1)
         print(results)
         print(env.state)
         print(predict_on_workload(workload))
 def test_cache(self):
     np.random.seed(123)
     with open("..\query_pull_1000v3.pkl", 'rb') as f:
         query_pull = pickle.load(f)
         register(
             id='DatabaseIndexesEnv-v0',
             entry_point='dbenv:DatabaseIndexesEnv',
             kwargs={'n': const.COLUMNS_AMOUNT,
                     'table_name': "test_table",
                     'query_pull': query_pull,
                     'batch_size': 2,
                     'connector': PostgresConnector(),
                     'k': 3,
                     'max_episodes': 1}
         )
         env = gym.make('DatabaseIndexesEnv-v0')
         env.step(0)
         env.step(1)
         env.step(2)
         print(env.cache)
def train_model():
    np.random.seed(123)
    with open("query_pull_1000v2.pkl", 'rb') as f:
        query_pull = pickle.load(f)[0:5]

        register(id=ENV_NAME,
                 entry_point='dbenv:DatabaseIndexesEnv',
                 kwargs={
                     'n': COLUMNS_AMOUNT,
                     'table_name': table_name,
                     'query_pull': query_pull,
                     'batch_size': BATCH_SIZE,
                     'connector': PostgresConnector(),
                     'k': 3,
                     'max_episodes': episodes
                 })

        # Get the environment and extract the number of actions.
        env = gym.make(ENV_NAME)
        env.seed(123)

        # Next, we build a very simple model.
        model = build_model()
        print(model.summary())

        # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
        # even the metrics!
        dqn = initialize_agent(model)

        # Okay, now it's time to learn something! We visualize the training here for show, but this
        # slows down training quite a lot. You can always safely abort the training prematurely using
        # Ctrl + C.
        dqn.fit(env, nb_steps=episodes, visualize=False, verbose=2)

        # After training is done, we save the final weights.
        dqn.save_weights('dqn_specific_{}.h5f'.format(ENV_NAME),
                         overwrite=True)

        # Finally, evaluate our algorithm for 5 episodes.
        dqn.test(env, nb_episodes=5, visualize=False)
Exemple #31
0
    result = []
    print("Number of empty hyperedges:")
    for i, prefix in enumerate(prefixes):
        with pc as open_pc:
            table = prefix + "hyperedges"

            open_pc.cursor.execute(
                "SELECT (SELECT MAX(edge_id) from {}) - "
                "(SELECT count(distinct edge_id) from {}) as diff".format(
                    table, table))

            result.append(open_pc.cursor.fetchall()[0][0])
        print("Results for {}: {}".format(table, result[i]))


if __name__ == "__main__":
    prefixes = ["", "entity_"]
    pc = PostgresConnector(port=5435)
    print()
    get_document_table_length(prefixes, pc)
    print()
    get_sentence_table_length(prefixes, pc)
    print()
    get_hyperedge_table_length(prefixes, pc)
    print()
    analyze_edge_size(prefixes, pc)
    print()
    analyze_term_frequency(prefixes, pc)
    print()
    get_number_of_empty_edges(prefixes, pc)
 def __init__(self):
   self.connector = PostgresConnector(CONFIG_FILE_NAME, CONFIG_SECTION_NAME)
   self.SQL_constructor = PostgresSQLConstructor()
Exemple #33
0
def get_indexes_qagent(index_amount, queries, Log=False):
    connector = PostgresConnector()
    if not table_exists(connector, table_name):
        create_table_2(connector)
        load_table(connector)

    #   make results repeatable
    np.random.seed(123)
    # gym configuration
    query_batch = list()

    env = gym.make('DatabaseIndexesEnv-v0')
    env.set_indices_num(index_amount)

    current_query_idx = 0
    query_batch = list()
    for workload in range(1):

        exploration_rate = 1.0  # represents the exploration rate to be decayed by the time
        initial_lr = 1.0  # Learning rate
        query_batch = list()
        Q_table_c.Q_table = {}
        query_batch = list()
        workload_selectivity_l = list()
        # 1. generate the queries per workload
        # 2. generate the cummlative selectivity per workload
        start = timer()
        for i in range(current_query_idx, current_query_idx + num_queries_batch):
            query_batch.append(queries[i]['query'])
            workload_selectivity_l.append(list(map(lambda x: x, queries[i]['sf_array'])))
        current_query_idx += num_queries_batch
        workload_selectivity = np.prod(workload_selectivity_l, axis=0).tolist()
        max_workload_selectivity = max(workload_selectivity)
        env.set_query_batch(query_batch)
        actions_taken = list()
        # as a heuristic: the indices with the lowest selectivity
        selectivity_indices = heapq.nsmallest(3, range(len(workload_selectivity)), workload_selectivity.__getitem__)
        if Log:
            print("Entering the q learning ..... the process can take time.")
            print(workload_selectivity)
        env.clear_cache()
        for episode in range(NUM_EPISODES):
            state = env.reset()
            actions_taken = list()
            # decay the exploration as the number of episodes grows, the Q table becomes more mature
            eps = exploration_rate / np.sqrt(episode + 1)
            eps = max(eps, min_exp_rate)
            episode_total_reward = 0
            episode_total_qreward = 0
            episode_strategy = []
            eta = max(min_lr, initial_lr * (0.85 ** (episode // 100)))
            ## now the learning comes
            for kk in range(3):
                # do exploration, i.e., choose a random actions
                # make sure the last step is exploitation unless the state is new
                if episode == 0:
                    episode_strategy.append("explore")
                    action = selectivity_indices[kk]
                    Q_table_c.Q_table[state_to_int(state)] = {}
                    Q_table_c.Q_table[state_to_int(state)][action] = 0
                elif (is_new_state(state) or (np.random.uniform(0, 1) < eps)) and episode != NUM_EPISODES - 1:
                    episode_strategy.append("explore")
                    # generate only actions that matches something with selectivity.
                    action = env.action_space.sample()
                    # high selectivity, not a good option for an index
                    while workload_selectivity[action] >= max_workload_selectivity:
                        action = env.action_space.sample()
                    if is_new_state(state):
                        Q_table_c.Q_table[state_to_int(state)] = {}
                    if action not in Q_table_c.Q_table[state_to_int(state)]:
                        Q_table_c.Q_table[state_to_int(state)][action] = 0
                else:
                    # else exploit choose the maximum value from the Q table
                    episode_strategy.append("exploit")
                    action = get_action_maximum_reward(state)[0]

                actions_taken.append(action)
                state_old_int = state_to_int(state)
                state_new, reward, done, _ = env.step(action)
                episode_total_reward += reward
                next_action = 0
                next_action_q_value = 0

                if is_new_state(state_new):
                    next_action = env.action_space.sample()
                    while (action == next_action or workload_selectivity[next_action] >= max_workload_selectivity):
                        next_action = env.action_space.sample()
                    next_action_q_value = 0

                else:
                    next_action, next_action_q_value = get_action_maximum_reward(state_new)

                Q_table_c.Q_table[state_old_int][action] += eta * (reward + GAMMA * next_action_q_value -
                                                                   Q_table_c.Q_table[state_old_int][action])
                episode_total_qreward += Q_table_c.Q_table[state_old_int][action]
                state, action = state_new, next_action
            actions_taken_s = ','.join(str(e) for e in actions_taken)
            if Log:
                print(
                    "episode num = '{0}', episode_total_immediate_rewards = '{1}', episode_total_reward = '{2}', current_state = '{3}', actions_taken = '{4}', strategy = {5}"
                        .format(episode, float(episode_total_reward), float(episode_total_qreward),
                                state_to_string(state), actions_taken_s,
                                episode_strategy))

        return actions_taken
Exemple #34
0
	def update_organized_tweets(self):
		tweet_id_dict = {} 

		try:
			conn = PostgresConnector().get_connection()
			cursor = conn.cursor()
			query_location = 'select id from location'	
			cursor.execute(query_location)
			location_column = 0

			for row_location in cursor:

				query = """
				select id,trend from trends 
				where trend in(select trend from (select count(*) as c,trend from 
					trends where locationid = %s group by trend)as t1 order 
					by c desc limit 15)
						"""
				cursor = conn.cursor()
				location_id = row_location[location_column]
				cursor.execute(query,(location_id,))
				trend_id_column = 0
				trend_name_column = 1
				trend_count = 0

				for row in cursor:
					trend_count = trend_count + 1
					trend_id = row[trend_id_column]
					trend_name = row[trend_name_column]
					print 'Processing for trend ' +trend_id+' , ' +str(trend_count)
					query_tweets = 'select tweets from tweets where trendId = \''+str(trend_id)+'\''
					cursor_tweets = conn.cursor()
					cursor_tweets.execute(query_tweets)
					tweets_column = 0

					with open(trend_name+'.txt','w') as f:

						# rows of tweets array
						for tweets_row in cursor_tweets:
							tweets_json_array = tweets_row[tweets_column]

							# tweets in a tweets array
							for json_in in tweets_json_array:

								id = json_in['id']
								tweet_id_exists = tweet_id_dict.get(id)

								if tweet_id_exists is None:
									#print jsonIn
									tweet_id_dict[id] = 1
									geo =  'none' if json_in['geo'] is None else 'none' #json['geo']
									retweeted = json_in['retweeted']
									in_reply_to_screen_name = 'none' if json_in['in_reply_to_screen_name'] is None else json_in['in_reply_to_screen_name']
									truncated = 'none' if json_in['truncated'] is None else json_in['truncated']
									source = json_in['source']
									created_at = json_in['created_at']
									place = 'none' if json_in['place'] is None else 'none'#json['place']
									user_id = json_in['user']['id']
									text = json_in['text'].strip()
									#text = " ".join(str(text).split())
									text = str(filter(lambda x: x in string.printable,text))
									#text = text.encode('utf-16')
									text = re.sub('\s+',' ',text)
									text = text.replace('\\','')
									entities = json_in['entities']['hashtags']
									user_mentions = json_in['entities']['user_mentions']
									user_mentions = [] 
									retweet_count = json_in['retweet_count']
									favorite_count = json_in['favorite_count']

									# if len(entities) > 0:
									# 	for entity in entities:
									# 		for k,v in entity.items():
									# 			if k in 'text':
									# 				entity_list = {}
									# 				new_v = entity[k]
									# 				new_v = str(new_v.encode('utf-8'))
									# 				new_v = filter(lambda x: x in string.printable,new_v)
									# 				#print id,check,new_v,len(new_v)
									# 				if len(new_v) > 0: 
									# 					entity[k] = new_v
									# 				else:
									# 					entity[k] = ''



									#print id,geo,retweeted ,in_reply_to_screen_name ,truncated ,source ,created_at ,place ,user_id ,text ,entities ,user_mentions,retweet_count,favorite_count
									f.write(str(id)+'\t'+str(geo)+'\t'+str(retweeted)+'\t'+str(in_reply_to_screen_name.encode('utf-8'))+'\t'+str(truncated)+'\t'+str(source.encode('utf-8'))+'\t'+str(created_at.encode('utf-8'))+'\t'+str(place)+'\t'+str(user_id)+'\t'+text+'\t'+str(json.dumps(entities))+'\t'+str(user_mentions)+'\t'+str(retweet_count)+'\t'+str(favorite_count)+'\t'+str(trend_name)+'\t'+str(location_id)+'\n')

								else:
									continue

								# array of tweets json ends here
								#break

							# total number of tweets rows for a given trend ends here
							#break

					print 'Writing to table'

					with open(trend_name+'.txt') as f:
						cursor_write = conn.cursor()
						cursor_write.copy_from(f,'organized_tweets',columns=('id','geo','retweeted','in_reply_to_screen_name','truncated','source','created_at','place','user_id','text','entities','user_mentions','retweet_count','favorite_count','trend','location_id'))

					conn.commit()
					os.remove(trend_name+'.txt')

					# all trends finish here
					#break



		except Exception :
			print traceback.format_exc()
def run_qlearning():
    connector = PostgresConnector()
    query_pull = generate_query_pull('.query_pull', queries_amount, [4, 6],
                                     table_column_types, table_column_names,
                                     table_name, connector)
Exemple #36
0
	def get_matrix(self,locationid):

		try:
			conn = PostgresConnector().get_connection()
			cursor = conn.cursor()
			query = """
			WITH 

			TREND_COUNT_TT AS
			(SELECT TREND,COUNT(*) AS TREND_COUNT 
				FROM TRENDS 
				WHERE LOCATIONID = %s GROUP BY TREND),


			TOP_TRENDS_TT AS
			(SELECT TREND FROM TREND_COUNT_TT ORDER BY TREND_COUNT DESC LIMIT 15),

			IDS_FOR_TOP_TRENDS_TT AS 
			(SELECT ID FROM ORGANIZED_TWEETS 
			WHERE TREND IN (SELECT TREND FROM TOP_TRENDS_TT) AND LOCATION_ID = '2295420'),
			--SELECT * FROM IDS_FOR_TOP_TRENDS_TT

			ID_ENTITY_TOP_TRENDS_TT AS
			(SELECT TREND,ID,ENTITY 
			FROM ID_ENTITY
			WHERE ID IN (SELECT ID FROM IDS_FOR_TOP_TRENDS_TT)),


			TREND_ENTITY_TF_IDF_SUM_TT AS
			(SELECT TREND,ENTITY,COUNT(ID) TF_IDF_SUM 
			FROM ID_ENTITY
			WHERE ID IN (SELECT ID FROM IDS_FOR_TOP_TRENDS_TT)
			GROUP BY TREND,ENTITY),
			--SELECT * FROM TREND_ENTITY_TF_IDF_SUM_TT

			TREND_TF_IDF_SQ_SUM_TT AS
			(SELECT TREND, 
			SUM(TF_IDF_SUM*TF_IDF_SUM) AS TF_IDF_SQ_SUM
			FROM TREND_ENTITY_TF_IDF_SUM_TT
			GROUP BY TREND),

			COSINE_DIST_NUM_TT AS 
			(SELECT T1.TREND AS TREND1,T2.TREND AS TREND2,
			SUM(T1.TF_IDF_SUM*T2.TF_IDF_SUM) AS COSINE_NUM
			FROM TREND_ENTITY_TF_IDF_SUM_TT AS T1
			INNER JOIN TREND_ENTITY_TF_IDF_SUM_TT AS T2 ON T2.TREND>T1.TREND AND T1.ENTITY = T2.ENTITY
			GROUP BY T1.TREND,T2.TREND),

			COSINE_DIST_TT AS
			(SELECT TREND1,TREND2,
			COSINE_NUM/(SQRT(T2.TF_IDF_SQ_SUM)*SQRT(T3.TF_IDF_SQ_SUM)) AS COSIND_DIST
			FROM COSINE_DIST_NUM_TT AS T1
			INNER JOIN TREND_TF_IDF_SQ_SUM_TT AS T2 ON T1.TREND1=T2.TREND
			INNER JOIN TREND_TF_IDF_SQ_SUM_TT AS T3 ON T1.TREND2=T3.TREND)

			SELECT * FROM COSINE_DIST_TT ORDER BY TREND1,TREND2;
			"""

			cursor.execute(query,(locationid,))
			trend1_column = 0
			trend2_column = 1
			distance_value_column = 2
			trends_list = [] 
			row_counter = 0
			max_columns = 15
			column_iteration = 1
			distance_matrix = [[0 for x in xrange(max_columns)] for x in xrange(max_columns)]

			for row in cursor:
				trend1 = row[trend1_column]
				trend2 = row[trend2_column]
				if trend1 not in trends_list:
					trends_list.append(trend1)	
				if trend2 not in trends_list:
					trends_list.append(trend2)

				# this is to check 0,0 1,1 and so on
				distance_matrix[row_counter][row_counter] = 0
				# this populates 1,2 and 2,1 and so on 
				# this avoid 2 loops

				distance_matrix[row_counter][column_iteration] = row[distance_value_column]	
				distance_matrix[column_iteration][row_counter] = row[distance_value_column]	

				column_iteration = column_iteration + 1
				if column_iteration == max_columns:
					row_counter = row_counter + 1
					column_iteration = row_counter + 1 
						

			return distance_matrix,trends_list

		except Exception:
			print traceback.format_exc()
    def __init__(self,
                 num_distinct_documents=5000,
                 replace_entities=True,
                 max_term_length=127,
                 remove_stopwords=True,
                 custom_stopwords=[
                     ',', '.', '-', '\xa0', '“', '”', '"', '\n', '—', ':', '?',
                     'I', '(', ')'
                 ],
                 analyze=False,
                 document_tabe_name="documents",
                 sentence_table_name="sentences",
                 sentence_fields=OrderedDict({
                     "doc_id": "document_id",
                     "sen_id": "sentence_id",
                     "content": "sentence_text"
                 }),
                 term_table_name="terms",
                 term_sql_format=("term_id", "term_text", "is_entity"),
                 term_occurrence_table_name="term_occurrence",
                 term_occurrence_sql_format=("document_id", "sentence_id",
                                             "term_id"),
                 entity_table_name="entities",
                 entity_sql_format=("entity_id", "entity_type"),
                 database="postgres",
                 user="******",
                 password="******",
                 host="127.0.0.1",
                 port=5435,
                 log_file=os.path.join(os.path.dirname(__file__),
                                       "logs/TermGenerator.log"),
                 log_level=logging.INFO,
                 log_verbose=True):
        """
        Initializes various parameters, registers logger and MongoConnector, and sets up the limit.
        :param num_distinct_documents: (int) The number of distinct documents retrieved from the queries.
               For performance reasons, this should be limited during debugging/development.
               0 (Zero) represents no limit, in accordance with the MongoDB standard for .limit().
        :param replace_entities: (boolean) Whether or not the entities in the text should be replaced/recognised.
               The reason for this is that single terms might be merged together to one term, i.e. first and last name:
               "Dennis" "Aumiller" would be two separate terms in the traditional splitting (replace_entities=False),
               whereas - if set to true - "Dennis Aumiller" would represent only one entity.
        :param max_term_length: (int) Indicator of how long the terms are supposed to be (varchar property in table).
        :param remove_stopwords: (boolean) Determines whether or not stop words are removed. Currently, we are still
               deciding on the final set, but likely either one (or both) of NLTK and SpaCy's stop word lists.
        :param custom_stopwords: (list of strings) Additional words that will not be considered at adding-time.
        :param analyze: (boolean) Whether or not to include analytically relevant metrics.
        :param document_tabe_name: (str) Name of the table where the document information is stored.
        :param sentence_table_name: (str) Name of the table where the sentence information will be stored.
        :param sentence_fields: (OrderedDict) Structure of input to output values from MongoDB to postgres for the
               sentence table and its fields.
        :param term_table_name: (str) Name of the Postgres tables for the terms.
        :param term_sql_format: (tuple) Since those are generated locally, only a tuple of the PostgresColumns suffices.
        :param term_occurrence_table_name: (str) Name of the Postgres table for the term occurrences
        :param term_occurrence_sql_format: (tuple) Same as term_sql_format, but for the term occurrences.
        :param entity_table_name: (str) (Not implemented yet) Name of the table for the entity meta information.
        :param entity_sql_format: (str) Same as term_sql_format, but for entities.
        :param database: (str) database name.
        :param user: (str) User name to get access to the Postgres database.
        :param password: (str) Corresponding user password.
        :param host: (IP) IP address (in string format) for the host of the postgres database.
        :param port: (integer) Port at which to access the database.
        """
        # set up logger
        self.logger = set_up_logger(__name__, log_file, log_level, log_verbose)
        self.logger.info("Successfully registered logger to TermGenerator.")

        # register a MongoConnector
        self.mc = MongoConnector()
        self.logger.info(
            "Successfully registered MongoConnector to TermGenerator.")

        # PostgresConnector
        self.pc = PostgresConnector(database, user, password, host, port)
        self.logger.info(
            "Successfully registered PostgresConnector to DocumentGenerator.")

        self.num_distinct_documents = num_distinct_documents
        # do this earlier since we need it already for the distinct documents.
        self.document_table_name = document_tabe_name
        # get the distinct IDs for the documents so we can match against them later
        # since we have removed parts of the document collection, we have to make sure to get this from Postgres.
        self.logger.info("Parsing relevant documents from Postgres...")
        with self.pc as open_pc:
            open_pc.cursor.execute("SELECT document_id FROM {}".format(
                self.document_table_name))
            self.first_distinct_documents = list(open_pc.cursor.fetchall())
            # extract from the tuple structure
            self.first_distinct_documents = [
                el[0] for el in self.first_distinct_documents
            ]
            self.logger.info("Retrieved all relevant documents from Postgres.")

        # additionally restrict if we want only a number of documents.
        if self.num_distinct_documents != 0:
            self.logger.info(
                "Non-zero limit detected. Limiting to the first N entries.")
            self.first_distinct_documents = self.first_distinct_documents[:self
                                                                          .
                                                                          num_distinct_documents]

        self.replace_entities = replace_entities
        self.analyze = analyze

        self.max_term_length = max_term_length

        self.nlp = spacy.load("en")

        # construct dictionary with the entries per document/sentence id pair. Thus, we can later check whether
        # there are any entities in the current sentence with higher efficiency.
        self.occurrence_dict = {}
        self.occurring_entities = []

        # start building the term dictionary/set, as well as an occurence map. Since terms will be "post-processed",
        # it is first created as a list and later cast to Counter and set.
        self.terms = []  # cast into a set later on.
        self.term_in_sentence = set()
        self.term_id = {}
        self.term_is_entity = {}
        if self.analyze:
            self.term_count = Counter()
            self.entity_count = Counter()

        self.entities = []
        self.sentences = []
        self.processed_sentences = []

        # Postgres tables
        if not sentence_fields:
            self.logger.error("No sentence fields specified!")
        self.sentence_table_name = sentence_table_name
        self.sentence_fields = sentence_fields
        if not term_sql_format:
            self.logger.error("No term fields specified!")
        self.term_table_name = term_table_name
        self.term_sql_format = ", ".join(term_sql_format)
        if not term_occurrence_sql_format:
            self.logger.error("No term occurrence fields specified!")
        self.term_occurrence_table_name = term_occurrence_table_name
        self.term_occurrence_sql_format = ", ".join(term_occurrence_sql_format)
        if not entity_sql_format:
            self.logger.error("No entity fields specified!")
        self.entity_table_name = entity_table_name
        self.entity_sql_format = ", ".join(entity_sql_format)

        # value retrieving parse:
        self.sentence_values_to_retrieve = {
            key: 1
            for key in self.sentence_fields.keys()
        }
        # suppress _id if not present:
        if "_id" not in self.sentence_values_to_retrieve.keys():
            self.sentence_values_to_retrieve["_id"] = 0
        self.sentence_sql_format = ", ".join(
            [value for value in self.sentence_fields.values()])

        # create union of stop words, and add potentially custom stop words
        self.remove_stopwords = remove_stopwords
        self.removed_counter = 0
        self.stopwords = STOP_WORDS.union(set(stopwords.words("english")))
        # add custom stopwords.
        for word in custom_stopwords:
            self.stopwords.add(word)

        self.logger.info("Successfully initialized TermGenerator.")
    def __init__(self,
                 window_size=2,
                 limit_edges=False,
                 entities_only=False,
                 document_table_name="documents",
                 sentence_table_name="sentences",
                 entity_table_name="entities",
                 term_table_name="terms",
                 term_occurrence_table_name="term_occurrence",
                 hyperedge_table_name="hyperedges",
                 hyperedge_format=("edge_id", "term_id", "pos"),
                 hyperedge_document_table_name="hyperedge_document",
                 hyperedge_document_format=("edge_id", "document_id"),
                 hyperedge_sentence_table_name="hyperedge_sentences",
                 hyperedge_sentence_format=("edge_id", "document_id",
                                            "sentence_id", "pos"),
                 database="postgres",
                 user="******",
                 password="******",
                 host="127.0.0.1",
                 port=5435,
                 log_file=os.path.join(os.path.dirname(__file__),
                                       "logs/HyperedgeGenerator.log"),
                 log_level=logging.INFO,
                 log_verbose=True):
        """
        Initializes hyper edge generator class.
        :param window_size: (int) Number of sentences in each direction that will determine the context window size
               of the algorithm.
        :param limit_edges: (boolean) Experimental: Should limit the maximum number of terms per hyperedge. This would
               only be useful in context with other theoretical results.
        :param entities_only: (boolean) Indicating whether or not we should only take into account entity terms,
               and not the entirety of all term occurrences for the edges.
        :param document_table_name: (str) Name of the table where documents are stored.
        :param sentence_table_name: (str) Name of the table containing the sentences and their content.
        :param entity_table_name: (str) Name of the table containing the entity information and their properties.
        :param term_table_name: (str) Name of the table containing the terms and meta data.
        :param term_occurrence_table_name: (str) Name of the table containing term occurrence data.
        :param hyperedge_table_name: (str) Name of the table containing the general hyper edge information.
        :param hyperedge_format: (str) Table structure of hyper edge table.
        :param hyperedge_document_table_name: (str) Name of the table containing the document classification.
        :param hyperedge_document_format: (str) Table structure of hyper edge document table.
        :param hyperedge_sentence_table_name: (str) Name of the tale containing the hyper edge sentence data.
        :param hyperedge_sentence_format: (str) Table structure of the hyper edge sentence table.
        :param database: (str) database name.
        :param user: (str) User name to get access to the Postgres database.
        :param password: (str) Corresponding user password.
        :param host: (IP) IP address (in string format) for the host of the postgres database.
        :param port: (integer) Port at which to access the database.
        :param log_file: (os.path) Path to the file containing the logs.
        :param log_level: (logging.LEVEL) Specifies the level to be logged.
        :param log_verbose: (boolean) Specifies whether or not to look to stdout as well.
        """

        self.logger = set_up_logger(__name__, log_file, log_level, log_verbose)
        self.logger.info(
            "Successfully registered logger to HyperedgeGenerator.")

        # important for hyperedges
        self.window_size = window_size
        self.limit_edges = limit_edges
        self.entities_only = entities_only

        # table names
        self.document_table_name = document_table_name
        self.sentence_table_name = sentence_table_name
        self.entity_table_name = entity_table_name
        self.term_table_name = term_table_name
        self.term_occurrence_table_name = term_occurrence_table_name
        self.hyperedge_table_name = hyperedge_table_name
        self.hyperedge_document_table_name = hyperedge_document_table_name
        self.hyperedge_sentence_table_name = hyperedge_sentence_table_name

        self.hyperedge_format = ", ".join([el for el in hyperedge_format])
        self.hyperedge_document_format = ", ".join(
            [el for el in hyperedge_document_format])
        self.hyperedge_sentence_format = ",".join(
            [el for el in hyperedge_sentence_format])

        self.pc = PostgresConnector(database, user, password, host, port)
        self.logger.info(
            "Successfully registered PostgresConnector to HyperedgeGenerator.")

        self.hyperedge = []
        self.hyperedge_sentence = []
        self.hyperedge_document = []
        self.all_hyperedges = []
        self.all_hyperedge_sentences = []

        # set up the "hyper edge ID counter", which is simply consecutive from 1.
        with self.pc as open_pc:
            if not check_table_existence(self.logger, open_pc,
                                         self.hyperedge_table_name):
                return 0

            self.logger.info("Retrieving current hyper edge ID key...")
            open_pc.cursor.execute(
                "SELECT COUNT(DISTINCT h.edge_id) FROM {} as h".format(
                    self.hyperedge_table_name))
            # either start with 1 or get the current maximum
            self.hyperedge_ID = max(1, open_pc.cursor.fetchone()[0])
Exemple #39
0
    # assert Q_table_c.Q_table[state_to_int(state)], "This state has no corresponding action: %r" % state_to_int(state)
    max_reward = float('-inf')
    int_state = state_to_int(state)
    actions_rewards_dict = Q_table_c.Q_table[int_state]
    for key, val in actions_rewards_dict.items():
        if val > max_reward:
            max_reward = val
            max_key = key
    return max_key, max_reward


register(
    id='DatabaseIndexesEnv-v0',
    entry_point='dbenvm:DatabaseIndexesEnv',
    kwargs={'n': len(table_column_names), 'table_name': table_name, 'query_batch': list(),
            'connector': PostgresConnector(), 'k': 3}
)


def get_indexes_qagent(index_amount, queries, Log=False):
    connector = PostgresConnector()
    if not table_exists(connector, table_name):
        create_table_2(connector)
        load_table(connector)

    #   make results repeatable
    np.random.seed(123)
    # gym configuration
    query_batch = list()

    env = gym.make('DatabaseIndexesEnv-v0')
Exemple #40
0
    def __init__(self,
                 fields=OrderedDict({
                     "_id": "document_id",
                     "title": "title",
                     "feedName": "feedName",
                     "category": "category",
                     "feedURL": "feedURL",
                     "published": "published"
                 }),
                 num_distinct_documents=0,
                 document_table_name="documents",
                 database="postgres",
                 user="******",
                 password="******",
                 host="127.0.0.1",
                 port=5435,
                 log_file=os.path.join(os.path.dirname(__file__),
                                       "logs/DocumentGenerator.log"),
                 log_level=logging.INFO,
                 log_verbose=True):
        """
        Initializes context, and sets up documents that will be parsed.
        Also establishes the PostgresConnector that will later be used to push the retrieved documents.
        :param fields: (OrderedDict) Key-value pairs that indicate a mapping of fields that should be retrieved (key),
               and the respective field it should be called in the SQL table. Ordered because SQL tables are.
        :param num_distinct_documents: (int) As the name indicates, the number of distinct articles that should be used.
               Mainly for debugging purposes. 0 means all documents will be used, in accordance with MongoDB standards.
        :param document_table_name: (str) Name of the Postgres table that should contain the documents
        :param database: (str) database name.
        :param user: (str) User name to get access to the Postgres database.
        :param password: (str) Corresponding user password.
        :param host: (IP) IP address (in string format) for the host of the postgres database.
        :param port: (integer) Port at which to access the database.
        :param log_file: (os.path) Path to the file containing the logs.
        :param log_level: (logging.LEVEL) Specifies the level to be logged.
        :param log_verbose: (boolean) Specifies whether or not to look to stdout as well.
        """

        # set up logger
        self.logger = set_up_logger(__name__, log_file, log_level, log_verbose)
        self.logger.info(
            "Successfully registered logger to DocumentGenerator.")

        # register a MongoConnector
        self.mc = MongoConnector()
        self.logger.info(
            "Successfully registered MongoConnector to DocumentGenerator.")

        self.num_distinct_documents = num_distinct_documents
        # get the distinct IDs for the documents so we can match against them later
        if self.num_distinct_documents != 0:
            self.logger.info(
                "Non-zero limit detected. Fetching first N distinct document IDs now..."
            )
            with self.mc as open_mc:
                documents = open_mc.client[open_mc.news].articles
                self.first_documents = list(documents.find().limit(
                    self.num_distinct_documents))
                # for small enough number, and large enough document collection, this is more efficient:
                self.first_documents = [
                    el["_id"] for el in self.first_documents
                ]
                self.logger.info(
                    "Successfully registered relevant document IDs.")
        else:
            # needed to avoid later conflicts
            self.first_documents = []
        # set up PostgresConnector. Since we only use these once, I don't see any reason to store the connection
        # details locally again.
        self.pc = PostgresConnector(database, user, password, host, port)
        self.logger.info(
            "Successfully registered PostgresConnector to DocumentGenerator.")

        # format them into a reasonable format
        self.fields = fields
        if not self.fields:
            self.logger.error("No fields for MongoDB table specified!")
        self.values_to_retrieve = {key: 1 for key in self.fields.keys()}
        # suppress _id if not wanted, as it is returned by default.
        if "_id" not in self.values_to_retrieve.keys():
            self.values_to_retrieve["_id"] = 0
        # TODO
        self.sql_format = ", ".join([value for value in self.fields.values()])
        self.document_table_name = document_table_name

        # preparation for later. According to PEP8
        self.data = []
        self.logger.info("Successfully set up DocumentGenerator.")
Exemple #41
0
import psycopg2 as db
import csv
import os
import sys
sys.path.append(os.path.abspath("../lib/"))
from query_helper import comm_helper
from PostgresConnector import PostgresConnector

# ports = list(range(5435, 5440))
port = 5436
windows = [5, 10]

t = comm_helper("postgres", "", "127.0.0.1", str(port))
pc = PostgresConnector(port=port)


def query_and_write(filename, query, header):
    with pc as opc:
        print("Start querying table {}".format(filename))
        if os.path.isfile(filename):
            os.remove(filename)
        opc.cursor.execute(query)
        # This only happens for documents

        print("Start writing table {}.".format(filename))
        with open(filename, 'w') as csvfile:
            csvwriter = csv.writer(csvfile)
            csvwriter.writerow(header)

            while True:
                data = opc.cursor.fetchmany(65536)
Exemple #42
0
    def update_organized_tweets(self):
        tweet_id_dict = {}

        try:
            conn = PostgresConnector().get_connection()
            cursor = conn.cursor()
            query_location = 'select id from location'
            cursor.execute(query_location)
            location_column = 0

            for row_location in cursor:

                query = """
				select id,trend from trends 
				where trend in(select trend from (select count(*) as c,trend from 
					trends where locationid = %s group by trend)as t1 order 
					by c desc limit 80)
						"""
                cursor = conn.cursor()
                location_id = row_location[location_column]
                cursor.execute(query, (location_id, ))
                trend_id_column = 0
                trend_name_column = 1
                trend_count = 0

                for row in cursor:
                    trend_count = trend_count + 1
                    trend_id = row[trend_id_column]
                    trend_name = row[trend_name_column]
                    print 'Processing for trend ' + trend_id + ' , ' + str(
                        trend_count)
                    query_tweets = 'select tweets from tweets where trendId = \'' + str(
                        trend_id) + '\''
                    cursor_tweets = conn.cursor()
                    cursor_tweets.execute(query_tweets)
                    tweets_column = 0

                    with open(trend_name + '.txt', 'w') as f:

                        # rows of tweets array
                        for tweets_row in cursor_tweets:
                            tweets_json_array = tweets_row[tweets_column]

                            # tweets in a tweets array
                            for json_in in tweets_json_array:

                                id = json_in['id']
                                tweet_id_exists = tweet_id_dict.get(id)

                                if tweet_id_exists is None:
                                    #print jsonIn
                                    tweet_id_dict[id] = 1
                                    geo = 'none' if json_in[
                                        'geo'] is None else 'none'  #json['geo']
                                    retweeted = json_in['retweeted']
                                    in_reply_to_screen_name = 'none' if json_in[
                                        'in_reply_to_screen_name'] is None else json_in[
                                            'in_reply_to_screen_name']
                                    truncated = 'none' if json_in[
                                        'truncated'] is None else json_in[
                                            'truncated']
                                    source = json_in['source']
                                    created_at = json_in['created_at']
                                    place = 'none' if json_in[
                                        'place'] is None else 'none'  #json['place']
                                    user_id = json_in['user']['id']
                                    text = json_in['text'].strip()
                                    #text = " ".join(str(text).split())
                                    text = str(
                                        filter(lambda x: x in string.printable,
                                               text))
                                    #text = text.encode('utf-16')
                                    text = re.sub('\s+', ' ', text)
                                    text = text.replace('\\', '')
                                    entities = json_in['entities']['hashtags']
                                    user_mentions = json_in['entities'][
                                        'user_mentions']
                                    user_mentions = []
                                    retweet_count = json_in['retweet_count']
                                    favorite_count = json_in['favorite_count']

                                    # if len(entities) > 0:
                                    # 	for entity in entities:
                                    # 		for k,v in entity.items():
                                    # 			if k in 'text':
                                    # 				entity_list = {}
                                    # 				new_v = entity[k]
                                    # 				new_v = str(new_v.encode('utf-8'))
                                    # 				new_v = filter(lambda x: x in string.printable,new_v)
                                    # 				#print id,check,new_v,len(new_v)
                                    # 				if len(new_v) > 0:
                                    # 					entity[k] = new_v
                                    # 				else:
                                    # 					entity[k] = ''

                                    #print id,geo,retweeted ,in_reply_to_screen_name ,truncated ,source ,created_at ,place ,user_id ,text ,entities ,user_mentions,retweet_count,favorite_count
                                    f.write(
                                        str(id) + '\t' + str(geo) + '\t' +
                                        str(retweeted) + '\t' + str(
                                            in_reply_to_screen_name.encode(
                                                'utf-8')) + '\t' +
                                        str(truncated) + '\t' +
                                        str(source.encode('utf-8')) + '\t' +
                                        str(created_at.encode('utf-8')) +
                                        '\t' + str(place) + '\t' +
                                        str(user_id) + '\t' + text + '\t' +
                                        str(json.dumps(entities)) + '\t' +
                                        str(user_mentions) + '\t' +
                                        str(retweet_count) + '\t' +
                                        str(favorite_count) + '\t' +
                                        str(trend_name) + '\t' +
                                        str(location_id) + '\n')

                                else:
                                    continue

                                # array of tweets json ends here
                                #break

                            # total number of tweets rows for a given trend ends here
                            #break

                    print 'Writing to table'

                    with open(trend_name + '.txt') as f:
                        cursor_write = conn.cursor()
                        cursor_write.copy_from(
                            f,
                            'organized_tweets',
                            columns=('id', 'geo', 'retweeted',
                                     'in_reply_to_screen_name', 'truncated',
                                     'source', 'created_at', 'place',
                                     'user_id', 'text', 'entities',
                                     'user_mentions', 'retweet_count',
                                     'favorite_count', 'trend', 'location_id'))

                    conn.commit()
                    os.remove(trend_name + '.txt')

                    # all trends finish here
                    #break

        except Exception:
            print traceback.format_exc()
Exemple #43
0
subjects = [(1, 'Accounting & Finance'), (2, 'Art & Design'),
            (3, 'Architecture'), (4, 'Manufacturing Engineering'), (5, 'Law'),
            (6, 'Economics & Econometrics'), (7, 'Medicine'),
            (8, 'Business & Management Studies'),
            (9, 'Engineering & Technology'), (10, 'Computer Science')]

#Дані для занесення в бд
subjects_to_teachers = [(1, 1, 1), (2, 2, 2), (3, 3, 3), (4, 4, 4), (5, 5, 5),
                        (6, 6, 6), (7, 7, 7), (8, 8, 8), (9, 9, 9),
                        (10, 10, 10), (11, 11, 1), (12, 12, 2), (13, 13, 3),
                        (14, 14, 4), (15, 15, 5), (16, 16, 6), (17, 17, 7),
                        (18, 18, 8), (19, 19, 9), (20, 20, 10)]

#Під'єднання до БД
sqlite = SqliteConnector()
Postgres = PostgresConnector()
MySql = MySqlConnector()


# Функція для створення БД1
def createDB():
    MySql.dropAllTables()
    MySql.createDatabase()
    MySql.executemany("INSERT INTO faculties VALUES (%s,%s)", faculties)
    MySql.executemany("INSERT INTO department VALUES (%s,%s,%s)", departments)
    MySql.executemany("INSERT INTO teachers VALUES (%s,%s,%s,%s,%s)", teachers)
    MySql.executemany("INSERT INTO subject VALUES (%s,%s)", subjects)
    MySql.executemany("INSERT INTO subjects_to_teachers VALUES (%s,%s,%s)",
                      subjects_to_teachers)

class PostgresQuerier(object):
  
  def __init__(self):
    self.connector = PostgresConnector(CONFIG_FILE_NAME, CONFIG_SECTION_NAME)
    self.SQL_constructor = PostgresSQLConstructor()
  

    
  def GetVersion(self):
    SQL = self.SQL_constructor.GetVersion()
    self.connector.ExecuteSQL(SQL)
    return self.connector.GetResults(1)
    
  def GetCurrentDate(self):
    SQL = self.SQL_constructor.GetCurrentDate()
    self.connector.ExecuteSQL(SQL)
    return self.connector.GetResults(1)
    
  def _SetDatestyleGerman(self):
    SQL = "SET DATESTYLE=%s;";
    data = ("German",)
    self.connector.ExecuteSQL(SQL, data)
    return "Datestyle set to German"
    
  
    
  def SelectColumnsFromTable(self, columns, table, limit=100):
    SQL_select = self.SQL_constructor.GetSELECTColumns(columns)
    SQL_from = self.SQL_constructor.GetFROMTable(table)
    SQL_limit = self.SQL_constructor.GetLIMIT(limit)

    SQL = SQL_select + SQL_from + SQL_limit
    
    self.connector.ExecuteSQL(SQL)
    return self.connector.GetAllResults()
    
  def FindWordsInColumnsInTableThenBoldAndRank(self, search_words, columns, table, operation="OR"):
    search_phrase = self.SQL_constructor.ConstructSearchPhrase(search_words, operation)
    
    headline_columns = self.SQL_constructor.GetHeadlineColumnsWithPhrase(columns, search_phrase)
    rank_column = self.SQL_constructor.Getts_rankColumnByPhraseWithLengthPenaltyWithAlias(columns[0], search_phrase, 1, "rank")
    headline_columns.append(rank_column)
    
    vector_query_pairs = [self.SQL_constructor.GetVectorColumnHasQueryPhrase(x, search_phrase) for x in columns]
    
    SQL_select = self.SQL_constructor.GetSELECTColumns(headline_columns)
    SQL_from = self.SQL_constructor.GetFROMTable(table)
    SQL_where = self.SQL_constructor.GetWHERE()
    SQL_conditions = " OR".join(vector_query_pairs)
    SQL_order = self.SQL_constructor.GetORDERByAliasInDirection("rank", "DESC")

    SQL = SQL_select + SQL_from + SQL_where + SQL_conditions + SQL_order
    
    self.connector.ExecuteSQL(SQL)
    return self.connector.GetAllResults()
  
  def FindWordsInDocumentInTableThenBoldAndRank(self, search_words, document, table, operation="OR"):
    search_phrase = self.SQL_constructor.ConstructSearchPhrase(search_words, operation)
    document_columns = ["title", "categories", "summary", "description"]
    
    headline_columns = self.SQL_constructor.GetHeadlineColumnsWithPhrase(document_columns, search_phrase)
    rank_column = self.SQL_constructor.Getts_rankDocumentByPhraseWithLengthPenaltyWithAlias(document, search_phrase, 1, "rank")
    
    headline_columns.append(rank_column)
    
    SQL_select = self.SQL_constructor.GetSELECTColumns(headline_columns)
    SQL_from = self.SQL_constructor.GetFROMTable(table)
    SQL_where = self.SQL_constructor.GetWHERE()
    SQL_conditions = self.SQL_constructor.GetDocumentHasQueryPhrase(document, search_phrase)
    SQL_order = self.SQL_constructor.GetORDERByAliasInDirection("rank", "DESC")
    
    SQL = SQL_select + SQL_from + SQL_where + SQL_conditions + SQL_order
    
    self.connector.ExecuteSQL(SQL)
    return self.connector.GetAllResults()
    
    
    
  def SuggestBasedOnPhraseInColumnInTable(self, phrase, column, table):
    similarity_function = self.SQL_constructor.GetSimilarityOfColumnAndStringWithAlias(column, phrase, "sameness")
    
    SQL_select = self.SQL_constructor.GetSELECTColumns([column, similarity_function])
    SQL_from = self.SQL_constructor.GetFROMTable(table)
    SQL_order = self.SQL_constructor.GetORDERByAliasInDirection("sameness", "DESC")
    
    SQL = SQL_select + SQL_from + SQL_order
    
    self.connector.ExecuteSQL(SQL)
    return self.connector.GetResults(5)
  


  def PivotMovies(self, start_datetime, stop_datetime, granulation):
    iso_start_datetime = start_datetime.isoformat(" ")
    iso_end_datetime = stop_datetime.isoformat(" ")
    
    diff = stop_datetime - start_datetime
    days, seconds = diff.days, diff.seconds
    hours = days * 24 + seconds // 3600
    
    if granulation == "hour":
      format = ["time_interval" + str(x) + " bigint" for x in range(hours + 1)]
    elif granulation == "day":
      format = ["time_interval" + str(x) + " bigint" for x in range(days + 1)]
    else:
      raise Exception("WARNING: incorrect granulation; possible granulation are (hour, day)")
    
    format.insert(0, "query character varying(255)")
    
    sub_query = ("select query,"
                 " date_trunc(''" + granulation + "'', date_and_time) as periods,"
                 " count(*)"
                 " from queries"
                 " where date_and_time >= ''" + iso_start_datetime + "'' and"
                       " date_and_time <= ''" + iso_end_datetime + "''"
                 " group by query, periods"
                 " order by query, periods;")
    sequence = ("select d"
                " from generate_series(''" + iso_start_datetime + "''::timestamp,"
                                      "''" + iso_end_datetime + "'',"
                                      "''1 " + granulation + "'')"
                " d;")
    SQL = ("select * from crosstab('" + sub_query + "',"
                                  "'" + sequence + "')"
           "as ct(" + ", ".join(format) + ");")
           
    self.connector.ExecuteSQL(SQL)
    return self.connector.GetAllResults()
                 
    
    
  def InsertIntoMovies(self, title, categories="No categories.", summary="No summary.", description="No description."):
    table = "movies"
    columns = ["title", "categories", "summary", "description"]
    values = [title, categories, summary, description]
    
    SQL_insert = self.SQL_constructor.GetINSERTIntoTableColumns(table, columns)
    SQL_values = self.SQL_constructor.GetVALUES(values)
    
    SQL = SQL_insert + SQL_values
    
    self.connector.ExecuteSQL(SQL)
    self.connector.CommitTransaction()
    return "Insertion successful."
  
  def _GetTableColumnsAfterIndex(self, table, index=0):
    return [x[0] for x in self.DescribeTable(table)][index:]
  
  
  
  def Prototype(self):
    SQL = ("SELECT title, similarity (title, 'The Dencing Master')"
           "FROM movies"
           "WHERE title % 'The Dencing Master'"
           )
    self.connector.ExecuteSQL(SQL)
    return self.connector.GetAllResults()
    
    
    
  def DescribeTable(self, table):
    SQL = "SELECT column_name FROM information_schema.columns WHERE table_name = '" + table + "'";
    self.connector.ExecuteSQL(SQL)
    return self.connector.GetAllResults()
  
  
  
  def Close(self):
    self.connector.Close()