def __init__(self, prefix="entity", window_size=2, entities_only=True, port=5436, log_file=os.path.join(os.path.dirname(__file__), "logs/SchemaCreator.log"), log_level=logging.INFO, log_verbose=True ): """ Set up. :param prefix: (str) Prefix to the table names. :param port: (int) Used to connect to the Postgres tables. :param log_file: (os.path) Path to the file containing the logs. :param log_level: (logging.LEVEL) Specifies the level to be logged. :param log_verbose: (boolean) Specifies whether or not to look to stdout as well. """ self.logger = set_up_logger(__name__, log_file, log_level, log_verbose) self.window_size = window_size self.prefix = prefix + "_" + str(self.window_size) self.entities_only = entities_only self.names = self.get_names(self.prefix) self.port = port self.pc = PostgresConnector(port=port) self.logger.info("Successfully registered SchemaGenerator.")
def get_dates_location(self, locationid): min_max_date_list = [] try: conn = PostgresConnector().get_connection() cursor = conn.cursor() query = """ select max(date),min(date) from trends where trend in (select t1.trend as trend from (select count(*) as c,trend from trends where locationid = %s group by trend)as t1 order by c desc limit 15) and locationid = %s """ cursor.execute(query, (locationid, locationid)) min_date_column = 1 max_date_column = 0 for row in cursor: min_max_date_dict = {} min_max_date_dict["min_date"] = str(row[min_date_column]) min_max_date_dict["max_date"] = str(row[max_date_column]) min_max_date_list.append(min_max_date_dict) except Exception: traceback.format_exc() return min_max_date_list
def get_trends(self, location_id, start_date, end_date): trends_list = [] try: conn = PostgresConnector().get_connection() cursor = conn.cursor() query = """ select c,trend from (select count(*) as c,trend from trends where locationid = %s and date between %s and %s and id in(select trendid from tweets) group by trend) as t1 order by c desc limit 15 """ cursor.execute(query, (location_id, start_date, end_date)) trend_column = 1 count_column = 0 for row in cursor: trend_count = {} trend_count["trend"] = row[trend_column] trend_count["count"] = row[count_column] trends_list.append(trend_count) except Exception as e: print e return trends_list
def create_table(port): pc = PostgresConnector(port=port) with pc as opc: # add sentence index column but in separate table print("Starting with ") pc.cursor.execute("CREATE TABLE sentences_neo4j AS TABLE sentences;") pc.cursor.execute("ALTER TABLE sentences_neo4j ADD COLUMN id int;") pc.cursor.execute("""WITH numbered (sid, document_id, sentence_id) AS (select row_number() OVER() sid, * from sentences_neo4j) UPDATE sentences_neo4j SET id = numbered.sid FROM numbered WHERE sentences_neo4j.document_id = numbered.document_id AND sentences_neo4j.sentence_id = numbered.sentence_id;""") # add term_occurrence index print("Starting with term occurrences...") pc.cursor.execute("CREATE TABLE term_occurrence_neo4j AS TABLE term_occurrence;") pc.cursor.execute("ALTER TABLE term_occurrence_neo4j ADD COLUMN id int;") pc.cursor.execute("""WITH numbered (sid, document_id, sentence_id, term_id) AS (select row_number() OVER() sid, * from term_occurrence) UPDATE term_occurrence_neo4j SET id = numbered.sid FROM numbered WHERE term_occurrence_neo4j.document_id = numbered.document_id AND term_occurrence_neo4j.sentence_id = numbered.sentence_id AND term_occurrence_neo4j.term_id = numbered.term_id;""")
def train_model(): np.random.seed(123) with open(os.path.join("..", "query_pull_1000v3.pkl"), 'rb') as f: query_pull = pickle.load(f) connector = PostgresConnector() env = DatabaseIndexesEnv(n=COLUMNS_AMOUNT, table_name=table_name, query_pull=query_pull, batch_size=BATCH_SIZE, connector=connector, k=3, max_episodes=1000) # Get the environment and extract the number of actions. env.seed(123) # Next, we build a very simple model. model = build_model() print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! dqn = initialize_agent(model) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. dqn.fit(env, nb_steps=50000, visualize=False, verbose=0, callbacks=[CustomEpisodeLogger()]) # After training is done, we save the final weights. dqn.save_weights('dqn_{}_weights_6_4_2_1_2000_episodes_estimated.h5f'.format(ENV_NAME), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. dqn.test(env, nb_episodes=5, visualize=False)
def get_data(self,locationid): entity_trend_dict = {} try: conn = PostgresConnector().get_connection() cursor = conn.cursor() query = """select t1.entity,t2.trend from (select id,entity from id_entity ) as t1 inner join (select id,trend from organized_tweets where trend in (select trend from (select count(*) as c,trend from trends where locationid = %s group by trend)as t_in order by c desc limit 15))as t2 on t1.id = t2.id""" cursor.execute(query,(locationid,)) entity_column = 0 trend_column = 1 for row in cursor: id = row[trend_column] if id in entity_id_dict.keys(): entity_list = entity_id_dict[id] entity_list.append(row[entity_column]) entity_id_dict[id] = entity_list else: entity_id_dict[id] = [row[entity_column]] except Exception: print traceback.format_exc() return entity_trend_dict
def build(self): conn = PostgresConnector().get_connection() cursor = conn.cursor() query = 'select id,entities,trend from organized_tweets' cursor.execute(query) id_column = 0 entities_column = 1 trend_column = 2 with open('copy_from.txt', 'w') as f: for row in cursor: tweet_id = row[id_column] trend = row[trend_column] hashtag_array = row[entities_column] json_array = json.loads(hashtag_array) hashtag_list = [hashtag["text"] for hashtag in json_array] hashtag_list_unique = list(set(hashtag_list)) print 'Writing data to table for the tweet_id ' + tweet_id for hashtag in hashtag_list_unique: f.write(tweet_id + '\t' + hashtag.encode('utf-8') + '\t' + trend + '\n') with open('copy_from.txt') as f: cursor.copy_from(f, 'id_entity', columns=('id', 'entity', 'trend')) conn.commit() os.remove('copy_from.txt')
def get_total_documents(self): conn = PostgresConnector().get_connection() cursor = conn.cursor() query = 'select count(distinct(id)) from "IdEntity" ' cursor.execute(query) count_of_distinct_id_column = 0 total_documents_count = 0 for row in cursor: total_documents_count = row[count_of_distinct_id_column] return total_documents_count
def get_locations(self): conn = PostgresConnector().get_connection() cursor = conn.cursor() query = 'SELECT id,city from location' cursor.execute(query) id_column = 0 city_column = 1 locations_list = [] for row in cursor: id_location = {} id_location["geoid"] = row[id_column] id_location["city"] = row[city_column] locations_list.append(id_location) return locations_list
def build_tf(self): # using group by first get tf score for each entity conn = PostgresConnector().get_connection() cursor = conn.cursor() query = 'select count(id),entity from "IdEntity" group by entity' cursor.execute(query) count_of_id_column = 0 entities_column = 1 entity_id_dict = {} for row in cursor: count_of_id = row[count_of_id_column] entity = row[entities_column] entity_id_dict[entity] = count_of_id return entity_id_dict
def get_tfidf(self, locationid, trend): tfidf_list = [] try: conn = PostgresConnector().get_connection() cursor = conn.cursor() tfidf_query = """ select entity,tf_idf_score from (select t4.entity,sum(t4.tf_idf) as tf_idf_score from (select t1.id,t1.entity,t2.count_id,t3.count_entity, (1.0/t3.count_entity)*log(( select count(*) from organized_tweets where trend = %s and location_id = %s )/t2.count_id) as tf_idf from (select id,entity from id_entity where id in (select id from organized_tweets where trend = %s and location_id = %s)) as t1 inner join (select entity,count(id) as count_id from id_entity where id in (select id from organized_tweets where trend = %s and location_id = %s)group by entity) as t2 on t1.entity = t2.entity inner join (select id,count(entity) as count_entity from id_entity where id in(select id from organized_tweets where trend = %s and location_id = %s )group by id) as t3 on t1.id = t3.id) as t4 group by entity)as t5 order by tf_idf_score desc limit 100; """ cursor.execute(tfidf_query, (trend, locationid, trend, locationid, trend, locationid, trend, locationid)) entity_column = 0 tfidf_column = 1 for row in cursor: entity_tfidf_score = {} entity_tfidf_score["entity"] = row[entity_column] entity_tfidf_score["tfidf"] = row[tfidf_column] tfidf_list.append(entity_tfidf_score) return tfidf_list except Exception: print traceback.format_exc()
def get_tweets(self, trend, entity): conn = PostgresConnector().get_connection() cursor = conn.cursor() query_tweets = """ select text from organized_tweets where id in (select id from id_entity where entity = %s) limit 50 """ cursor.execute(query_tweets, (entity, )) text_list = [] for row in cursor: text_dict = {} text_dict["name"] = row[0] text_list.append(text_dict) return text_list
def test_dqn_against_heuristic(self): np.random.seed(123) with open(path.join("..", "query_pull_1000v3.pkl"), 'rb') as f: query_pull = pickle.load(f)[0:5] workload = np.random.choice(query_pull, const.BATCH_SIZE) env = DatabaseIndexesEnv(n=const.COLUMNS_AMOUNT, table_name=table_name, query_pull=query_pull, batch_size=const.BATCH_SIZE, connector=PostgresConnector(), k=3, max_episodes=1) dqn = load_agent( path.join("..", "dqn_specific_{}.h5f".format(ENV_NAME))) results = dqn.test(env, nb_episodes=1) print(results) print(env.state) print(predict_on_workload(workload))
def test_cache(self): np.random.seed(123) with open("..\query_pull_1000v3.pkl", 'rb') as f: query_pull = pickle.load(f) register( id='DatabaseIndexesEnv-v0', entry_point='dbenv:DatabaseIndexesEnv', kwargs={'n': const.COLUMNS_AMOUNT, 'table_name': "test_table", 'query_pull': query_pull, 'batch_size': 2, 'connector': PostgresConnector(), 'k': 3, 'max_episodes': 1} ) env = gym.make('DatabaseIndexesEnv-v0') env.step(0) env.step(1) env.step(2) print(env.cache)
def get_sentiments(self): conn = PostgresConnector().get_connection() cursor = conn.cursor() query = """ select id,text from organized_tweets """ cursor.execute(query) id_column = 0 text_column = 1 with open("sentiments.tsv", "w") as f: for row in cursor: text = row[text_column] blob = TextBlob(text, analyzer=NaiveBayesAnalyzer()) print 'writing for tweet with id ' + str(row[id_column]) f.write( str(row[id_column]) + '\t' + str(blob.sentiment.classification) + '\t' + str(blob.sentiment.p_pos) + '\t' + str(blob.sentiment.p_neg) + '\n')
def train_model(): np.random.seed(123) with open("query_pull_1000v2.pkl", 'rb') as f: query_pull = pickle.load(f)[0:5] register(id=ENV_NAME, entry_point='dbenv:DatabaseIndexesEnv', kwargs={ 'n': COLUMNS_AMOUNT, 'table_name': table_name, 'query_pull': query_pull, 'batch_size': BATCH_SIZE, 'connector': PostgresConnector(), 'k': 3, 'max_episodes': episodes }) # Get the environment and extract the number of actions. env = gym.make(ENV_NAME) env.seed(123) # Next, we build a very simple model. model = build_model() print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! dqn = initialize_agent(model) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. dqn.fit(env, nb_steps=episodes, visualize=False, verbose=2) # After training is done, we save the final weights. dqn.save_weights('dqn_specific_{}.h5f'.format(ENV_NAME), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. dqn.test(env, nb_episodes=5, visualize=False)
def build(self): conn = PostgresConnector().get_connection() cursor = conn.cursor() query = 'select id,hashtags from "organizedTweets" ' cursor.execute(query) id_column = 0 entities_column = 1 entity_id_dict = {} for row in cursor: tweet_id = row[id_column] hashtag_array = row[entities_column] hashtag_list = [hashtag['text'] for hashtag in hashtag_array] for entity in hashtag_list: if entity in entity_id_dict.keys(): id_list = entity_id_dict[entity] id_list.append(tweet_id) entity_id_dict[entity] = id_list else: id_list = [] id_list.append(tweet_id) entity_id_dict[entity] = id_list return entity_id_dict
def update_organized_tweets(self): tweet_id_dict = {} try: conn = PostgresConnector().get_connection() cursor = conn.cursor() query_location = 'select id from location' cursor.execute(query_location) location_column = 0 for row_location in cursor: query = """ select id,trend from trends where trend in(select trend from (select count(*) as c,trend from trends where locationid = %s group by trend)as t1 order by c desc limit 80) """ cursor = conn.cursor() location_id = row_location[location_column] cursor.execute(query, (location_id, )) trend_id_column = 0 trend_name_column = 1 trend_count = 0 for row in cursor: trend_count = trend_count + 1 trend_id = row[trend_id_column] trend_name = row[trend_name_column] print 'Processing for trend ' + trend_id + ' , ' + str( trend_count) query_tweets = 'select tweets from tweets where trendId = \'' + str( trend_id) + '\'' cursor_tweets = conn.cursor() cursor_tweets.execute(query_tweets) tweets_column = 0 with open(trend_name + '.txt', 'w') as f: # rows of tweets array for tweets_row in cursor_tweets: tweets_json_array = tweets_row[tweets_column] # tweets in a tweets array for json_in in tweets_json_array: id = json_in['id'] tweet_id_exists = tweet_id_dict.get(id) if tweet_id_exists is None: #print jsonIn tweet_id_dict[id] = 1 geo = 'none' if json_in[ 'geo'] is None else 'none' #json['geo'] retweeted = json_in['retweeted'] in_reply_to_screen_name = 'none' if json_in[ 'in_reply_to_screen_name'] is None else json_in[ 'in_reply_to_screen_name'] truncated = 'none' if json_in[ 'truncated'] is None else json_in[ 'truncated'] source = json_in['source'] created_at = json_in['created_at'] place = 'none' if json_in[ 'place'] is None else 'none' #json['place'] user_id = json_in['user']['id'] text = json_in['text'].strip() #text = " ".join(str(text).split()) text = str( filter(lambda x: x in string.printable, text)) #text = text.encode('utf-16') text = re.sub('\s+', ' ', text) text = text.replace('\\', '') entities = json_in['entities']['hashtags'] user_mentions = json_in['entities'][ 'user_mentions'] user_mentions = [] retweet_count = json_in['retweet_count'] favorite_count = json_in['favorite_count'] # if len(entities) > 0: # for entity in entities: # for k,v in entity.items(): # if k in 'text': # entity_list = {} # new_v = entity[k] # new_v = str(new_v.encode('utf-8')) # new_v = filter(lambda x: x in string.printable,new_v) # #print id,check,new_v,len(new_v) # if len(new_v) > 0: # entity[k] = new_v # else: # entity[k] = '' #print id,geo,retweeted ,in_reply_to_screen_name ,truncated ,source ,created_at ,place ,user_id ,text ,entities ,user_mentions,retweet_count,favorite_count f.write( str(id) + '\t' + str(geo) + '\t' + str(retweeted) + '\t' + str( in_reply_to_screen_name.encode( 'utf-8')) + '\t' + str(truncated) + '\t' + str(source.encode('utf-8')) + '\t' + str(created_at.encode('utf-8')) + '\t' + str(place) + '\t' + str(user_id) + '\t' + text + '\t' + str(json.dumps(entities)) + '\t' + str(user_mentions) + '\t' + str(retweet_count) + '\t' + str(favorite_count) + '\t' + str(trend_name) + '\t' + str(location_id) + '\n') else: continue # array of tweets json ends here #break # total number of tweets rows for a given trend ends here #break print 'Writing to table' with open(trend_name + '.txt') as f: cursor_write = conn.cursor() cursor_write.copy_from( f, 'organized_tweets', columns=('id', 'geo', 'retweeted', 'in_reply_to_screen_name', 'truncated', 'source', 'created_at', 'place', 'user_id', 'text', 'entities', 'user_mentions', 'retweet_count', 'favorite_count', 'trend', 'location_id')) conn.commit() os.remove(trend_name + '.txt') # all trends finish here #break except Exception: print traceback.format_exc()
def __init__(self, window_size=2, limit_edges=False, entities_only=False, document_table_name="documents", sentence_table_name="sentences", entity_table_name="entities", term_table_name="terms", term_occurrence_table_name="term_occurrence", hyperedge_table_name="hyperedges", hyperedge_format=("edge_id", "term_id", "pos"), hyperedge_document_table_name="hyperedge_document", hyperedge_document_format=("edge_id", "document_id"), hyperedge_sentence_table_name="hyperedge_sentences", hyperedge_sentence_format=("edge_id", "document_id", "sentence_id", "pos"), database="postgres", user="******", password="******", host="127.0.0.1", port=5435, log_file=os.path.join(os.path.dirname(__file__), "logs/HyperedgeGenerator.log"), log_level=logging.INFO, log_verbose=True): """ Initializes hyper edge generator class. :param window_size: (int) Number of sentences in each direction that will determine the context window size of the algorithm. :param limit_edges: (boolean) Experimental: Should limit the maximum number of terms per hyperedge. This would only be useful in context with other theoretical results. :param entities_only: (boolean) Indicating whether or not we should only take into account entity terms, and not the entirety of all term occurrences for the edges. :param document_table_name: (str) Name of the table where documents are stored. :param sentence_table_name: (str) Name of the table containing the sentences and their content. :param entity_table_name: (str) Name of the table containing the entity information and their properties. :param term_table_name: (str) Name of the table containing the terms and meta data. :param term_occurrence_table_name: (str) Name of the table containing term occurrence data. :param hyperedge_table_name: (str) Name of the table containing the general hyper edge information. :param hyperedge_format: (str) Table structure of hyper edge table. :param hyperedge_document_table_name: (str) Name of the table containing the document classification. :param hyperedge_document_format: (str) Table structure of hyper edge document table. :param hyperedge_sentence_table_name: (str) Name of the tale containing the hyper edge sentence data. :param hyperedge_sentence_format: (str) Table structure of the hyper edge sentence table. :param database: (str) database name. :param user: (str) User name to get access to the Postgres database. :param password: (str) Corresponding user password. :param host: (IP) IP address (in string format) for the host of the postgres database. :param port: (integer) Port at which to access the database. :param log_file: (os.path) Path to the file containing the logs. :param log_level: (logging.LEVEL) Specifies the level to be logged. :param log_verbose: (boolean) Specifies whether or not to look to stdout as well. """ self.logger = set_up_logger(__name__, log_file, log_level, log_verbose) self.logger.info( "Successfully registered logger to HyperedgeGenerator.") # important for hyperedges self.window_size = window_size self.limit_edges = limit_edges self.entities_only = entities_only # table names self.document_table_name = document_table_name self.sentence_table_name = sentence_table_name self.entity_table_name = entity_table_name self.term_table_name = term_table_name self.term_occurrence_table_name = term_occurrence_table_name self.hyperedge_table_name = hyperedge_table_name self.hyperedge_document_table_name = hyperedge_document_table_name self.hyperedge_sentence_table_name = hyperedge_sentence_table_name self.hyperedge_format = ", ".join([el for el in hyperedge_format]) self.hyperedge_document_format = ", ".join( [el for el in hyperedge_document_format]) self.hyperedge_sentence_format = ",".join( [el for el in hyperedge_sentence_format]) self.pc = PostgresConnector(database, user, password, host, port) self.logger.info( "Successfully registered PostgresConnector to HyperedgeGenerator.") self.hyperedge = [] self.hyperedge_sentence = [] self.hyperedge_document = [] self.all_hyperedges = [] self.all_hyperedge_sentences = [] # set up the "hyper edge ID counter", which is simply consecutive from 1. with self.pc as open_pc: if not check_table_existence(self.logger, open_pc, self.hyperedge_table_name): return 0 self.logger.info("Retrieving current hyper edge ID key...") open_pc.cursor.execute( "SELECT COUNT(DISTINCT h.edge_id) FROM {} as h".format( self.hyperedge_table_name)) # either start with 1 or get the current maximum self.hyperedge_ID = max(1, open_pc.cursor.fetchone()[0])
# assert Q_table_c.Q_table[state_to_int(state)], "This state has no corresponding action: %r" % state_to_int(state) max_reward = float('-inf') int_state = state_to_int(state) actions_rewards_dict = Q_table_c.Q_table[int_state] for key, val in actions_rewards_dict.items(): if val > max_reward: max_reward = val max_key = key return max_key, max_reward register( id='DatabaseIndexesEnv-v0', entry_point='dbenvm:DatabaseIndexesEnv', kwargs={'n': len(table_column_names), 'table_name': table_name, 'query_batch': list(), 'connector': PostgresConnector(), 'k': 3} ) def get_indexes_qagent(index_amount, queries, Log=False): connector = PostgresConnector() if not table_exists(connector, table_name): create_table_2(connector) load_table(connector) # make results repeatable np.random.seed(123) # gym configuration query_batch = list() env = gym.make('DatabaseIndexesEnv-v0')
def get_indexes_qagent(index_amount, queries, Log=False): connector = PostgresConnector() if not table_exists(connector, table_name): create_table_2(connector) load_table(connector) # make results repeatable np.random.seed(123) # gym configuration query_batch = list() env = gym.make('DatabaseIndexesEnv-v0') env.set_indices_num(index_amount) current_query_idx = 0 query_batch = list() for workload in range(1): exploration_rate = 1.0 # represents the exploration rate to be decayed by the time initial_lr = 1.0 # Learning rate query_batch = list() Q_table_c.Q_table = {} query_batch = list() workload_selectivity_l = list() # 1. generate the queries per workload # 2. generate the cummlative selectivity per workload start = timer() for i in range(current_query_idx, current_query_idx + num_queries_batch): query_batch.append(queries[i]['query']) workload_selectivity_l.append(list(map(lambda x: x, queries[i]['sf_array']))) current_query_idx += num_queries_batch workload_selectivity = np.prod(workload_selectivity_l, axis=0).tolist() max_workload_selectivity = max(workload_selectivity) env.set_query_batch(query_batch) actions_taken = list() # as a heuristic: the indices with the lowest selectivity selectivity_indices = heapq.nsmallest(3, range(len(workload_selectivity)), workload_selectivity.__getitem__) if Log: print("Entering the q learning ..... the process can take time.") print(workload_selectivity) env.clear_cache() for episode in range(NUM_EPISODES): state = env.reset() actions_taken = list() # decay the exploration as the number of episodes grows, the Q table becomes more mature eps = exploration_rate / np.sqrt(episode + 1) eps = max(eps, min_exp_rate) episode_total_reward = 0 episode_total_qreward = 0 episode_strategy = [] eta = max(min_lr, initial_lr * (0.85 ** (episode // 100))) ## now the learning comes for kk in range(3): # do exploration, i.e., choose a random actions # make sure the last step is exploitation unless the state is new if episode == 0: episode_strategy.append("explore") action = selectivity_indices[kk] Q_table_c.Q_table[state_to_int(state)] = {} Q_table_c.Q_table[state_to_int(state)][action] = 0 elif (is_new_state(state) or (np.random.uniform(0, 1) < eps)) and episode != NUM_EPISODES - 1: episode_strategy.append("explore") # generate only actions that matches something with selectivity. action = env.action_space.sample() # high selectivity, not a good option for an index while workload_selectivity[action] >= max_workload_selectivity: action = env.action_space.sample() if is_new_state(state): Q_table_c.Q_table[state_to_int(state)] = {} if action not in Q_table_c.Q_table[state_to_int(state)]: Q_table_c.Q_table[state_to_int(state)][action] = 0 else: # else exploit choose the maximum value from the Q table episode_strategy.append("exploit") action = get_action_maximum_reward(state)[0] actions_taken.append(action) state_old_int = state_to_int(state) state_new, reward, done, _ = env.step(action) episode_total_reward += reward next_action = 0 next_action_q_value = 0 if is_new_state(state_new): next_action = env.action_space.sample() while (action == next_action or workload_selectivity[next_action] >= max_workload_selectivity): next_action = env.action_space.sample() next_action_q_value = 0 else: next_action, next_action_q_value = get_action_maximum_reward(state_new) Q_table_c.Q_table[state_old_int][action] += eta * (reward + GAMMA * next_action_q_value - Q_table_c.Q_table[state_old_int][action]) episode_total_qreward += Q_table_c.Q_table[state_old_int][action] state, action = state_new, next_action actions_taken_s = ','.join(str(e) for e in actions_taken) if Log: print( "episode num = '{0}', episode_total_immediate_rewards = '{1}', episode_total_reward = '{2}', current_state = '{3}', actions_taken = '{4}', strategy = {5}" .format(episode, float(episode_total_reward), float(episode_total_qreward), state_to_string(state), actions_taken_s, episode_strategy)) return actions_taken
result = [] print("Number of empty hyperedges:") for i, prefix in enumerate(prefixes): with pc as open_pc: table = prefix + "hyperedges" open_pc.cursor.execute( "SELECT (SELECT MAX(edge_id) from {}) - " "(SELECT count(distinct edge_id) from {}) as diff".format( table, table)) result.append(open_pc.cursor.fetchall()[0][0]) print("Results for {}: {}".format(table, result[i])) if __name__ == "__main__": prefixes = ["", "entity_"] pc = PostgresConnector(port=5435) print() get_document_table_length(prefixes, pc) print() get_sentence_table_length(prefixes, pc) print() get_hyperedge_table_length(prefixes, pc) print() analyze_edge_size(prefixes, pc) print() analyze_term_frequency(prefixes, pc) print() get_number_of_empty_edges(prefixes, pc)
def __init__(self): self.connector = PostgresConnector(CONFIG_FILE_NAME, CONFIG_SECTION_NAME) self.SQL_constructor = PostgresSQLConstructor()
def __init__(self, num_distinct_documents=5000, replace_entities=True, max_term_length=127, remove_stopwords=True, custom_stopwords=[ ',', '.', '-', '\xa0', '“', '”', '"', '\n', '—', ':', '?', 'I', '(', ')' ], analyze=False, document_tabe_name="documents", sentence_table_name="sentences", sentence_fields=OrderedDict({ "doc_id": "document_id", "sen_id": "sentence_id", "content": "sentence_text" }), term_table_name="terms", term_sql_format=("term_id", "term_text", "is_entity"), term_occurrence_table_name="term_occurrence", term_occurrence_sql_format=("document_id", "sentence_id", "term_id"), entity_table_name="entities", entity_sql_format=("entity_id", "entity_type"), database="postgres", user="******", password="******", host="127.0.0.1", port=5435, log_file=os.path.join(os.path.dirname(__file__), "logs/TermGenerator.log"), log_level=logging.INFO, log_verbose=True): """ Initializes various parameters, registers logger and MongoConnector, and sets up the limit. :param num_distinct_documents: (int) The number of distinct documents retrieved from the queries. For performance reasons, this should be limited during debugging/development. 0 (Zero) represents no limit, in accordance with the MongoDB standard for .limit(). :param replace_entities: (boolean) Whether or not the entities in the text should be replaced/recognised. The reason for this is that single terms might be merged together to one term, i.e. first and last name: "Dennis" "Aumiller" would be two separate terms in the traditional splitting (replace_entities=False), whereas - if set to true - "Dennis Aumiller" would represent only one entity. :param max_term_length: (int) Indicator of how long the terms are supposed to be (varchar property in table). :param remove_stopwords: (boolean) Determines whether or not stop words are removed. Currently, we are still deciding on the final set, but likely either one (or both) of NLTK and SpaCy's stop word lists. :param custom_stopwords: (list of strings) Additional words that will not be considered at adding-time. :param analyze: (boolean) Whether or not to include analytically relevant metrics. :param document_tabe_name: (str) Name of the table where the document information is stored. :param sentence_table_name: (str) Name of the table where the sentence information will be stored. :param sentence_fields: (OrderedDict) Structure of input to output values from MongoDB to postgres for the sentence table and its fields. :param term_table_name: (str) Name of the Postgres tables for the terms. :param term_sql_format: (tuple) Since those are generated locally, only a tuple of the PostgresColumns suffices. :param term_occurrence_table_name: (str) Name of the Postgres table for the term occurrences :param term_occurrence_sql_format: (tuple) Same as term_sql_format, but for the term occurrences. :param entity_table_name: (str) (Not implemented yet) Name of the table for the entity meta information. :param entity_sql_format: (str) Same as term_sql_format, but for entities. :param database: (str) database name. :param user: (str) User name to get access to the Postgres database. :param password: (str) Corresponding user password. :param host: (IP) IP address (in string format) for the host of the postgres database. :param port: (integer) Port at which to access the database. """ # set up logger self.logger = set_up_logger(__name__, log_file, log_level, log_verbose) self.logger.info("Successfully registered logger to TermGenerator.") # register a MongoConnector self.mc = MongoConnector() self.logger.info( "Successfully registered MongoConnector to TermGenerator.") # PostgresConnector self.pc = PostgresConnector(database, user, password, host, port) self.logger.info( "Successfully registered PostgresConnector to DocumentGenerator.") self.num_distinct_documents = num_distinct_documents # do this earlier since we need it already for the distinct documents. self.document_table_name = document_tabe_name # get the distinct IDs for the documents so we can match against them later # since we have removed parts of the document collection, we have to make sure to get this from Postgres. self.logger.info("Parsing relevant documents from Postgres...") with self.pc as open_pc: open_pc.cursor.execute("SELECT document_id FROM {}".format( self.document_table_name)) self.first_distinct_documents = list(open_pc.cursor.fetchall()) # extract from the tuple structure self.first_distinct_documents = [ el[0] for el in self.first_distinct_documents ] self.logger.info("Retrieved all relevant documents from Postgres.") # additionally restrict if we want only a number of documents. if self.num_distinct_documents != 0: self.logger.info( "Non-zero limit detected. Limiting to the first N entries.") self.first_distinct_documents = self.first_distinct_documents[:self . num_distinct_documents] self.replace_entities = replace_entities self.analyze = analyze self.max_term_length = max_term_length self.nlp = spacy.load("en") # construct dictionary with the entries per document/sentence id pair. Thus, we can later check whether # there are any entities in the current sentence with higher efficiency. self.occurrence_dict = {} self.occurring_entities = [] # start building the term dictionary/set, as well as an occurence map. Since terms will be "post-processed", # it is first created as a list and later cast to Counter and set. self.terms = [] # cast into a set later on. self.term_in_sentence = set() self.term_id = {} self.term_is_entity = {} if self.analyze: self.term_count = Counter() self.entity_count = Counter() self.entities = [] self.sentences = [] self.processed_sentences = [] # Postgres tables if not sentence_fields: self.logger.error("No sentence fields specified!") self.sentence_table_name = sentence_table_name self.sentence_fields = sentence_fields if not term_sql_format: self.logger.error("No term fields specified!") self.term_table_name = term_table_name self.term_sql_format = ", ".join(term_sql_format) if not term_occurrence_sql_format: self.logger.error("No term occurrence fields specified!") self.term_occurrence_table_name = term_occurrence_table_name self.term_occurrence_sql_format = ", ".join(term_occurrence_sql_format) if not entity_sql_format: self.logger.error("No entity fields specified!") self.entity_table_name = entity_table_name self.entity_sql_format = ", ".join(entity_sql_format) # value retrieving parse: self.sentence_values_to_retrieve = { key: 1 for key in self.sentence_fields.keys() } # suppress _id if not present: if "_id" not in self.sentence_values_to_retrieve.keys(): self.sentence_values_to_retrieve["_id"] = 0 self.sentence_sql_format = ", ".join( [value for value in self.sentence_fields.values()]) # create union of stop words, and add potentially custom stop words self.remove_stopwords = remove_stopwords self.removed_counter = 0 self.stopwords = STOP_WORDS.union(set(stopwords.words("english"))) # add custom stopwords. for word in custom_stopwords: self.stopwords.add(word) self.logger.info("Successfully initialized TermGenerator.")
def run_qlearning(): connector = PostgresConnector() query_pull = generate_query_pull('.query_pull', queries_amount, [4, 6], table_column_types, table_column_names, table_name, connector)
def __test_results(self, columns_participating): def get_execution_time_for_indexes_configuration(indexes): total_time = 0 for index in indexes: add_index(connector, index, table_name) total_time = 0 for query in queries: total_time += get_estimated_execution_time( connector, query['query']) drop_indexes(connector, table_name) return total_time def add_execution_time_for_method_and_indexes_configuration( method, indexes): if method in methods: methods[method].append( get_execution_time_for_indexes_configuration(indexes)) else: methods[method] = [ get_execution_time_for_indexes_configuration(indexes) ] def get_indexes_dqn(): env = DatabaseIndexesEnv(n=const.COLUMNS_AMOUNT, table_name=table_name, query_pull=queries, batch_size=const.BATCH_SIZE, connector=connector, k=3, max_episodes=1) dqn = load_agent( path.join( "..", "dqn_{}_weights_6_4_2_1_50000_episodes_estimated.h5f". format(ENV_NAME))) dqn.test(env, nb_episodes=1) return [i for i, x in enumerate(env.state) if x] connector = PostgresConnector() drop_indexes(connector, table_name) methods = {} i = 0 np.warnings.filterwarnings('ignore') while True: queries = generate_query_pull( '../.test_query_pull_' + str(columns_participating) + '_' + str(i), self.__queries_amount, columns_participating, table_column_types, table_column_names, table_name, connector) i += 1 sf_array = np.array([query['sf_array'] for query in queries]).sum(axis=0) indexes_to_add = [ i[0] for i in (sorted(enumerate(sf_array), key=lambda x: x[1]) )[:self.__index_amount] ] add_execution_time_for_method_and_indexes_configuration( 'heuristic', indexes_to_add) indexes_to_add = get_indexes_qagent(self.__index_amount, queries, True) add_execution_time_for_method_and_indexes_configuration( 'qlearning', indexes_to_add) # #extra clean up to make sure no indices left from the agent drop_indexes(connector, table_name) # dqn indexes_to_add = get_indexes_dqn() drop_indexes(connector, table_name) add_execution_time_for_method_and_indexes_configuration( 'dqn', indexes_to_add) drop_indexes(connector, table_name) indexes_to_add = get_indexes_supervised(self.__index_amount, queries) add_execution_time_for_method_and_indexes_configuration( 'supervised', indexes_to_add) drop_indexes(connector, table_name) indexes_to_add = random.sample(range(COLUMNS_AMOUNT), self.__index_amount) add_execution_time_for_method_and_indexes_configuration( 'random', indexes_to_add) times_combinations = list( itertools.combinations(methods.values(), 2)) p_values = [ stats.ttest_ind(time[0], time[1])[1] for time in times_combinations ] print(p_values) if all(p_value < 0.01 for p_value in p_values) and i >= 5 or i >= 5: break print('try #' + str(i)) for method, times in methods.items(): print('{}: {}'.format(method, np.mean(times))) print('') for method, times in methods.items(): print('{}: {}'.format(method, np.mean(times)))
import psycopg2 as db import csv import os import sys sys.path.append(os.path.abspath("../lib/")) from query_helper import comm_helper from PostgresConnector import PostgresConnector # ports = list(range(5435, 5440)) port = 5436 windows = [5, 10] t = comm_helper("postgres", "", "127.0.0.1", str(port)) pc = PostgresConnector(port=port) def query_and_write(filename, query, header): with pc as opc: print("Start querying table {}".format(filename)) if os.path.isfile(filename): os.remove(filename) opc.cursor.execute(query) # This only happens for documents print("Start writing table {}.".format(filename)) with open(filename, 'w') as csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerow(header) while True: data = opc.cursor.fetchmany(65536)
def __init__(self, fields=OrderedDict({ "_id": "document_id", "title": "title", "feedName": "feedName", "category": "category", "feedURL": "feedURL", "published": "published" }), num_distinct_documents=0, document_table_name="documents", database="postgres", user="******", password="******", host="127.0.0.1", port=5435, log_file=os.path.join(os.path.dirname(__file__), "logs/DocumentGenerator.log"), log_level=logging.INFO, log_verbose=True): """ Initializes context, and sets up documents that will be parsed. Also establishes the PostgresConnector that will later be used to push the retrieved documents. :param fields: (OrderedDict) Key-value pairs that indicate a mapping of fields that should be retrieved (key), and the respective field it should be called in the SQL table. Ordered because SQL tables are. :param num_distinct_documents: (int) As the name indicates, the number of distinct articles that should be used. Mainly for debugging purposes. 0 means all documents will be used, in accordance with MongoDB standards. :param document_table_name: (str) Name of the Postgres table that should contain the documents :param database: (str) database name. :param user: (str) User name to get access to the Postgres database. :param password: (str) Corresponding user password. :param host: (IP) IP address (in string format) for the host of the postgres database. :param port: (integer) Port at which to access the database. :param log_file: (os.path) Path to the file containing the logs. :param log_level: (logging.LEVEL) Specifies the level to be logged. :param log_verbose: (boolean) Specifies whether or not to look to stdout as well. """ # set up logger self.logger = set_up_logger(__name__, log_file, log_level, log_verbose) self.logger.info( "Successfully registered logger to DocumentGenerator.") # register a MongoConnector self.mc = MongoConnector() self.logger.info( "Successfully registered MongoConnector to DocumentGenerator.") self.num_distinct_documents = num_distinct_documents # get the distinct IDs for the documents so we can match against them later if self.num_distinct_documents != 0: self.logger.info( "Non-zero limit detected. Fetching first N distinct document IDs now..." ) with self.mc as open_mc: documents = open_mc.client[open_mc.news].articles self.first_documents = list(documents.find().limit( self.num_distinct_documents)) # for small enough number, and large enough document collection, this is more efficient: self.first_documents = [ el["_id"] for el in self.first_documents ] self.logger.info( "Successfully registered relevant document IDs.") else: # needed to avoid later conflicts self.first_documents = [] # set up PostgresConnector. Since we only use these once, I don't see any reason to store the connection # details locally again. self.pc = PostgresConnector(database, user, password, host, port) self.logger.info( "Successfully registered PostgresConnector to DocumentGenerator.") # format them into a reasonable format self.fields = fields if not self.fields: self.logger.error("No fields for MongoDB table specified!") self.values_to_retrieve = {key: 1 for key in self.fields.keys()} # suppress _id if not wanted, as it is returned by default. if "_id" not in self.values_to_retrieve.keys(): self.values_to_retrieve["_id"] = 0 # TODO self.sql_format = ", ".join([value for value in self.fields.values()]) self.document_table_name = document_table_name # preparation for later. According to PEP8 self.data = [] self.logger.info("Successfully set up DocumentGenerator.")
subjects = [(1, 'Accounting & Finance'), (2, 'Art & Design'), (3, 'Architecture'), (4, 'Manufacturing Engineering'), (5, 'Law'), (6, 'Economics & Econometrics'), (7, 'Medicine'), (8, 'Business & Management Studies'), (9, 'Engineering & Technology'), (10, 'Computer Science')] #Дані для занесення в бд subjects_to_teachers = [(1, 1, 1), (2, 2, 2), (3, 3, 3), (4, 4, 4), (5, 5, 5), (6, 6, 6), (7, 7, 7), (8, 8, 8), (9, 9, 9), (10, 10, 10), (11, 11, 1), (12, 12, 2), (13, 13, 3), (14, 14, 4), (15, 15, 5), (16, 16, 6), (17, 17, 7), (18, 18, 8), (19, 19, 9), (20, 20, 10)] #Під'єднання до БД sqlite = SqliteConnector() Postgres = PostgresConnector() MySql = MySqlConnector() # Функція для створення БД1 def createDB(): MySql.dropAllTables() MySql.createDatabase() MySql.executemany("INSERT INTO faculties VALUES (%s,%s)", faculties) MySql.executemany("INSERT INTO department VALUES (%s,%s,%s)", departments) MySql.executemany("INSERT INTO teachers VALUES (%s,%s,%s,%s,%s)", teachers) MySql.executemany("INSERT INTO subject VALUES (%s,%s)", subjects) MySql.executemany("INSERT INTO subjects_to_teachers VALUES (%s,%s,%s)", subjects_to_teachers)
def get_matrix(self,locationid): try: conn = PostgresConnector().get_connection() cursor = conn.cursor() query = """ WITH TREND_COUNT_TT AS (SELECT TREND,COUNT(*) AS TREND_COUNT FROM TRENDS WHERE LOCATIONID = %s GROUP BY TREND), TOP_TRENDS_TT AS (SELECT TREND FROM TREND_COUNT_TT ORDER BY TREND_COUNT DESC LIMIT 15), IDS_FOR_TOP_TRENDS_TT AS (SELECT ID FROM ORGANIZED_TWEETS WHERE TREND IN (SELECT TREND FROM TOP_TRENDS_TT) AND LOCATION_ID = '2295420'), --SELECT * FROM IDS_FOR_TOP_TRENDS_TT ID_ENTITY_TOP_TRENDS_TT AS (SELECT TREND,ID,ENTITY FROM ID_ENTITY WHERE ID IN (SELECT ID FROM IDS_FOR_TOP_TRENDS_TT)), TREND_ENTITY_TF_IDF_SUM_TT AS (SELECT TREND,ENTITY,COUNT(ID) TF_IDF_SUM FROM ID_ENTITY WHERE ID IN (SELECT ID FROM IDS_FOR_TOP_TRENDS_TT) GROUP BY TREND,ENTITY), --SELECT * FROM TREND_ENTITY_TF_IDF_SUM_TT TREND_TF_IDF_SQ_SUM_TT AS (SELECT TREND, SUM(TF_IDF_SUM*TF_IDF_SUM) AS TF_IDF_SQ_SUM FROM TREND_ENTITY_TF_IDF_SUM_TT GROUP BY TREND), COSINE_DIST_NUM_TT AS (SELECT T1.TREND AS TREND1,T2.TREND AS TREND2, SUM(T1.TF_IDF_SUM*T2.TF_IDF_SUM) AS COSINE_NUM FROM TREND_ENTITY_TF_IDF_SUM_TT AS T1 INNER JOIN TREND_ENTITY_TF_IDF_SUM_TT AS T2 ON T2.TREND>T1.TREND AND T1.ENTITY = T2.ENTITY GROUP BY T1.TREND,T2.TREND), COSINE_DIST_TT AS (SELECT TREND1,TREND2, COSINE_NUM/(SQRT(T2.TF_IDF_SQ_SUM)*SQRT(T3.TF_IDF_SQ_SUM)) AS COSIND_DIST FROM COSINE_DIST_NUM_TT AS T1 INNER JOIN TREND_TF_IDF_SQ_SUM_TT AS T2 ON T1.TREND1=T2.TREND INNER JOIN TREND_TF_IDF_SQ_SUM_TT AS T3 ON T1.TREND2=T3.TREND) SELECT * FROM COSINE_DIST_TT ORDER BY TREND1,TREND2; """ cursor.execute(query,(locationid,)) trend1_column = 0 trend2_column = 1 distance_value_column = 2 trends_list = [] row_counter = 0 max_columns = 15 column_iteration = 1 distance_matrix = [[0 for x in xrange(max_columns)] for x in xrange(max_columns)] for row in cursor: trend1 = row[trend1_column] trend2 = row[trend2_column] if trend1 not in trends_list: trends_list.append(trend1) if trend2 not in trends_list: trends_list.append(trend2) # this is to check 0,0 1,1 and so on distance_matrix[row_counter][row_counter] = 0 # this populates 1,2 and 2,1 and so on # this avoid 2 loops distance_matrix[row_counter][column_iteration] = row[distance_value_column] distance_matrix[column_iteration][row_counter] = row[distance_value_column] column_iteration = column_iteration + 1 if column_iteration == max_columns: row_counter = row_counter + 1 column_iteration = row_counter + 1 return distance_matrix,trends_list except Exception: print traceback.format_exc()