def get_dates_location(self, locationid): min_max_date_list = [] try: conn = PostgresConnector().get_connection() cursor = conn.cursor() query = """ select max(date),min(date) from trends where trend in (select t1.trend as trend from (select count(*) as c,trend from trends where locationid = %s group by trend)as t1 order by c desc limit 15) and locationid = %s """ cursor.execute(query, (locationid, locationid)) min_date_column = 1 max_date_column = 0 for row in cursor: min_max_date_dict = {} min_max_date_dict["min_date"] = str(row[min_date_column]) min_max_date_dict["max_date"] = str(row[max_date_column]) min_max_date_list.append(min_max_date_dict) except Exception: traceback.format_exc() return min_max_date_list
def get_dates_location(self,locationid): min_max_date_list = [] try: conn = PostgresConnector().get_connection() cursor = conn.cursor() query = """ select max(date),min(date) from trends where trend in (select t1.trend as trend from (select count(*) as c,trend from trends where locationid = %s group by trend)as t1 order by c desc limit 15) and locationid = %s """ cursor.execute(query,(locationid,locationid)) min_date_column = 1 max_date_column = 0 for row in cursor: min_max_date_dict = {} min_max_date_dict["min_date"] = str(row[min_date_column]) min_max_date_dict["max_date"] = str(row[max_date_column]) min_max_date_list.append(min_max_date_dict) except Exception: traceback.format_exc() return min_max_date_list
def build(self): conn = PostgresConnector().get_connection() cursor = conn.cursor() query = 'select id,entities,trend from organized_tweets' cursor.execute(query) id_column = 0 entities_column = 1 trend_column = 2 with open('copy_from.txt', 'w') as f: for row in cursor: tweet_id = row[id_column] trend = row[trend_column] hashtag_array = row[entities_column] json_array = json.loads(hashtag_array) hashtag_list = [hashtag["text"] for hashtag in json_array] hashtag_list_unique = list(set(hashtag_list)) print 'Writing data to table for the tweet_id ' + tweet_id for hashtag in hashtag_list_unique: f.write(tweet_id + '\t' + hashtag.encode('utf-8') + '\t' + trend + '\n') with open('copy_from.txt') as f: cursor.copy_from(f, 'id_entity', columns=('id', 'entity', 'trend')) conn.commit() os.remove('copy_from.txt')
def get_trends(self, location_id, start_date, end_date): trends_list = [] try: conn = PostgresConnector().get_connection() cursor = conn.cursor() query = """ select c,trend from (select count(*) as c,trend from trends where locationid = %s and date between %s and %s and id in(select trendid from tweets) group by trend) as t1 order by c desc limit 15 """ cursor.execute(query, (location_id, start_date, end_date)) trend_column = 1 count_column = 0 for row in cursor: trend_count = {} trend_count["trend"] = row[trend_column] trend_count["count"] = row[count_column] trends_list.append(trend_count) except Exception as e: print e return trends_list
def get_data(self,locationid): entity_trend_dict = {} try: conn = PostgresConnector().get_connection() cursor = conn.cursor() query = """select t1.entity,t2.trend from (select id,entity from id_entity ) as t1 inner join (select id,trend from organized_tweets where trend in (select trend from (select count(*) as c,trend from trends where locationid = %s group by trend)as t_in order by c desc limit 15))as t2 on t1.id = t2.id""" cursor.execute(query,(locationid,)) entity_column = 0 trend_column = 1 for row in cursor: id = row[trend_column] if id in entity_id_dict.keys(): entity_list = entity_id_dict[id] entity_list.append(row[entity_column]) entity_id_dict[id] = entity_list else: entity_id_dict[id] = [row[entity_column]] except Exception: print traceback.format_exc() return entity_trend_dict
def build(self): conn = PostgresConnector().get_connection() cursor = conn.cursor() query = 'select id,entities,trend from organized_tweets' cursor.execute(query) id_column = 0 entities_column = 1 trend_column = 2 with open('copy_from.txt','w') as f: for row in cursor: tweet_id = row[id_column] trend = row[trend_column] hashtag_array = row[entities_column] json_array = json.loads(hashtag_array) hashtag_list = [hashtag["text"] for hashtag in json_array] hashtag_list_unique = list(set(hashtag_list)) print 'Writing data to table for the tweet_id ' +tweet_id for hashtag in hashtag_list_unique: f.write(tweet_id + '\t' + hashtag.encode('utf-8') + '\t' + trend + '\n') with open('copy_from.txt') as f: cursor.copy_from(f, 'id_entity', columns=('id', 'entity','trend')) conn.commit() os.remove('copy_from.txt')
def get_total_documents(self): conn = PostgresConnector().get_connection() cursor = conn.cursor() query = 'select count(distinct(id)) from "IdEntity" ' cursor.execute(query) count_of_distinct_id_column = 0 total_documents_count = 0 for row in cursor: total_documents_count = row[count_of_distinct_id_column] return total_documents_count
def get_sentiments(self): conn = PostgresConnector().get_connection() cursor = conn.cursor() query = """ select text from organized_tweets limit 10 """ cursor.execute(query) for row in cursor: text = row[0] blob = TextBlob(text,analyzer=NaiveBayesAnalyzer()) print blob.sentiment
def get_locations(self): conn = PostgresConnector().get_connection() cursor = conn.cursor() query = 'SELECT id,city from location' cursor.execute(query) id_column = 0 city_column = 1 locations_list = [] for row in cursor: id_location = {} id_location["geoid"] = row[id_column] id_location["city"] = row[city_column] locations_list.append(id_location) return locations_list
def build_tf(self): # using group by first get tf score for each entity conn = PostgresConnector().get_connection() cursor = conn.cursor() query = 'select count(id),entity from "IdEntity" group by entity' cursor.execute(query) count_of_id_column = 0 entities_column = 1 entity_id_dict = {} for row in cursor: count_of_id = row[count_of_id_column] entity = row[entities_column] entity_id_dict[entity] = count_of_id return entity_id_dict
def get_locations(self): conn = PostgresConnector().get_connection() cursor = conn.cursor() query = 'SELECT id,city from location'; cursor.execute(query) id_column = 0 city_column = 1 locations_list = [] for row in cursor: id_location = {} id_location["geoid"] = row[id_column] id_location["city"] = row[city_column] locations_list.append(id_location) return locations_list
def train_model(): np.random.seed(123) with open(os.path.join("..", "query_pull_1000v3.pkl"), 'rb') as f: query_pull = pickle.load(f) connector = PostgresConnector() env = DatabaseIndexesEnv(n=COLUMNS_AMOUNT, table_name=table_name, query_pull=query_pull, batch_size=BATCH_SIZE, connector=connector, k=3, max_episodes=1000) # Get the environment and extract the number of actions. env.seed(123) # Next, we build a very simple model. model = build_model() print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! dqn = initialize_agent(model) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. dqn.fit(env, nb_steps=50000, visualize=False, verbose=0, callbacks=[CustomEpisodeLogger()]) # After training is done, we save the final weights. dqn.save_weights('dqn_{}_weights_6_4_2_1_2000_episodes_estimated.h5f'.format(ENV_NAME), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. dqn.test(env, nb_episodes=5, visualize=False)
def __init__(self, prefix="entity", window_size=2, entities_only=True, port=5436, log_file=os.path.join(os.path.dirname(__file__), "logs/SchemaCreator.log"), log_level=logging.INFO, log_verbose=True ): """ Set up. :param prefix: (str) Prefix to the table names. :param port: (int) Used to connect to the Postgres tables. :param log_file: (os.path) Path to the file containing the logs. :param log_level: (logging.LEVEL) Specifies the level to be logged. :param log_verbose: (boolean) Specifies whether or not to look to stdout as well. """ self.logger = set_up_logger(__name__, log_file, log_level, log_verbose) self.window_size = window_size self.prefix = prefix + "_" + str(self.window_size) self.entities_only = entities_only self.names = self.get_names(self.prefix) self.port = port self.pc = PostgresConnector(port=port) self.logger.info("Successfully registered SchemaGenerator.")
def create_table(port): pc = PostgresConnector(port=port) with pc as opc: # add sentence index column but in separate table print("Starting with ") pc.cursor.execute("CREATE TABLE sentences_neo4j AS TABLE sentences;") pc.cursor.execute("ALTER TABLE sentences_neo4j ADD COLUMN id int;") pc.cursor.execute("""WITH numbered (sid, document_id, sentence_id) AS (select row_number() OVER() sid, * from sentences_neo4j) UPDATE sentences_neo4j SET id = numbered.sid FROM numbered WHERE sentences_neo4j.document_id = numbered.document_id AND sentences_neo4j.sentence_id = numbered.sentence_id;""") # add term_occurrence index print("Starting with term occurrences...") pc.cursor.execute("CREATE TABLE term_occurrence_neo4j AS TABLE term_occurrence;") pc.cursor.execute("ALTER TABLE term_occurrence_neo4j ADD COLUMN id int;") pc.cursor.execute("""WITH numbered (sid, document_id, sentence_id, term_id) AS (select row_number() OVER() sid, * from term_occurrence) UPDATE term_occurrence_neo4j SET id = numbered.sid FROM numbered WHERE term_occurrence_neo4j.document_id = numbered.document_id AND term_occurrence_neo4j.sentence_id = numbered.sentence_id AND term_occurrence_neo4j.term_id = numbered.term_id;""")
def get_tfidf(self, locationid, trend): tfidf_list = [] try: conn = PostgresConnector().get_connection() cursor = conn.cursor() tfidf_query = """ select entity,tf_idf_score from (select t4.entity,sum(t4.tf_idf) as tf_idf_score from (select t1.id,t1.entity,t2.count_id,t3.count_entity, (1.0/t3.count_entity)*log(( select count(*) from organized_tweets where trend = %s and location_id = %s )/t2.count_id) as tf_idf from (select id,entity from id_entity where id in (select id from organized_tweets where trend = %s and location_id = %s)) as t1 inner join (select entity,count(id) as count_id from id_entity where id in (select id from organized_tweets where trend = %s and location_id = %s)group by entity) as t2 on t1.entity = t2.entity inner join (select id,count(entity) as count_entity from id_entity where id in(select id from organized_tweets where trend = %s and location_id = %s )group by id) as t3 on t1.id = t3.id) as t4 group by entity)as t5 order by tf_idf_score desc limit 100; """ cursor.execute(tfidf_query, (trend, locationid, trend, locationid, trend, locationid, trend, locationid)) entity_column = 0 tfidf_column = 1 for row in cursor: entity_tfidf_score = {} entity_tfidf_score["entity"] = row[entity_column] entity_tfidf_score["tfidf"] = row[tfidf_column] tfidf_list.append(entity_tfidf_score) return tfidf_list except Exception: print traceback.format_exc()
def get_tfidf(self,locationid,trend): tfidf_list = [] try: conn = PostgresConnector().get_connection() cursor = conn.cursor() tfidf_query = """ select entity,tf_idf_score from (select t4.entity,sum(t4.tf_idf) as tf_idf_score from (select t1.id,t1.entity,t2.count_id,t3.count_entity, (1.0/t3.count_entity)*log(( select count(*) from organized_tweets where trend = %s and location_id = %s )/t2.count_id) as tf_idf from (select id,entity from id_entity where id in (select id from organized_tweets where trend = %s and location_id = %s)) as t1 inner join (select entity,count(id) as count_id from id_entity where id in (select id from organized_tweets where trend = %s and location_id = %s)group by entity) as t2 on t1.entity = t2.entity inner join (select id,count(entity) as count_entity from id_entity where id in(select id from organized_tweets where trend = %s and location_id = %s )group by id) as t3 on t1.id = t3.id) as t4 group by entity)as t5 order by tf_idf_score desc; """ cursor.execute(tfidf_query,(trend,locationid,trend,locationid,trend,locationid,trend,locationid)) entity_column = 0 tfidf_column = 1 for row in cursor: entity_tfidf_score = {} entity_tfidf_score["entity"] = row[entity_column] entity_tfidf_score["tfidf"] = row[tfidf_column] tfidf_list.append(entity_tfidf_score) return tfidf_list except Exception : print traceback.format_exc()
def get_tweets(self, trend, entity): conn = PostgresConnector().get_connection() cursor = conn.cursor() query_tweets = """ select text from organized_tweets where id in (select id from id_entity where entity = %s) limit 50 """ cursor.execute(query_tweets, (entity, )) text_list = [] for row in cursor: text_dict = {} text_dict["name"] = row[0] text_list.append(text_dict) return text_list
def get_sentiments(self): conn = PostgresConnector().get_connection() cursor = conn.cursor() query = """ select id,text from organized_tweets """ cursor.execute(query) id_column = 0 text_column = 1 with open("sentiments.tsv","w") as f: for row in cursor: text = row[text_column] blob = TextBlob(text,analyzer=NaiveBayesAnalyzer()) print 'writing for tweet with id ' +str(row[id_column]) f.write(str(row[id_column])+'\t'+str(blob.sentiment.classification)+'\t'+str(blob.sentiment.p_pos)+'\t'+str(blob.sentiment.p_neg)+'\n')
def get_sentiments(self): conn = PostgresConnector().get_connection() cursor = conn.cursor() query = """ select id,text from organized_tweets """ cursor.execute(query) id_column = 0 text_column = 1 with open("sentiments.tsv", "w") as f: for row in cursor: text = row[text_column] blob = TextBlob(text, analyzer=NaiveBayesAnalyzer()) print 'writing for tweet with id ' + str(row[id_column]) f.write( str(row[id_column]) + '\t' + str(blob.sentiment.classification) + '\t' + str(blob.sentiment.p_pos) + '\t' + str(blob.sentiment.p_neg) + '\n')
def validate_data(self,vehicle_type,toll_type,date,price,vehicle_no): if vehicle_type.strip() == '' or toll_type.strip() == '' or price.strip() == '' or date.strip() == '' or vehicle_no.strip() =='': # figure out what to return here! raise Exception('input data has nulls') else: try: conn = PostgresConnector().get_connection() cursor = conn.cursor() query = """ INSERT INTO transactions (vehicle_type,toll_type,timestamp,price,vehicle_no ) values(%s,%s,\'%s\',%s,\'%s\') """ % (vehicle_type,toll_type,date,float(price),vehicle_no) #print "Inserting data to table using the query %s" % (query,) cursor.execute(query) conn.commit() return 'Success' except psycopg2.IntegrityError as e: raise Exception(' Unique key constraint failed ') except Exception as e: print e raise Exception(' Something else went wrong')
def validate_data(self,vehicle_no,time): if vehicle_no.strip() == '' and time.strip() == '': # figure out what to return here! raise Exception('input data has nulls') else: try: conn = PostgresConnector().get_connection() cursor = conn.cursor() query = "" if vehicle_no.strip() != '' and time.strip() != '': query = """ SELECT * FROM transactions where vehicle_no = \'%s\' and timestamp = \'%s\' order by timestamp """ % (vehicle_no,time) elif vehicle_no.strip() != '' : query = """ select t2.vehicle_type,t2.toll_type,t1.timestamp,t1.price,t1.vehicle_no from (SELECT * FROM transactions where vehicle_no =\'%s\') as t1 inner join (select * from master_data) as t2 on t2.vehicle_type_id = t1.vehicle_type and t2.toll_type_id = t1.toll_type """ % (vehicle_no,) else: query = """ SELECT * FROM transactions where timestamp = \'%s\' order by timestamp """ % (time,) print "selecting data from the table using the query %s" % (query,) cursor.execute(query) search_list = [] for row in cursor: elements = {} elements['vehicle_type'] = row[0] elements['toll_type'] = row[1] elements['time'] = str(row[2]) elements['price'] = row[3] elements['vehicle_no'] = row[4] search_list.append(elements) return search_list except Exception: print traceback.format_exc()
def build(self): conn = PostgresConnector().get_connection() cursor = conn.cursor() query = 'select id,hashtags from "organizedTweets" ' cursor.execute(query) id_column = 0 entities_column = 1 entity_id_dict = {} for row in cursor: tweet_id = row[id_column] hashtag_array = row[entities_column] hashtag_list = [hashtag['text'] for hashtag in hashtag_array] for entity in hashtag_list: if entity in entity_id_dict.keys(): id_list = entity_id_dict[entity] id_list.append(tweet_id) entity_id_dict[entity] = id_list else: id_list = [] id_list.append(tweet_id) entity_id_dict[entity] = id_list return entity_id_dict
def get_data(self): out_list = [] try: conn = PostgresConnector().get_connection() cursor = conn.cursor() query = """ SELECT vehicle_type,vehicle_type_id, toll_type,toll_type_id,price from master_data""" cursor.execute(query) resultset = cursor.fetchall() out_list = [] vehicle_dict = {} for row in resultset: row_data = {} row_value_dict = {} id = row[1] vehicle_name = row[0] journey_list = [] if id not in vehicle_dict.keys(): vehicle_dict_out = {} vehicle_dict[id] = 1 for row_in in resultset: journey_type = {} if row_in[1] == id: journey_type['toll_type'] = row_in[2] journey_type['toll_id'] = str(row_in[3]) journey_type['price'] = str(row_in[4]) journey_list.append(journey_type) vehicle_dict_out['vehicle_type'] = vehicle_name vehicle_dict_out['vehicle_type_id'] = id vehicle_dict_out['journey'] = journey_list out_list.append(vehicle_dict_out) return out_list except Exception as e: print e raise Exception(' Something went wrong while retrieving data')
def get_trends(self,location_id,start_date,end_date): trends_list = [] try: conn = PostgresConnector().get_connection() cursor = conn.cursor() query = """ select c,trend from (select count(*) as c,trend from trends where locationid = %s and date between %s and %s group by trend)as t1 order by c desc limit 15 """ cursor.execute(query,(location_id,start_date,end_date)) trend_column = 1 count_column = 0 for row in cursor: trend_count = {} trend_count["trend"] = row[trend_column] trend_count["count"] = row[count_column] trends_list.append(trend_count) except Exception as e: print e return trends_list
def test_dqn_against_heuristic(self): np.random.seed(123) with open(path.join("..", "query_pull_1000v3.pkl"), 'rb') as f: query_pull = pickle.load(f)[0:5] workload = np.random.choice(query_pull, const.BATCH_SIZE) env = DatabaseIndexesEnv(n=const.COLUMNS_AMOUNT, table_name=table_name, query_pull=query_pull, batch_size=const.BATCH_SIZE, connector=PostgresConnector(), k=3, max_episodes=1) dqn = load_agent( path.join("..", "dqn_specific_{}.h5f".format(ENV_NAME))) results = dqn.test(env, nb_episodes=1) print(results) print(env.state) print(predict_on_workload(workload))
def test_cache(self): np.random.seed(123) with open("..\query_pull_1000v3.pkl", 'rb') as f: query_pull = pickle.load(f) register( id='DatabaseIndexesEnv-v0', entry_point='dbenv:DatabaseIndexesEnv', kwargs={'n': const.COLUMNS_AMOUNT, 'table_name': "test_table", 'query_pull': query_pull, 'batch_size': 2, 'connector': PostgresConnector(), 'k': 3, 'max_episodes': 1} ) env = gym.make('DatabaseIndexesEnv-v0') env.step(0) env.step(1) env.step(2) print(env.cache)
def train_model(): np.random.seed(123) with open("query_pull_1000v2.pkl", 'rb') as f: query_pull = pickle.load(f)[0:5] register(id=ENV_NAME, entry_point='dbenv:DatabaseIndexesEnv', kwargs={ 'n': COLUMNS_AMOUNT, 'table_name': table_name, 'query_pull': query_pull, 'batch_size': BATCH_SIZE, 'connector': PostgresConnector(), 'k': 3, 'max_episodes': episodes }) # Get the environment and extract the number of actions. env = gym.make(ENV_NAME) env.seed(123) # Next, we build a very simple model. model = build_model() print(model.summary()) # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and # even the metrics! dqn = initialize_agent(model) # Okay, now it's time to learn something! We visualize the training here for show, but this # slows down training quite a lot. You can always safely abort the training prematurely using # Ctrl + C. dqn.fit(env, nb_steps=episodes, visualize=False, verbose=2) # After training is done, we save the final weights. dqn.save_weights('dqn_specific_{}.h5f'.format(ENV_NAME), overwrite=True) # Finally, evaluate our algorithm for 5 episodes. dqn.test(env, nb_episodes=5, visualize=False)
result = [] print("Number of empty hyperedges:") for i, prefix in enumerate(prefixes): with pc as open_pc: table = prefix + "hyperedges" open_pc.cursor.execute( "SELECT (SELECT MAX(edge_id) from {}) - " "(SELECT count(distinct edge_id) from {}) as diff".format( table, table)) result.append(open_pc.cursor.fetchall()[0][0]) print("Results for {}: {}".format(table, result[i])) if __name__ == "__main__": prefixes = ["", "entity_"] pc = PostgresConnector(port=5435) print() get_document_table_length(prefixes, pc) print() get_sentence_table_length(prefixes, pc) print() get_hyperedge_table_length(prefixes, pc) print() analyze_edge_size(prefixes, pc) print() analyze_term_frequency(prefixes, pc) print() get_number_of_empty_edges(prefixes, pc)
def __init__(self): self.connector = PostgresConnector(CONFIG_FILE_NAME, CONFIG_SECTION_NAME) self.SQL_constructor = PostgresSQLConstructor()
def get_indexes_qagent(index_amount, queries, Log=False): connector = PostgresConnector() if not table_exists(connector, table_name): create_table_2(connector) load_table(connector) # make results repeatable np.random.seed(123) # gym configuration query_batch = list() env = gym.make('DatabaseIndexesEnv-v0') env.set_indices_num(index_amount) current_query_idx = 0 query_batch = list() for workload in range(1): exploration_rate = 1.0 # represents the exploration rate to be decayed by the time initial_lr = 1.0 # Learning rate query_batch = list() Q_table_c.Q_table = {} query_batch = list() workload_selectivity_l = list() # 1. generate the queries per workload # 2. generate the cummlative selectivity per workload start = timer() for i in range(current_query_idx, current_query_idx + num_queries_batch): query_batch.append(queries[i]['query']) workload_selectivity_l.append(list(map(lambda x: x, queries[i]['sf_array']))) current_query_idx += num_queries_batch workload_selectivity = np.prod(workload_selectivity_l, axis=0).tolist() max_workload_selectivity = max(workload_selectivity) env.set_query_batch(query_batch) actions_taken = list() # as a heuristic: the indices with the lowest selectivity selectivity_indices = heapq.nsmallest(3, range(len(workload_selectivity)), workload_selectivity.__getitem__) if Log: print("Entering the q learning ..... the process can take time.") print(workload_selectivity) env.clear_cache() for episode in range(NUM_EPISODES): state = env.reset() actions_taken = list() # decay the exploration as the number of episodes grows, the Q table becomes more mature eps = exploration_rate / np.sqrt(episode + 1) eps = max(eps, min_exp_rate) episode_total_reward = 0 episode_total_qreward = 0 episode_strategy = [] eta = max(min_lr, initial_lr * (0.85 ** (episode // 100))) ## now the learning comes for kk in range(3): # do exploration, i.e., choose a random actions # make sure the last step is exploitation unless the state is new if episode == 0: episode_strategy.append("explore") action = selectivity_indices[kk] Q_table_c.Q_table[state_to_int(state)] = {} Q_table_c.Q_table[state_to_int(state)][action] = 0 elif (is_new_state(state) or (np.random.uniform(0, 1) < eps)) and episode != NUM_EPISODES - 1: episode_strategy.append("explore") # generate only actions that matches something with selectivity. action = env.action_space.sample() # high selectivity, not a good option for an index while workload_selectivity[action] >= max_workload_selectivity: action = env.action_space.sample() if is_new_state(state): Q_table_c.Q_table[state_to_int(state)] = {} if action not in Q_table_c.Q_table[state_to_int(state)]: Q_table_c.Q_table[state_to_int(state)][action] = 0 else: # else exploit choose the maximum value from the Q table episode_strategy.append("exploit") action = get_action_maximum_reward(state)[0] actions_taken.append(action) state_old_int = state_to_int(state) state_new, reward, done, _ = env.step(action) episode_total_reward += reward next_action = 0 next_action_q_value = 0 if is_new_state(state_new): next_action = env.action_space.sample() while (action == next_action or workload_selectivity[next_action] >= max_workload_selectivity): next_action = env.action_space.sample() next_action_q_value = 0 else: next_action, next_action_q_value = get_action_maximum_reward(state_new) Q_table_c.Q_table[state_old_int][action] += eta * (reward + GAMMA * next_action_q_value - Q_table_c.Q_table[state_old_int][action]) episode_total_qreward += Q_table_c.Q_table[state_old_int][action] state, action = state_new, next_action actions_taken_s = ','.join(str(e) for e in actions_taken) if Log: print( "episode num = '{0}', episode_total_immediate_rewards = '{1}', episode_total_reward = '{2}', current_state = '{3}', actions_taken = '{4}', strategy = {5}" .format(episode, float(episode_total_reward), float(episode_total_qreward), state_to_string(state), actions_taken_s, episode_strategy)) return actions_taken
def update_organized_tweets(self): tweet_id_dict = {} try: conn = PostgresConnector().get_connection() cursor = conn.cursor() query_location = 'select id from location' cursor.execute(query_location) location_column = 0 for row_location in cursor: query = """ select id,trend from trends where trend in(select trend from (select count(*) as c,trend from trends where locationid = %s group by trend)as t1 order by c desc limit 15) """ cursor = conn.cursor() location_id = row_location[location_column] cursor.execute(query,(location_id,)) trend_id_column = 0 trend_name_column = 1 trend_count = 0 for row in cursor: trend_count = trend_count + 1 trend_id = row[trend_id_column] trend_name = row[trend_name_column] print 'Processing for trend ' +trend_id+' , ' +str(trend_count) query_tweets = 'select tweets from tweets where trendId = \''+str(trend_id)+'\'' cursor_tweets = conn.cursor() cursor_tweets.execute(query_tweets) tweets_column = 0 with open(trend_name+'.txt','w') as f: # rows of tweets array for tweets_row in cursor_tweets: tweets_json_array = tweets_row[tweets_column] # tweets in a tweets array for json_in in tweets_json_array: id = json_in['id'] tweet_id_exists = tweet_id_dict.get(id) if tweet_id_exists is None: #print jsonIn tweet_id_dict[id] = 1 geo = 'none' if json_in['geo'] is None else 'none' #json['geo'] retweeted = json_in['retweeted'] in_reply_to_screen_name = 'none' if json_in['in_reply_to_screen_name'] is None else json_in['in_reply_to_screen_name'] truncated = 'none' if json_in['truncated'] is None else json_in['truncated'] source = json_in['source'] created_at = json_in['created_at'] place = 'none' if json_in['place'] is None else 'none'#json['place'] user_id = json_in['user']['id'] text = json_in['text'].strip() #text = " ".join(str(text).split()) text = str(filter(lambda x: x in string.printable,text)) #text = text.encode('utf-16') text = re.sub('\s+',' ',text) text = text.replace('\\','') entities = json_in['entities']['hashtags'] user_mentions = json_in['entities']['user_mentions'] user_mentions = [] retweet_count = json_in['retweet_count'] favorite_count = json_in['favorite_count'] # if len(entities) > 0: # for entity in entities: # for k,v in entity.items(): # if k in 'text': # entity_list = {} # new_v = entity[k] # new_v = str(new_v.encode('utf-8')) # new_v = filter(lambda x: x in string.printable,new_v) # #print id,check,new_v,len(new_v) # if len(new_v) > 0: # entity[k] = new_v # else: # entity[k] = '' #print id,geo,retweeted ,in_reply_to_screen_name ,truncated ,source ,created_at ,place ,user_id ,text ,entities ,user_mentions,retweet_count,favorite_count f.write(str(id)+'\t'+str(geo)+'\t'+str(retweeted)+'\t'+str(in_reply_to_screen_name.encode('utf-8'))+'\t'+str(truncated)+'\t'+str(source.encode('utf-8'))+'\t'+str(created_at.encode('utf-8'))+'\t'+str(place)+'\t'+str(user_id)+'\t'+text+'\t'+str(json.dumps(entities))+'\t'+str(user_mentions)+'\t'+str(retweet_count)+'\t'+str(favorite_count)+'\t'+str(trend_name)+'\t'+str(location_id)+'\n') else: continue # array of tweets json ends here #break # total number of tweets rows for a given trend ends here #break print 'Writing to table' with open(trend_name+'.txt') as f: cursor_write = conn.cursor() cursor_write.copy_from(f,'organized_tweets',columns=('id','geo','retweeted','in_reply_to_screen_name','truncated','source','created_at','place','user_id','text','entities','user_mentions','retweet_count','favorite_count','trend','location_id')) conn.commit() os.remove(trend_name+'.txt') # all trends finish here #break except Exception : print traceback.format_exc()
def run_qlearning(): connector = PostgresConnector() query_pull = generate_query_pull('.query_pull', queries_amount, [4, 6], table_column_types, table_column_names, table_name, connector)
def get_matrix(self,locationid): try: conn = PostgresConnector().get_connection() cursor = conn.cursor() query = """ WITH TREND_COUNT_TT AS (SELECT TREND,COUNT(*) AS TREND_COUNT FROM TRENDS WHERE LOCATIONID = %s GROUP BY TREND), TOP_TRENDS_TT AS (SELECT TREND FROM TREND_COUNT_TT ORDER BY TREND_COUNT DESC LIMIT 15), IDS_FOR_TOP_TRENDS_TT AS (SELECT ID FROM ORGANIZED_TWEETS WHERE TREND IN (SELECT TREND FROM TOP_TRENDS_TT) AND LOCATION_ID = '2295420'), --SELECT * FROM IDS_FOR_TOP_TRENDS_TT ID_ENTITY_TOP_TRENDS_TT AS (SELECT TREND,ID,ENTITY FROM ID_ENTITY WHERE ID IN (SELECT ID FROM IDS_FOR_TOP_TRENDS_TT)), TREND_ENTITY_TF_IDF_SUM_TT AS (SELECT TREND,ENTITY,COUNT(ID) TF_IDF_SUM FROM ID_ENTITY WHERE ID IN (SELECT ID FROM IDS_FOR_TOP_TRENDS_TT) GROUP BY TREND,ENTITY), --SELECT * FROM TREND_ENTITY_TF_IDF_SUM_TT TREND_TF_IDF_SQ_SUM_TT AS (SELECT TREND, SUM(TF_IDF_SUM*TF_IDF_SUM) AS TF_IDF_SQ_SUM FROM TREND_ENTITY_TF_IDF_SUM_TT GROUP BY TREND), COSINE_DIST_NUM_TT AS (SELECT T1.TREND AS TREND1,T2.TREND AS TREND2, SUM(T1.TF_IDF_SUM*T2.TF_IDF_SUM) AS COSINE_NUM FROM TREND_ENTITY_TF_IDF_SUM_TT AS T1 INNER JOIN TREND_ENTITY_TF_IDF_SUM_TT AS T2 ON T2.TREND>T1.TREND AND T1.ENTITY = T2.ENTITY GROUP BY T1.TREND,T2.TREND), COSINE_DIST_TT AS (SELECT TREND1,TREND2, COSINE_NUM/(SQRT(T2.TF_IDF_SQ_SUM)*SQRT(T3.TF_IDF_SQ_SUM)) AS COSIND_DIST FROM COSINE_DIST_NUM_TT AS T1 INNER JOIN TREND_TF_IDF_SQ_SUM_TT AS T2 ON T1.TREND1=T2.TREND INNER JOIN TREND_TF_IDF_SQ_SUM_TT AS T3 ON T1.TREND2=T3.TREND) SELECT * FROM COSINE_DIST_TT ORDER BY TREND1,TREND2; """ cursor.execute(query,(locationid,)) trend1_column = 0 trend2_column = 1 distance_value_column = 2 trends_list = [] row_counter = 0 max_columns = 15 column_iteration = 1 distance_matrix = [[0 for x in xrange(max_columns)] for x in xrange(max_columns)] for row in cursor: trend1 = row[trend1_column] trend2 = row[trend2_column] if trend1 not in trends_list: trends_list.append(trend1) if trend2 not in trends_list: trends_list.append(trend2) # this is to check 0,0 1,1 and so on distance_matrix[row_counter][row_counter] = 0 # this populates 1,2 and 2,1 and so on # this avoid 2 loops distance_matrix[row_counter][column_iteration] = row[distance_value_column] distance_matrix[column_iteration][row_counter] = row[distance_value_column] column_iteration = column_iteration + 1 if column_iteration == max_columns: row_counter = row_counter + 1 column_iteration = row_counter + 1 return distance_matrix,trends_list except Exception: print traceback.format_exc()
def __init__(self, num_distinct_documents=5000, replace_entities=True, max_term_length=127, remove_stopwords=True, custom_stopwords=[ ',', '.', '-', '\xa0', '“', '”', '"', '\n', '—', ':', '?', 'I', '(', ')' ], analyze=False, document_tabe_name="documents", sentence_table_name="sentences", sentence_fields=OrderedDict({ "doc_id": "document_id", "sen_id": "sentence_id", "content": "sentence_text" }), term_table_name="terms", term_sql_format=("term_id", "term_text", "is_entity"), term_occurrence_table_name="term_occurrence", term_occurrence_sql_format=("document_id", "sentence_id", "term_id"), entity_table_name="entities", entity_sql_format=("entity_id", "entity_type"), database="postgres", user="******", password="******", host="127.0.0.1", port=5435, log_file=os.path.join(os.path.dirname(__file__), "logs/TermGenerator.log"), log_level=logging.INFO, log_verbose=True): """ Initializes various parameters, registers logger and MongoConnector, and sets up the limit. :param num_distinct_documents: (int) The number of distinct documents retrieved from the queries. For performance reasons, this should be limited during debugging/development. 0 (Zero) represents no limit, in accordance with the MongoDB standard for .limit(). :param replace_entities: (boolean) Whether or not the entities in the text should be replaced/recognised. The reason for this is that single terms might be merged together to one term, i.e. first and last name: "Dennis" "Aumiller" would be two separate terms in the traditional splitting (replace_entities=False), whereas - if set to true - "Dennis Aumiller" would represent only one entity. :param max_term_length: (int) Indicator of how long the terms are supposed to be (varchar property in table). :param remove_stopwords: (boolean) Determines whether or not stop words are removed. Currently, we are still deciding on the final set, but likely either one (or both) of NLTK and SpaCy's stop word lists. :param custom_stopwords: (list of strings) Additional words that will not be considered at adding-time. :param analyze: (boolean) Whether or not to include analytically relevant metrics. :param document_tabe_name: (str) Name of the table where the document information is stored. :param sentence_table_name: (str) Name of the table where the sentence information will be stored. :param sentence_fields: (OrderedDict) Structure of input to output values from MongoDB to postgres for the sentence table and its fields. :param term_table_name: (str) Name of the Postgres tables for the terms. :param term_sql_format: (tuple) Since those are generated locally, only a tuple of the PostgresColumns suffices. :param term_occurrence_table_name: (str) Name of the Postgres table for the term occurrences :param term_occurrence_sql_format: (tuple) Same as term_sql_format, but for the term occurrences. :param entity_table_name: (str) (Not implemented yet) Name of the table for the entity meta information. :param entity_sql_format: (str) Same as term_sql_format, but for entities. :param database: (str) database name. :param user: (str) User name to get access to the Postgres database. :param password: (str) Corresponding user password. :param host: (IP) IP address (in string format) for the host of the postgres database. :param port: (integer) Port at which to access the database. """ # set up logger self.logger = set_up_logger(__name__, log_file, log_level, log_verbose) self.logger.info("Successfully registered logger to TermGenerator.") # register a MongoConnector self.mc = MongoConnector() self.logger.info( "Successfully registered MongoConnector to TermGenerator.") # PostgresConnector self.pc = PostgresConnector(database, user, password, host, port) self.logger.info( "Successfully registered PostgresConnector to DocumentGenerator.") self.num_distinct_documents = num_distinct_documents # do this earlier since we need it already for the distinct documents. self.document_table_name = document_tabe_name # get the distinct IDs for the documents so we can match against them later # since we have removed parts of the document collection, we have to make sure to get this from Postgres. self.logger.info("Parsing relevant documents from Postgres...") with self.pc as open_pc: open_pc.cursor.execute("SELECT document_id FROM {}".format( self.document_table_name)) self.first_distinct_documents = list(open_pc.cursor.fetchall()) # extract from the tuple structure self.first_distinct_documents = [ el[0] for el in self.first_distinct_documents ] self.logger.info("Retrieved all relevant documents from Postgres.") # additionally restrict if we want only a number of documents. if self.num_distinct_documents != 0: self.logger.info( "Non-zero limit detected. Limiting to the first N entries.") self.first_distinct_documents = self.first_distinct_documents[:self . num_distinct_documents] self.replace_entities = replace_entities self.analyze = analyze self.max_term_length = max_term_length self.nlp = spacy.load("en") # construct dictionary with the entries per document/sentence id pair. Thus, we can later check whether # there are any entities in the current sentence with higher efficiency. self.occurrence_dict = {} self.occurring_entities = [] # start building the term dictionary/set, as well as an occurence map. Since terms will be "post-processed", # it is first created as a list and later cast to Counter and set. self.terms = [] # cast into a set later on. self.term_in_sentence = set() self.term_id = {} self.term_is_entity = {} if self.analyze: self.term_count = Counter() self.entity_count = Counter() self.entities = [] self.sentences = [] self.processed_sentences = [] # Postgres tables if not sentence_fields: self.logger.error("No sentence fields specified!") self.sentence_table_name = sentence_table_name self.sentence_fields = sentence_fields if not term_sql_format: self.logger.error("No term fields specified!") self.term_table_name = term_table_name self.term_sql_format = ", ".join(term_sql_format) if not term_occurrence_sql_format: self.logger.error("No term occurrence fields specified!") self.term_occurrence_table_name = term_occurrence_table_name self.term_occurrence_sql_format = ", ".join(term_occurrence_sql_format) if not entity_sql_format: self.logger.error("No entity fields specified!") self.entity_table_name = entity_table_name self.entity_sql_format = ", ".join(entity_sql_format) # value retrieving parse: self.sentence_values_to_retrieve = { key: 1 for key in self.sentence_fields.keys() } # suppress _id if not present: if "_id" not in self.sentence_values_to_retrieve.keys(): self.sentence_values_to_retrieve["_id"] = 0 self.sentence_sql_format = ", ".join( [value for value in self.sentence_fields.values()]) # create union of stop words, and add potentially custom stop words self.remove_stopwords = remove_stopwords self.removed_counter = 0 self.stopwords = STOP_WORDS.union(set(stopwords.words("english"))) # add custom stopwords. for word in custom_stopwords: self.stopwords.add(word) self.logger.info("Successfully initialized TermGenerator.")
def __init__(self, window_size=2, limit_edges=False, entities_only=False, document_table_name="documents", sentence_table_name="sentences", entity_table_name="entities", term_table_name="terms", term_occurrence_table_name="term_occurrence", hyperedge_table_name="hyperedges", hyperedge_format=("edge_id", "term_id", "pos"), hyperedge_document_table_name="hyperedge_document", hyperedge_document_format=("edge_id", "document_id"), hyperedge_sentence_table_name="hyperedge_sentences", hyperedge_sentence_format=("edge_id", "document_id", "sentence_id", "pos"), database="postgres", user="******", password="******", host="127.0.0.1", port=5435, log_file=os.path.join(os.path.dirname(__file__), "logs/HyperedgeGenerator.log"), log_level=logging.INFO, log_verbose=True): """ Initializes hyper edge generator class. :param window_size: (int) Number of sentences in each direction that will determine the context window size of the algorithm. :param limit_edges: (boolean) Experimental: Should limit the maximum number of terms per hyperedge. This would only be useful in context with other theoretical results. :param entities_only: (boolean) Indicating whether or not we should only take into account entity terms, and not the entirety of all term occurrences for the edges. :param document_table_name: (str) Name of the table where documents are stored. :param sentence_table_name: (str) Name of the table containing the sentences and their content. :param entity_table_name: (str) Name of the table containing the entity information and their properties. :param term_table_name: (str) Name of the table containing the terms and meta data. :param term_occurrence_table_name: (str) Name of the table containing term occurrence data. :param hyperedge_table_name: (str) Name of the table containing the general hyper edge information. :param hyperedge_format: (str) Table structure of hyper edge table. :param hyperedge_document_table_name: (str) Name of the table containing the document classification. :param hyperedge_document_format: (str) Table structure of hyper edge document table. :param hyperedge_sentence_table_name: (str) Name of the tale containing the hyper edge sentence data. :param hyperedge_sentence_format: (str) Table structure of the hyper edge sentence table. :param database: (str) database name. :param user: (str) User name to get access to the Postgres database. :param password: (str) Corresponding user password. :param host: (IP) IP address (in string format) for the host of the postgres database. :param port: (integer) Port at which to access the database. :param log_file: (os.path) Path to the file containing the logs. :param log_level: (logging.LEVEL) Specifies the level to be logged. :param log_verbose: (boolean) Specifies whether or not to look to stdout as well. """ self.logger = set_up_logger(__name__, log_file, log_level, log_verbose) self.logger.info( "Successfully registered logger to HyperedgeGenerator.") # important for hyperedges self.window_size = window_size self.limit_edges = limit_edges self.entities_only = entities_only # table names self.document_table_name = document_table_name self.sentence_table_name = sentence_table_name self.entity_table_name = entity_table_name self.term_table_name = term_table_name self.term_occurrence_table_name = term_occurrence_table_name self.hyperedge_table_name = hyperedge_table_name self.hyperedge_document_table_name = hyperedge_document_table_name self.hyperedge_sentence_table_name = hyperedge_sentence_table_name self.hyperedge_format = ", ".join([el for el in hyperedge_format]) self.hyperedge_document_format = ", ".join( [el for el in hyperedge_document_format]) self.hyperedge_sentence_format = ",".join( [el for el in hyperedge_sentence_format]) self.pc = PostgresConnector(database, user, password, host, port) self.logger.info( "Successfully registered PostgresConnector to HyperedgeGenerator.") self.hyperedge = [] self.hyperedge_sentence = [] self.hyperedge_document = [] self.all_hyperedges = [] self.all_hyperedge_sentences = [] # set up the "hyper edge ID counter", which is simply consecutive from 1. with self.pc as open_pc: if not check_table_existence(self.logger, open_pc, self.hyperedge_table_name): return 0 self.logger.info("Retrieving current hyper edge ID key...") open_pc.cursor.execute( "SELECT COUNT(DISTINCT h.edge_id) FROM {} as h".format( self.hyperedge_table_name)) # either start with 1 or get the current maximum self.hyperedge_ID = max(1, open_pc.cursor.fetchone()[0])
# assert Q_table_c.Q_table[state_to_int(state)], "This state has no corresponding action: %r" % state_to_int(state) max_reward = float('-inf') int_state = state_to_int(state) actions_rewards_dict = Q_table_c.Q_table[int_state] for key, val in actions_rewards_dict.items(): if val > max_reward: max_reward = val max_key = key return max_key, max_reward register( id='DatabaseIndexesEnv-v0', entry_point='dbenvm:DatabaseIndexesEnv', kwargs={'n': len(table_column_names), 'table_name': table_name, 'query_batch': list(), 'connector': PostgresConnector(), 'k': 3} ) def get_indexes_qagent(index_amount, queries, Log=False): connector = PostgresConnector() if not table_exists(connector, table_name): create_table_2(connector) load_table(connector) # make results repeatable np.random.seed(123) # gym configuration query_batch = list() env = gym.make('DatabaseIndexesEnv-v0')
def __init__(self, fields=OrderedDict({ "_id": "document_id", "title": "title", "feedName": "feedName", "category": "category", "feedURL": "feedURL", "published": "published" }), num_distinct_documents=0, document_table_name="documents", database="postgres", user="******", password="******", host="127.0.0.1", port=5435, log_file=os.path.join(os.path.dirname(__file__), "logs/DocumentGenerator.log"), log_level=logging.INFO, log_verbose=True): """ Initializes context, and sets up documents that will be parsed. Also establishes the PostgresConnector that will later be used to push the retrieved documents. :param fields: (OrderedDict) Key-value pairs that indicate a mapping of fields that should be retrieved (key), and the respective field it should be called in the SQL table. Ordered because SQL tables are. :param num_distinct_documents: (int) As the name indicates, the number of distinct articles that should be used. Mainly for debugging purposes. 0 means all documents will be used, in accordance with MongoDB standards. :param document_table_name: (str) Name of the Postgres table that should contain the documents :param database: (str) database name. :param user: (str) User name to get access to the Postgres database. :param password: (str) Corresponding user password. :param host: (IP) IP address (in string format) for the host of the postgres database. :param port: (integer) Port at which to access the database. :param log_file: (os.path) Path to the file containing the logs. :param log_level: (logging.LEVEL) Specifies the level to be logged. :param log_verbose: (boolean) Specifies whether or not to look to stdout as well. """ # set up logger self.logger = set_up_logger(__name__, log_file, log_level, log_verbose) self.logger.info( "Successfully registered logger to DocumentGenerator.") # register a MongoConnector self.mc = MongoConnector() self.logger.info( "Successfully registered MongoConnector to DocumentGenerator.") self.num_distinct_documents = num_distinct_documents # get the distinct IDs for the documents so we can match against them later if self.num_distinct_documents != 0: self.logger.info( "Non-zero limit detected. Fetching first N distinct document IDs now..." ) with self.mc as open_mc: documents = open_mc.client[open_mc.news].articles self.first_documents = list(documents.find().limit( self.num_distinct_documents)) # for small enough number, and large enough document collection, this is more efficient: self.first_documents = [ el["_id"] for el in self.first_documents ] self.logger.info( "Successfully registered relevant document IDs.") else: # needed to avoid later conflicts self.first_documents = [] # set up PostgresConnector. Since we only use these once, I don't see any reason to store the connection # details locally again. self.pc = PostgresConnector(database, user, password, host, port) self.logger.info( "Successfully registered PostgresConnector to DocumentGenerator.") # format them into a reasonable format self.fields = fields if not self.fields: self.logger.error("No fields for MongoDB table specified!") self.values_to_retrieve = {key: 1 for key in self.fields.keys()} # suppress _id if not wanted, as it is returned by default. if "_id" not in self.values_to_retrieve.keys(): self.values_to_retrieve["_id"] = 0 # TODO self.sql_format = ", ".join([value for value in self.fields.values()]) self.document_table_name = document_table_name # preparation for later. According to PEP8 self.data = [] self.logger.info("Successfully set up DocumentGenerator.")
import psycopg2 as db import csv import os import sys sys.path.append(os.path.abspath("../lib/")) from query_helper import comm_helper from PostgresConnector import PostgresConnector # ports = list(range(5435, 5440)) port = 5436 windows = [5, 10] t = comm_helper("postgres", "", "127.0.0.1", str(port)) pc = PostgresConnector(port=port) def query_and_write(filename, query, header): with pc as opc: print("Start querying table {}".format(filename)) if os.path.isfile(filename): os.remove(filename) opc.cursor.execute(query) # This only happens for documents print("Start writing table {}.".format(filename)) with open(filename, 'w') as csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerow(header) while True: data = opc.cursor.fetchmany(65536)
def update_organized_tweets(self): tweet_id_dict = {} try: conn = PostgresConnector().get_connection() cursor = conn.cursor() query_location = 'select id from location' cursor.execute(query_location) location_column = 0 for row_location in cursor: query = """ select id,trend from trends where trend in(select trend from (select count(*) as c,trend from trends where locationid = %s group by trend)as t1 order by c desc limit 80) """ cursor = conn.cursor() location_id = row_location[location_column] cursor.execute(query, (location_id, )) trend_id_column = 0 trend_name_column = 1 trend_count = 0 for row in cursor: trend_count = trend_count + 1 trend_id = row[trend_id_column] trend_name = row[trend_name_column] print 'Processing for trend ' + trend_id + ' , ' + str( trend_count) query_tweets = 'select tweets from tweets where trendId = \'' + str( trend_id) + '\'' cursor_tweets = conn.cursor() cursor_tweets.execute(query_tweets) tweets_column = 0 with open(trend_name + '.txt', 'w') as f: # rows of tweets array for tweets_row in cursor_tweets: tweets_json_array = tweets_row[tweets_column] # tweets in a tweets array for json_in in tweets_json_array: id = json_in['id'] tweet_id_exists = tweet_id_dict.get(id) if tweet_id_exists is None: #print jsonIn tweet_id_dict[id] = 1 geo = 'none' if json_in[ 'geo'] is None else 'none' #json['geo'] retweeted = json_in['retweeted'] in_reply_to_screen_name = 'none' if json_in[ 'in_reply_to_screen_name'] is None else json_in[ 'in_reply_to_screen_name'] truncated = 'none' if json_in[ 'truncated'] is None else json_in[ 'truncated'] source = json_in['source'] created_at = json_in['created_at'] place = 'none' if json_in[ 'place'] is None else 'none' #json['place'] user_id = json_in['user']['id'] text = json_in['text'].strip() #text = " ".join(str(text).split()) text = str( filter(lambda x: x in string.printable, text)) #text = text.encode('utf-16') text = re.sub('\s+', ' ', text) text = text.replace('\\', '') entities = json_in['entities']['hashtags'] user_mentions = json_in['entities'][ 'user_mentions'] user_mentions = [] retweet_count = json_in['retweet_count'] favorite_count = json_in['favorite_count'] # if len(entities) > 0: # for entity in entities: # for k,v in entity.items(): # if k in 'text': # entity_list = {} # new_v = entity[k] # new_v = str(new_v.encode('utf-8')) # new_v = filter(lambda x: x in string.printable,new_v) # #print id,check,new_v,len(new_v) # if len(new_v) > 0: # entity[k] = new_v # else: # entity[k] = '' #print id,geo,retweeted ,in_reply_to_screen_name ,truncated ,source ,created_at ,place ,user_id ,text ,entities ,user_mentions,retweet_count,favorite_count f.write( str(id) + '\t' + str(geo) + '\t' + str(retweeted) + '\t' + str( in_reply_to_screen_name.encode( 'utf-8')) + '\t' + str(truncated) + '\t' + str(source.encode('utf-8')) + '\t' + str(created_at.encode('utf-8')) + '\t' + str(place) + '\t' + str(user_id) + '\t' + text + '\t' + str(json.dumps(entities)) + '\t' + str(user_mentions) + '\t' + str(retweet_count) + '\t' + str(favorite_count) + '\t' + str(trend_name) + '\t' + str(location_id) + '\n') else: continue # array of tweets json ends here #break # total number of tweets rows for a given trend ends here #break print 'Writing to table' with open(trend_name + '.txt') as f: cursor_write = conn.cursor() cursor_write.copy_from( f, 'organized_tweets', columns=('id', 'geo', 'retweeted', 'in_reply_to_screen_name', 'truncated', 'source', 'created_at', 'place', 'user_id', 'text', 'entities', 'user_mentions', 'retweet_count', 'favorite_count', 'trend', 'location_id')) conn.commit() os.remove(trend_name + '.txt') # all trends finish here #break except Exception: print traceback.format_exc()
subjects = [(1, 'Accounting & Finance'), (2, 'Art & Design'), (3, 'Architecture'), (4, 'Manufacturing Engineering'), (5, 'Law'), (6, 'Economics & Econometrics'), (7, 'Medicine'), (8, 'Business & Management Studies'), (9, 'Engineering & Technology'), (10, 'Computer Science')] #Дані для занесення в бд subjects_to_teachers = [(1, 1, 1), (2, 2, 2), (3, 3, 3), (4, 4, 4), (5, 5, 5), (6, 6, 6), (7, 7, 7), (8, 8, 8), (9, 9, 9), (10, 10, 10), (11, 11, 1), (12, 12, 2), (13, 13, 3), (14, 14, 4), (15, 15, 5), (16, 16, 6), (17, 17, 7), (18, 18, 8), (19, 19, 9), (20, 20, 10)] #Під'єднання до БД sqlite = SqliteConnector() Postgres = PostgresConnector() MySql = MySqlConnector() # Функція для створення БД1 def createDB(): MySql.dropAllTables() MySql.createDatabase() MySql.executemany("INSERT INTO faculties VALUES (%s,%s)", faculties) MySql.executemany("INSERT INTO department VALUES (%s,%s,%s)", departments) MySql.executemany("INSERT INTO teachers VALUES (%s,%s,%s,%s,%s)", teachers) MySql.executemany("INSERT INTO subject VALUES (%s,%s)", subjects) MySql.executemany("INSERT INTO subjects_to_teachers VALUES (%s,%s,%s)", subjects_to_teachers)
class PostgresQuerier(object): def __init__(self): self.connector = PostgresConnector(CONFIG_FILE_NAME, CONFIG_SECTION_NAME) self.SQL_constructor = PostgresSQLConstructor() def GetVersion(self): SQL = self.SQL_constructor.GetVersion() self.connector.ExecuteSQL(SQL) return self.connector.GetResults(1) def GetCurrentDate(self): SQL = self.SQL_constructor.GetCurrentDate() self.connector.ExecuteSQL(SQL) return self.connector.GetResults(1) def _SetDatestyleGerman(self): SQL = "SET DATESTYLE=%s;"; data = ("German",) self.connector.ExecuteSQL(SQL, data) return "Datestyle set to German" def SelectColumnsFromTable(self, columns, table, limit=100): SQL_select = self.SQL_constructor.GetSELECTColumns(columns) SQL_from = self.SQL_constructor.GetFROMTable(table) SQL_limit = self.SQL_constructor.GetLIMIT(limit) SQL = SQL_select + SQL_from + SQL_limit self.connector.ExecuteSQL(SQL) return self.connector.GetAllResults() def FindWordsInColumnsInTableThenBoldAndRank(self, search_words, columns, table, operation="OR"): search_phrase = self.SQL_constructor.ConstructSearchPhrase(search_words, operation) headline_columns = self.SQL_constructor.GetHeadlineColumnsWithPhrase(columns, search_phrase) rank_column = self.SQL_constructor.Getts_rankColumnByPhraseWithLengthPenaltyWithAlias(columns[0], search_phrase, 1, "rank") headline_columns.append(rank_column) vector_query_pairs = [self.SQL_constructor.GetVectorColumnHasQueryPhrase(x, search_phrase) for x in columns] SQL_select = self.SQL_constructor.GetSELECTColumns(headline_columns) SQL_from = self.SQL_constructor.GetFROMTable(table) SQL_where = self.SQL_constructor.GetWHERE() SQL_conditions = " OR".join(vector_query_pairs) SQL_order = self.SQL_constructor.GetORDERByAliasInDirection("rank", "DESC") SQL = SQL_select + SQL_from + SQL_where + SQL_conditions + SQL_order self.connector.ExecuteSQL(SQL) return self.connector.GetAllResults() def FindWordsInDocumentInTableThenBoldAndRank(self, search_words, document, table, operation="OR"): search_phrase = self.SQL_constructor.ConstructSearchPhrase(search_words, operation) document_columns = ["title", "categories", "summary", "description"] headline_columns = self.SQL_constructor.GetHeadlineColumnsWithPhrase(document_columns, search_phrase) rank_column = self.SQL_constructor.Getts_rankDocumentByPhraseWithLengthPenaltyWithAlias(document, search_phrase, 1, "rank") headline_columns.append(rank_column) SQL_select = self.SQL_constructor.GetSELECTColumns(headline_columns) SQL_from = self.SQL_constructor.GetFROMTable(table) SQL_where = self.SQL_constructor.GetWHERE() SQL_conditions = self.SQL_constructor.GetDocumentHasQueryPhrase(document, search_phrase) SQL_order = self.SQL_constructor.GetORDERByAliasInDirection("rank", "DESC") SQL = SQL_select + SQL_from + SQL_where + SQL_conditions + SQL_order self.connector.ExecuteSQL(SQL) return self.connector.GetAllResults() def SuggestBasedOnPhraseInColumnInTable(self, phrase, column, table): similarity_function = self.SQL_constructor.GetSimilarityOfColumnAndStringWithAlias(column, phrase, "sameness") SQL_select = self.SQL_constructor.GetSELECTColumns([column, similarity_function]) SQL_from = self.SQL_constructor.GetFROMTable(table) SQL_order = self.SQL_constructor.GetORDERByAliasInDirection("sameness", "DESC") SQL = SQL_select + SQL_from + SQL_order self.connector.ExecuteSQL(SQL) return self.connector.GetResults(5) def PivotMovies(self, start_datetime, stop_datetime, granulation): iso_start_datetime = start_datetime.isoformat(" ") iso_end_datetime = stop_datetime.isoformat(" ") diff = stop_datetime - start_datetime days, seconds = diff.days, diff.seconds hours = days * 24 + seconds // 3600 if granulation == "hour": format = ["time_interval" + str(x) + " bigint" for x in range(hours + 1)] elif granulation == "day": format = ["time_interval" + str(x) + " bigint" for x in range(days + 1)] else: raise Exception("WARNING: incorrect granulation; possible granulation are (hour, day)") format.insert(0, "query character varying(255)") sub_query = ("select query," " date_trunc(''" + granulation + "'', date_and_time) as periods," " count(*)" " from queries" " where date_and_time >= ''" + iso_start_datetime + "'' and" " date_and_time <= ''" + iso_end_datetime + "''" " group by query, periods" " order by query, periods;") sequence = ("select d" " from generate_series(''" + iso_start_datetime + "''::timestamp," "''" + iso_end_datetime + "''," "''1 " + granulation + "'')" " d;") SQL = ("select * from crosstab('" + sub_query + "'," "'" + sequence + "')" "as ct(" + ", ".join(format) + ");") self.connector.ExecuteSQL(SQL) return self.connector.GetAllResults() def InsertIntoMovies(self, title, categories="No categories.", summary="No summary.", description="No description."): table = "movies" columns = ["title", "categories", "summary", "description"] values = [title, categories, summary, description] SQL_insert = self.SQL_constructor.GetINSERTIntoTableColumns(table, columns) SQL_values = self.SQL_constructor.GetVALUES(values) SQL = SQL_insert + SQL_values self.connector.ExecuteSQL(SQL) self.connector.CommitTransaction() return "Insertion successful." def _GetTableColumnsAfterIndex(self, table, index=0): return [x[0] for x in self.DescribeTable(table)][index:] def Prototype(self): SQL = ("SELECT title, similarity (title, 'The Dencing Master')" "FROM movies" "WHERE title % 'The Dencing Master'" ) self.connector.ExecuteSQL(SQL) return self.connector.GetAllResults() def DescribeTable(self, table): SQL = "SELECT column_name FROM information_schema.columns WHERE table_name = '" + table + "'"; self.connector.ExecuteSQL(SQL) return self.connector.GetAllResults() def Close(self): self.connector.Close()