Python PostgresConnector.PostgresConnector 예제들, PostgresConnector.PostgresConnector.PostgresConnector Python 예제들

예제 #1

0

파일 보기

 def __init__(self,
              prefix="entity",
              window_size=2,
              entities_only=True,
              port=5436,
              log_file=os.path.join(os.path.dirname(__file__), "logs/SchemaCreator.log"),
              log_level=logging.INFO,
              log_verbose=True
              ):
     """
     Set up.
     :param prefix: (str) Prefix to the table names.
     :param port: (int) Used to connect to the Postgres tables.
     :param log_file: (os.path) Path to the file containing the logs.
     :param log_level: (logging.LEVEL) Specifies the level to be logged.
     :param log_verbose: (boolean) Specifies whether or not to look to stdout as well.
     """
     self.logger = set_up_logger(__name__, log_file, log_level, log_verbose)
     self.window_size = window_size
     self.prefix = prefix + "_" + str(self.window_size)
     self.entities_only = entities_only
     self.names = self.get_names(self.prefix)
     self.port = port
     self.pc = PostgresConnector(port=port)
     self.logger.info("Successfully registered SchemaGenerator.")

예제 #2

0

파일 보기

파일: Pipeline.py 프로젝트: cchaplin/TweetLyze

    def get_dates_location(self, locationid):
        min_max_date_list = []
        try:
            conn = PostgresConnector().get_connection()
            cursor = conn.cursor()
            query = """
			select max(date),min(date) from trends where trend in 
			(select t1.trend as trend from
			(select count(*) as c,trend from trends where 
				locationid = %s group by trend)as t1 order by c desc limit 15)
				and locationid = %s
			"""
            cursor.execute(query, (locationid, locationid))
            min_date_column = 1
            max_date_column = 0
            for row in cursor:
                min_max_date_dict = {}
                min_max_date_dict["min_date"] = str(row[min_date_column])
                min_max_date_dict["max_date"] = str(row[max_date_column])
                min_max_date_list.append(min_max_date_dict)

        except Exception:
            traceback.format_exc()

        return min_max_date_list

예제 #3

0

파일 보기

파일: Pipeline.py 프로젝트: cchaplin/TweetLyze

    def get_trends(self, location_id, start_date, end_date):
        trends_list = []
        try:
            conn = PostgresConnector().get_connection()
            cursor = conn.cursor()
            query = """
			select c,trend from
			(select count(*) as c,trend from trends where 
				locationid = %s and date between %s and %s 
				and id in(select trendid from tweets)
				group by trend)
			as t1 order by c desc limit 15
			"""
            cursor.execute(query, (location_id, start_date, end_date))
            trend_column = 1
            count_column = 0
            for row in cursor:
                trend_count = {}
                trend_count["trend"] = row[trend_column]
                trend_count["count"] = row[count_column]
                trends_list.append(trend_count)

        except Exception as e:
            print e

        return trends_list

예제 #4

0

파일 보기

def create_table(port):
    pc = PostgresConnector(port=port)

    with pc as opc:
        # add sentence index column but in separate table
        print("Starting with ")
        pc.cursor.execute("CREATE TABLE sentences_neo4j AS TABLE sentences;")
        pc.cursor.execute("ALTER TABLE sentences_neo4j ADD COLUMN id int;")
        pc.cursor.execute("""WITH numbered (sid, document_id, sentence_id) AS
                            (select row_number() OVER() sid, * from sentences_neo4j)
                            UPDATE sentences_neo4j
                            SET id = numbered.sid
                            FROM numbered
                            WHERE sentences_neo4j.document_id = numbered.document_id AND
                            sentences_neo4j.sentence_id = numbered.sentence_id;""")

        # add term_occurrence index
        print("Starting with term occurrences...")
        pc.cursor.execute("CREATE TABLE term_occurrence_neo4j AS TABLE term_occurrence;")
        pc.cursor.execute("ALTER TABLE term_occurrence_neo4j ADD COLUMN id int;")
        pc.cursor.execute("""WITH numbered (sid, document_id, sentence_id, term_id) AS
                            (select row_number() OVER() sid, * from term_occurrence)
                            UPDATE term_occurrence_neo4j
                            SET  id = numbered.sid                                                
                            FROM numbered
                            WHERE term_occurrence_neo4j.document_id = numbered.document_id AND
                            term_occurrence_neo4j.sentence_id = numbered.sentence_id AND
                            term_occurrence_neo4j.term_id = numbered.term_id;""")

예제 #5

0

파일 보기

파일: dqn.py 프로젝트: mmmohsen/stp-database-technologies

def train_model():
    np.random.seed(123)
    with open(os.path.join("..", "query_pull_1000v3.pkl"), 'rb') as f:
        query_pull = pickle.load(f)

        connector = PostgresConnector()
        env = DatabaseIndexesEnv(n=COLUMNS_AMOUNT,
                                 table_name=table_name,
                                 query_pull=query_pull,
                                 batch_size=BATCH_SIZE,
                                 connector=connector,
                                 k=3,
                                 max_episodes=1000)

        # Get the environment and extract the number of actions.
        env.seed(123)

        # Next, we build a very simple model.
        model = build_model()
        print(model.summary())

        # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
        # even the metrics!
        dqn = initialize_agent(model)

        # Okay, now it's time to learn something! We visualize the training here for show, but this
        # slows down training quite a lot. You can always safely abort the training prematurely using
        # Ctrl + C.
        dqn.fit(env, nb_steps=50000, visualize=False, verbose=0, callbacks=[CustomEpisodeLogger()])

        # After training is done, we save the final weights.
        dqn.save_weights('dqn_{}_weights_6_4_2_1_2000_episodes_estimated.h5f'.format(ENV_NAME), overwrite=True)

        # Finally, evaluate our algorithm for 5 episodes.
        dqn.test(env, nb_episodes=5, visualize=False)

예제 #6

0

파일 보기

	def get_data(self,locationid):
		entity_trend_dict = {} 
		try:
			conn = PostgresConnector().get_connection()
			cursor = conn.cursor()
			query = """select t1.entity,t2.trend from
						(select id,entity from id_entity ) as t1
						inner join
						(select id,trend from organized_tweets where trend in 
							(select trend from 
							(select count(*) as c,trend from 
								trends where locationid = %s group by trend)as t_in order 
							by c desc limit 15))as t2
						on
						t1.id = t2.id"""
			cursor.execute(query,(locationid,))
			entity_column = 0
			trend_column = 1
			for row in cursor:
				id = row[trend_column]
				if id in entity_id_dict.keys():
					entity_list = entity_id_dict[id]
					entity_list.append(row[entity_column])
					entity_id_dict[id] = entity_list
				else:
					entity_id_dict[id] = [row[entity_column]]


		except Exception:
			print traceback.format_exc()

		return entity_trend_dict

예제 #7

0

파일 보기

    def build(self):
        conn = PostgresConnector().get_connection()
        cursor = conn.cursor()
        query = 'select id,entities,trend from organized_tweets'
        cursor.execute(query)
        id_column = 0
        entities_column = 1
        trend_column = 2

        with open('copy_from.txt', 'w') as f:
            for row in cursor:
                tweet_id = row[id_column]
                trend = row[trend_column]
                hashtag_array = row[entities_column]
                json_array = json.loads(hashtag_array)
                hashtag_list = [hashtag["text"] for hashtag in json_array]
                hashtag_list_unique = list(set(hashtag_list))
                print 'Writing data to table for the tweet_id ' + tweet_id
                for hashtag in hashtag_list_unique:
                    f.write(tweet_id + '\t' + hashtag.encode('utf-8') + '\t' +
                            trend + '\n')

        with open('copy_from.txt') as f:
            cursor.copy_from(f, 'id_entity', columns=('id', 'entity', 'trend'))
            conn.commit()

        os.remove('copy_from.txt')

예제 #8

0

파일 보기

파일: EntityIdIndexer.py 프로젝트: cchaplin/TweetLyze

    def get_total_documents(self):
        conn = PostgresConnector().get_connection()
        cursor = conn.cursor()
        query = 'select count(distinct(id)) from "IdEntity" '
        cursor.execute(query)
        count_of_distinct_id_column = 0
        total_documents_count = 0
        for row in cursor:
            total_documents_count = row[count_of_distinct_id_column]

        return total_documents_count

예제 #9

0

파일 보기

파일: Pipeline.py 프로젝트: cchaplin/TweetLyze

    def get_locations(self):
        conn = PostgresConnector().get_connection()
        cursor = conn.cursor()
        query = 'SELECT id,city from location'
        cursor.execute(query)
        id_column = 0
        city_column = 1
        locations_list = []
        for row in cursor:
            id_location = {}
            id_location["geoid"] = row[id_column]
            id_location["city"] = row[city_column]
            locations_list.append(id_location)

        return locations_list

예제 #10

0

파일 보기

파일: EntityIdIndexer.py 프로젝트: cchaplin/TweetLyze

    def build_tf(self):
        # using group by first get tf score for each entity
        conn = PostgresConnector().get_connection()
        cursor = conn.cursor()
        query = 'select count(id),entity from "IdEntity" group by entity'
        cursor.execute(query)
        count_of_id_column = 0
        entities_column = 1
        entity_id_dict = {}
        for row in cursor:
            count_of_id = row[count_of_id_column]
            entity = row[entities_column]
            entity_id_dict[entity] = count_of_id

        return entity_id_dict

예제 #11

0

파일 보기

파일: Pipeline.py 프로젝트: cchaplin/TweetLyze

    def get_tfidf(self, locationid, trend):
        tfidf_list = []
        try:
            conn = PostgresConnector().get_connection()
            cursor = conn.cursor()
            tfidf_query = """
			select entity,tf_idf_score from 
				(select t4.entity,sum(t4.tf_idf) as tf_idf_score
				from
				(select t1.id,t1.entity,t2.count_id,t3.count_entity,
				(1.0/t3.count_entity)*log((
				select count(*) from organized_tweets 
				where trend = %s 
				and location_id = %s )/t2.count_id) as tf_idf  from
					(select id,entity from id_entity where id in
					(select id from organized_tweets 
					where trend = %s 
					and location_id = %s)) as t1
				inner join
					(select entity,count(id) as count_id from id_entity where id in
					(select id from organized_tweets 
					where trend = %s 
					and location_id = %s)group by entity) as t2
				on
					t1.entity = t2.entity
				inner join
					(select id,count(entity) as count_entity from id_entity 
					where id in(select id from organized_tweets 
					where trend = %s 
					and location_id = %s )group by id) as t3
				on 
					t1.id = t3.id) as t4 group by entity)as t5 order by 
					tf_idf_score desc limit 100;
			"""
            cursor.execute(tfidf_query, (trend, locationid, trend, locationid,
                                         trend, locationid, trend, locationid))
            entity_column = 0
            tfidf_column = 1
            for row in cursor:
                entity_tfidf_score = {}
                entity_tfidf_score["entity"] = row[entity_column]
                entity_tfidf_score["tfidf"] = row[tfidf_column]
                tfidf_list.append(entity_tfidf_score)

            return tfidf_list
        except Exception:
            print traceback.format_exc()

예제 #12

0

파일 보기

파일: Pipeline.py 프로젝트: cchaplin/TweetLyze

    def get_tweets(self, trend, entity):
        conn = PostgresConnector().get_connection()
        cursor = conn.cursor()
        query_tweets = """

			select text from organized_tweets
			where id in (select id from id_entity where 
			entity = %s) limit 50
		"""
        cursor.execute(query_tweets, (entity, ))
        text_list = []
        for row in cursor:
            text_dict = {}
            text_dict["name"] = row[0]
            text_list.append(text_dict)

        return text_list

예제 #13

0

파일 보기

파일: test_dqn_specific.py 프로젝트: mmmohsen/stp-database-technologies

 def test_dqn_against_heuristic(self):
     np.random.seed(123)
     with open(path.join("..", "query_pull_1000v3.pkl"), 'rb') as f:
         query_pull = pickle.load(f)[0:5]
         workload = np.random.choice(query_pull, const.BATCH_SIZE)
         env = DatabaseIndexesEnv(n=const.COLUMNS_AMOUNT,
                                  table_name=table_name,
                                  query_pull=query_pull,
                                  batch_size=const.BATCH_SIZE,
                                  connector=PostgresConnector(),
                                  k=3,
                                  max_episodes=1)
         dqn = load_agent(
             path.join("..", "dqn_specific_{}.h5f".format(ENV_NAME)))
         results = dqn.test(env, nb_episodes=1)
         print(results)
         print(env.state)
         print(predict_on_workload(workload))

예제 #14

0

파일 보기

파일: test_databaseIndexesEnv.py 프로젝트: mmmohsen/stp-database-technologies

 def test_cache(self):
     np.random.seed(123)
     with open("..\query_pull_1000v3.pkl", 'rb') as f:
         query_pull = pickle.load(f)
         register(
             id='DatabaseIndexesEnv-v0',
             entry_point='dbenv:DatabaseIndexesEnv',
             kwargs={'n': const.COLUMNS_AMOUNT,
                     'table_name': "test_table",
                     'query_pull': query_pull,
                     'batch_size': 2,
                     'connector': PostgresConnector(),
                     'k': 3,
                     'max_episodes': 1}
         )
         env = gym.make('DatabaseIndexesEnv-v0')
         env.step(0)
         env.step(1)
         env.step(2)
         print(env.cache)

예제 #15

0

파일 보기

    def get_sentiments(self):

        conn = PostgresConnector().get_connection()
        cursor = conn.cursor()
        query = """
		select id,text from organized_tweets 
		"""
        cursor.execute(query)
        id_column = 0
        text_column = 1
        with open("sentiments.tsv", "w") as f:
            for row in cursor:
                text = row[text_column]
                blob = TextBlob(text, analyzer=NaiveBayesAnalyzer())
                print 'writing for tweet with id ' + str(row[id_column])

                f.write(
                    str(row[id_column]) + '\t' +
                    str(blob.sentiment.classification) + '\t' +
                    str(blob.sentiment.p_pos) + '\t' +
                    str(blob.sentiment.p_neg) + '\n')

예제 #16

0

파일 보기

파일: dqn_specific.py 프로젝트: mmmohsen/stp-database-technologies

def train_model():
    np.random.seed(123)
    with open("query_pull_1000v2.pkl", 'rb') as f:
        query_pull = pickle.load(f)[0:5]

        register(id=ENV_NAME,
                 entry_point='dbenv:DatabaseIndexesEnv',
                 kwargs={
                     'n': COLUMNS_AMOUNT,
                     'table_name': table_name,
                     'query_pull': query_pull,
                     'batch_size': BATCH_SIZE,
                     'connector': PostgresConnector(),
                     'k': 3,
                     'max_episodes': episodes
                 })

        # Get the environment and extract the number of actions.
        env = gym.make(ENV_NAME)
        env.seed(123)

        # Next, we build a very simple model.
        model = build_model()
        print(model.summary())

        # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
        # even the metrics!
        dqn = initialize_agent(model)

        # Okay, now it's time to learn something! We visualize the training here for show, but this
        # slows down training quite a lot. You can always safely abort the training prematurely using
        # Ctrl + C.
        dqn.fit(env, nb_steps=episodes, visualize=False, verbose=2)

        # After training is done, we save the final weights.
        dqn.save_weights('dqn_specific_{}.h5f'.format(ENV_NAME),
                         overwrite=True)

        # Finally, evaluate our algorithm for 5 episodes.
        dqn.test(env, nb_episodes=5, visualize=False)

예제 #17

0

파일 보기

파일: EntityIdIndexer.py 프로젝트: cchaplin/TweetLyze

 def build(self):
     conn = PostgresConnector().get_connection()
     cursor = conn.cursor()
     query = 'select id,hashtags from "organizedTweets" '
     cursor.execute(query)
     id_column = 0
     entities_column = 1
     entity_id_dict = {}
     for row in cursor:
         tweet_id = row[id_column]
         hashtag_array = row[entities_column]
         hashtag_list = [hashtag['text'] for hashtag in hashtag_array]
         for entity in hashtag_list:
             if entity in entity_id_dict.keys():
                 id_list = entity_id_dict[entity]
                 id_list.append(tweet_id)
                 entity_id_dict[entity] = id_list
             else:
                 id_list = []
                 id_list.append(tweet_id)
                 entity_id_dict[entity] = id_list
     return entity_id_dict

예제 #18

0

파일 보기

파일: Pipeline.py 프로젝트: cchaplin/TweetLyze

    def update_organized_tweets(self):
        tweet_id_dict = {}

        try:
            conn = PostgresConnector().get_connection()
            cursor = conn.cursor()
            query_location = 'select id from location'
            cursor.execute(query_location)
            location_column = 0

            for row_location in cursor:

                query = """
				select id,trend from trends 
				where trend in(select trend from (select count(*) as c,trend from 
					trends where locationid = %s group by trend)as t1 order 
					by c desc limit 80)
						"""
                cursor = conn.cursor()
                location_id = row_location[location_column]
                cursor.execute(query, (location_id, ))
                trend_id_column = 0
                trend_name_column = 1
                trend_count = 0

                for row in cursor:
                    trend_count = trend_count + 1
                    trend_id = row[trend_id_column]
                    trend_name = row[trend_name_column]
                    print 'Processing for trend ' + trend_id + ' , ' + str(
                        trend_count)
                    query_tweets = 'select tweets from tweets where trendId = \'' + str(
                        trend_id) + '\''
                    cursor_tweets = conn.cursor()
                    cursor_tweets.execute(query_tweets)
                    tweets_column = 0

                    with open(trend_name + '.txt', 'w') as f:

                        # rows of tweets array
                        for tweets_row in cursor_tweets:
                            tweets_json_array = tweets_row[tweets_column]

                            # tweets in a tweets array
                            for json_in in tweets_json_array:

                                id = json_in['id']
                                tweet_id_exists = tweet_id_dict.get(id)

                                if tweet_id_exists is None:
                                    #print jsonIn
                                    tweet_id_dict[id] = 1
                                    geo = 'none' if json_in[
                                        'geo'] is None else 'none'  #json['geo']
                                    retweeted = json_in['retweeted']
                                    in_reply_to_screen_name = 'none' if json_in[
                                        'in_reply_to_screen_name'] is None else json_in[
                                            'in_reply_to_screen_name']
                                    truncated = 'none' if json_in[
                                        'truncated'] is None else json_in[
                                            'truncated']
                                    source = json_in['source']
                                    created_at = json_in['created_at']
                                    place = 'none' if json_in[
                                        'place'] is None else 'none'  #json['place']
                                    user_id = json_in['user']['id']
                                    text = json_in['text'].strip()
                                    #text = " ".join(str(text).split())
                                    text = str(
                                        filter(lambda x: x in string.printable,
                                               text))
                                    #text = text.encode('utf-16')
                                    text = re.sub('\s+', ' ', text)
                                    text = text.replace('\\', '')
                                    entities = json_in['entities']['hashtags']
                                    user_mentions = json_in['entities'][
                                        'user_mentions']
                                    user_mentions = []
                                    retweet_count = json_in['retweet_count']
                                    favorite_count = json_in['favorite_count']

                                    # if len(entities) > 0:
                                    # 	for entity in entities:
                                    # 		for k,v in entity.items():
                                    # 			if k in 'text':
                                    # 				entity_list = {}
                                    # 				new_v = entity[k]
                                    # 				new_v = str(new_v.encode('utf-8'))
                                    # 				new_v = filter(lambda x: x in string.printable,new_v)
                                    # 				#print id,check,new_v,len(new_v)
                                    # 				if len(new_v) > 0:
                                    # 					entity[k] = new_v
                                    # 				else:
                                    # 					entity[k] = ''

                                    #print id,geo,retweeted ,in_reply_to_screen_name ,truncated ,source ,created_at ,place ,user_id ,text ,entities ,user_mentions,retweet_count,favorite_count
                                    f.write(
                                        str(id) + '\t' + str(geo) + '\t' +
                                        str(retweeted) + '\t' + str(
                                            in_reply_to_screen_name.encode(
                                                'utf-8')) + '\t' +
                                        str(truncated) + '\t' +
                                        str(source.encode('utf-8')) + '\t' +
                                        str(created_at.encode('utf-8')) +
                                        '\t' + str(place) + '\t' +
                                        str(user_id) + '\t' + text + '\t' +
                                        str(json.dumps(entities)) + '\t' +
                                        str(user_mentions) + '\t' +
                                        str(retweet_count) + '\t' +
                                        str(favorite_count) + '\t' +
                                        str(trend_name) + '\t' +
                                        str(location_id) + '\n')

                                else:
                                    continue

                                # array of tweets json ends here
                                #break

                            # total number of tweets rows for a given trend ends here
                            #break

                    print 'Writing to table'

                    with open(trend_name + '.txt') as f:
                        cursor_write = conn.cursor()
                        cursor_write.copy_from(
                            f,
                            'organized_tweets',
                            columns=('id', 'geo', 'retweeted',
                                     'in_reply_to_screen_name', 'truncated',
                                     'source', 'created_at', 'place',
                                     'user_id', 'text', 'entities',
                                     'user_mentions', 'retweet_count',
                                     'favorite_count', 'trend', 'location_id'))

                    conn.commit()
                    os.remove(trend_name + '.txt')

                    # all trends finish here
                    #break

        except Exception:
            print traceback.format_exc()

예제 #19

0

파일 보기

파일: HyperedgeGenerator.py 프로젝트: dennlinger/hypergraph-document-store

    def __init__(self,
                 window_size=2,
                 limit_edges=False,
                 entities_only=False,
                 document_table_name="documents",
                 sentence_table_name="sentences",
                 entity_table_name="entities",
                 term_table_name="terms",
                 term_occurrence_table_name="term_occurrence",
                 hyperedge_table_name="hyperedges",
                 hyperedge_format=("edge_id", "term_id", "pos"),
                 hyperedge_document_table_name="hyperedge_document",
                 hyperedge_document_format=("edge_id", "document_id"),
                 hyperedge_sentence_table_name="hyperedge_sentences",
                 hyperedge_sentence_format=("edge_id", "document_id",
                                            "sentence_id", "pos"),
                 database="postgres",
                 user="******",
                 password="******",
                 host="127.0.0.1",
                 port=5435,
                 log_file=os.path.join(os.path.dirname(__file__),
                                       "logs/HyperedgeGenerator.log"),
                 log_level=logging.INFO,
                 log_verbose=True):
        """
        Initializes hyper edge generator class.
        :param window_size: (int) Number of sentences in each direction that will determine the context window size
               of the algorithm.
        :param limit_edges: (boolean) Experimental: Should limit the maximum number of terms per hyperedge. This would
               only be useful in context with other theoretical results.
        :param entities_only: (boolean) Indicating whether or not we should only take into account entity terms,
               and not the entirety of all term occurrences for the edges.
        :param document_table_name: (str) Name of the table where documents are stored.
        :param sentence_table_name: (str) Name of the table containing the sentences and their content.
        :param entity_table_name: (str) Name of the table containing the entity information and their properties.
        :param term_table_name: (str) Name of the table containing the terms and meta data.
        :param term_occurrence_table_name: (str) Name of the table containing term occurrence data.
        :param hyperedge_table_name: (str) Name of the table containing the general hyper edge information.
        :param hyperedge_format: (str) Table structure of hyper edge table.
        :param hyperedge_document_table_name: (str) Name of the table containing the document classification.
        :param hyperedge_document_format: (str) Table structure of hyper edge document table.
        :param hyperedge_sentence_table_name: (str) Name of the tale containing the hyper edge sentence data.
        :param hyperedge_sentence_format: (str) Table structure of the hyper edge sentence table.
        :param database: (str) database name.
        :param user: (str) User name to get access to the Postgres database.
        :param password: (str) Corresponding user password.
        :param host: (IP) IP address (in string format) for the host of the postgres database.
        :param port: (integer) Port at which to access the database.
        :param log_file: (os.path) Path to the file containing the logs.
        :param log_level: (logging.LEVEL) Specifies the level to be logged.
        :param log_verbose: (boolean) Specifies whether or not to look to stdout as well.
        """

        self.logger = set_up_logger(__name__, log_file, log_level, log_verbose)
        self.logger.info(
            "Successfully registered logger to HyperedgeGenerator.")

        # important for hyperedges
        self.window_size = window_size
        self.limit_edges = limit_edges
        self.entities_only = entities_only

        # table names
        self.document_table_name = document_table_name
        self.sentence_table_name = sentence_table_name
        self.entity_table_name = entity_table_name
        self.term_table_name = term_table_name
        self.term_occurrence_table_name = term_occurrence_table_name
        self.hyperedge_table_name = hyperedge_table_name
        self.hyperedge_document_table_name = hyperedge_document_table_name
        self.hyperedge_sentence_table_name = hyperedge_sentence_table_name

        self.hyperedge_format = ", ".join([el for el in hyperedge_format])
        self.hyperedge_document_format = ", ".join(
            [el for el in hyperedge_document_format])
        self.hyperedge_sentence_format = ",".join(
            [el for el in hyperedge_sentence_format])

        self.pc = PostgresConnector(database, user, password, host, port)
        self.logger.info(
            "Successfully registered PostgresConnector to HyperedgeGenerator.")

        self.hyperedge = []
        self.hyperedge_sentence = []
        self.hyperedge_document = []
        self.all_hyperedges = []
        self.all_hyperedge_sentences = []

        # set up the "hyper edge ID counter", which is simply consecutive from 1.
        with self.pc as open_pc:
            if not check_table_existence(self.logger, open_pc,
                                         self.hyperedge_table_name):
                return 0

            self.logger.info("Retrieving current hyper edge ID key...")
            open_pc.cursor.execute(
                "SELECT COUNT(DISTINCT h.edge_id) FROM {} as h".format(
                    self.hyperedge_table_name))
            # either start with 1 or get the current maximum
            self.hyperedge_ID = max(1, open_pc.cursor.fetchone()[0])

예제 #20

0

파일 보기

    # assert Q_table_c.Q_table[state_to_int(state)], "This state has no corresponding action: %r" % state_to_int(state)
    max_reward = float('-inf')
    int_state = state_to_int(state)
    actions_rewards_dict = Q_table_c.Q_table[int_state]
    for key, val in actions_rewards_dict.items():
        if val > max_reward:
            max_reward = val
            max_key = key
    return max_key, max_reward


register(
    id='DatabaseIndexesEnv-v0',
    entry_point='dbenvm:DatabaseIndexesEnv',
    kwargs={'n': len(table_column_names), 'table_name': table_name, 'query_batch': list(),
            'connector': PostgresConnector(), 'k': 3}
)


def get_indexes_qagent(index_amount, queries, Log=False):
    connector = PostgresConnector()
    if not table_exists(connector, table_name):
        create_table_2(connector)
        load_table(connector)

    #   make results repeatable
    np.random.seed(123)
    # gym configuration
    query_batch = list()

    env = gym.make('DatabaseIndexesEnv-v0')

예제 #21

0

파일 보기

def get_indexes_qagent(index_amount, queries, Log=False):
    connector = PostgresConnector()
    if not table_exists(connector, table_name):
        create_table_2(connector)
        load_table(connector)

    #   make results repeatable
    np.random.seed(123)
    # gym configuration
    query_batch = list()

    env = gym.make('DatabaseIndexesEnv-v0')
    env.set_indices_num(index_amount)

    current_query_idx = 0
    query_batch = list()
    for workload in range(1):

        exploration_rate = 1.0  # represents the exploration rate to be decayed by the time
        initial_lr = 1.0  # Learning rate
        query_batch = list()
        Q_table_c.Q_table = {}
        query_batch = list()
        workload_selectivity_l = list()
        # 1. generate the queries per workload
        # 2. generate the cummlative selectivity per workload
        start = timer()
        for i in range(current_query_idx, current_query_idx + num_queries_batch):
            query_batch.append(queries[i]['query'])
            workload_selectivity_l.append(list(map(lambda x: x, queries[i]['sf_array'])))
        current_query_idx += num_queries_batch
        workload_selectivity = np.prod(workload_selectivity_l, axis=0).tolist()
        max_workload_selectivity = max(workload_selectivity)
        env.set_query_batch(query_batch)
        actions_taken = list()
        # as a heuristic: the indices with the lowest selectivity
        selectivity_indices = heapq.nsmallest(3, range(len(workload_selectivity)), workload_selectivity.__getitem__)
        if Log:
            print("Entering the q learning ..... the process can take time.")
            print(workload_selectivity)
        env.clear_cache()
        for episode in range(NUM_EPISODES):
            state = env.reset()
            actions_taken = list()
            # decay the exploration as the number of episodes grows, the Q table becomes more mature
            eps = exploration_rate / np.sqrt(episode + 1)
            eps = max(eps, min_exp_rate)
            episode_total_reward = 0
            episode_total_qreward = 0
            episode_strategy = []
            eta = max(min_lr, initial_lr * (0.85 ** (episode // 100)))
            ## now the learning comes
            for kk in range(3):
                # do exploration, i.e., choose a random actions
                # make sure the last step is exploitation unless the state is new
                if episode == 0:
                    episode_strategy.append("explore")
                    action = selectivity_indices[kk]
                    Q_table_c.Q_table[state_to_int(state)] = {}
                    Q_table_c.Q_table[state_to_int(state)][action] = 0
                elif (is_new_state(state) or (np.random.uniform(0, 1) < eps)) and episode != NUM_EPISODES - 1:
                    episode_strategy.append("explore")
                    # generate only actions that matches something with selectivity.
                    action = env.action_space.sample()
                    # high selectivity, not a good option for an index
                    while workload_selectivity[action] >= max_workload_selectivity:
                        action = env.action_space.sample()
                    if is_new_state(state):
                        Q_table_c.Q_table[state_to_int(state)] = {}
                    if action not in Q_table_c.Q_table[state_to_int(state)]:
                        Q_table_c.Q_table[state_to_int(state)][action] = 0
                else:
                    # else exploit choose the maximum value from the Q table
                    episode_strategy.append("exploit")
                    action = get_action_maximum_reward(state)[0]

                actions_taken.append(action)
                state_old_int = state_to_int(state)
                state_new, reward, done, _ = env.step(action)
                episode_total_reward += reward
                next_action = 0
                next_action_q_value = 0

                if is_new_state(state_new):
                    next_action = env.action_space.sample()
                    while (action == next_action or workload_selectivity[next_action] >= max_workload_selectivity):
                        next_action = env.action_space.sample()
                    next_action_q_value = 0

                else:
                    next_action, next_action_q_value = get_action_maximum_reward(state_new)

                Q_table_c.Q_table[state_old_int][action] += eta * (reward + GAMMA * next_action_q_value -
                                                                   Q_table_c.Q_table[state_old_int][action])
                episode_total_qreward += Q_table_c.Q_table[state_old_int][action]
                state, action = state_new, next_action
            actions_taken_s = ','.join(str(e) for e in actions_taken)
            if Log:
                print(
                    "episode num = '{0}', episode_total_immediate_rewards = '{1}', episode_total_reward = '{2}', current_state = '{3}', actions_taken = '{4}', strategy = {5}"
                        .format(episode, float(episode_total_reward), float(episode_total_qreward),
                                state_to_string(state), actions_taken_s,
                                episode_strategy))

        return actions_taken

예제 #22

0

파일 보기

    result = []
    print("Number of empty hyperedges:")
    for i, prefix in enumerate(prefixes):
        with pc as open_pc:
            table = prefix + "hyperedges"

            open_pc.cursor.execute(
                "SELECT (SELECT MAX(edge_id) from {}) - "
                "(SELECT count(distinct edge_id) from {}) as diff".format(
                    table, table))

            result.append(open_pc.cursor.fetchall()[0][0])
        print("Results for {}: {}".format(table, result[i]))


if __name__ == "__main__":
    prefixes = ["", "entity_"]
    pc = PostgresConnector(port=5435)
    print()
    get_document_table_length(prefixes, pc)
    print()
    get_sentence_table_length(prefixes, pc)
    print()
    get_hyperedge_table_length(prefixes, pc)
    print()
    analyze_edge_size(prefixes, pc)
    print()
    analyze_term_frequency(prefixes, pc)
    print()
    get_number_of_empty_edges(prefixes, pc)

예제 #23

0

파일 보기

파일: PostgresQuerier.py 프로젝트: MislavJaksic/College-Labs

 def __init__(self):
   self.connector = PostgresConnector(CONFIG_FILE_NAME, CONFIG_SECTION_NAME)
   self.SQL_constructor = PostgresSQLConstructor()

예제 #24

0

파일 보기

파일: TermGenerator.py 프로젝트: dennlinger/hypergraph-document-store

    def __init__(self,
                 num_distinct_documents=5000,
                 replace_entities=True,
                 max_term_length=127,
                 remove_stopwords=True,
                 custom_stopwords=[
                     ',', '.', '-', '\xa0', '“', '”', '"', '\n', '—', ':', '?',
                     'I', '(', ')'
                 ],
                 analyze=False,
                 document_tabe_name="documents",
                 sentence_table_name="sentences",
                 sentence_fields=OrderedDict({
                     "doc_id": "document_id",
                     "sen_id": "sentence_id",
                     "content": "sentence_text"
                 }),
                 term_table_name="terms",
                 term_sql_format=("term_id", "term_text", "is_entity"),
                 term_occurrence_table_name="term_occurrence",
                 term_occurrence_sql_format=("document_id", "sentence_id",
                                             "term_id"),
                 entity_table_name="entities",
                 entity_sql_format=("entity_id", "entity_type"),
                 database="postgres",
                 user="******",
                 password="******",
                 host="127.0.0.1",
                 port=5435,
                 log_file=os.path.join(os.path.dirname(__file__),
                                       "logs/TermGenerator.log"),
                 log_level=logging.INFO,
                 log_verbose=True):
        """
        Initializes various parameters, registers logger and MongoConnector, and sets up the limit.
        :param num_distinct_documents: (int) The number of distinct documents retrieved from the queries.
               For performance reasons, this should be limited during debugging/development.
               0 (Zero) represents no limit, in accordance with the MongoDB standard for .limit().
        :param replace_entities: (boolean) Whether or not the entities in the text should be replaced/recognised.
               The reason for this is that single terms might be merged together to one term, i.e. first and last name:
               "Dennis" "Aumiller" would be two separate terms in the traditional splitting (replace_entities=False),
               whereas - if set to true - "Dennis Aumiller" would represent only one entity.
        :param max_term_length: (int) Indicator of how long the terms are supposed to be (varchar property in table).
        :param remove_stopwords: (boolean) Determines whether or not stop words are removed. Currently, we are still
               deciding on the final set, but likely either one (or both) of NLTK and SpaCy's stop word lists.
        :param custom_stopwords: (list of strings) Additional words that will not be considered at adding-time.
        :param analyze: (boolean) Whether or not to include analytically relevant metrics.
        :param document_tabe_name: (str) Name of the table where the document information is stored.
        :param sentence_table_name: (str) Name of the table where the sentence information will be stored.
        :param sentence_fields: (OrderedDict) Structure of input to output values from MongoDB to postgres for the
               sentence table and its fields.
        :param term_table_name: (str) Name of the Postgres tables for the terms.
        :param term_sql_format: (tuple) Since those are generated locally, only a tuple of the PostgresColumns suffices.
        :param term_occurrence_table_name: (str) Name of the Postgres table for the term occurrences
        :param term_occurrence_sql_format: (tuple) Same as term_sql_format, but for the term occurrences.
        :param entity_table_name: (str) (Not implemented yet) Name of the table for the entity meta information.
        :param entity_sql_format: (str) Same as term_sql_format, but for entities.
        :param database: (str) database name.
        :param user: (str) User name to get access to the Postgres database.
        :param password: (str) Corresponding user password.
        :param host: (IP) IP address (in string format) for the host of the postgres database.
        :param port: (integer) Port at which to access the database.
        """
        # set up logger
        self.logger = set_up_logger(__name__, log_file, log_level, log_verbose)
        self.logger.info("Successfully registered logger to TermGenerator.")

        # register a MongoConnector
        self.mc = MongoConnector()
        self.logger.info(
            "Successfully registered MongoConnector to TermGenerator.")

        # PostgresConnector
        self.pc = PostgresConnector(database, user, password, host, port)
        self.logger.info(
            "Successfully registered PostgresConnector to DocumentGenerator.")

        self.num_distinct_documents = num_distinct_documents
        # do this earlier since we need it already for the distinct documents.
        self.document_table_name = document_tabe_name
        # get the distinct IDs for the documents so we can match against them later
        # since we have removed parts of the document collection, we have to make sure to get this from Postgres.
        self.logger.info("Parsing relevant documents from Postgres...")
        with self.pc as open_pc:
            open_pc.cursor.execute("SELECT document_id FROM {}".format(
                self.document_table_name))
            self.first_distinct_documents = list(open_pc.cursor.fetchall())
            # extract from the tuple structure
            self.first_distinct_documents = [
                el[0] for el in self.first_distinct_documents
            ]
            self.logger.info("Retrieved all relevant documents from Postgres.")

        # additionally restrict if we want only a number of documents.
        if self.num_distinct_documents != 0:
            self.logger.info(
                "Non-zero limit detected. Limiting to the first N entries.")
            self.first_distinct_documents = self.first_distinct_documents[:self
                                                                          .
                                                                          num_distinct_documents]

        self.replace_entities = replace_entities
        self.analyze = analyze

        self.max_term_length = max_term_length

        self.nlp = spacy.load("en")

        # construct dictionary with the entries per document/sentence id pair. Thus, we can later check whether
        # there are any entities in the current sentence with higher efficiency.
        self.occurrence_dict = {}
        self.occurring_entities = []

        # start building the term dictionary/set, as well as an occurence map. Since terms will be "post-processed",
        # it is first created as a list and later cast to Counter and set.
        self.terms = []  # cast into a set later on.
        self.term_in_sentence = set()
        self.term_id = {}
        self.term_is_entity = {}
        if self.analyze:
            self.term_count = Counter()
            self.entity_count = Counter()

        self.entities = []
        self.sentences = []
        self.processed_sentences = []

        # Postgres tables
        if not sentence_fields:
            self.logger.error("No sentence fields specified!")
        self.sentence_table_name = sentence_table_name
        self.sentence_fields = sentence_fields
        if not term_sql_format:
            self.logger.error("No term fields specified!")
        self.term_table_name = term_table_name
        self.term_sql_format = ", ".join(term_sql_format)
        if not term_occurrence_sql_format:
            self.logger.error("No term occurrence fields specified!")
        self.term_occurrence_table_name = term_occurrence_table_name
        self.term_occurrence_sql_format = ", ".join(term_occurrence_sql_format)
        if not entity_sql_format:
            self.logger.error("No entity fields specified!")
        self.entity_table_name = entity_table_name
        self.entity_sql_format = ", ".join(entity_sql_format)

        # value retrieving parse:
        self.sentence_values_to_retrieve = {
            key: 1
            for key in self.sentence_fields.keys()
        }
        # suppress _id if not present:
        if "_id" not in self.sentence_values_to_retrieve.keys():
            self.sentence_values_to_retrieve["_id"] = 0
        self.sentence_sql_format = ", ".join(
            [value for value in self.sentence_fields.values()])

        # create union of stop words, and add potentially custom stop words
        self.remove_stopwords = remove_stopwords
        self.removed_counter = 0
        self.stopwords = STOP_WORDS.union(set(stopwords.words("english")))
        # add custom stopwords.
        for word in custom_stopwords:
            self.stopwords.add(word)

        self.logger.info("Successfully initialized TermGenerator.")

예제 #25

0

파일 보기

파일: main.py 프로젝트: mmmohsen/stp-database-technologies

def run_qlearning():
    connector = PostgresConnector()
    query_pull = generate_query_pull('.query_pull', queries_amount, [4, 6],
                                     table_column_types, table_column_names,
                                     table_name, connector)

예제 #26

0

파일 보기

    def __test_results(self, columns_participating):
        def get_execution_time_for_indexes_configuration(indexes):
            total_time = 0
            for index in indexes:
                add_index(connector, index, table_name)
                total_time = 0
            for query in queries:
                total_time += get_estimated_execution_time(
                    connector, query['query'])
            drop_indexes(connector, table_name)
            return total_time

        def add_execution_time_for_method_and_indexes_configuration(
                method, indexes):
            if method in methods:
                methods[method].append(
                    get_execution_time_for_indexes_configuration(indexes))
            else:
                methods[method] = [
                    get_execution_time_for_indexes_configuration(indexes)
                ]

        def get_indexes_dqn():
            env = DatabaseIndexesEnv(n=const.COLUMNS_AMOUNT,
                                     table_name=table_name,
                                     query_pull=queries,
                                     batch_size=const.BATCH_SIZE,
                                     connector=connector,
                                     k=3,
                                     max_episodes=1)
            dqn = load_agent(
                path.join(
                    "..",
                    "dqn_{}_weights_6_4_2_1_50000_episodes_estimated.h5f".
                    format(ENV_NAME)))
            dqn.test(env, nb_episodes=1)
            return [i for i, x in enumerate(env.state) if x]

        connector = PostgresConnector()
        drop_indexes(connector, table_name)
        methods = {}
        i = 0
        np.warnings.filterwarnings('ignore')
        while True:
            queries = generate_query_pull(
                '../.test_query_pull_' + str(columns_participating) + '_' +
                str(i), self.__queries_amount, columns_participating,
                table_column_types, table_column_names, table_name, connector)
            i += 1
            sf_array = np.array([query['sf_array']
                                 for query in queries]).sum(axis=0)

            indexes_to_add = [
                i[0] for i in (sorted(enumerate(sf_array), key=lambda x: x[1])
                               )[:self.__index_amount]
            ]
            add_execution_time_for_method_and_indexes_configuration(
                'heuristic', indexes_to_add)

            indexes_to_add = get_indexes_qagent(self.__index_amount, queries,
                                                True)
            add_execution_time_for_method_and_indexes_configuration(
                'qlearning', indexes_to_add)
            # #extra clean up to make sure no indices left from the agent
            drop_indexes(connector, table_name)

            # dqn
            indexes_to_add = get_indexes_dqn()
            drop_indexes(connector, table_name)
            add_execution_time_for_method_and_indexes_configuration(
                'dqn', indexes_to_add)
            drop_indexes(connector, table_name)

            indexes_to_add = get_indexes_supervised(self.__index_amount,
                                                    queries)
            add_execution_time_for_method_and_indexes_configuration(
                'supervised', indexes_to_add)
            drop_indexes(connector, table_name)

            indexes_to_add = random.sample(range(COLUMNS_AMOUNT),
                                           self.__index_amount)
            add_execution_time_for_method_and_indexes_configuration(
                'random', indexes_to_add)

            times_combinations = list(
                itertools.combinations(methods.values(), 2))
            p_values = [
                stats.ttest_ind(time[0], time[1])[1]
                for time in times_combinations
            ]
            print(p_values)
            if all(p_value < 0.01
                   for p_value in p_values) and i >= 5 or i >= 5:
                break
            print('try #' + str(i))
            for method, times in methods.items():
                print('{}: {}'.format(method, np.mean(times)))
        print('')

        for method, times in methods.items():
            print('{}: {}'.format(method, np.mean(times)))

예제 #27

0

파일 보기

import psycopg2 as db
import csv
import os
import sys
sys.path.append(os.path.abspath("../lib/"))
from query_helper import comm_helper
from PostgresConnector import PostgresConnector

# ports = list(range(5435, 5440))
port = 5436
windows = [5, 10]

t = comm_helper("postgres", "", "127.0.0.1", str(port))
pc = PostgresConnector(port=port)


def query_and_write(filename, query, header):
    with pc as opc:
        print("Start querying table {}".format(filename))
        if os.path.isfile(filename):
            os.remove(filename)
        opc.cursor.execute(query)
        # This only happens for documents

        print("Start writing table {}.".format(filename))
        with open(filename, 'w') as csvfile:
            csvwriter = csv.writer(csvfile)
            csvwriter.writerow(header)

            while True:
                data = opc.cursor.fetchmany(65536)

예제 #28

0

파일 보기

    def __init__(self,
                 fields=OrderedDict({
                     "_id": "document_id",
                     "title": "title",
                     "feedName": "feedName",
                     "category": "category",
                     "feedURL": "feedURL",
                     "published": "published"
                 }),
                 num_distinct_documents=0,
                 document_table_name="documents",
                 database="postgres",
                 user="******",
                 password="******",
                 host="127.0.0.1",
                 port=5435,
                 log_file=os.path.join(os.path.dirname(__file__),
                                       "logs/DocumentGenerator.log"),
                 log_level=logging.INFO,
                 log_verbose=True):
        """
        Initializes context, and sets up documents that will be parsed.
        Also establishes the PostgresConnector that will later be used to push the retrieved documents.
        :param fields: (OrderedDict) Key-value pairs that indicate a mapping of fields that should be retrieved (key),
               and the respective field it should be called in the SQL table. Ordered because SQL tables are.
        :param num_distinct_documents: (int) As the name indicates, the number of distinct articles that should be used.
               Mainly for debugging purposes. 0 means all documents will be used, in accordance with MongoDB standards.
        :param document_table_name: (str) Name of the Postgres table that should contain the documents
        :param database: (str) database name.
        :param user: (str) User name to get access to the Postgres database.
        :param password: (str) Corresponding user password.
        :param host: (IP) IP address (in string format) for the host of the postgres database.
        :param port: (integer) Port at which to access the database.
        :param log_file: (os.path) Path to the file containing the logs.
        :param log_level: (logging.LEVEL) Specifies the level to be logged.
        :param log_verbose: (boolean) Specifies whether or not to look to stdout as well.
        """

        # set up logger
        self.logger = set_up_logger(__name__, log_file, log_level, log_verbose)
        self.logger.info(
            "Successfully registered logger to DocumentGenerator.")

        # register a MongoConnector
        self.mc = MongoConnector()
        self.logger.info(
            "Successfully registered MongoConnector to DocumentGenerator.")

        self.num_distinct_documents = num_distinct_documents
        # get the distinct IDs for the documents so we can match against them later
        if self.num_distinct_documents != 0:
            self.logger.info(
                "Non-zero limit detected. Fetching first N distinct document IDs now..."
            )
            with self.mc as open_mc:
                documents = open_mc.client[open_mc.news].articles
                self.first_documents = list(documents.find().limit(
                    self.num_distinct_documents))
                # for small enough number, and large enough document collection, this is more efficient:
                self.first_documents = [
                    el["_id"] for el in self.first_documents
                ]
                self.logger.info(
                    "Successfully registered relevant document IDs.")
        else:
            # needed to avoid later conflicts
            self.first_documents = []
        # set up PostgresConnector. Since we only use these once, I don't see any reason to store the connection
        # details locally again.
        self.pc = PostgresConnector(database, user, password, host, port)
        self.logger.info(
            "Successfully registered PostgresConnector to DocumentGenerator.")

        # format them into a reasonable format
        self.fields = fields
        if not self.fields:
            self.logger.error("No fields for MongoDB table specified!")
        self.values_to_retrieve = {key: 1 for key in self.fields.keys()}
        # suppress _id if not wanted, as it is returned by default.
        if "_id" not in self.values_to_retrieve.keys():
            self.values_to_retrieve["_id"] = 0
        # TODO
        self.sql_format = ", ".join([value for value in self.fields.values()])
        self.document_table_name = document_table_name

        # preparation for later. According to PEP8
        self.data = []
        self.logger.info("Successfully set up DocumentGenerator.")

예제 #29

0

파일 보기

파일: front.py 프로젝트: Oleks-Y/backend-laba4

subjects = [(1, 'Accounting & Finance'), (2, 'Art & Design'),
            (3, 'Architecture'), (4, 'Manufacturing Engineering'), (5, 'Law'),
            (6, 'Economics & Econometrics'), (7, 'Medicine'),
            (8, 'Business & Management Studies'),
            (9, 'Engineering & Technology'), (10, 'Computer Science')]

#Дані для занесення в бд
subjects_to_teachers = [(1, 1, 1), (2, 2, 2), (3, 3, 3), (4, 4, 4), (5, 5, 5),
                        (6, 6, 6), (7, 7, 7), (8, 8, 8), (9, 9, 9),
                        (10, 10, 10), (11, 11, 1), (12, 12, 2), (13, 13, 3),
                        (14, 14, 4), (15, 15, 5), (16, 16, 6), (17, 17, 7),
                        (18, 18, 8), (19, 19, 9), (20, 20, 10)]

#Під'єднання до БД
sqlite = SqliteConnector()
Postgres = PostgresConnector()
MySql = MySqlConnector()


# Функція для створення БД1
def createDB():
    MySql.dropAllTables()
    MySql.createDatabase()
    MySql.executemany("INSERT INTO faculties VALUES (%s,%s)", faculties)
    MySql.executemany("INSERT INTO department VALUES (%s,%s,%s)", departments)
    MySql.executemany("INSERT INTO teachers VALUES (%s,%s,%s,%s,%s)", teachers)
    MySql.executemany("INSERT INTO subject VALUES (%s,%s)", subjects)
    MySql.executemany("INSERT INTO subjects_to_teachers VALUES (%s,%s,%s)",
                      subjects_to_teachers)

예제 #30

0

파일 보기

파일: KMedoid.py 프로젝트: cchaplin/TweetLyze

	def get_matrix(self,locationid):

		try:
			conn = PostgresConnector().get_connection()
			cursor = conn.cursor()
			query = """
			WITH 

			TREND_COUNT_TT AS
			(SELECT TREND,COUNT(*) AS TREND_COUNT 
				FROM TRENDS 
				WHERE LOCATIONID = %s GROUP BY TREND),


			TOP_TRENDS_TT AS
			(SELECT TREND FROM TREND_COUNT_TT ORDER BY TREND_COUNT DESC LIMIT 15),

			IDS_FOR_TOP_TRENDS_TT AS 
			(SELECT ID FROM ORGANIZED_TWEETS 
			WHERE TREND IN (SELECT TREND FROM TOP_TRENDS_TT) AND LOCATION_ID = '2295420'),
			--SELECT * FROM IDS_FOR_TOP_TRENDS_TT

			ID_ENTITY_TOP_TRENDS_TT AS
			(SELECT TREND,ID,ENTITY 
			FROM ID_ENTITY
			WHERE ID IN (SELECT ID FROM IDS_FOR_TOP_TRENDS_TT)),


			TREND_ENTITY_TF_IDF_SUM_TT AS
			(SELECT TREND,ENTITY,COUNT(ID) TF_IDF_SUM 
			FROM ID_ENTITY
			WHERE ID IN (SELECT ID FROM IDS_FOR_TOP_TRENDS_TT)
			GROUP BY TREND,ENTITY),
			--SELECT * FROM TREND_ENTITY_TF_IDF_SUM_TT

			TREND_TF_IDF_SQ_SUM_TT AS
			(SELECT TREND, 
			SUM(TF_IDF_SUM*TF_IDF_SUM) AS TF_IDF_SQ_SUM
			FROM TREND_ENTITY_TF_IDF_SUM_TT
			GROUP BY TREND),

			COSINE_DIST_NUM_TT AS 
			(SELECT T1.TREND AS TREND1,T2.TREND AS TREND2,
			SUM(T1.TF_IDF_SUM*T2.TF_IDF_SUM) AS COSINE_NUM
			FROM TREND_ENTITY_TF_IDF_SUM_TT AS T1
			INNER JOIN TREND_ENTITY_TF_IDF_SUM_TT AS T2 ON T2.TREND>T1.TREND AND T1.ENTITY = T2.ENTITY
			GROUP BY T1.TREND,T2.TREND),

			COSINE_DIST_TT AS
			(SELECT TREND1,TREND2,
			COSINE_NUM/(SQRT(T2.TF_IDF_SQ_SUM)*SQRT(T3.TF_IDF_SQ_SUM)) AS COSIND_DIST
			FROM COSINE_DIST_NUM_TT AS T1
			INNER JOIN TREND_TF_IDF_SQ_SUM_TT AS T2 ON T1.TREND1=T2.TREND
			INNER JOIN TREND_TF_IDF_SQ_SUM_TT AS T3 ON T1.TREND2=T3.TREND)

			SELECT * FROM COSINE_DIST_TT ORDER BY TREND1,TREND2;
			"""

			cursor.execute(query,(locationid,))
			trend1_column = 0
			trend2_column = 1
			distance_value_column = 2
			trends_list = [] 
			row_counter = 0
			max_columns = 15
			column_iteration = 1
			distance_matrix = [[0 for x in xrange(max_columns)] for x in xrange(max_columns)]

			for row in cursor:
				trend1 = row[trend1_column]
				trend2 = row[trend2_column]
				if trend1 not in trends_list:
					trends_list.append(trend1)	
				if trend2 not in trends_list:
					trends_list.append(trend2)

				# this is to check 0,0 1,1 and so on
				distance_matrix[row_counter][row_counter] = 0
				# this populates 1,2 and 2,1 and so on 
				# this avoid 2 loops

				distance_matrix[row_counter][column_iteration] = row[distance_value_column]	
				distance_matrix[column_iteration][row_counter] = row[distance_value_column]	

				column_iteration = column_iteration + 1
				if column_iteration == max_columns:
					row_counter = row_counter + 1
					column_iteration = row_counter + 1 
						

			return distance_matrix,trends_list

		except Exception:
			print traceback.format_exc()