def query(self, criteria, limit=20, offset=0): """ The most general way to query based on a set of criteria. """ if self.connection is None: self.connection = get_db_connection(self.dbname) cursor = self.connection.cursor() if 'node' in criteria: query_forward = gin_jsonb_value(criteria, node_forward=True) query_backward = gin_jsonb_value(criteria, node_forward=False) cursor.execute( GIN_QUERY_2WAY, { 'query_forward': jsonify(query_forward), 'query_backward': jsonify(query_backward), 'limit': limit, 'offset': offset, }, ) else: query = gin_jsonb_value(criteria) cursor.execute( GIN_QUERY_1WAY, {'query': jsonify(query), 'limit': limit, 'offset': offset}, ) results = [ transform_for_linked_data(data) for uri, data, weight in cursor.fetchall() ] return results
def random_edges(self, limit=20): """ Get a collection of distinct, randomly-selected edges. """ if self.connection is None: self.connection = get_db_connection(self.dbname) cursor = self.connection.cursor() cursor.execute(RANDOM_QUERY, {'limit': limit}) results = [ transform_for_linked_data(data) for uri, data, weight in cursor.fetchall() ] return results
def lookup_assertion(self, uri): """ Get a single assertion, given its URI starting with /a/. """ # Sanitize URIs to remove control characters such as \x00. The postgres driver would # remove \x00 anyway, but this avoids reporting a server error when that happens. uri = remove_control_chars(uri) if self.connection is None: self.connection = get_db_connection(self.dbname) cursor = self.connection.cursor() cursor.execute("SELECT data FROM edges WHERE uri=%(uri)s", {'uri': uri}) results = [transform_for_linked_data(data) for (data,) in cursor.fetchall()] return results
def sample_dataset(self, uri, limit=50, offset=0): """ Get a subsample of edges matching a particular dataset. """ uri = remove_control_chars(uri) if self.connection is None: self.connection = get_db_connection(self.dbname) cursor = self.connection.cursor() dataset_json = json.dumps(uri) cursor.execute(DATASET_QUERY, { 'dataset': dataset_json, 'limit': limit, 'offset': offset }) results = [ transform_for_linked_data(data) for uri, data in cursor.fetchall() ] return results
def lookup(self, uri, limit=100, offset=0): """ A query that returns all the edges that include a certain URI. """ if self.connection is None: self.connection = get_db_connection(self.dbname) if uri.startswith('/c/') or uri.startswith('http'): criteria = {'node': uri} elif uri.startswith('/r/'): criteria = {'rel': uri} elif uri.startswith('/s/'): criteria = {'source': uri} elif uri.startswith('/d/'): criteria = {'dataset': uri} elif uri.startswith('/a/'): return self.lookup_assertion(uri) else: raise ValueError("%r isn't a ConceptNet URI that can be looked up") return self.query(criteria, limit, offset)
def lookup_grouped_by_feature(self, uri, limit=20): """ The query used by the browseable interface, which groups its results by what 'feature' they describe of the queried node. A feature is defined by the relation, the queried node, and the direction (incoming or outgoing). """ uri = remove_control_chars(uri) if self.connection is None: self.connection = get_db_connection(self.dbname) def extract_feature(row): return tuple(row[:2]) def feature_data(row): direction, _, data = row # Hacky way to figure out what the 'other' node is, the one that # (in most cases) didn't match the URI. If both start with our # given URI, take the longer one, which is either a more specific # sense or a different, longer word. shorter, longer = sorted([data['start'], data['end']], key=len) if shorter.startswith(uri): data['other'] = longer else: data['other'] = shorter return data cursor = self.connection.cursor() cursor.execute(NODE_TO_FEATURE_QUERY, {'node': uri, 'limit': limit}) results = {} for feature, rows in itertools.groupby(cursor.fetchall(), extract_feature): results[feature] = [ transform_for_linked_data(feature_data(row)) for row in rows ] return results
def query(self, criteria, limit=20, offset=0): """ The most general way to query based on a set of criteria. """ if self.connection is None: self.connection = get_db_connection(self.dbname) cursor = self.connection.cursor() if 'node' in criteria: query_forward = gin_jsonb_value(criteria, node_forward=True) query_backward = gin_jsonb_value(criteria, node_forward=False) cursor.execute( GIN_QUERY_2WAY, { 'query_forward': jsonify(query_forward), 'query_backward': jsonify(query_backward), 'limit': limit, 'offset': offset, }, ) else: query = gin_jsonb_value(criteria) cursor.execute( GIN_QUERY_1WAY, { 'query': jsonify(query), 'limit': limit, 'offset': offset }, ) results = [ transform_for_linked_data(data) for uri, data, weight in cursor.fetchall() ] return results
def connection(): # See https://www.psycopg.org/docs/connection.html#connection.closed if self._connection is None or self._connection.closed > 0: self._connection = get_db_connection(self.dbname) return self._connection
def complex_concept_load(N=6): connection = get_db_connection(None) cursor = connection.cursor() left_edges_query = ''' select edges.start_id from edges group by edges.start_id having count(*)>''' + str(N) + ''';''' right_edges_query = ''' select edges.end_id from edges group by edges.end_id having count(*)>''' + str(N) + ''';''' left_ids = set() right_ids = set() cursor.execute(left_edges_query) results = cursor.fetchall() for result in results: left_ids.add(result) cursor.execute(right_edges_query) results = cursor.fetchall() for result in results: right_ids.add(result) resset = left_ids.union(right_ids) print(len(resset)) concept_uris = [] print("Finding concept names") for concept in resset: conceptname_query = ''' select uri from nodes where nodes.id=''' + str(concept[0]) + ''';''' cursor.execute(conceptname_query) res = cursor.fetchall() concept_uris.append(res[0][0]) print("Finding edges") the_ultimate_edge_list = [] for idx, concept_uri in enumerate(resset): if idx % 10000 == 0: print(idx) relation_query = ''' select relations.uri,s.uri,v.uri,t.weight from ((select distinct edges.id from edges where edges.weight>=1 and (edges.start_id=''' + str( concept_uri[0]) + ''' OR edges.end_id=''' + str(concept_uri[0]) + ''')) uids inner join edges on edges.id=uids.id) t inner join relations on relations.id=t.relation_id inner join nodes as s on s.id = t.start_id inner join nodes as v on v.id = t.end_id ;''' cursor.execute(relation_query) res = cursor.fetchall() # print(len(res)) the_ultimate_edge_list += res print("Dumping things.") pickle.dump(resset, open("resset.pickle", "wb")) pickle.dump(concept_uris, open("concept_uris.pickle.pickle", "wb")) pickle.dump(the_ultimate_edge_list, open("ultimate_edge_list.pickle", 'wb')) print("Done") return resset