def get(self, request, corpus_id): if not request.user.is_authenticated(): # can't use @requires_auth because of positional 'self' within class return HttpResponse('Unauthorized', status=401) parameters = get_parameters(request) parameters = validate(parameters, {'score': str, 'ngram_ids': list}) try: ngram_ids = [int(n) for n in parameters['ngram_ids'].split(',')] except: raise ValidationException( '"ngram_ids" needs integers separated by comma.') limit = DEFAULT_N_DOCS_HAVING_NGRAM nodes_list = [] corpus = session.query(Node).filter(Node.id == corpus_id).first() tfidf_id = (session.query(Node.id).filter( Node.typename == "TFIDF-CORPUS", Node.parent_id == corpus.id).first()) tfidf_id = tfidf_id[0] print(tfidf_id) # request data nodes_query = (session.query(Node, func.sum(NodeNodeNgram.score)).join( NodeNodeNgram, NodeNodeNgram.node2_id == Node.id).filter( NodeNodeNgram.node1_id == tfidf_id).filter( Node.typename == 'DOCUMENT', Node.parent_id == corpus.id).filter( or_(*[ NodeNodeNgram.ngram_id == ngram_id for ngram_id in ngram_ids ])).group_by(Node)) # get the total count before applying limit nodes_count = nodes_query.count() # now the query with the limit nodes_results_query = (nodes_query.order_by( func.sum(NodeNodeNgram.score).desc()).limit(limit)) for node, score in nodes_results_query: print(node, score) print("\t corpus:", corpus_id, "\t", node.name) node_dict = { 'id': node.id, 'score': score, } for key in ('title', 'publication_date', 'source', 'authors', 'fields'): if key in node.hyperdata: node_dict[key] = node.hyperdata[key] nodes_list.append(node_dict) return JsonHttpResponse({'count': nodes_count, 'records': nodes_list})
def put(self, request, corpus_id, check_each_doc=True): if not request.user.is_authenticated(): # can't use @requires_auth because of positional 'self' within class return HttpResponse('Unauthorized', status=401) # user is ok fav_node = self._get_fav_node(corpus_id) response = {} if fav_node == None: response = { 'warning': 'No favorites node is defined for this corpus (\'%s\')' % self.corpus.name, 'count_added': 0 } else: req_params = validate(get_parameters(request), { 'docs': list, 'default': "" }) nodeids_to_add = [ int(did) for did in req_params['docs'].split(',') ] if check_each_doc: # verification que ce sont bien des documents du bon corpus # un peu long => désactiver par défaut ? known_docs_q = (session.query( Node.id).filter(Node.parent_id == corpus_id).filter( Node.typename == 'DOCUMENT')) lookup = { known_doc.id: True for known_doc in known_docs_q.all() } # debug # print("lookup hash", lookup) rejected_list = [] for doc_node_id in nodeids_to_add: if (doc_node_id not in lookup): rejected_list.append(doc_node_id) if len(rejected_list): raise ValidationException( "Error on some requested docs: %s (Only nodes of type 'doc' AND belonging to corpus %i can be added to favorites.)" % (str(rejected_list), int(corpus_id))) # add them bulk_insert(NodeNode, ('node1_id', 'node2_id', 'score'), ((fav_node.id, doc_node_id, 1.0) for doc_node_id in nodeids_to_add)) # todo count really added (here: counts input param not result) response = {'count_added': len(nodeids_to_add)} return JsonHttpResponse(response)
def get(self, request, corpus_id): """ 2 possibilities with/without param 1) GET http://localhost:8000/api/nodes/2/favorites (returns the full list of fav docs within corpus 2) 2) GET http://localhost:8000/api/nodes/2/favorites?docs=53,54 (will test if docs 53 and 54 are among the favorites of corpus 2) (returns the intersection of fav docs with [53,54]) """ if not request.user.is_authenticated(): # can't use @requires_auth because of positional 'self' within class return HttpResponse('Unauthorized', status=401) fav_node = self._get_fav_node(corpus_id) req_params = validate(get_parameters(request), { 'docs': list, 'default': "" }) response = {} if fav_node == None: response = { 'warning': 'No favorites node is defined for this corpus (\'%s\')' % self.corpus.name, 'favdocs': [] } elif 'docs' not in req_params: # each docnode associated to the favnode of this corpusnode q = (session.query( NodeNode.node2_id).filter(NodeNode.node1_id == fav_node.id)) all_doc_ids = [row.node2_id for row in q.all()] response = {'favdocs': all_doc_ids} else: nodeids_to_check = [ int(did) for did in req_params['docs'].split(',') ] # each docnode from the input list, if it is associated to the favnode q = (session.query(NodeNode.node2_id).filter( NodeNode.node1_id == fav_node.id).filter( NodeNode.node2_id.in_(nodeids_to_check))) present_doc_ids = [row.node2_id for row in q.all()] absent_doc_ids = [ did for did in nodeids_to_check if did not in present_doc_ids ] response = {'favdocs': present_doc_ids, 'missing': absent_doc_ids} return JsonHttpResponse(response)
def get(self, request, node_id): # check that the node is a corpus # ? faster from cache than: corpus = session.query(Node)... if not request.user.is_authenticated(): # can't use @requires_auth because of positional 'self' within class return HttpResponse('Unauthorized', status=401) corpus = cache.Node[node_id] if corpus.typename != 'CORPUS': raise ValidationException( "Only nodes of type CORPUS can accept facet queries" + " (but this node has type %s)..." % corpus.typename) else: self.corpus = corpus # check that the hyperfield parameter makes sense _facet_available_subfields = [ 'source', 'publication_year', 'rubrique', 'language_iso2', 'language_iso3', 'language_name', 'authors' ] parameters = get_parameters(request) # validate() triggers an info message if subfield not in range parameters = validate( parameters, { 'type': dict, 'items': { 'hyperfield': { 'type': str, 'range': _facet_available_subfields } } }) subfield = parameters['hyperfield'] # do the aggregated sum (xcounts, total) = self._ndocs_by_facet(subfield) # response return JsonHttpResponse({ 'doc_count': total, 'by': { subfield: xcounts } })
def delete(self, request): """Removes the list of nodes corresponding to the query. TODO : Should be a delete method! """ if not request.user.is_authenticated(): # can't use @requires_auth because of positional 'self' within class return HttpResponse('Unauthorized', status=401) parameters = get_parameters(request) parameters = validate(parameters, {'ids': list}) try: node_ids = [int(n) for n in parameters['ids'].split(',')] except: raise ValidationException( '"ids" needs integers separated by comma.') result = session.execute(delete(Node).where(Node.id.in_(node_ids))) session.commit() return JsonHttpResponse({'deleted': result.rowcount})
def delete(self, request, corpus_id): """ DELETE http://localhost:8000/api/nodes/2/favorites?docs=53,54 (will delete docs 53 and 54 from the favorites of corpus 2) """ if not request.user.is_authenticated(): # can't use @requires_auth because of positional 'self' within class return HttpResponse('Unauthorized', status=401) # user is ok fav_node = self._get_fav_node(corpus_id) response = {} if fav_node == None: response = { 'warning': 'No favorites node is defined for this corpus (\'%s\')' % self.corpus.name, 'count_removed': 0 } else: req_params = validate(get_parameters(request), { 'docs': list, 'default': "" }) nodeids_to_delete = [ int(did) for did in req_params['docs'].split(',') ] try: # it deletes from favourites but not from DB result = session.execute( delete(NodeNode).where( NodeNode.node1_id == fav_node.id).where( NodeNode.node2_id.in_(nodeids_to_delete))) session.commit() response = {'count_removed': result.rowcount} finally: session.close() return JsonHttpResponse(response)
def _query_nodes(request, node_id=None): if request.user.id is None: raise TypeError( "This API request must come from an authenticated user.") else: # we query among the nodes that belong to this user user = cache.User[request.user.id] # parameters validation # fixme: this validation does not allow custom keys in url (eg '?name=' for rename action) parameters = get_parameters(request) parameters = validate( parameters, { 'type': dict, 'items': { 'formated': { 'type': str, 'required': False, 'default': 'json' }, 'pagination_limit': { 'type': int, 'default': 10 }, 'pagination_offset': { 'type': int, 'default': 0 }, 'fields': { 'type': list, 'default': _node_default_fields, 'items': { 'type': str, 'range': _node_available_fields, } }, # choice of hyperdata fields 'hyperdata_filter': { 'type': list, 'required': False, 'items': { 'type': str, 'range': _hyperdata_available_fields, } }, # optional filtering parameters 'types': { 'type': list, 'required': False, 'items': { 'type': str, 'range': _node_available_types, } }, 'parent_id': { 'type': int, 'required': False }, } }) # debug # print('PARAMS', parameters) # additional validation for hyperdata_filter if (('hyperdata_filter' in parameters) and (not ('hyperdata' in parameters['fields']))): raise ValidationException( "Using the hyperdata_filter filter requires fields[]=hyperdata") # start the query query = user.nodes() # filter by id if node_id is not None: query = query.filter(Node.id == node_id) # filter by type if 'types' in parameters: query = query.filter(Node.typename.in_(parameters['types'])) # filter by parent if 'parent_id' in parameters: query = query.filter(Node.parent_id == parameters['parent_id']) # count count = query.count() # order query = query.order_by(Node.hyperdata['publication_date'], Node.id) # paginate the query if parameters['pagination_limit'] == -1: query = query[parameters['pagination_offset']:] else: query = query[ parameters['pagination_offset']:parameters['pagination_limit']] # return the result! # (the receiver function does the filtering of fields and hyperdata_filter) return parameters, query, count
def post(self, request, project_id): # example only input = request.data or { 'x': { 'with_empty': True, 'resolution': 'decade', 'value': 'publication_date', }, 'y': { # 'divided_by': 'total_ngrams_count', # 'divided_by': 'total_documents_count', }, 'filter': { # 'ngrams': ['bees', 'bee', 'honeybee', 'honeybees', 'honey bee', 'honey bees'], # 'ngrams': ['insecticide', 'pesticide'], # 'corpora': [52633], # 'date': {'min': '1995-12-31'} }, # 'format': 'csv', } print(input) # input validation input = validate( input, { 'type': dict, 'default': {}, 'items': { 'x': { 'type': dict, 'default': {}, 'items': { # which hyperdata to choose for the date 'value': { 'type': str, 'default': 'publication_date', 'range': { 'publication_date', } }, # time resolution 'resolution': { 'type': str, 'range': self._resolutions.keys(), 'default': 'month' }, # should we add zeroes for empty values? 'with_empty': { 'type': bool, 'default': False }, } }, 'y': { 'type': dict, 'default': {}, 'items': { # mesured value 'value': { 'type': str, 'default': 'ngrams_count', 'range': { 'ngrams_count', 'documents_count', 'ngrams_tfidf' } }, # value by which we should normalize 'divided_by': { 'type': str, 'range': { 'total_documents_count', 'documents_count', 'total_ngrams_count' } }, } }, # filtering 'filter': { 'type': dict, 'default': {}, 'items': { # filter by metadata 'hyperdata': { 'type': list, 'default': [], 'items': { 'type': dict, 'items': { 'key': { 'type': str, 'range': self._operators.keys() }, 'operator': { 'type': str }, 'value': { 'type': str }, } } }, # filter by date 'date': { 'type': dict, 'items': { 'min': { 'type': datetime.datetime }, 'max': { 'type': datetime.datetime }, }, 'default': {} }, # filter by corpora 'corpora': { 'type': list, 'default': [], 'items': { 'type': int } }, # filter by ngrams 'ngrams': { 'type': list, 'default': [], 'items': { 'type': str } }, } }, # output format 'format': { 'type': str, 'default': 'json', 'range': {'json', 'csv'} }, } }) # build query: prepare columns X = aliased(NodeHyperdata) column_x = func.date_trunc(input['x']['resolution'], X.value_utc) column_y = { 'documents_count': func.count(Node.id.distinct()), 'ngrams_count': func.sum(NodeNgram.weight), # 'ngrams_tfidf': func.sum(NodeNodeNgram.weight), }[input['y']['value']] # build query: base print(input) query_base = ( session.query(column_x).select_from(Node).join( NodeNgram, NodeNgram.node_id == Node.id).join( X, X.node_id == NodeNgram.node_id) #.filter(X.key == input['x']['value']) .group_by(column_x).order_by(column_x)) # build query: base, filter by corpora or project if 'corpora' in input['filter'] and input['filter']['corpora']: query_base = (query_base.filter( Node.parent_id.in_(input['filter']['corpora']))) else: ParentNode = aliased(Node) query_base = (query_base.join( ParentNode, ParentNode.id == Node.parent_id).filter( ParentNode.parent_id == project_id)) # build query: base, filter by date if 'date' in input['filter']: if 'min' in input['filter']['date']: query_base = query_base.filter( X.value >= input['filter']['date']['min']) if 'max' in input['filter']['date']: query_base = query_base.filter( X.value <= input['filter']['date']['max']) # build query: filter by ngrams query_result = query_base.add_columns(column_y) if 'ngrams' in input['filter'] and input['filter']['ngrams']: query_result = (query_result.join( Ngram, Ngram.id == NodeNgram.ngram_id).filter( Ngram.terms.in_(input['filter']['ngrams']))) # build query: filter by metadata if 'hyperdata' in input['filter']: for h, hyperdata in enumerate(input['filter']['hyperdata']): print(h, hyperdata) # get hyperdata in database #if hyperdata_model is None: # continue #hyperdata_id, hyperdata_type = hyperdata_model # create alias and query it operator = self._operators[hyperdata['operator']] type_string = type2string( INDEXED_HYPERDATA[hyperdata['key']]['type']) value = self._converters[type_string](hyperdata['value']) query_result = (query_result.join( NodeHyperdata, NodeHyperdata.node_id == NodeNgram.node_id).filter( NodeHyperdata.key == hyperdata['key']).filter( operator(NodeHyperdata.value, value))) # build result: prepare data date_value_list = query_result.all() #print(date_value_list) if date_value_list: date_min = date_value_list[0][0].replace(tzinfo=None) date_max = date_value_list[-2][0].replace(tzinfo=None) # build result: prepare interval result = collections.OrderedDict() if input['x']['with_empty'] and date_value_list: compute_next_date = self._resolutions[input['x']['resolution']] date = date_min while date <= date_max: result[date] = 0.0 date = compute_next_date(date) # build result: integrate for date, value in date_value_list[0:-1]: result[date.replace(tzinfo=None)] = value # build result: normalize query_normalize = None if date_value_list and 'divided_by' in input['y'] and input['y'][ 'divided_by']: if input['y']['divided_by'] == 'total_documents_count': query_normalize = query_base.add_column( func.count(Node.id.distinct())) elif input['y']['divided_by'] == 'total_ngrams_count': query_normalize = query_base.add_column( func.sum(NodeNgram.weight)) if query_normalize is not None: for date, value in query_normalize[0:-1]: date = date.replace(tzinfo=None) if date in result: result[date] /= value # return result with proper formatting if input['format'] == 'json': return JsonHttpResponse( { 'query': input, 'result': sorted(result.items()), }, 201) elif input['format'] == 'csv': return CsvHttpResponse(sorted(result.items()), ('date', 'value'), 201)