def initial(self, request): """ Before dispatching to put(), delete()... 1) Checks current user authentication to prevent remote DB manipulation 2) Prepares self.list_objects from params """ if not request.user.is_authenticated(): raise Http404() # can't use return in initial() (although 401 maybe better than 404) # can't use @requires_auth because of positional 'self' within class # get validated params self.params = get_parameters(request) (self.base_list, self.change_list) = ListChange._validate(self.params) if not len(self.change_list.items): payload_ngrams = request.data['ngrams'] # print("no change_list in params but we got:", payload_ngrams) # change_list can be in payload too change_ngram_ids = [int(n) for n in payload_ngrams.split(',')] if (not len(change_ngram_ids)): raise ValidationException( 'The "ngrams" parameter requires one or more ngram_ids separated by comma' ) else: self.change_list = UnweightedList(change_ngram_ids)
def get(self, request): parameters = get_parameters(request) maplist_id = None scores_id = None if "corpus" in parameters: corpus_id = parameters['corpus'] corpus = cache.Node[corpus_id] maplist_id = corpus.children('MAPLIST').first().id # with a corpus_id, the explicit scoring pointer is optional if "scoring" in parameters: scores_id = parameters['scoring'] else: scores_id = corpus.children('OCCURRENCES').first().id elif "maplist" in parameters and "scoring" in parameters: maplist_id = int(parameters['mainlist']) scores_id = int(parameters['scoring']) else: raise ValidationException("A 'corpus' id or 'maplist' id is required, and a 'scoring' for occurences counts") ngraminfo = {} # ngram details sorted per ngram id listmembers = {'maplist':[]} # ngram ids sorted per list name # infos for all ngrams from maplist map_ngrams = query_list(maplist_id, details=True, scoring_metric_id= scores_id).all() # ex: [(8805, 'mean age', 4.0), # (1632, 'activity', 4.0), # (8423, 'present', 2.0), # (2928, 'objective', 2.0)] # shortcut to useful function during loop add_to_members = listmembers['maplist'].append for ng in map_ngrams: ng_id = ng[0] ngraminfo[ng_id] = ng[1:] # maplist ngrams will already be <=> ngraminfos # but the client side expects a membership lookup # as when there are multiple lists or some groupings add_to_members(ng_id) return JsonHttpResponse({ 'ngraminfos' : ngraminfo, 'listmembers' : listmembers, 'links' : {}, # no grouping links sent during glance (for speed) 'nodeids' : { 'mainlist': None, 'maplist' : maplist_id, 'stoplist': None, 'groups': None, 'scores': None, } })
def delete(self, request): """ Within a groupnode, deletes some group elements from some groups Data format just like in POST, everything in the url """ # from the url params = get_parameters(request) # the node param is unique group_node = params.pop('node') # the others params are links to change couples_to_remove = self.links_to_couples(params) # debug # print("==couples_to_remove=================================dd=") # print(couples_to_remove) # remove selectively group_couples # using IN is correct in this case: list of ids is short and external # see stackoverflow.com/questions/444475/ db_rows = (session.query(NodeNgramNgram).filter( NodeNgramNgram.node_id == group_node).filter( tuple_(NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id).in_(couples_to_remove))) n_removed = db_rows.delete(synchronize_session=False) session.commit() return JsonHttpResponse({'count_removed': n_removed}, 200)
def patch(self,request): """ A copy of POST (merging list) but with the source == just an internal corpus_id params in request.GET: onto_corpus: the corpus whose lists are getting patched from: the corpus from which we take the source lists to merge in todo: an array of the list types ("map", "main", "stop") to merge in """ if not request.user.is_authenticated(): res = HttpResponse("Unauthorized") res.status_code = 401 return res params = get_parameters(request) print(params) # the corpus with the target lists to be patched corpus_id = int(params.pop("onto_corpus")) corpus_node = cache.Node[corpus_id] print(params) if request.user.id != corpus_node.user_id: res = HttpResponse("Unauthorized") res.status_code = 401 return res list_types = {'map':'MAPLIST', 'main':'MAINLIST', 'stop':'STOPLIST'} # internal DB retrieve source_lists source_corpus_id = int(params.pop("from_corpus")) source_node = cache.Node[source_corpus_id] todo_lists = params.pop("todo").split(',') # ex: ['map', 'stop'] source_lists = {} for key in todo_lists: source_lists[key] = UnweightedList( source_node.children(list_types[key]).first().id ) # add the groupings too source_lists['groupings'] = Translations( source_node.children("GROUPLIST").first().id ) # attempt to merge and send response try: # merge the source_lists onto those of the target corpus log_msg = merge_ngramlists(source_lists, onto_corpus=corpus_node) return JsonHttpResponse({ 'log': log_msg, }, 200) except Exception as e: return JsonHttpResponse({ 'err': str(e), }, 400)
def get(self, request, corpus_id): if not request.user.is_authenticated(): # can't use @requires_auth because of positional 'self' within class return HttpResponse('Unauthorized', status=401) parameters = get_parameters(request) parameters = validate(parameters, {'score': str, 'ngram_ids': list}) try: ngram_ids = [int(n) for n in parameters['ngram_ids'].split(',')] except: raise ValidationException( '"ngram_ids" needs integers separated by comma.') limit = DEFAULT_N_DOCS_HAVING_NGRAM nodes_list = [] corpus = session.query(Node).filter(Node.id == corpus_id).first() tfidf_id = (session.query(Node.id).filter( Node.typename == "TFIDF-CORPUS", Node.parent_id == corpus.id).first()) tfidf_id = tfidf_id[0] print(tfidf_id) # request data nodes_query = (session.query(Node, func.sum(NodeNodeNgram.score)).join( NodeNodeNgram, NodeNodeNgram.node2_id == Node.id).filter( NodeNodeNgram.node1_id == tfidf_id).filter( Node.typename == 'DOCUMENT', Node.parent_id == corpus.id).filter( or_(*[ NodeNodeNgram.ngram_id == ngram_id for ngram_id in ngram_ids ])).group_by(Node)) # get the total count before applying limit nodes_count = nodes_query.count() # now the query with the limit nodes_results_query = (nodes_query.order_by( func.sum(NodeNodeNgram.score).desc()).limit(limit)) for node, score in nodes_results_query: print(node, score) print("\t corpus:", corpus_id, "\t", node.name) node_dict = { 'id': node.id, 'score': score, } for key in ('title', 'publication_date', 'source', 'authors', 'fields'): if key in node.hyperdata: node_dict[key] = node.hyperdata[key] nodes_list.append(node_dict) return JsonHttpResponse({'count': nodes_count, 'records': nodes_list})
def put(self, request, corpus_id, check_each_doc=True): if not request.user.is_authenticated(): # can't use @requires_auth because of positional 'self' within class return HttpResponse('Unauthorized', status=401) # user is ok fav_node = self._get_fav_node(corpus_id) response = {} if fav_node == None: response = { 'warning': 'No favorites node is defined for this corpus (\'%s\')' % self.corpus.name, 'count_added': 0 } else: req_params = validate(get_parameters(request), { 'docs': list, 'default': "" }) nodeids_to_add = [ int(did) for did in req_params['docs'].split(',') ] if check_each_doc: # verification que ce sont bien des documents du bon corpus # un peu long => désactiver par défaut ? known_docs_q = (session.query( Node.id).filter(Node.parent_id == corpus_id).filter( Node.typename == 'DOCUMENT')) lookup = { known_doc.id: True for known_doc in known_docs_q.all() } # debug # print("lookup hash", lookup) rejected_list = [] for doc_node_id in nodeids_to_add: if (doc_node_id not in lookup): rejected_list.append(doc_node_id) if len(rejected_list): raise ValidationException( "Error on some requested docs: %s (Only nodes of type 'doc' AND belonging to corpus %i can be added to favorites.)" % (str(rejected_list), int(corpus_id))) # add them bulk_insert(NodeNode, ('node1_id', 'node2_id', 'score'), ((fav_node.id, doc_node_id, 1.0) for doc_node_id in nodeids_to_add)) # todo count really added (here: counts input param not result) response = {'count_added': len(nodeids_to_add)} return JsonHttpResponse(response)
def get(self, request, corpus_id): """ 2 possibilities with/without param 1) GET http://localhost:8000/api/nodes/2/favorites (returns the full list of fav docs within corpus 2) 2) GET http://localhost:8000/api/nodes/2/favorites?docs=53,54 (will test if docs 53 and 54 are among the favorites of corpus 2) (returns the intersection of fav docs with [53,54]) """ if not request.user.is_authenticated(): # can't use @requires_auth because of positional 'self' within class return HttpResponse('Unauthorized', status=401) fav_node = self._get_fav_node(corpus_id) req_params = validate(get_parameters(request), { 'docs': list, 'default': "" }) response = {} if fav_node == None: response = { 'warning': 'No favorites node is defined for this corpus (\'%s\')' % self.corpus.name, 'favdocs': [] } elif 'docs' not in req_params: # each docnode associated to the favnode of this corpusnode q = (session.query( NodeNode.node2_id).filter(NodeNode.node1_id == fav_node.id)) all_doc_ids = [row.node2_id for row in q.all()] response = {'favdocs': all_doc_ids} else: nodeids_to_check = [ int(did) for did in req_params['docs'].split(',') ] # each docnode from the input list, if it is associated to the favnode q = (session.query(NodeNode.node2_id).filter( NodeNode.node1_id == fav_node.id).filter( NodeNode.node2_id.in_(nodeids_to_check))) present_doc_ids = [row.node2_id for row in q.all()] absent_doc_ids = [ did for did in nodeids_to_check if did not in present_doc_ids ] response = {'favdocs': present_doc_ids, 'missing': absent_doc_ids} return JsonHttpResponse(response)
def get(self, request): params = get_parameters(request) corpus_id = int(params.pop("corpus")) corpus_node = cache.Node[corpus_id] # response is file-like + headers response = HttpResponse(content_type='text/csv') response['Content-Disposition'] = 'attachment; filename="corpus-%i_gargantext_term_list.csv"' % corpus_id # fill the response with the data export_ngramlists(corpus_node, fname=response, titles=True) return response
def post(self, request): """ Merge the lists of a corpus with other lists from a CSV source or from another corpus params in request.GET: onto_corpus: the corpus whose lists are getting patched params in request.data: csvfile: the csv file /!\ We assume we checked the file size client-side before upload """ if not request.user.is_authenticated(): res = HttpResponse("Unauthorized") res.status_code = 401 return res # the corpus with the target lists to be patched params = get_parameters(request) corpus_id = int(params.pop("onto_corpus")) corpus_node = cache.Node[corpus_id] if request.user.id != corpus_node.user_id: res = HttpResponse("Unauthorized") res.status_code = 401 return res # request also contains the file # csv_file has type django.core.files.uploadedfile.InMemoryUploadedFile # ---------------------- csv_file = request.data['csvfile'] csv_contents = csv_file.read().decode("UTF-8").split("\n") csv_file.close() del csv_file # import the csv # try: log_msg = "Async generation" corpus_node_id = corpus_node.id scheduled(import_and_merge_ngramlists)(csv_contents, corpus_node_id, overwrite=bool( params.get('overwrite'))) return JsonHttpResponse({ 'log': log_msg, }, 200)
def get(self, request, node_id): # check that the node is a corpus # ? faster from cache than: corpus = session.query(Node)... if not request.user.is_authenticated(): # can't use @requires_auth because of positional 'self' within class return HttpResponse('Unauthorized', status=401) corpus = cache.Node[node_id] if corpus.typename != 'CORPUS': raise ValidationException( "Only nodes of type CORPUS can accept facet queries" + " (but this node has type %s)..." % corpus.typename) else: self.corpus = corpus # check that the hyperfield parameter makes sense _facet_available_subfields = [ 'source', 'publication_year', 'rubrique', 'language_iso2', 'language_iso3', 'language_name', 'authors' ] parameters = get_parameters(request) # validate() triggers an info message if subfield not in range parameters = validate( parameters, { 'type': dict, 'items': { 'hyperfield': { 'type': str, 'range': _facet_available_subfields } } }) subfield = parameters['hyperfield'] # do the aggregated sum (xcounts, total) = self._ndocs_by_facet(subfield) # response return JsonHttpResponse({ 'doc_count': total, 'by': { subfield: xcounts } })
def delete(self, request): """Removes the list of nodes corresponding to the query. TODO : Should be a delete method! """ if not request.user.is_authenticated(): # can't use @requires_auth because of positional 'self' within class return HttpResponse('Unauthorized', status=401) parameters = get_parameters(request) parameters = validate(parameters, {'ids': list}) try: node_ids = [int(n) for n in parameters['ids'].split(',')] except: raise ValidationException( '"ids" needs integers separated by comma.') result = session.execute(delete(Node).where(Node.id.in_(node_ids))) session.commit() return JsonHttpResponse({'deleted': result.rowcount})
def delete(self, request, corpus_id): """ DELETE http://localhost:8000/api/nodes/2/favorites?docs=53,54 (will delete docs 53 and 54 from the favorites of corpus 2) """ if not request.user.is_authenticated(): # can't use @requires_auth because of positional 'self' within class return HttpResponse('Unauthorized', status=401) # user is ok fav_node = self._get_fav_node(corpus_id) response = {} if fav_node == None: response = { 'warning': 'No favorites node is defined for this corpus (\'%s\')' % self.corpus.name, 'count_removed': 0 } else: req_params = validate(get_parameters(request), { 'docs': list, 'default': "" }) nodeids_to_delete = [ int(did) for did in req_params['docs'].split(',') ] try: # it deletes from favourites but not from DB result = session.execute( delete(NodeNode).where( NodeNode.node1_id == fav_node.id).where( NodeNode.node2_id.in_(nodeids_to_delete))) session.commit() response = {'count_removed': result.rowcount} finally: session.close() return JsonHttpResponse(response)
def _query_nodes(request, node_id=None): if request.user.id is None: raise TypeError( "This API request must come from an authenticated user.") else: # we query among the nodes that belong to this user user = cache.User[request.user.id] # parameters validation # fixme: this validation does not allow custom keys in url (eg '?name=' for rename action) parameters = get_parameters(request) parameters = validate( parameters, { 'type': dict, 'items': { 'formated': { 'type': str, 'required': False, 'default': 'json' }, 'pagination_limit': { 'type': int, 'default': 10 }, 'pagination_offset': { 'type': int, 'default': 0 }, 'fields': { 'type': list, 'default': _node_default_fields, 'items': { 'type': str, 'range': _node_available_fields, } }, # choice of hyperdata fields 'hyperdata_filter': { 'type': list, 'required': False, 'items': { 'type': str, 'range': _hyperdata_available_fields, } }, # optional filtering parameters 'types': { 'type': list, 'required': False, 'items': { 'type': str, 'range': _node_available_types, } }, 'parent_id': { 'type': int, 'required': False }, } }) # debug # print('PARAMS', parameters) # additional validation for hyperdata_filter if (('hyperdata_filter' in parameters) and (not ('hyperdata' in parameters['fields']))): raise ValidationException( "Using the hyperdata_filter filter requires fields[]=hyperdata") # start the query query = user.nodes() # filter by id if node_id is not None: query = query.filter(Node.id == node_id) # filter by type if 'types' in parameters: query = query.filter(Node.typename.in_(parameters['types'])) # filter by parent if 'parent_id' in parameters: query = query.filter(Node.parent_id == parameters['parent_id']) # count count = query.count() # order query = query.order_by(Node.hyperdata['publication_date'], Node.id) # paginate the query if parameters['pagination_limit'] == -1: query = query[parameters['pagination_offset']:] else: query = query[ parameters['pagination_offset']:parameters['pagination_limit']] # return the result! # (the receiver function does the filtering of fields and hyperdata_filter) return parameters, query, count
def get(self, request): parameters = get_parameters(request) glance_limit = None mainlist_id = None scores_id = None groups_id = None other_list_ids = {'maplist': None, 'stoplist': None} # 1) retrieve a mainlist_id and other lists ########################################## # simple request: just refers to the parent corpus # ------------------------------------------------ if "corpus" in parameters: corpus_id = parameters['corpus'] corpus = cache.Node[corpus_id] # with a corpus_id, the explicit scoring pointer is optional if "scoring" in parameters: scores_id = parameters['scoring'] else: scores_id = corpus.children('OCCURRENCES').first().id # retrieve the family of lists that have corpus as parent mainlist_id = corpus.children('MAINLIST').first().id groups_id = corpus.children('GROUPLIST').first().id other_list_ids['stoplist'] = corpus.children('STOPLIST').first().id other_list_ids['maplist'] = corpus.children('MAPLIST').first().id # custom request: refers to each list individually # ------------------------------------------------- elif "mainlist" in parameters and "scoring" in parameters: mainlist_id = parameters['mainlist'] scores_id = parameters['scoring'] groups_id = None if 'groups' in parameters: groups_id = parameters['scoring'] for k in ['stoplist', 'maplist']: if k in parameters: other_list_ids[k] = parameters[k] # or request has an error # ----------------------- else: raise ValidationException( "Either a 'corpus' parameter or 'mainlist' & 'scoring' params are required" ) # 2) get the infos for each list ################################ ngraminfo = {} # ngram details sorted per ngram id linkinfo = {} # ngram groups sorted per ngram id listmembers = {} # ngram ids sorted per list name if "head" in parameters: # head <=> only mainlist AND only k top ngrams glance_limit = int(parameters['head']) mainlist_query = query_list(mainlist_id, details=True, pagination_limit=glance_limit, scoring_metric_id=scores_id) else: # infos for all ngrams from mainlist mainlist_query = query_list(mainlist_id, details=True, scoring_metric_id=scores_id) # infos for grouped ngrams, absent from mainlist hidden_ngrams_query = query_grouped_ngrams(groups_id, details=True) # infos for stoplist terms, absent from mainlist stop_ngrams_query = query_list(other_list_ids['stoplist'], details=True, scoring_metric_id=scores_id) # and for the other lists (stop and map) # no details needed here, just the member ids for li in other_list_ids: li_elts = query_list(other_list_ids[li], details=False).all() # simple array of ngram_ids listmembers[li] = [ng[0] for ng in li_elts] # and the groupings if groups_id: links = Translations(groups_id) linkinfo = links.groups # list of ngrams_which_need_detailed_info = [] if "head" in parameters: # head triggered simplified form: just the top of the mainlist # TODO add maplist membership ngrams_which_need_detailed_info = mainlist_query.all() else: ngrams_which_need_detailed_info = mainlist_query.all( ) + hidden_ngrams_query.all() + stop_ngrams_query.all() # the output form of details is: # ngraminfo[id] => [term, weight] for ng in ngrams_which_need_detailed_info: ng_id = ng[0] ngraminfo[ng_id] = ng[1:] # NB the client js will sort mainlist ngs from hidden ngs after ajax # using linkinfo (otherwise needs redundant listmembers for main) return JsonHttpResponse({ 'ngraminfos': ngraminfo, 'listmembers': listmembers, 'links': linkinfo, 'nodeids': { 'mainlist': mainlist_id, 'maplist': other_list_ids['maplist'], 'stoplist': other_list_ids['stoplist'], 'groups': groups_id, 'scores': scores_id, } })
def put(self, request): """ Add some group elements to a group node => adds new couples from GroupsBuffer._to_add of terms view TODO see use of util.lists.Translations Parameters are all in the url (for symmetry with DELETE method) api/ngramlists/groups?node=783&1228[]=891,1639 => creates 1228 - 891 and 1228 - 1639 general format is: mainform_id[]=subform_id1,subform_id2 etc => creates mainform_id - subform_id1 and mainform_id - subform_id2 NB: also checks if the couples exist before because the ngram table will send the entire group (old existing links + new links) """ # from the url params = get_parameters(request) # the node param is unique group_node = params.pop('node') # the others params are links to change couples = self.links_to_couples(params) # debug # print("==couples from url =================================++++=") # print(couples) # local version of "insert if not exists" -------------------->8-------- # (1) check already existing elements check_query = (session.query(NodeNgramNgram).filter( NodeNgramNgram.node_id == group_node).filter( tuple_(NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id).in_(couples))) existing = {} for synonyms in check_query.all(): existing[(synonyms.ngram1_id, synonyms.ngram2_id)] = True # debug #print("==existing") #print(existing) # (2) compute difference locally couples_to_add = [(mform, sform) for (mform, sform) in couples if (mform, sform) not in existing] # debug # print("== couples_to_add =================================++++=") # print(couples_to_add) # (3) add new groupings bulk_insert(NodeNgramNgram, ('node_id', 'ngram1_id', 'ngram2_id', 'weight'), ((group_node, mainform, subform, 1.0) for (mainform, subform) in couples_to_add)) # ------------------------------------------------------------>8-------- return JsonHttpResponse({ 'count_added': len(couples_to_add), }, 200)
def put(self, request): """ Basic external access for *creating an ngram* --------------------------------------------- 1 - checks user authentication before any changes 2 - checks if ngram to Ngram table in DB if yes returns ngram_id and optionally mainform_id otherwise continues 3 - adds the ngram to Ngram table in DB 4 - (if corpus param is present) adds the ngram doc counts to NodeNgram table in DB (aka "index the ngram" throught the docs of the corpus) 5 - returns json with: 'msg' => a success msg 'text' => the initial text content 'term' => the normalized text content 'id' => the new ngram_id 'count' => the number of docs with the ngram in the corpus (if corpus param is present) 'group' => the mainform_id if applicable possible inline parameters -------------------------- @param text=<ngram_string> [required] @param corpus=<CORPUS_ID> [optional] @param testgroup (true if present) [optional, requires corpus] """ # 1 - check user authentication if not request.user.is_authenticated(): res = HttpResponse("Unauthorized") res.status_code = 401 return res # the params params = get_parameters(request) print("PARAMS", [(i,v) for (i,v) in params.items()]) if 'text' in params: original_text = str(params.pop('text')) ngram_str = normalize_forms(normalize_chars(original_text)) else: raise ValidationException('The route PUT /api/ngrams/ is used to create a new ngram\ It requires a "text" parameter,\ for instance /api/ngrams?text=hydrometallurgy') if ('testgroup' in params) and (not ('corpus' in params)): raise ValidationException("'testgroup' param requires 'corpus' param") # if we have a 'corpus' param (to do the indexing)... do_indexation = False if 'corpus' in params: # we retrieve the corpus... corpus_id = int(params.pop('corpus')) corpus_node = cache.Node[corpus_id] # and the user must also have rights on the corpus if request.user.id == corpus_node.user_id: do_indexation = True else: res = HttpResponse("Unauthorized") res.status_code = 401 return res # number of "words" in the ngram ngram_size = len(findall(r' +', ngram_str)) + 1 # do the additions try: log_msg = "" ngram_id = None mainform_id = None preexisting = session.query(Ngram).filter(Ngram.terms==ngram_str).first() if preexisting is not None: ngram_id = preexisting.id log_msg += "ngram already existed (id %i)\n" % ngram_id # in the context of a corpus we can also check if has mainform # (useful for) if 'testgroup' in params: groupings_id = (session.query(Node.id) .filter(Node.parent_id == corpus_id) .filter(Node.typename == 'GROUPLIST') .first() ) had_mainform = (session.query(NodeNgramNgram.ngram1_id) .filter(NodeNgramNgram.node_id == groupings_id) .filter(NodeNgramNgram.ngram2_id == preexisting.id) .first() ) if had_mainform: mainform_id = had_mainform[0] log_msg += "ngram had mainform (id %i) in this corpus" % mainform_id else: log_msg += "ngram was not in any group for this corpus" else: # 2 - insert into Ngrams new_ngram = Ngram(terms=ngram_str, n=ngram_size) session.add(new_ngram) session.commit() ngram_id = new_ngram.id log_msg += "ngram was added with new id %i\n" % ngram_id # 3 - index the term if do_indexation: n_added = index_new_ngrams([ngram_id], corpus_node) log_msg += 'ngram indexed in corpus %i\n' % corpus_id return JsonHttpResponse({ 'msg': log_msg, 'text': original_text, 'term': ngram_str, 'id' : ngram_id, 'group' : mainform_id, 'count': n_added if do_indexation else 'no corpus provided for indexation' }, 200) # just in case except Exception as e: return JsonHttpResponse({ 'msg': str(e), 'text': original_text }, 400)