def query(request): """ Pubmed year by year results # alist = [ # {'string': '2011[dp] serendipity', 'queryKey': '1', # 'webEnv': 'NCID_1_11...._F_1', 'count': 475, 'retmax': 6}, # {'string': '2012[dp] serendipity', 'queryKey': '1', # 'webEnv': 'NCID_1_14..._F_1', 'count': 345, 'retmax': 4}, # ... ] (reused as thequeries in query_save) """ print(request.method) alist = [] if request.method == "POST": query = request.POST["query"] if request.POST["N"] == "NaN": N = QUERY_SIZE_N_MAX else: N = int(request.POST["N"]) if N > QUERY_SIZE_N_MAX: msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX) print("ERROR(scrap: pubmed stats): ", msg) raise ValueError(msg) print( "LOG::TIME:_ " + datetime.datetime.now().isoformat() + " query =", query) print("LOG::TIME:_ " + datetime.datetime.now().isoformat() + " N =", N) instancia = Scraper() # serialFetcher (n_last_years, query, query_size) alist = instancia.serialFetcher(5, query, N) data = alist return JsonHttpResponse(data)
def delete(self, request, corpus_id): """ DELETE http://localhost:8000/api/nodes/2/favorites?docs=53,54 (will delete docs 53 and 54 from the favorites of corpus 2) """ if not request.user.is_authenticated(): # can't use @requires_auth because of positional 'self' within class return HttpResponse('Unauthorized', status=401) # user is ok fav_node = self._get_fav_node(corpus_id) response = {} if fav_node == None: response = { 'warning': 'No favorites node is defined for this corpus (\'%s\')' % self.corpus.name, 'count_removed': 0 } else: req_params = validate(get_parameters(request), { 'docs': list, 'default': "" }) nodeids_to_delete = [ int(did) for did in req_params['docs'].split(',') ] # it deletes from favourites but not from DB result = session.execute( delete(NodeNode).where(NodeNode.node1_id == fav_node.id).where( NodeNode.node2_id.in_(nodeids_to_delete))) session.commit() response = {'count_removed': result.rowcount} return JsonHttpResponse(response)
def patch(self, request, corpusnode_id): """ PATCH triggers recount of metrics for the specified corpus. ex PATCH http://localhost:8000/api/metrics/14072 ----- corpus_id """ print("==> update metrics request on ", corpusnode_id) if not request.user.is_authenticated(): # can't use @requires_auth because of positional 'self' within class return HttpResponse('Unauthorized', status=401) try: corpus = cache.Node[int(corpusnode_id)] except: corpus = None if corpus is None: raise ValidationException("%s is not a valid corpus node id." % corpusnode_id) else: t_before = datetime.now() # ============= scheduled(recount)(corpus.id) # ============= t_after = datetime.now() return JsonHttpResponse({ 'corpus_id': corpusnode_id, 'took': "%f s." % (t_after - t_before).total_seconds() })
def save(request, project_id): '''save''' if request.method == "POST": query = request.POST.get("query") try: N = int(request.POST.get("N")) except: N = 0 print(query, N) #for next time #ids = request.POST["ids"] source = get_resource(RESOURCE_TYPE_SCOAP) if N == 0: raise Http404() if N > QUERY_SIZE_N_MAX: N = QUERY_SIZE_N_MAX try: project_id = int(project_id) except ValueError: raise Http404() # do we have a valid project? project = session.query(Node).filter(Node.id == project_id).first() if project is None: raise Http404() user = cache.User[request.user.id] if not user.owns(project): return HttpResponseForbidden() # corpus node instanciation as a Django model corpus = Node(name=query, user_id=request.user.id, parent_id=project_id, typename='CORPUS', hyperdata={ "action": "Scrapping data", "language_id": "en" }) #download_file crawler_bot = load_crawler(source)() #for now no way to force downloading X records #the long running command filename = crawler_bot.download(query) corpus.add_resource( type=source["type"] #, name = source["name"] , path=crawler_bot.path) session.add(corpus) session.commit() #corpus_id = corpus.id try: scheduled(parse_extract_indexhyperdata)(corpus.id) except Exception as error: print('WORKFLOW ERROR') print(error) try: print_tb(error.__traceback__) except: pass # IMPORTANT --------------------------------- # sanitize session after interrupted transact session.rollback() # -------------------------------------------- return render( template_name='pages/projects/wait.html', request=request, context={ 'user': request.user, 'project': project, }, ) data = [query_string, query, N] print(data) return JsonHttpResponse(data)
def get(self, request): parameters = get_parameters(request) glance_limit = None mainlist_id = None scores_id = None groups_id = None other_list_ids = {'maplist': None, 'stoplist': None} # 1) retrieve a mainlist_id and other lists ########################################## # simple request: just refers to the parent corpus # ------------------------------------------------ if "corpus" in parameters: corpus_id = parameters['corpus'] corpus = cache.Node[corpus_id] # with a corpus_id, the explicit scoring pointer is optional if "scoring" in parameters: scores_id = parameters['scoring'] else: scores_id = corpus.children('OCCURRENCES').first().id # retrieve the family of lists that have corpus as parent mainlist_id = corpus.children('MAINLIST').first().id groups_id = corpus.children('GROUPLIST').first().id other_list_ids['stoplist'] = corpus.children('STOPLIST').first().id other_list_ids['maplist'] = corpus.children('MAPLIST').first().id # custom request: refers to each list individually # ------------------------------------------------- elif "mainlist" in parameters and "scoring" in parameters: mainlist_id = parameters['mainlist'] scores_id = parameters['scoring'] groups_id = None if 'groups' in parameters: groups_id = parameters['scoring'] for k in ['stoplist', 'maplist']: if k in parameters: other_list_ids[k] = parameters[k] # or request has an error # ----------------------- else: raise ValidationException( "Either a 'corpus' parameter or 'mainlist' & 'scoring' params are required" ) # 2) get the infos for each list ################################ ngraminfo = {} # ngram details sorted per ngram id linkinfo = {} # ngram groups sorted per ngram id listmembers = {} # ngram ids sorted per list name if "head" in parameters: # head <=> only mainlist AND only k top ngrams glance_limit = int(parameters['head']) mainlist_query = query_list(mainlist_id, details=True, pagination_limit=glance_limit, scoring_metric_id=scores_id) else: # infos for all ngrams from mainlist mainlist_query = query_list(mainlist_id, details=True, scoring_metric_id=scores_id) # infos for grouped ngrams, absent from mainlist hidden_ngrams_query = query_grouped_ngrams(groups_id, details=True) # infos for stoplist terms, absent from mainlist stop_ngrams_query = query_list(other_list_ids['stoplist'], details=True, scoring_metric_id=scores_id) # and for the other lists (stop and map) # no details needed here, just the member ids for li in other_list_ids: li_elts = query_list(other_list_ids[li], details=False).all() # simple array of ngram_ids listmembers[li] = [ng[0] for ng in li_elts] # and the groupings if groups_id: links = Translations(groups_id) linkinfo = links.groups # list of ngrams_which_need_detailed_info = [] if "head" in parameters: # head triggered simplified form: just the top of the mainlist # TODO add maplist membership ngrams_which_need_detailed_info = mainlist_query.all() else: ngrams_which_need_detailed_info = mainlist_query.all( ) + hidden_ngrams_query.all() + stop_ngrams_query.all() # the output form of details is: # ngraminfo[id] => [term, weight] for ng in ngrams_which_need_detailed_info: ng_id = ng[0] ngraminfo[ng_id] = ng[1:] # NB the client js will sort mainlist ngs from hidden ngs after ajax # using linkinfo (otherwise needs redundant listmembers for main) return JsonHttpResponse({ 'ngraminfos': ngraminfo, 'listmembers': listmembers, 'links': linkinfo, 'nodeids': { 'mainlist': mainlist_id, 'maplist': other_list_ids['maplist'], 'stoplist': other_list_ids['stoplist'], 'groups': groups_id, 'scores': scores_id, } })
def get(self, request): parameters = get_parameters(request) maplist_id = None scores_id = None if "corpus" in parameters: corpus_id = parameters['corpus'] corpus = cache.Node[corpus_id] maplist_id = corpus.children('MAPLIST').first().id # with a corpus_id, the explicit scoring pointer is optional if "scoring" in parameters: scores_id = parameters['scoring'] else: scores_id = corpus.children('OCCURRENCES').first().id elif "maplist" in parameters and "scoring" in parameters: maplist_id = int(parameters['mainlist']) scores_id = int(parameters['scoring']) else: raise ValidationException( "A 'corpus' id or 'maplist' id is required, and a 'scoring' for occurences counts" ) ngraminfo = {} # ngram details sorted per ngram id listmembers = {'maplist': []} # ngram ids sorted per list name # infos for all ngrams from maplist map_ngrams = query_list(maplist_id, details=True, scoring_metric_id=scores_id).all() # ex: [(8805, 'mean age', 4.0), # (1632, 'activity', 4.0), # (8423, 'present', 2.0), # (2928, 'objective', 2.0)] # shortcut to useful function during loop add_to_members = listmembers['maplist'].append for ng in map_ngrams: ng_id = ng[0] ngraminfo[ng_id] = ng[1:] # maplist ngrams will already be <=> ngraminfos # but the client side expects a membership lookup # as when there are multiple lists or some groupings add_to_members(ng_id) return JsonHttpResponse({ 'ngraminfos': ngraminfo, 'listmembers': listmembers, 'links': {}, # no grouping links sent during glance (for speed) 'nodeids': { 'mainlist': None, 'maplist': maplist_id, 'stoplist': None, 'groups': None, 'scores': None, } })
def put(self, request): """ Add some group elements to a group node => adds new couples from GroupsBuffer._to_add of terms view TODO see use of util.lists.Translations Parameters are all in the url (for symmetry with DELETE method) api/ngramlists/groups?node=783&1228[]=891,1639 => creates 1228 - 891 and 1228 - 1639 general format is: mainform_id[]=subform_id1,subform_id2 etc => creates mainform_id - subform_id1 and mainform_id - subform_id2 NB: also checks if the couples exist before because the ngram table will send the entire group (old existing links + new links) """ # from the url params = get_parameters(request) # the node param is unique group_node = params.pop('node') # the others params are links to change couples = self.links_to_couples(params) # debug # print("==couples from url =================================++++=") # print(couples) # local version of "insert if not exists" -------------------->8-------- # (1) check already existing elements check_query = (session.query(NodeNgramNgram).filter( NodeNgramNgram.node_id == group_node).filter( tuple_(NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id).in_(couples))) existing = {} for synonyms in check_query.all(): existing[(synonyms.ngram1_id, synonyms.ngram2_id)] = True # debug #print("==existing") #print(existing) # (2) compute difference locally couples_to_add = [(mform, sform) for (mform, sform) in couples if (mform, sform) not in existing] # debug # print("== couples_to_add =================================++++=") # print(couples_to_add) # (3) add new groupings bulk_insert(NodeNgramNgram, ('node_id', 'ngram1_id', 'ngram2_id', 'weight'), ((group_node, mainform, subform, 1.0) for (mainform, subform) in couples_to_add)) # ------------------------------------------------------------>8-------- return JsonHttpResponse({ 'count_added': len(couples_to_add), }, 200)
def patch(self, request): """ A copy of POST (merging list) but with the source == just an internal corpus_id params in request.GET: onto_corpus: the corpus whose lists are getting patched from: the corpus from which we take the source lists to merge in todo: an array of the list types ("map", "main", "stop") to merge in """ if not request.user.is_authenticated(): res = HttpResponse("Unauthorized") res.status_code = 401 return res params = get_parameters(request) print(params) # the corpus with the target lists to be patched corpus_id = int(params.pop("onto_corpus")) corpus_node = cache.Node[corpus_id] print(params) if request.user.id != corpus_node.user_id: res = HttpResponse("Unauthorized") res.status_code = 401 return res list_types = {'map': 'MAPLIST', 'main': 'MAINLIST', 'stop': 'STOPLIST'} # internal DB retrieve source_lists source_corpus_id = int(params.pop("from_corpus")) source_node = cache.Node[source_corpus_id] todo_lists = params.pop("todo").split(',') # ex: ['map', 'stop'] source_lists = {} for key in todo_lists: source_lists[key] = UnweightedList( source_node.children(list_types[key]).first().id) # add the groupings too source_lists['groupings'] = Translations( source_node.children("GROUPLIST").first().id) # attempt to merge and send response try: # merge the source_lists onto those of the target corpus delete = todo_lists if bool(params.get('overwrite')) else [] if len(delete) == len(list_types): delete.append('groupings') log_msg = merge_ngramlists(source_lists, onto_corpus=corpus_node, del_originals=delete) return JsonHttpResponse({ 'log': log_msg, }, 200) except Exception as e: return JsonHttpResponse({ 'err': str(e), }, 400)
def save(request, project_id, return_corpus=False): print("testISTEX:") print(request.method) alist = ["bar", "foo"] # implicit global session # do we have a valid project id? try: project_id = int(project_id) except ValueError: raise Http404() # do we have a valid project? project = (session.query(Node).filter(Node.id == project_id).filter( Node.typename == 'PROJECT')).first() if project is None: raise Http404() # do we have a valid user? user = request.user if not user.is_authenticated(): return redirect('/auth/?next=%s' % request.path) if project.user_id != user.id: return HttpResponseForbidden() query_string = "" if request.method == "POST": query = "-" query_string = "-" #N = QUERY_SIZE_N_MAX if "query" in request.POST: query = request.POST["query"] query_string = query.replace(" ", "+") # url encoded q if "N" in request.POST: if request.POST["N"] == "NaN": N = QUERY_SIZE_N_MAX else: N = int(request.POST["N"]) # query_size from views_opti if N > QUERY_SIZE_N_MAX: N = QUERY_SIZE_N_MAX #msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX) #print("ERROR (scrap: istex d/l ): ",msg) #raise ValueError(msg) print("Scrapping Istex: '%s' (%i)" % (query_string, N)) urlreqs = [] pagesize = 50 tasks = Scraper() chunks = list(tasks.chunks(range(N), pagesize)) for k in chunks: if (k[0] + pagesize) > N: pagesize = N - k[0] urlreqs.append( "http://api.istex.fr/document/?q=" + query_string + "&output=id,corpusName,title,genre,language,doi,host,publicationDate,abstract,author&" + "from=" + str(k[0]) + "&size=" + str(pagesize)) # corpus node instanciation as a Django model corpus = Node(name=query, user_id=request.user.id, parent_id=project_id, typename='CORPUS', hyperdata={ "action": "Scrapping data", "language_id": None }) tasks = Scraper() for i in range(8): t = threading.Thread(target=tasks.worker2) #thing to do t.daemon = True # thread dies when main thread (only non-daemon thread) exits. t.start() for url in urlreqs: tasks.q.put(url) #put a task in th queue tasks.q.join() # wait until everything is finished dwnldsOK = 0 for filename in tasks.firstResults: if filename != False: # add the uploaded resource to the corpus corpus.add_resource( type=get_resource(RESOURCE_TYPE_ISTEX)["type"], path=filename) dwnldsOK += 1 session.add(corpus) session.commit() #corpus_id = corpus.id if dwnldsOK == 0: return JsonHttpResponse(["fail"]) ########################### ########################### try: scheduled(parse_extract_indexhyperdata)(corpus.id) except Exception as error: print('WORKFLOW ERROR') print(error) try: print_tb(error.__traceback__) except: pass # IMPORTANT --------------------------------- # sanitize session after interrupted transact session.rollback() # -------------------------------------------- if return_corpus: return corpus return render( template_name='pages/projects/wait.html', request=request, context={ 'user': request.user, 'project': project, }, ) data = [query_string, query, N] print(data) return JsonHttpResponse(data)
def save( request , project_id ) : # implicit global session # do we have a valid project id? try: project_id = int(project_id) except ValueError: raise Http404() # do we have a valid project? project = session.query( Node ).filter(Node.id == project_id).first() if project is None: raise Http404() user = cache.User[request.user.id] if not user.owns(project): return HttpResponseForbidden() if request.method == "POST": queries = request.POST["query"] name = request.POST["string"] # here we just realize queries already prepared by getGlobalStats # ===> no need to repeat N parameter like in testISTEX <=== instancia = Scraper() thequeries = json.loads(queries) # fyi the sum of our prepared yearly proportional quotas sampled_sum = sum([year_q['retmax'] for year_q in thequeries]) print("Scrapping Pubmed: '%s' (N=%i)" % (name,sampled_sum)) urlreqs = [] for yearquery in thequeries: urlreqs.append( instancia.medlineEfetchRAW( yearquery ) ) alist = ["tudo fixe" , "tudo bem"] # corpus node instanciation as a Django model corpus = project.add_child( name=name , typename = "CORPUS" ) # """ # urlreqs: List of urls to query. # - Then, to each url in urlreqs you do: # eFetchResult = urlopen(url) # eFetchResult.read() # this will output the XML... normally you write this to a XML-file. # """ tasks = Scraper() for i in range(8): t = threading.Thread(target=tasks.worker2) #thing to do t.daemon = True # thread dies when main thread (only non-daemon thread) exits. t.start() for url in urlreqs: tasks.q.put( url ) #put a task in the queue tasks.q.join() # wait until everything is finished dwnldsOK = 0 for filename in tasks.firstResults : print(filename) if filename != False: # add the uploaded resource to the corpus corpus.add_resource( type = get_resource_by_name('Pubmed [XML]')["type"] , path = filename , url = None ) print("Adding the resource") dwnldsOK+=1 session.add(corpus) session.commit() corpus_id = corpus.id if dwnldsOK == 0 : return JsonHttpResponse(["fail"]) try: scheduled(parse_extract_indexhyperdata)(corpus_id) except Exception as error: print('WORKFLOW ERROR') print(error) try: print_tb(error.__traceback__) except: pass # IMPORTANT --------------------------------- # sanitize session after interrupted transact session.rollback() # -------------------------------------------- sleep(1) return HttpResponseRedirect('/projects/' + str(project_id)) data = alist return JsonHttpResponse(data)
def put(self, request): """ Basic external access for *creating an ngram* --------------------------------------------- 1 - checks user authentication before any changes 2 - checks if ngram to Ngram table in DB if yes returns ngram_id and optionally mainform_id otherwise continues 3 - adds the ngram to Ngram table in DB 4 - (if corpus param is present) adds the ngram doc counts to NodeNgram table in DB (aka "index the ngram" throught the docs of the corpus) 5 - returns json with: 'msg' => a success msg 'text' => the initial text content 'term' => the normalized text content 'id' => the new ngram_id 'count' => the number of docs with the ngram in the corpus (if corpus param is present) 'group' => the mainform_id if applicable possible inline parameters -------------------------- @param text=<ngram_string> [required] @param corpus=<CORPUS_ID> [optional] @param testgroup (true if present) [optional, requires corpus] """ # 1 - check user authentication if not request.user.is_authenticated(): res = HttpResponse("Unauthorized") res.status_code = 401 return res # the params params = get_parameters(request) print("PARAMS", [(i,v) for (i,v) in params.items()]) if 'text' in params: original_text = str(params.pop('text')) ngram_str = normalize_forms(normalize_chars(original_text)) else: raise ValidationException('The route PUT /api/ngrams/ is used to create a new ngram\ It requires a "text" parameter,\ for instance /api/ngrams?text=hydrometallurgy') if ('testgroup' in params) and (not ('corpus' in params)): raise ValidationException("'testgroup' param requires 'corpus' param") # if we have a 'corpus' param (to do the indexing)... do_indexation = False if 'corpus' in params: # we retrieve the corpus... corpus_id = int(params.pop('corpus')) corpus_node = cache.Node[corpus_id] # and the user must also have rights on the corpus if request.user.id == corpus_node.user_id: do_indexation = True else: res = HttpResponse("Unauthorized") res.status_code = 401 return res # number of "words" in the ngram ngram_size = len(findall(r' +', ngram_str)) + 1 # do the additions try: log_msg = "" ngram_id = None mainform_id = None preexisting = session.query(Ngram).filter(Ngram.terms==ngram_str).first() if preexisting is not None: ngram_id = preexisting.id log_msg += "ngram already existed (id %i)\n" % ngram_id # in the context of a corpus we can also check if has mainform # (useful for) if 'testgroup' in params: groupings_id = (session.query(Node.id) .filter(Node.parent_id == corpus_id) .filter(Node.typename == 'GROUPLIST') .first() ) had_mainform = (session.query(NodeNgramNgram.ngram1_id) .filter(NodeNgramNgram.node_id == groupings_id) .filter(NodeNgramNgram.ngram2_id == preexisting.id) .first() ) if had_mainform: mainform_id = had_mainform[0] log_msg += "ngram had mainform (id %i) in this corpus" % mainform_id else: log_msg += "ngram was not in any group for this corpus" else: # 2 - insert into Ngrams new_ngram = Ngram(terms=ngram_str, n=ngram_size) session.add(new_ngram) session.commit() ngram_id = new_ngram.id log_msg += "ngram was added with new id %i\n" % ngram_id # 3 - index the term if do_indexation: n_added = index_new_ngrams([ngram_id], corpus_node) log_msg += 'ngram indexed in corpus %i\n' % corpus_id return JsonHttpResponse({ 'msg': log_msg, 'text': original_text, 'term': ngram_str, 'id' : ngram_id, 'group' : mainform_id, 'count': n_added if do_indexation else 'no corpus provided for indexation' }, 200) # just in case except Exception as e: return JsonHttpResponse({ 'msg': str(e), 'text': original_text }, 400)
def get(self, request, project_id, corpus_id): ''' Graph.get :: Get graph data as REST api. Get all the parameters first graph?field1=ngrams&field2=ngrams& graph?field1=ngrams&field2=ngrams&start=''&end='' NB save new graph mode (option saveOnly=True without a cooc_id) can return the new cooc id in the json before counting + filling data in async ''' if not request.user.is_authenticated(): # can't use @requires_auth because of positional 'self' within class return HttpResponse('Unauthorized', status=401) # Get the node we are working with corpus = session.query(Node).filter(Node.id==corpus_id).first() # TODO Parameters to save in hyperdata of the Node Cooc # WARNING: we could factorize the parameters as dict but ... # ... it causes a bug in asynchronous function ! # Check celery upgrades before. # Example (for the future): # parameters = dict() # parameters['field1'] = field1 # parameters['field2'] = field2 # Get all the parameters in the URL cooc_id = request.GET.get ('cooc_id' , None ) saveOnly = request.GET.get ('saveOnly' , None ) field1 = str(request.GET.get ('field1' , 'ngrams' )) field2 = str(request.GET.get ('field2' , 'ngrams' )) start = request.GET.get ('start' , None ) end = request.GET.get ('end' , None ) mapList_id = int(request.GET.get ('mapList' , 0 )) groupList_id = int(request.GET.get ('groupList' , 0 )) threshold = int(request.GET.get ('threshold' , 1 )) bridgeness = int(request.GET.get ('bridgeness', -1 )) format_ = str(request.GET.get ('format' , 'json' )) type_ = str(request.GET.get ('type' , 'node_link' )) distance = str(request.GET.get ('distance' , 'conditional')) # Get default map List of corpus if mapList_id == 0 : mapList_id = ( session.query ( Node.id ) .filter( Node.typename == "MAPLIST" , Node.parent_id == corpus.id ) .first() ) mapList_id = mapList_id[0] if mapList_id == None : raise ValueError("MAPLIST node needed for cooccurrences") # Get default value if no group list if groupList_id == 0 : groupList_id = ( session.query ( Node.id ) .filter( Node.typename == "GROUPLIST" , Node.parent_id == corpus.id ) .first() ) groupList_id = groupList_id[0] if groupList_id == None : raise ValueError("GROUPLIST node needed for cooccurrences") # Declare accepted fields accepted_field1 = ['ngrams', 'journal', 'source', 'authors'] accepted_field2 = ['ngrams', ] options = ['start', 'end', 'threshold', 'distance', 'cooc_id' ] try: # Check if parameters are accepted if (field1 in accepted_field1) and (field2 in accepted_field2): data = get_graph( corpus=corpus, cooc_id = cooc_id , field1=field1 , field2=field2 , mapList_id = mapList_id , groupList_id = groupList_id , start=start , end=end , threshold =threshold , distance=distance , bridgeness=bridgeness , saveOnly=saveOnly ) # data :: Either (Dic Nodes Links) (Dic State Length) # data_test :: Either String Bool data_test = data.get("state", True) if data_test is True: # normal case -------------------------------- if format_ == 'json': return JsonHttpResponse( compress_graph(data), status=200 ) # -------------------------------------------- else: # All other cases (more probable are higher in the if list) if data["state"] == "saveOnly": # async data case link = "http://%s/projects/%d/corpora/%d/myGraphs" % (request.get_host(), corpus.parent_id, corpus.id) return JsonHttpResponse({ 'id': data["target_id"], 'name': data["target_name"], 'date': data["target_date"], 'msg': '''Your graph is being saved: %s ''' % format_html(link) }, status=200) elif data["state"] == "corpusMin": # async data case link = "http://%s/projects/%d/" % (request.get_host(), corpus.parent_id) return JsonHttpResponse({ 'msg': '''Problem: your corpus is too small (only %d documents). Solution: Add more documents (more than %d documents) in order to get a graph. You can manage your corpus here: %s ''' % ( data["length"] , graph_constraints['corpusMin'] , format_html(link) ), }, status=400) elif data["state"] == "mapListError": # async data case link = 'http://%s/projects/%d/corpora/%d/terms' % (request.get_host(), corpus.parent_id, corpus.id) return JsonHttpResponse({ 'msg': '''Problem: your map list is too small (currently %d terms). Solution: Add some terms (more than %d terms) in order to get a graph. You can manage your map terms here: %s ''' % ( data["length"] , graph_constraints['mapList'] , format_html(link) ), }, status=400) elif data["state"] == "corpusMax": # async data case link = 'http://%s/projects/%d/corpora/%d/myGraphs' % (request.get_host(), corpus.parent_id, corpus.id) return JsonHttpResponse({ 'msg': '''Warning: Async graph generation since your corpus is big (about %d documents). Wait a while and discover your graph very soon. Click on the link below and see your current graph processing on top of the list: %s ''' % (data["length"], format_html(link)), }, status=200) else : return JsonHttpResponse({ 'msg': '''Programming error.''', }, status=400) elif len(data["nodes"]) < 2 and len(data["links"]) < 2: # empty data case return JsonHttpResponse({ 'msg': '''Empty graph warning No cooccurences found in this corpus for the words of this maplist (maybe add more terms to the maplist or increase the size of your corpus ?)''', }, status=400) else: # parameters error case return JsonHttpResponse({ 'msg': '''Usage warning Please choose only one field from each range: - "field1": %s - "field2": %s - "options": %s''' % (accepted_field1, accepted_field2, options) }, status=400) # for any other errors that we forgot to test except Exception as error: print(error) return JsonHttpResponse({ 'msg' : 'Unknown error (showing the trace):\n%s' % "\n".join(format_tb(error.__traceback__)) }, status=400)
def get(self, request): corpus_id_list = list(map(int, request.GET['corpus_id'].split(','))) return JsonHttpResponse({ 'data': get_metadata(corpus_id_list), })
def post(self, request, project_id): # example only input = request.data or { 'x': { 'with_empty': True, 'resolution': 'decade', 'value': 'publication_date', }, 'y': { # 'divided_by': 'total_ngrams_count', # 'divided_by': 'total_documents_count', }, 'filter': { # 'ngrams': ['bees', 'bee', 'honeybee', 'honeybees', 'honey bee', 'honey bees'], # 'ngrams': ['insecticide', 'pesticide'], # 'corpora': [52633], # 'date': {'min': '1995-12-31'} }, # 'format': 'csv', } print(input) # input validation input = validate( input, { 'type': dict, 'default': {}, 'items': { 'x': { 'type': dict, 'default': {}, 'items': { # which hyperdata to choose for the date 'value': { 'type': str, 'default': 'publication_date', 'range': { 'publication_date', } }, # time resolution 'resolution': { 'type': str, 'range': self._resolutions.keys(), 'default': 'month' }, # should we add zeroes for empty values? 'with_empty': { 'type': bool, 'default': False }, } }, 'y': { 'type': dict, 'default': {}, 'items': { # mesured value 'value': { 'type': str, 'default': 'ngrams_count', 'range': { 'ngrams_count', 'documents_count', 'ngrams_tfidf' } }, # value by which we should normalize 'divided_by': { 'type': str, 'range': { 'total_documents_count', 'documents_count', 'total_ngrams_count' } }, } }, # filtering 'filter': { 'type': dict, 'default': {}, 'items': { # filter by metadata 'hyperdata': { 'type': list, 'default': [], 'items': { 'type': dict, 'items': { 'key': { 'type': str, 'range': self._operators.keys() }, 'operator': { 'type': str }, 'value': { 'type': str }, } } }, # filter by date 'date': { 'type': dict, 'items': { 'min': { 'type': datetime.datetime }, 'max': { 'type': datetime.datetime }, }, 'default': {} }, # filter by corpora 'corpora': { 'type': list, 'default': [], 'items': { 'type': int } }, # filter by ngrams 'ngrams': { 'type': list, 'default': [], 'items': { 'type': str } }, } }, # output format 'format': { 'type': str, 'default': 'json', 'range': {'json', 'csv'} }, } }) # build query: prepare columns X = aliased(NodeHyperdata) column_x = func.date_trunc(input['x']['resolution'], X.value_utc) column_y = { 'documents_count': func.count(Node.id.distinct()), 'ngrams_count': func.sum(NodeNgram.weight), # 'ngrams_tfidf': func.sum(NodeNodeNgram.weight), }[input['y']['value']] # build query: base print(input) query_base = ( session.query(column_x).select_from(Node).join( NodeNgram, NodeNgram.node_id == Node.id).join( X, X.node_id == NodeNgram.node_id) #.filter(X.key == input['x']['value']) .group_by(column_x).order_by(column_x)) # build query: base, filter by corpora or project if 'corpora' in input['filter'] and input['filter']['corpora']: query_base = (query_base.filter( Node.parent_id.in_(input['filter']['corpora']))) else: ParentNode = aliased(Node) query_base = (query_base.join( ParentNode, ParentNode.id == Node.parent_id).filter( ParentNode.parent_id == project_id)) # build query: base, filter by date if 'date' in input['filter']: if 'min' in input['filter']['date']: query_base = query_base.filter( X.value >= input['filter']['date']['min']) if 'max' in input['filter']['date']: query_base = query_base.filter( X.value <= input['filter']['date']['max']) # build query: filter by ngrams query_result = query_base.add_columns(column_y) if 'ngrams' in input['filter'] and input['filter']['ngrams']: query_result = (query_result.join( Ngram, Ngram.id == NodeNgram.ngram_id).filter( Ngram.terms.in_(input['filter']['ngrams']))) # build query: filter by metadata if 'hyperdata' in input['filter']: for h, hyperdata in enumerate(input['filter']['hyperdata']): print(h, hyperdata) # get hyperdata in database #if hyperdata_model is None: # continue #hyperdata_id, hyperdata_type = hyperdata_model # create alias and query it operator = self._operators[hyperdata['operator']] type_string = type2string( INDEXED_HYPERDATA[hyperdata['key']]['type']) value = self._converters[type_string](hyperdata['value']) query_result = (query_result.join( NodeHyperdata, NodeHyperdata.node_id == NodeNgram.node_id).filter( NodeHyperdata.key == hyperdata['key']).filter( operator(NodeHyperdata.value, value))) # build result: prepare data date_value_list = query_result.all() #print(date_value_list) if date_value_list: date_min = date_value_list[0][0].replace(tzinfo=None) date_max = date_value_list[-2][0].replace(tzinfo=None) # build result: prepare interval result = collections.OrderedDict() if input['x']['with_empty'] and date_value_list: compute_next_date = self._resolutions[input['x']['resolution']] date = date_min while date <= date_max: result[date] = 0.0 date = compute_next_date(date) # build result: integrate for date, value in date_value_list[0:-1]: result[date.replace(tzinfo=None)] = value # build result: normalize query_normalize = None if date_value_list and 'divided_by' in input['y'] and input['y'][ 'divided_by']: if input['y']['divided_by'] == 'total_documents_count': query_normalize = query_base.add_column( func.count(Node.id.distinct())) elif input['y']['divided_by'] == 'total_ngrams_count': query_normalize = query_base.add_column( func.sum(NodeNgram.weight)) if query_normalize is not None: for date, value in query_normalize[0:-1]: date = date.replace(tzinfo=None) if date in result: result[date] /= value # return result with proper formatting if input['format'] == 'json': return JsonHttpResponse( { 'query': input, 'result': sorted(result.items()), }, 201) elif input['format'] == 'csv': return CsvHttpResponse(sorted(result.items()), ('date', 'value'), 201)