def create_corpus(self): #create a corpus corpus = Node(name=self.query, user_id=self.user_id, parent_id=self.project_id, typename='CORPUS', hyperdata={ "action": "Scrapping data", "language_id": self.type["default_language"], }) self.corpus_id = corpus.id if len(self.paths) > 0: for path in self.paths: #add the resource corpus.add_resource(type=self.type["type"], name=self.type["name"], path=path) session.add(corpus) session.commit() scheduled(parse_extract_indexhyperdata(corpus.id)) else: #add the resource corpus.add_resource(type=self.type["type"], name=self.type["name"], path=self.path) session.add(corpus) session.commit() scheduled(parse_extract_indexhyperdata(corpus.id)) return corpus
def post(self, request): """ Merge the lists of a corpus with other lists from a CSV source or from another corpus params in request.GET: onto_corpus: the corpus whose lists are getting patched params in request.data: csvfile: the csv file /!\ We assume we checked the file size client-side before upload """ if not request.user.is_authenticated(): res = HttpResponse("Unauthorized") res.status_code = 401 return res # the corpus with the target lists to be patched params = get_parameters(request) corpus_id = int(params.pop("onto_corpus")) corpus_node = cache.Node[corpus_id] if request.user.id != corpus_node.user_id: res = HttpResponse("Unauthorized") res.status_code = 401 return res # request also contains the file # csv_file has type django.core.files.uploadedfile.InMemoryUploadedFile # ---------------------- csv_file = request.data['csvfile'] csv_contents = csv_file.read().decode("UTF-8").split("\n") csv_file.close() del csv_file # import the csv # try: log_msg = "Async generation" corpus_node_id = corpus_node.id scheduled(import_and_merge_ngramlists)(csv_contents, corpus_node_id, overwrite=bool( params.get('overwrite'))) return JsonHttpResponse({ 'log': log_msg, }, 200)
def patch(self, request, corpusnode_id): """ PATCH triggers recount of metrics for the specified corpus. ex PATCH http://localhost:8000/api/metrics/14072 ----- corpus_id """ print("==> update metrics request on ", corpusnode_id) if not request.user.is_authenticated(): # can't use @requires_auth because of positional 'self' within class return HttpResponse('Unauthorized', status=401) try: corpus = cache.Node[int(corpusnode_id)] except: corpus = None if corpus is None: raise ValidationException("%s is not a valid corpus node id." % corpusnode_id) else: t_before = datetime.now() # ============= scheduled(recount)(corpus.id) # ============= t_after = datetime.now() return JsonHttpResponse({ 'corpus_id': corpusnode_id, 'took': "%f s." % (t_after - t_before).total_seconds() })
def save(request, project_id): '''save''' if request.method == "POST": query = request.POST.get("query") try: N = int(request.POST.get("N")) except: N = 0 print(query, N) #for next time #ids = request.POST["ids"] source = get_resource(RESOURCE_TYPE_SCOAP) if N == 0: raise Http404() if N > QUERY_SIZE_N_MAX: N = QUERY_SIZE_N_MAX try: project_id = int(project_id) except ValueError: raise Http404() # do we have a valid project? project = session.query(Node).filter(Node.id == project_id).first() if project is None: raise Http404() user = cache.User[request.user.id] if not user.owns(project): return HttpResponseForbidden() # corpus node instanciation as a Django model corpus = Node(name=query, user_id=request.user.id, parent_id=project_id, typename='CORPUS', hyperdata={ "action": "Scrapping data", "language_id": "en" }) #download_file crawler_bot = load_crawler(source)() #for now no way to force downloading X records #the long running command filename = crawler_bot.download(query) corpus.add_resource( type=source["type"] #, name = source["name"] , path=crawler_bot.path) session.add(corpus) session.commit() #corpus_id = corpus.id try: scheduled(parse_extract_indexhyperdata)(corpus.id) except Exception as error: print('WORKFLOW ERROR') print(error) try: print_tb(error.__traceback__) except: pass # IMPORTANT --------------------------------- # sanitize session after interrupted transact session.rollback() # -------------------------------------------- return render( template_name='pages/projects/wait.html', request=request, context={ 'user': request.user, 'project': project, }, ) data = [query_string, query, N] print(data) return JsonHttpResponse(data)
def project(request, project_id): # security check project = session.query(Node).filter(Node.id == project_id).first() user = cache.User[request.user.id] if project is None: raise Http404() if not user.owns(project): return HttpResponseForbidden() # end of security check # new corpus if request.method == 'POST': corpus = project.add_child( name=request.POST['name'], typename='CORPUS', ) corpus.add_resource( type=int(request.POST['type']), path=upload(request.FILES['file']), ) session.add(corpus) session.commit() # parse_extract: fileparsing -> ngram extraction -> lists scheduled(parse_extract_indexhyperdata)(corpus.id) wait = True else: wait = False # corpora within this project corpora = project.children('CORPUS', order=True).all() sourcename2corpora = defaultdict(list) for corpus in corpora: # we only consider the first resource of the corpus to determine its type resources = corpus.resources() if len(resources): resource = resources[0] #resource_type_name = RESOURCETYPES[resource['type']]['name'] resource_type_name = get_resource(resource["type"])["name"] else: print("(WARNING) PROJECT view: no listed resource") # add some data for the viewer corpus.count = corpus.children('DOCUMENT').count() status = corpus.status() if status is not None and not status['complete']: if not status['error']: corpus.status_message = '(in progress: %s, %d complete)' % ( status['action'].replace('_', ' '), status['progress'], ) else: corpus.status_message = '(aborted: "%s" after %i docs)' % ( status['error'][-1], status['progress']) else: corpus.status_message = '' # add sourcename2corpora[resource_type_name].append(corpus) # source & their respective counts total_documentscount = 0 sourcename2documentscount = defaultdict(int) for sourcename, corpora in sourcename2corpora.items(): sourcename = re.sub(' \(.*$', '', sourcename) for corpus in corpora: count = corpus.children('DOCUMENT').count() sourcename2documentscount[sourcename] += count total_documentscount += count donut = [{ 'source': sourcename, 'count': count, 'part': round(count * 100.0 / total_documentscount, 1) if total_documentscount else 0, } for sourcename, count in sourcename2documentscount.items()] if wait: return render( template_name='pages/projects/wait.html', request=request, context={ 'form': NewCorpusForm, 'user': request.user, 'date': datetime.now(), 'project': project, 'donut': donut, 'list_corpora': dict(sourcename2corpora), 'whitelists': [], 'blacklists': [], 'cooclists': [], 'number': len(corpora), 'query_size': QUERY_SIZE_N_DEFAULT, # status refreshing params (when active workflows) 'status_refresh_initial_interval': PROJECT_VIEW_REFRESH_INTERVAL, 'status_refresh_max_attempts': PROJECT_VIEW_MAX_REFRESH_ATTEMPTS, }, ) # response! return render( template_name='pages/projects/project.html', request=request, context={ 'form': NewCorpusForm, 'user': request.user, 'date': datetime.now(), 'project': project, 'donut': donut, 'list_corpora': dict(sourcename2corpora), 'whitelists': [], 'blacklists': [], 'cooclists': [], 'number': len(corpora), 'query_size': QUERY_SIZE_N_DEFAULT, # status refreshing params (when active workflows) 'status_refresh_initial_interval': PROJECT_VIEW_REFRESH_INTERVAL, 'status_refresh_max_attempts': PROJECT_VIEW_MAX_REFRESH_ATTEMPTS, }, )
def save(request, project_id, return_corpus=False): print("testISTEX:") print(request.method) alist = ["bar", "foo"] # implicit global session # do we have a valid project id? try: project_id = int(project_id) except ValueError: raise Http404() # do we have a valid project? project = (session.query(Node).filter(Node.id == project_id).filter( Node.typename == 'PROJECT')).first() if project is None: raise Http404() # do we have a valid user? user = request.user if not user.is_authenticated(): return redirect('/auth/?next=%s' % request.path) if project.user_id != user.id: return HttpResponseForbidden() query_string = "" if request.method == "POST": query = "-" query_string = "-" #N = QUERY_SIZE_N_MAX if "query" in request.POST: query = request.POST["query"] query_string = query.replace(" ", "+") # url encoded q if "N" in request.POST: if request.POST["N"] == "NaN": N = QUERY_SIZE_N_MAX else: N = int(request.POST["N"]) # query_size from views_opti if N > QUERY_SIZE_N_MAX: N = QUERY_SIZE_N_MAX #msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX) #print("ERROR (scrap: istex d/l ): ",msg) #raise ValueError(msg) print("Scrapping Istex: '%s' (%i)" % (query_string, N)) urlreqs = [] pagesize = 50 tasks = Scraper() chunks = list(tasks.chunks(range(N), pagesize)) for k in chunks: if (k[0] + pagesize) > N: pagesize = N - k[0] urlreqs.append( "http://api.istex.fr/document/?q=" + query_string + "&output=id,corpusName,title,genre,language,doi,host,publicationDate,abstract,author&" + "from=" + str(k[0]) + "&size=" + str(pagesize)) # corpus node instanciation as a Django model corpus = Node(name=query, user_id=request.user.id, parent_id=project_id, typename='CORPUS', hyperdata={ "action": "Scrapping data", "language_id": None }) tasks = Scraper() for i in range(8): t = threading.Thread(target=tasks.worker2) #thing to do t.daemon = True # thread dies when main thread (only non-daemon thread) exits. t.start() for url in urlreqs: tasks.q.put(url) #put a task in th queue tasks.q.join() # wait until everything is finished dwnldsOK = 0 for filename in tasks.firstResults: if filename != False: # add the uploaded resource to the corpus corpus.add_resource( type=get_resource(RESOURCE_TYPE_ISTEX)["type"], path=filename) dwnldsOK += 1 session.add(corpus) session.commit() #corpus_id = corpus.id if dwnldsOK == 0: return JsonHttpResponse(["fail"]) ########################### ########################### try: scheduled(parse_extract_indexhyperdata)(corpus.id) except Exception as error: print('WORKFLOW ERROR') print(error) try: print_tb(error.__traceback__) except: pass # IMPORTANT --------------------------------- # sanitize session after interrupted transact session.rollback() # -------------------------------------------- if return_corpus: return corpus return render( template_name='pages/projects/wait.html', request=request, context={ 'user': request.user, 'project': project, }, ) data = [query_string, query, N] print(data) return JsonHttpResponse(data)
def save( request , project_id ) : # implicit global session # do we have a valid project id? try: project_id = int(project_id) except ValueError: raise Http404() # do we have a valid project? project = session.query( Node ).filter(Node.id == project_id).first() if project is None: raise Http404() user = cache.User[request.user.id] if not user.owns(project): return HttpResponseForbidden() if request.method == "POST": queries = request.POST["query"] name = request.POST["string"] # here we just realize queries already prepared by getGlobalStats # ===> no need to repeat N parameter like in testISTEX <=== instancia = Scraper() thequeries = json.loads(queries) # fyi the sum of our prepared yearly proportional quotas sampled_sum = sum([year_q['retmax'] for year_q in thequeries]) print("Scrapping Pubmed: '%s' (N=%i)" % (name,sampled_sum)) urlreqs = [] for yearquery in thequeries: urlreqs.append( instancia.medlineEfetchRAW( yearquery ) ) alist = ["tudo fixe" , "tudo bem"] # corpus node instanciation as a Django model corpus = project.add_child( name=name , typename = "CORPUS" ) # """ # urlreqs: List of urls to query. # - Then, to each url in urlreqs you do: # eFetchResult = urlopen(url) # eFetchResult.read() # this will output the XML... normally you write this to a XML-file. # """ tasks = Scraper() for i in range(8): t = threading.Thread(target=tasks.worker2) #thing to do t.daemon = True # thread dies when main thread (only non-daemon thread) exits. t.start() for url in urlreqs: tasks.q.put( url ) #put a task in the queue tasks.q.join() # wait until everything is finished dwnldsOK = 0 for filename in tasks.firstResults : print(filename) if filename != False: # add the uploaded resource to the corpus corpus.add_resource( type = get_resource_by_name('Pubmed [XML]')["type"] , path = filename , url = None ) print("Adding the resource") dwnldsOK+=1 session.add(corpus) session.commit() corpus_id = corpus.id if dwnldsOK == 0 : return JsonHttpResponse(["fail"]) try: scheduled(parse_extract_indexhyperdata)(corpus_id) except Exception as error: print('WORKFLOW ERROR') print(error) try: print_tb(error.__traceback__) except: pass # IMPORTANT --------------------------------- # sanitize session after interrupted transact session.rollback() # -------------------------------------------- sleep(1) return HttpResponseRedirect('/projects/' + str(project_id)) data = alist return JsonHttpResponse(data)
def get_graph(request=None, corpus=None, field1='ngrams', field2='ngrams', mapList_id=None, groupList_id=None, cooc_id=None, type='node_link', start=None, end=None, distance='conditional', bridgeness=5, threshold=1, isMonopartite=True, saveOnly=True): ''' Get_graph : main steps: 0) Check the parameters get_graph :: GraphParameters -> Either (Dic Nodes Links) (Dic State Length) where type Length = Int get_graph first checks the parameters and return either graph data or a dict with state "type" with an integer to indicate the size of the parameter (maybe we could add a String in that step to factor and give here the error message) 1) compute_graph (see function above) 2) return graph ''' overwrite_node_contents = False # Case of graph has been computed already if cooc_id is not None: print("GRAPH#%d ... Loading data already computed." % int(cooc_id)) node = session.query(Node).filter(Node.id == cooc_id).first() # Structure of the Node.hyperdata[distance][bridbeness] # All parameters (but distance and bridgeness) # are in Node.hyperdata["parameters"] # Check distance of the graph if node.hyperdata.get(distance, None) is not None: graph = node.hyperdata[distance] # Check bridgeness of the graph if graph.get(str(bridgeness), None) is not None: return graph[str(bridgeness)] # new graph: we give it an empty node with new id and status elif saveOnly: # NB: we do creation already here (instead of same in countCooccurrences) # to guarantee a unique ref id to the saveOnly graph (async generation) new_node = corpus.add_child(typename="COOCCURRENCES", name="GRAPH (in corpus %s)" % corpus.id) session.add(new_node) session.commit() cooc_id = new_node.id cooc_name = new_node.name cooc_date = new_node.date # and the empty content will need redoing by countCooccurrences overwrite_node_contents = True print("GRAPH #%d ... Created new empty data node for saveOnly" % int(cooc_id)) # Case of graph has not been computed already # First, check the parameters # Case of mapList not big enough # ============================== # if we do not have any mapList_id already if mapList_id is None: mapList_id = session.query( Node.id).filter(Node.typename == "MAPLIST").first()[0] mapList_size = session.query(NodeNgram).filter( NodeNgram.node_id == mapList_id).count() if mapList_size < graph_constraints['mapList']: # Do not compute the graph if mapList is not big enough return {'state': "mapListError", "length": mapList_size} # Instantiate query for case of corpus not big enough # =================================================== corpus_size_query = (session.query(Node).filter( Node.typename == "DOCUMENT").filter(Node.parent_id == corpus.id)) # Filter corpus by date if any start date # --------------------------------------- if start is not None: #date_start = datetime.datetime.strptime ("2001-2-3 10:11:12", "%Y-%m-%d %H:%M:%S") date_start = datetime.strptime(str(start), "%Y-%m-%d") date_start_utc = date_start.strftime("%Y-%m-%d %H:%M:%S") Start = aliased(NodeHyperdata) corpus_size_query = (corpus_size_query.join( Start, Start.node_id == Node.id).filter( Start.key == 'publication_date').filter( Start.value_utc >= date_start_utc)) # Filter corpus by date if any end date # ------------------------------------- if end is not None: date_end = datetime.strptime(str(end), "%Y-%m-%d") date_end_utc = date_end.strftime("%Y-%m-%d %H:%M:%S") End = aliased(NodeHyperdata) corpus_size_query = (corpus_size_query.join( End, End.node_id == Node.id).filter( End.key == 'publication_date').filter( End.value_utc <= date_end_utc)) # Finally test if the size of the corpora is big enough # -------------------------------- corpus_size = corpus_size_query.count() if saveOnly is not None and saveOnly == "True": scheduled(compute_graph)( corpus_id=corpus.id, cooc_id=cooc_id #, field1="ngrams", field2="ngrams" , start=start, end=end, mapList_id=mapList_id, groupList_id=groupList_id, isMonopartite=True, threshold=threshold, distance=distance, bridgeness=bridgeness, save_on_db=True, reset=overwrite_node_contents #, limit=size ) return { "state": "saveOnly", "target_id": cooc_id, "target_name": cooc_name, "target_date": cooc_date } elif corpus_size > graph_constraints['corpusMax']: # Then compute cooc asynchronously with celery scheduled(compute_graph)( corpus_id=corpus.id, cooc_id=cooc_id #, field1="ngrams", field2="ngrams" , start=start, end=end, mapList_id=mapList_id, groupList_id=groupList_id, isMonopartite=True, threshold=threshold, distance=distance, bridgeness=bridgeness, save_on_db=True, reset=overwrite_node_contents #, limit=size ) # Dict to inform user that corpus maximum is reached # then graph is computed asynchronously return {"state": "corpusMax", "length": corpus_size} elif corpus_size <= graph_constraints['corpusMin']: # Do not compute the graph if corpus is not big enough return {"state": "corpusMin", "length": corpus_size} else: # If graph_constraints are ok then compute the graph in live data = compute_graph( corpus_id=corpus.id, cooc_id=cooc_id #, field1="ngrams", field2="ngrams" , start=start, end=end, mapList_id=mapList_id, groupList_id=groupList_id, isMonopartite=True, threshold=threshold, distance=distance, bridgeness=bridgeness, save_on_db=True, reset=overwrite_node_contents #, limit=size ) # case when 0 coocs are observed (usually b/c not enough ngrams in maplist) if len(data) == 0: print("GRAPH # ... GET_GRAPH: 0 coocs in matrix") data = {'nodes': [], 'links': []} # empty data return data