def _requires_auth(request, *args, **kwargs): if not request.user.is_authenticated(): url = '/auth/login/?next=%s' % urlencode(request.path) return redirect(url) try: # normal return the subfunction when user ok return func(request, *args, **kwargs) # user was authenticated but something made the session expire except DetachedInstanceError as die: print("===\n Detached instance error: trying to rollback session") print(die) from gargantext.util.db import session session.rollback() print("=== session rollback ok!") # re init the global cache (it must still have detached instances) from gargantext.util.db_cache import cache cache.clean_all() print("=== cache reinit ok!") # and relogin for safety url = '/auth/login/?next=%s' % urlencode(request.path) return redirect(url)
def save(request, project_id): '''save''' if request.method == "POST": query = request.POST.get("query") try: N = int(request.POST.get("N")) except: N = 0 print(query, N) #for next time #ids = request.POST["ids"] source = get_resource(RESOURCE_TYPE_SCOAP) if N == 0: raise Http404() if N > QUERY_SIZE_N_MAX: N = QUERY_SIZE_N_MAX try: project_id = int(project_id) except ValueError: raise Http404() # do we have a valid project? project = session.query(Node).filter(Node.id == project_id).first() if project is None: raise Http404() user = cache.User[request.user.id] if not user.owns(project): return HttpResponseForbidden() # corpus node instanciation as a Django model corpus = Node(name=query, user_id=request.user.id, parent_id=project_id, typename='CORPUS', hyperdata={ "action": "Scrapping data", "language_id": "en" }) #download_file crawler_bot = load_crawler(source)() #for now no way to force downloading X records #the long running command filename = crawler_bot.download(query) corpus.add_resource( type=source["type"] #, name = source["name"] , path=crawler_bot.path) session.add(corpus) session.commit() #corpus_id = corpus.id try: scheduled(parse_extract_indexhyperdata)(corpus.id) except Exception as error: print('WORKFLOW ERROR') print(error) try: print_tb(error.__traceback__) except: pass # IMPORTANT --------------------------------- # sanitize session after interrupted transact session.rollback() # -------------------------------------------- return render( template_name='pages/projects/wait.html', request=request, context={ 'user': request.user, 'project': project, }, ) data = [query_string, query, N] print(data) return JsonHttpResponse(data)
def save(request, project_id, return_corpus=False): print("testISTEX:") print(request.method) alist = ["bar", "foo"] # implicit global session # do we have a valid project id? try: project_id = int(project_id) except ValueError: raise Http404() # do we have a valid project? project = (session.query(Node).filter(Node.id == project_id).filter( Node.typename == 'PROJECT')).first() if project is None: raise Http404() # do we have a valid user? user = request.user if not user.is_authenticated(): return redirect('/auth/?next=%s' % request.path) if project.user_id != user.id: return HttpResponseForbidden() query_string = "" if request.method == "POST": query = "-" query_string = "-" #N = QUERY_SIZE_N_MAX if "query" in request.POST: query = request.POST["query"] query_string = query.replace(" ", "+") # url encoded q if "N" in request.POST: if request.POST["N"] == "NaN": N = QUERY_SIZE_N_MAX else: N = int(request.POST["N"]) # query_size from views_opti if N > QUERY_SIZE_N_MAX: N = QUERY_SIZE_N_MAX #msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX) #print("ERROR (scrap: istex d/l ): ",msg) #raise ValueError(msg) print("Scrapping Istex: '%s' (%i)" % (query_string, N)) urlreqs = [] pagesize = 50 tasks = Scraper() chunks = list(tasks.chunks(range(N), pagesize)) for k in chunks: if (k[0] + pagesize) > N: pagesize = N - k[0] urlreqs.append( "http://api.istex.fr/document/?q=" + query_string + "&output=id,corpusName,title,genre,language,doi,host,publicationDate,abstract,author&" + "from=" + str(k[0]) + "&size=" + str(pagesize)) # corpus node instanciation as a Django model corpus = Node(name=query, user_id=request.user.id, parent_id=project_id, typename='CORPUS', hyperdata={ "action": "Scrapping data", "language_id": None }) tasks = Scraper() for i in range(8): t = threading.Thread(target=tasks.worker2) #thing to do t.daemon = True # thread dies when main thread (only non-daemon thread) exits. t.start() for url in urlreqs: tasks.q.put(url) #put a task in th queue tasks.q.join() # wait until everything is finished dwnldsOK = 0 for filename in tasks.firstResults: if filename != False: # add the uploaded resource to the corpus corpus.add_resource( type=get_resource(RESOURCE_TYPE_ISTEX)["type"], path=filename) dwnldsOK += 1 session.add(corpus) session.commit() #corpus_id = corpus.id if dwnldsOK == 0: return JsonHttpResponse(["fail"]) ########################### ########################### try: scheduled(parse_extract_indexhyperdata)(corpus.id) except Exception as error: print('WORKFLOW ERROR') print(error) try: print_tb(error.__traceback__) except: pass # IMPORTANT --------------------------------- # sanitize session after interrupted transact session.rollback() # -------------------------------------------- if return_corpus: return corpus return render( template_name='pages/projects/wait.html', request=request, context={ 'user': request.user, 'project': project, }, ) data = [query_string, query, N] print(data) return JsonHttpResponse(data)
def save( request , project_id ) : # implicit global session # do we have a valid project id? try: project_id = int(project_id) except ValueError: raise Http404() # do we have a valid project? project = session.query( Node ).filter(Node.id == project_id).first() if project is None: raise Http404() user = cache.User[request.user.id] if not user.owns(project): return HttpResponseForbidden() if request.method == "POST": queries = request.POST["query"] name = request.POST["string"] # here we just realize queries already prepared by getGlobalStats # ===> no need to repeat N parameter like in testISTEX <=== instancia = Scraper() thequeries = json.loads(queries) # fyi the sum of our prepared yearly proportional quotas sampled_sum = sum([year_q['retmax'] for year_q in thequeries]) print("Scrapping Pubmed: '%s' (N=%i)" % (name,sampled_sum)) urlreqs = [] for yearquery in thequeries: urlreqs.append( instancia.medlineEfetchRAW( yearquery ) ) alist = ["tudo fixe" , "tudo bem"] # corpus node instanciation as a Django model corpus = project.add_child( name=name , typename = "CORPUS" ) # """ # urlreqs: List of urls to query. # - Then, to each url in urlreqs you do: # eFetchResult = urlopen(url) # eFetchResult.read() # this will output the XML... normally you write this to a XML-file. # """ tasks = Scraper() for i in range(8): t = threading.Thread(target=tasks.worker2) #thing to do t.daemon = True # thread dies when main thread (only non-daemon thread) exits. t.start() for url in urlreqs: tasks.q.put( url ) #put a task in the queue tasks.q.join() # wait until everything is finished dwnldsOK = 0 for filename in tasks.firstResults : print(filename) if filename != False: # add the uploaded resource to the corpus corpus.add_resource( type = get_resource_by_name('Pubmed [XML]')["type"] , path = filename , url = None ) print("Adding the resource") dwnldsOK+=1 session.add(corpus) session.commit() corpus_id = corpus.id if dwnldsOK == 0 : return JsonHttpResponse(["fail"]) try: scheduled(parse_extract_indexhyperdata)(corpus_id) except Exception as error: print('WORKFLOW ERROR') print(error) try: print_tb(error.__traceback__) except: pass # IMPORTANT --------------------------------- # sanitize session after interrupted transact session.rollback() # -------------------------------------------- sleep(1) return HttpResponseRedirect('/projects/' + str(project_id)) data = alist return JsonHttpResponse(data)