def _run_recipe(self, source_type, expected_results): """ Each of the resources input test can follow this common recipe base @param source_type: int (cf. constants.py RESOURCETYPES) @param expected_results: int[] (number of docs for each sample corpora of this source) """ source = get_resource(source_type) source_name = source["name"].split("[")[0].lower().strip().replace(" ", "_") self.test_name = ">> "+ sys._getframe().f_code.co_name +"_"+str(source_name)+":" self.log.debug("\n" + self.test_name) for i,sample_file in enumerate(self.sample_files[source_name]): print("... sample_file:", sample_file) expected_ndocs = expected_results[i] name = "test_"+source_name+str(i) self.log.debug("\t- Checking creation of corpus %s" %name) self.corpus = self._create_corpus(name, source_type, sample_file) db_corpus = self._get_corpus(name) #corpus check self.assertEqual(self.corpus.name, db_corpus.name) self.log.debug("\t- Checking creation of resource type '%s' " %get_resource(source_type)["name"]) self.assertEqual(self.corpus.resources()[0]["type"], db_corpus.resources()[0]["type"]) self.log.debug("\t- Parsing and indexing corpus") parse(self.corpus) real_ndocs = self.__count_node_children__(self.corpus, "DOCUMENT") # print('==>\t'+str(source_type)+'\t'+str(i)+'\t'+sample_file+'\t'+str(real_ndocs)) self.assertEqual(real_ndocs, expected_ndocs) status = self.corpus.status() self.log.debug("\t- Extracting ngrams") extract_ngrams(self.corpus) # ngrams = self.__count_node_children__(self.corpus, "NGRAMS") status = self.corpus.status() self.assertTrue(status["complete"])
def newCorpus(project, source, name=None, query=None): error = False if name is None: name = query if not isinstance(project, ProjectNode): error = "a valid project" if not isinstance(source, int) and not isinstance(source, str): error = "a valid source identifier: id or name" elif not isinstance(query, str): error = "a valid query" elif not isinstance(name, str): error = "a valid name" if error: raise NotebookError("Please provide %s." % error) resource = get_resource(source) if isinstance(source, int) else \ get_resource_by_name(source) moissonneur_name = get_moissonneur_name(resource) if resource else \ source.lower() try: moissonneur = get_moissonneur(moissonneur_name) except ImportError: raise NotebookError("Invalid source identifier: %r" % source) return run_moissonneur(moissonneur, project, name, query)
def query(request): '''get GlobalResults()''' if request.method == "POST": query = request.POST["query"] source = get_resource(RESOURCE_TYPE_SCOAP) if source["crawler"] is not None: crawlerbot = load_crawler(source)() #old raw way to get results_nb results = crawlerbot.scan_results(query) #ids = crawlerbot.get_ids(query) return JsonHttpResponse({"results_nb": crawlerbot.results_nb})
def ngramtable(request, project_id, corpus_id): ''' Browse and modify all lists together. => maplist and mainlist terms in a table with groupings as '+' nodes => uses API GET batch of lists => uses API PUT/DEL for list modifications => uses frontend AJAX through Ngrams_dyna_charts_and_table.js # TODO refactor Ngrams_dyna_charts_and_table.js ''' # corpus still necessary to find all lists corpus = cache.Node[corpus_id] # and the project just for project.id in corpusBannerTop project = cache.Node[project_id] # retrieve all corpora of this user for list import option # POSSIBILITY: could do same task in ajax "only if needed" # (use api for that when merged) corpora_infos_q = (session.query(Node.id, Node.name).filter( Node.typename == "CORPUS").filter(Node.user_id == project.user_id)) # .filter(Node.id != corpus_id) corpora_infos = corpora_infos_q.all() source_type = corpus.resources()[0]['type'] # rendered page : terms.html return render( template_name='pages/corpora/terms.html', request=request, context={ 'debug': settings.DEBUG, 'user': request.user, 'date': datetime.now(), 'project': project, 'corpus': corpus, 'resourcename': get_resource(source_type)['name'], 'view': 'terms', # for the CSV import modal 'importroute': "/api/ngramlists/import?onto_corpus=%i" % corpus.id, 'corporainfos': corpora_infos, #user params 'user_parameters': get_user_params(request.user), 'languages': USER_LANG }, )
def __init__(self, record): #the name of corpus #that will be built in case of internal fileparsing self.record = record self.name = record["corpus_name"] self.project_id = record["project_id"] self.user_id = record["user_id"] self.resource = record["source"] self.type = get_resource(self.resource) self.query = record["query"] #format the sampling self.n_last_years = 5 self.YEAR = date.today().year #pas glop # mais easy version self.MONTH = str(date.today().month) if len(self.MONTH) == 1: self.MONTH = "0"+self.MONTH self.MAX_RESULTS = QUERY_SIZE_N_MAX try: self.results_nb = int(record["count"]) except KeyError: #n'existe pas encore self.results_nb = 0 try: self.webEnv = record["webEnv"] self.queryKey = record["queryKey"] self.retMax = record["retMax"] except KeyError: #n'exsite pas encore self.queryKey = None self.webEnv = None self.retMax = 1 self.status = [None] self.path = "/tmp/results.txt"
def save(request, project_id): '''save''' if request.method == "POST": query = request.POST.get("query") try: N = int(request.POST.get("N")) except: N = 0 print(query, N) #for next time #ids = request.POST["ids"] source = get_resource(RESOURCE_TYPE_SCOAP) if N == 0: raise Http404() if N > QUERY_SIZE_N_MAX: N = QUERY_SIZE_N_MAX try: project_id = int(project_id) except ValueError: raise Http404() # do we have a valid project? project = session.query(Node).filter(Node.id == project_id).first() if project is None: raise Http404() user = cache.User[request.user.id] if not user.owns(project): return HttpResponseForbidden() # corpus node instanciation as a Django model corpus = Node(name=query, user_id=request.user.id, parent_id=project_id, typename='CORPUS', hyperdata={ "action": "Scrapping data", "language_id": "en" }) #download_file crawler_bot = load_crawler(source)() #for now no way to force downloading X records #the long running command filename = crawler_bot.download(query) corpus.add_resource( type=source["type"] #, name = source["name"] , path=crawler_bot.path) session.add(corpus) session.commit() #corpus_id = corpus.id try: scheduled(parse_extract_indexhyperdata)(corpus.id) except Exception as error: print('WORKFLOW ERROR') print(error) try: print_tb(error.__traceback__) except: pass # IMPORTANT --------------------------------- # sanitize session after interrupted transact session.rollback() # -------------------------------------------- return render( template_name='pages/projects/wait.html', request=request, context={ 'user': request.user, 'project': project, }, ) data = [query_string, query, N] print(data) return JsonHttpResponse(data)
def save(request, project_id, return_corpus=False): print("testISTEX:") print(request.method) alist = ["bar", "foo"] # implicit global session # do we have a valid project id? try: project_id = int(project_id) except ValueError: raise Http404() # do we have a valid project? project = (session.query(Node).filter(Node.id == project_id).filter( Node.typename == 'PROJECT')).first() if project is None: raise Http404() # do we have a valid user? user = request.user if not user.is_authenticated(): return redirect('/auth/?next=%s' % request.path) if project.user_id != user.id: return HttpResponseForbidden() query_string = "" if request.method == "POST": query = "-" query_string = "-" #N = QUERY_SIZE_N_MAX if "query" in request.POST: query = request.POST["query"] query_string = query.replace(" ", "+") # url encoded q if "N" in request.POST: if request.POST["N"] == "NaN": N = QUERY_SIZE_N_MAX else: N = int(request.POST["N"]) # query_size from views_opti if N > QUERY_SIZE_N_MAX: N = QUERY_SIZE_N_MAX #msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX) #print("ERROR (scrap: istex d/l ): ",msg) #raise ValueError(msg) print("Scrapping Istex: '%s' (%i)" % (query_string, N)) urlreqs = [] pagesize = 50 tasks = Scraper() chunks = list(tasks.chunks(range(N), pagesize)) for k in chunks: if (k[0] + pagesize) > N: pagesize = N - k[0] urlreqs.append( "http://api.istex.fr/document/?q=" + query_string + "&output=id,corpusName,title,genre,language,doi,host,publicationDate,abstract,author&" + "from=" + str(k[0]) + "&size=" + str(pagesize)) # corpus node instanciation as a Django model corpus = Node(name=query, user_id=request.user.id, parent_id=project_id, typename='CORPUS', hyperdata={ "action": "Scrapping data", "language_id": None }) tasks = Scraper() for i in range(8): t = threading.Thread(target=tasks.worker2) #thing to do t.daemon = True # thread dies when main thread (only non-daemon thread) exits. t.start() for url in urlreqs: tasks.q.put(url) #put a task in th queue tasks.q.join() # wait until everything is finished dwnldsOK = 0 for filename in tasks.firstResults: if filename != False: # add the uploaded resource to the corpus corpus.add_resource( type=get_resource(RESOURCE_TYPE_ISTEX)["type"], path=filename) dwnldsOK += 1 session.add(corpus) session.commit() #corpus_id = corpus.id if dwnldsOK == 0: return JsonHttpResponse(["fail"]) ########################### ########################### try: scheduled(parse_extract_indexhyperdata)(corpus.id) except Exception as error: print('WORKFLOW ERROR') print(error) try: print_tb(error.__traceback__) except: pass # IMPORTANT --------------------------------- # sanitize session after interrupted transact session.rollback() # -------------------------------------------- if return_corpus: return corpus return render( template_name='pages/projects/wait.html', request=request, context={ 'user': request.user, 'project': project, }, ) data = [query_string, query, N] print(data) return JsonHttpResponse(data)