def put(self, request): if request.user.id is None: raise TypeError( "This API request must come from an authenticated user.") else: # we query among the nodes that belong to this user user = cache.User[request.user.id] node_user = session.query(Node).filter( Node.user_id == user.id, Node.typename == "USER").first() if node_user is None: return Response({"detail": "Not Allowed"}, status=HTTP_401_UNAUTHORIZED) for k, v in request.data.items(): node_user.hyperdata[k] = v # setattr(node_user.hyperdata, k, v) # print(node_user.hyperdata) node_user.save_hyperdata() session.add(node_user) session.commit() node_user = session.query(Node).filter( Node.user_id == user.id, Node.typename == "USER").first() print(node_user.hyperdata) return Response( { "detail": "Updated user parameters", "hyperdata": node_user.hyperdata }, status=HTTP_202_ACCEPTED)
def create_corpus(self): #create a corpus corpus = Node(name=self.query, user_id=self.user_id, parent_id=self.project_id, typename='CORPUS', hyperdata={ "action": "Scrapping data", "language_id": self.type["default_language"], }) self.corpus_id = corpus.id if len(self.paths) > 0: for path in self.paths: #add the resource corpus.add_resource(type=self.type["type"], name=self.type["name"], path=path) session.add(corpus) session.commit() scheduled(parse_extract_indexhyperdata(corpus.id)) else: #add the resource corpus.add_resource(type=self.type["type"], name=self.type["name"], path=self.path) session.add(corpus) session.commit() scheduled(parse_extract_indexhyperdata(corpus.id)) return corpus
def _parse(self, corpus, form): '''internal method to parse a corpus >> resource >> corpus >> docs corpus >> resource (method + file params + parser ) ^ >> docs (resource.defaultlang <--------| ) | >> ngrams |------- le tout rappatrié dans corpus ''' #1. creating a resource resource = {} resource = Node( user_id=corpus.user_id, parent_id=corpus.id, typename="RESOURCE", #corpus_name = form["name"], ) resource.method = form["method"] resource.path = upload(form['file']) #mapping the default attribute of a given source from constant RESOURCETYPE for k, v in get_resource(int(form["source"])).items(): setattr(resource, k, v) resource.status(action="parse", progress=1, complete=False) session.add(resource) session.commit() try: workflow(resource) except Exception as e: print("=======except dans _parse===========") print(e) from traceback import print_tb print_tb(e.__traceback__) print("====================================") return True
def form_valid(self, form): username = form.cleaned_data['username'] password = form.cleaned_data['password'] user = authenticate(username=username, password=password) if user is not None and user.is_active: login(self.request, user) node_user = session.query(Node).filter(Node.user_id == user.id, Node.typename== "USER").first() #user hasn't been found inside Node table #create it from auth table => node table if node_user is None: node_user = Node( typename = 'USER', #in node = > name #in user = > username name = user.username, user_id = user.id, ) node_user.hyperdata = {"language":"fr"} session.add(node_user) session.commit() return super(LoginView, self).form_valid(form) else: return self.form_invalid(form)
def put(self, request, project_id, corpus_id, view="DOCUMENT"): '''UPDATE corpus''' project = session.query(Node).filter(Node.id == project_id, Node.typename == "PROJECT").first() project = check_rights(request, project.id) if project is None: return Response({'detail' : "PROJECT Node #%s not found" %(project_id) }, status = status.HTTP_404_NOT_FOUND) corpus = session.query(Node).filter(Node.id == corpus_id, Node.typename == "CORPUS").first() if corpus is None: return Response({'detail' : "CORPUS Node #%s not found" %(corpus_id) }, status = status.HTTP_404_NOT_FOUND) #documents = session.query(Node).filter(Node.parent_id == corpus_id, Node.typename= view).all() for key, val in request.data.items(): if key in ["name", "date", "username", "hyperdata"]: if key == "username": #changement de propriétaire #user = session.query(Node).filter(Node.typename=="USER", Node.username== username).first() #print(user) #set(node, user_id, user.id) pass elif key == "hyperdata": #updating some contextualvalues of the corpus pass else: setattr(node, key, val) session.add(node) session.commit() '''#updating children??? ''' return Response({"detail":"Updated corpus #" %str(corpus.id)}, status=HTTP_202_ACCEPTED)
def _copy(self, corpus, form): #find the target corpus new_corpus = session.query(Node).filter( Node.typename == "CORPUS", Node.corpus_id == form["corpus_id"]).first() #get the resource of this corpus and copy it two new_resource = self._find_resource_hyperdata(new_corpus, form) #copy new_corpus to previously created corpus new_resouce.method = "cloned CORPUS #%i" % (new_corpus.id) new_corpus.id = corpus.id # change new_corpus ownership new_corpus.parent_id = corpus.parent_id new_corpus.user_id = corpus.user_id #get the documents of the existing corpus for doc in new_corpus.get_children(): doc.parent_id = new_corpus.parent_id doc.user_id = new_corpus.id #store it into corpus new_doc = corpus.add_child(doc) for ngrams in doc.get_children(): new_ngrams.parent_id = new_doc.id new_ngrams.user_id = new_corpus.user_id #store it into corpus new_doc.add_child(new_ngrams) #save the corpus corpus.status(action="copy", progress=1, complete=True) session.add(corpus) session.commit() return Response({"log": "Corpus created", "uids": [corpus.id]}, 202)
def put(self, request): '''UPDATE EVERY projects of a given user''' user = cache.User[request.user.id] query = session.query(Node).filter( Node.typename == "PROJECT", Node.user_id == request.user.id).all() uids = [] for node in query: for key, val in request.data.items(): #here verify that key is in accepted modified keys if key in ["name", "date", "username"]: if key == "username": #changement de propriétaire user = session.query(Node).filter( Node.typename == "PROJECT", Node.username == username).first() set(node, user_id, user.id) else: setattr(node, key, val) #node.name = request.data["name"] session.add(node) session.commit() uids.append(node.id) return Response({"detail": "Updated %s projects" % len(uids)}, status=HTTP_202_ACCEPTED)
def test_011_node_write(self): '''write new_node to DB and commit''' from gargantext.util.db import session self.assertFalse(self.new_node._sa_instance_state._attached) session.add(self.new_node) session.commit() self.assertTrue(self.new_node._sa_instance_state._attached)
def create_project(self): new_project = Node( typename='PROJECT', name="My project", ) session.add(new_project) session.commit() self.project = new_project
def create_corpus(self): #create a default corpus self.corpus = self.project.add_child( name="My Corpus", typename='CORPUS', ) session.add(self.corpus) session.commit()
def _sample(self, resource): resource = self._find_resource_hyperdata(corpus, form) crawlbot = eval(resource.crawler)(resource) records = crawlbot.sample() #resource.status.insert(0,"sampled") resource.ids = records corpus.status(action="sample", progress=1, complete=True) session.add(corpus) session.commit() return Response({"uids": [corpus.id]}, status=HTTP_200_OK)
def create_gargantua_resources(): gargantua_id = session.query( User.id).filter(User.username == "gargantua").first() project = Node(name="Resources", user_id=gargantua_id, typename="PROJECT") stopList = Node(name="STOPLIST", parent_id=project.id, user_id=gargantua_id, typename="STOPLIST") session.add(project) session.add(stopList) session.commit()
def _fetch(self, resource): '''internal method to fetch from a corpus the resource.urls >>> resource._parser(urls)''' resource = self._find_resource_hyperdata(corpus, form) resource.status(action="fetch", progress=1, complete=False) crawlbot = eval(resource.typecrawler)(resource) #send job to celery scheduled(crawlbot.fetch()) corpus.status(action="fetch", progress=1, complete=True) session.add(corpus) session.commit() return Response({"uids": [corpus.id]}, 200)
def do_stoplist(corpus, overwrite_id=None): ''' Create list of stop words. TODO do a function to get all stop words with social scores Parameters: - overwrite_id: optional preexisting STOPLIST node to overwrite ''' # Get preexisting StopList if provided in overwrite_id param if overwrite_id: stoplist_id = overwrite_id # At this step of development, a new StopList should be created else: stoplist = corpus.add_child(name="Stoplist (in:%s)" % corpus.id, typename="STOPLIST") session.add(stoplist) session.commit() stoplist_id = stoplist.id # Get common resources, all common StopWords on the platform ## First get the id of the StopList of Gargantua super user gargantua_id = session.query( User.id).filter(User.username == "gargantua").first() rootStopList_id = session.query(Node.id).filter( Node.user_id == gargantua_id, Node.typename == "STOPLIST").first() ## Then get all the stop words ## stop_words :: [String] stop_words = (session.query(Ngram.terms).join( NodeNgram, NodeNgram.ngram_id == Ngram.id).filter( NodeNgram.node_id == rootStopList_id).all()) # print([n for n in stop_words]) ## Get the ngrams ## ngrams :: [(Int, String, Int)] ngrams = ( session.query(Ngram.id, Ngram.terms).join( NodeNgram, NodeNgram.ngram_id == Ngram.id).join( Node, Node.id == NodeNgram.node_id).filter( Node.parent_id == corpus.id, Node.typename == "DOCUMENT").group_by(Ngram.id) #.limit(limit) .all()) ngrams_to_stop = filter(lambda x: is_stop_word(x, stop_words=stop_words), ngrams) # print([n for n in ngrams_to_stop]) stop = LISTTYPES["STOPLIST"]({n[0]: -1 for n in ngrams_to_stop}) # stop = LISTTYPES["STOPLIST"]([n[0] for n in ngrams_to_stop]) stop.save(stoplist_id) return stoplist_id
def post(self, request): '''CREATE a new project for a given user''' user = cache.User[request.user.id] try: #corpus name name = request.data["name"] except AttributeError: return Response( {"detail": "Invalid POST method: \"name\" field is required "}, status=HTTP_406_NOT_ACCEPTABLE) if name == "": return Response( {"detail": "Invalid POST method: \"name\" field is empty "}, status=HTTP_406_NOT_ACCEPTABLE) else: project = session.query(Node).filter(Node.typename == "PROJECT", Node.name == name).first() if project is not None: return Response( { "detail": "Project with this name already exists", "url": "/projects/%s" % str(project.id) }, status=HTTP_409_CONFLICT) else: user_node = session.query(UserNode).filter_by( user_id=request.user.id).one_or_none() if user_node is None: print( "??? Can't find UserNode for %r to create ProjectNode with name %r ???" % (request.user, name)) new_project = Node( user_id=request.user.id, typename='PROJECT', name=name, parent_id=user_node and user_node.id, ) session.add(new_project) session.commit() return Response( { "detail": "Created", "url": "/projects/%s" % str(new_project.id) }, status=HTTP_201_CREATED)
def put(self, request, project_id): '''UPDATE project ''' project = session.query(Node).filter(Node.id == project_id).first() if project is None: return Response( {'detail': "PROJECT Node #%s not found" % (project_id)}, status=HTTP_404_NOT_FOUND) check_rights(request, project_id) params = get_parameters(request) # print(params) #u_project = deepcopy(project) for key, val in params.items(): if len(val) == 0: return Response( { "detail": "Invalid POST method: \"%s\" field is empty " % key }, status=HTTP_406_NOT_ACCEPTABLE) if key in ["name", "date", "username"]: if key == "username": #change ownership #find user #user = session.query(Node).filter(Node.username == username, Node.typename="USER").first() #if user.id pass elif key == "name": other = session.query(Node).filter( Node.name == val).count() if other == 0: setattr(project, key, val) else: return Response( { "detail": "Project with this name already exists" }, status=HTTP_409_CONFLICT) else: setattr(project, key, val) session.add(project) session.commit() return Response({"detail": "Updated PROJECT #%s" % str(project_id)}, status=HTTP_206_PARTIAL_CONTENT)
def create_user(username, email, user=None, password=None, group=None, notify=False): ''' create_user : - create user - create its group if needed - create relation between user and its group ''' if user is None: user = User() user.username = username user.email = email user.is_active = True # Creating the password if password is None or len(password) < 3 : password = make_random_password() user.password = make_password(password) session.add(user) session.commit() if group is not None : # get or create group_iscpif group_iscpif_id = session.query(User.id).filter(User.username=="group_iscpif").first() if group_iscpif_id is None: group_iscpif = create_user("group_iscpif", "*****@*****.**", group=None, notify=False) group_iscpif_id = group_iscpif.id if group == "group_iscpif": block (user.id, group_iscpif_id, False) else: block (user.id, group_iscpif_id, True) if notify == True and group == "group_iscpif" : mail2user (username, email, password, drafts['partnerOk']) elif notify == True : mail2user (username, email, password, drafts['partnerKo']) else: print("User %s created, no notification" % username) return user
def delete(self, request): '''delete status for node''' if not request.user.is_authenticated(): # can't use @requires_auth because of positional 'self' within class return HttpResponse('Unauthorized', status=401) user = cache.User[request.user.id] # check_rights(request, node_id) node = session.query(Node).filter(Node.id == node_id, Node.user_id == user.id).first() if node is None: return Response({"detail": "Node not Found"}, status=HTTP_404_NOT_FOUND) node.hyperdata["status"] = [] session.add(node) session.commit() return Response({"detail": "Deleted status for NODE #%i " % node.id}, status=HTTP_204_NO_CONTENT)
def block(user1_id, user2_id, bool_): ''' user_group :: Int -> Int -> Bool Link between user1 and user2 If False: link blocked else: link not blocked ''' contact = Contact() contact.user1_id = user1_id contact.user2_id = user2_id contact.is_blocked = bool_ session.add(contact) session.commit() return contact
def create_user(username, email, user=None, password=None, active=False, notify=True): if user is None: user = User() user.username = username user.email = email user.is_active = True if password is None or password == "": password = make_random_password() user.password = make_password(password) session.add(user) session.commit() if notify == True: notify_user(username, email, password) return user
def setUp(self): """ Will be run before *each* test """ self.client = Client() # login with our fake user response = self.client.post('/auth/login/', { 'username': '******', 'password': '******' }) # print(response.status_code) # expected: 302 FOUND new_project = Node( typename='PROJECT', name="hello i'm a project", user_id=1 # todo make sure it's the same user as login ) session.add(new_project) session.commit() self.a_node_id = new_project.id print("created a project with id: %i" % new_project.id)
def _scan(self, corpus, form): '''internal method to scan a query >> add results_nb to resource as a corpus hyperdata''' resource = self._find_resource_hyperdata(corpus, form) #corpus_query = check_query(form["query") ressource.query = form["query"] corpus.status(action="scan", progress=1, complete=False) session.add(corpus) session.commit() crawlbot = eval(resource.crawler)(corpus.id) corpus.status(action="scan", progress=2, complete=False) session.add(corpus) session.commit() results_nb = crawlbot.scan_results() resource.results_nb = results_nb corpus.status(action="scan", progress=2, complete=True) code = 200 session.add(corpus) session.commit() return Response({"log": "Corpus created", "uids": [corpus.id]}, 200)
def post(self, request, project_id): '''CREATE corpus''' project = session.query(Node).filter(Node.id == project_id).first() if project is None: return Response( {'detail': "PROJECT Node #%s not found" % (project_id)}, status=HTTP_404_NOT_FOUND) project = check_rights(request, project_id) #controling form data if not "name" in request.data.keys(): return Response({'detail': "CORPUS Node: field name is mandatory"}, status=HTTP_406_NOT_ACCEPTABLE) if not "source" in request.data.keys(): return Response( {'detail': "CORPUS Node: field source is mandatory"}, status=HTTP_406_NOT_ACCEPTABLE) corpus_name = request.data["name"] corpus_source = request.data["source"] if corpus_name == "": return Response({'detail': "CORPUS Node name can't be empty"}, status=HTTP_406_NOT_ACCEPTABLE) corpus = session.query(Node).filter(Node.name == corpus_name, Node.typename == "CORPUS").first() if corpus is not None: return Response( { 'detail': "CORPUS Node with name '%s' already exists" % (corpus_name) }, status=HTTP_409_CONFLICT) if corpus_source == "" or corpus_source == 0 or corpus_source == None: return Response({'detail': "CORPUS Node source can't be empty"}, status=HTTP_406_NOT_ACCEPTABLE) params = get_parameters(request) if "method" not in params.keys(): #if "method" not in request.data.keys(): return Response( {'detail': "CORPUS Node has not 'method' parameter"}, status=HTTP_405_METHOD_NOT_ALLOWED) #method = request.data["method"] method = params["method"] if method not in ["parse", "scan", "copy"]: return Response( { 'detail': "CORPUS Node only parse, scan and copy 'method' are allowed" }, status=HTTP_405_METHOD_NOT_ALLOWED) if method == "copy": corpus = session.query(Node).filter( Node.id == corpus_source, Node.typename == "CORPUS").first() if corpus is None: return Response( { 'detail': "CORPUS Node #%s doesn't exist. Fail to copy" % (str(corpus_source)) }, status=HTTP_404_NOT_FOUND) else: #cloned_corpus = {k:v for k,v in corpus if k not in ["user_id", "id", "parent_id"]} cloned_corpus = copy.deepcopy(corpus) del cloned_corpus.id cloned_corpus.parent_id = project_id cloned_corpus.user_id = request.user.id for child in corpus.get_children(): #{k:getattr(corpus, k) for k in ["name", "date", "source", "hyperdata"] } cloned_child = copy.deepcopy(child) del cloned_child["id"] cloned_child["parent_id"] = new_corpus.id cloned_corpus["user_id"] = request.user.id cloned_corpus.add_child(cloned_child) session.add(cloned_corpus) session.commit() #RESOURCE source = get_resource(int(corpus_source)) if source is None: return Response({'detail': "CORPUS Node sourcetype unknown"}, status=HTTP_406_NOT_ACCEPTABLE) if method == "parse": print('PARSING') if not "file" in request.FILES.keys(): return Response({'detail': "CORPUS Node need a file to parse"}, status=HTTP_405_METHOD_NOT_ALLOWED) corpus_file = request.FILES['file'] if "parser" in source.keys(): corpus = project.add_child( name=request.data["name"], typename='CORPUS', #path = corpus_file, ) print("CORPUS #", corpus.id) session.add(corpus) session.commit() resource = Node(name=source["name"], typename='RESOURCE', parent_id=corpus.id, hyperdata={ "type": source["type"], "method": method, "file": upload(corpus_file), "query": None }) session.add(resource) session.commit() return Response( { "detail": "Parsing corpus #%s of type #%s" % (str(corpus.id), resource.name) }, 200) else: return Response( { "detail": "No Parser found for this corpus #%s of type %s" % (str(corpus.id), resource.name) }, 405) elif method == "scan": if "crawler" in source.keys(): if not "query" in request.data.keys(): #corpus_file = request.FILES['file'] return Response( {'detail': "CORPUS Node need a query to scan"}, status=HTTP_405_METHOD_NOT_ALLOWED) query = request.data['query'] corpus = project.add_child( name=request.data["name"], typename='CORPUS', ) resource = Node(name=source["name"], typename='RESOURCE', parent_id=corpus.id, user_id=request.user_id, hyperdata={ "type": source["type"], "method": method, "file": None, "query": query }) session.add(resource) session.commit() return Response({'detail': "CORPUS #%s created" % corpus.id}, status=HTTP_201_CREATED) else: return Response( { 'detail': "CORPUS Node only parse, scan and copy 'method' are allowed" }, status=HTTP_405_METHOD_NOT_ALLOWED)
def do_maplist(corpus, overwrite_id=None, mainlist_id=None, specclusion_id=None, genclusion_id=None, grouplist_id=None, limit=DEFAULT_MAPLIST_MAX, genclusion_part=DEFAULT_MAPLIST_GENCLUSION_RATIO, monograms_part=DEFAULT_MAPLIST_MONOGRAMS_RATIO): ''' According to Genericity/Specificity and mainlist Parameters: - mainlist_id (starting point, already cleaned of stoplist terms) - specclusion_id (ngram inclusion by cooc specificity -- ranking factor) - genclusion_id (ngram inclusion by cooc genericity -- ranking factor) - grouplist_id (filtering grouped ones) - overwrite_id: optional if preexisting MAPLIST node to overwrite + 3 params to modulate the terms choice - limit for the amount of picked terms - monograms_part: a ratio of terms with only one lexical unit to keep (multigrams quota = limit * (1-monograms_part)) - genclusion_part: a ratio of terms with only one lexical unit to keep (speclusion quota = limit * (1-genclusion_part)) ''' if not (mainlist_id and specclusion_id and genclusion_id and grouplist_id): raise ValueError( "Please provide mainlist_id, specclusion_id, genclusion_id and grouplist_id" ) quotas = {'topgen': {}, 'topspec': {}} genclusion_limit = round(limit * genclusion_part) speclusion_limit = limit - genclusion_limit quotas['topgen']['monograms'] = round(genclusion_limit * monograms_part) quotas['topgen'][ 'multigrams'] = genclusion_limit - quotas['topgen']['monograms'] quotas['topspec']['monograms'] = round(speclusion_limit * monograms_part) quotas['topspec'][ 'multigrams'] = speclusion_limit - quotas['topspec']['monograms'] print("MAPLIST quotas:", quotas) #dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id) MainlistTable = aliased(NodeNgram) IsSubform = (session # we want only secondary terms (ngram2) # to be able to filter them out .query(NodeNgramNgram.ngram2_id) .filter(NodeNgramNgram.node_id == grouplist_id) .subquery() ) ScoreSpec = aliased(NodeNgram) ScoreGen = aliased(NodeNgram) # ngram with both ranking factors spec and gen query = ( session.query(ScoreSpec.ngram_id, ScoreSpec.weight, ScoreGen.weight, Ngram.n).join( Ngram, Ngram.id == ScoreSpec.ngram_id).join( ScoreGen, ScoreGen.ngram_id == ScoreSpec.ngram_id).filter( ScoreSpec.node_id == specclusion_id).filter( ScoreGen.node_id == genclusion_id) # we want only terms within mainlist .join(MainlistTable, Ngram.id == MainlistTable.ngram_id).filter( MainlistTable.node_id == mainlist_id) # we remove all ngrams matching an ngram2_id from the synonyms .outerjoin(IsSubform, IsSubform.c.ngram2_id == ScoreSpec.ngram_id).filter( IsSubform.c.ngram2_id == None) # specificity-ranked .order_by(desc(ScoreSpec.weight))) # format in scored_ngrams array: # ------------------------------- # [(37723, 8.428, 14.239, 3 ), etc] # ngramid wspec wgen nwords scored_ngrams = query.all() n_ngrams = len(scored_ngrams) if n_ngrams == 0: raise ValueError("No ngrams in cooc table ?") #return # results, with same structure as quotas chosen_ngrams = { 'topgen': { 'monograms': [], 'multigrams': [] }, 'topspec': { 'monograms': [], 'multigrams': [] } } # specificity and genericity are rather reverse-correlated # but occasionally they can have common ngrams (same ngram well ranked in both) # => we'll use a lookup table to check if we didn't already get it already_gotten_ngramids = {} # 2 loops to fill spec-clusion then gen-clusion quotas # (1st loop uses order from DB, 2nd loop uses our own sort at end of 1st) for rkr in ['topspec', 'topgen']: got_enough_mono = False got_enough_multi = False all_done = False i = -1 while ((not all_done) and (not (got_enough_mono and got_enough_multi))): # retrieve sorted ngram n° i i += 1 (ng_id, wspec, wgen, nwords) = scored_ngrams[i] # before any continue case, we check the next i for max reached all_done = (i + 1 >= n_ngrams) if ng_id in already_gotten_ngramids: continue # NB: nwords could be replaced by a simple search on r' ' if nwords == 1: if got_enough_mono: continue else: # add ngram to results and lookup chosen_ngrams[rkr]['monograms'].append(ng_id) already_gotten_ngramids[ng_id] = True # multi else: if got_enough_multi: continue else: # add ngram to results and lookup chosen_ngrams[rkr]['multigrams'].append(ng_id) already_gotten_ngramids[ng_id] = True got_enough_mono = (len(chosen_ngrams[rkr]['monograms']) >= quotas[rkr]['monograms']) got_enough_multi = (len(chosen_ngrams[rkr]['multigrams']) >= quotas[rkr]['multigrams']) # at the end of the first loop we just need to sort all by the second ranker (gen) scored_ngrams = sorted(scored_ngrams, key=lambda ng_infos: ng_infos[2], reverse=True) obtained_spec_mono = len(chosen_ngrams['topspec']['monograms']) obtained_spec_multi = len(chosen_ngrams['topspec']['multigrams']) obtained_gen_mono = len(chosen_ngrams['topgen']['monograms']) obtained_gen_multi = len(chosen_ngrams['topgen']['multigrams']) obtained_total = obtained_spec_mono \ + obtained_spec_multi \ + obtained_gen_mono \ + obtained_gen_multi print("MAPLIST: top_spec_monograms =", obtained_spec_mono) print("MAPLIST: top_spec_multigrams =", obtained_spec_multi) print("MAPLIST: top_gen_monograms =", obtained_gen_mono) print("MAPLIST: top_gen_multigrams =", obtained_gen_multi) print("MAPLIST: kept %i ngrams in total " % obtained_total) obtained_data = chosen_ngrams['topspec']['monograms'] \ + chosen_ngrams['topspec']['multigrams'] \ + chosen_ngrams['topgen']['monograms'] \ + chosen_ngrams['topgen']['multigrams'] # NEW MAPLIST NODE # ----------------- # saving the parameters of the analysis in the Node JSON new_hyperdata = { 'corpus': corpus.id, 'limit': limit, 'monograms_part': monograms_part, 'genclusion_part': genclusion_part, } if overwrite_id: # overwrite pre-existing node the_maplist = cache.Node[overwrite_id] the_maplist.hyperdata = new_hyperdata the_maplist.save_hyperdata() session.commit() the_id = overwrite_id else: # create a new maplist node the_maplist = corpus.add_child(name="Maplist (in %i)" % corpus.id, typename="MAPLIST", hyperdata=new_hyperdata) session.add(the_maplist) session.commit() the_id = the_maplist.id # create UnweightedList object and save (=> new NodeNgram rows) datalist = UnweightedList(obtained_data) # save datalist.save(the_id) # dbg.show('MapList computed') return the_id
def save(request, project_id): '''save''' if request.method == "POST": query = request.POST.get("query") try: N = int(request.POST.get("N")) except: N = 0 print(query, N) #for next time #ids = request.POST["ids"] source = get_resource(RESOURCE_TYPE_SCOAP) if N == 0: raise Http404() if N > QUERY_SIZE_N_MAX: N = QUERY_SIZE_N_MAX try: project_id = int(project_id) except ValueError: raise Http404() # do we have a valid project? project = session.query(Node).filter(Node.id == project_id).first() if project is None: raise Http404() user = cache.User[request.user.id] if not user.owns(project): return HttpResponseForbidden() # corpus node instanciation as a Django model corpus = Node(name=query, user_id=request.user.id, parent_id=project_id, typename='CORPUS', hyperdata={ "action": "Scrapping data", "language_id": "en" }) #download_file crawler_bot = load_crawler(source)() #for now no way to force downloading X records #the long running command filename = crawler_bot.download(query) corpus.add_resource( type=source["type"] #, name = source["name"] , path=crawler_bot.path) session.add(corpus) session.commit() #corpus_id = corpus.id try: scheduled(parse_extract_indexhyperdata)(corpus.id) except Exception as error: print('WORKFLOW ERROR') print(error) try: print_tb(error.__traceback__) except: pass # IMPORTANT --------------------------------- # sanitize session after interrupted transact session.rollback() # -------------------------------------------- return render( template_name='pages/projects/wait.html', request=request, context={ 'user': request.user, 'project': project, }, ) data = [query_string, query, N] print(data) return JsonHttpResponse(data)
def save( request , project_id ) : # implicit global session # do we have a valid project id? try: project_id = int(project_id) except ValueError: raise Http404() # do we have a valid project? project = session.query( Node ).filter(Node.id == project_id).first() if project is None: raise Http404() user = cache.User[request.user.id] if not user.owns(project): return HttpResponseForbidden() if request.method == "POST": queries = request.POST["query"] name = request.POST["string"] # here we just realize queries already prepared by getGlobalStats # ===> no need to repeat N parameter like in testISTEX <=== instancia = Scraper() thequeries = json.loads(queries) # fyi the sum of our prepared yearly proportional quotas sampled_sum = sum([year_q['retmax'] for year_q in thequeries]) print("Scrapping Pubmed: '%s' (N=%i)" % (name,sampled_sum)) urlreqs = [] for yearquery in thequeries: urlreqs.append( instancia.medlineEfetchRAW( yearquery ) ) alist = ["tudo fixe" , "tudo bem"] # corpus node instanciation as a Django model corpus = project.add_child( name=name , typename = "CORPUS" ) # """ # urlreqs: List of urls to query. # - Then, to each url in urlreqs you do: # eFetchResult = urlopen(url) # eFetchResult.read() # this will output the XML... normally you write this to a XML-file. # """ tasks = Scraper() for i in range(8): t = threading.Thread(target=tasks.worker2) #thing to do t.daemon = True # thread dies when main thread (only non-daemon thread) exits. t.start() for url in urlreqs: tasks.q.put( url ) #put a task in the queue tasks.q.join() # wait until everything is finished dwnldsOK = 0 for filename in tasks.firstResults : print(filename) if filename != False: # add the uploaded resource to the corpus corpus.add_resource( type = get_resource_by_name('Pubmed [XML]')["type"] , path = filename , url = None ) print("Adding the resource") dwnldsOK+=1 session.add(corpus) session.commit() corpus_id = corpus.id if dwnldsOK == 0 : return JsonHttpResponse(["fail"]) try: scheduled(parse_extract_indexhyperdata)(corpus_id) except Exception as error: print('WORKFLOW ERROR') print(error) try: print_tb(error.__traceback__) except: pass # IMPORTANT --------------------------------- # sanitize session after interrupted transact session.rollback() # -------------------------------------------- sleep(1) return HttpResponseRedirect('/projects/' + str(project_id)) data = alist return JsonHttpResponse(data)
def compute_groups(corpus, stoplist_id = None, overwrite_id = None): """ 1) Use a stemmer/lemmatizer to group forms if they have same stem/lemma 2) Create an empty GROUPLIST node (for a list of "synonym" ngrams) 3) Save the list to DB (list node + each grouping as listnode - ngram1 - ngram2) """ stop_ngrams_ids = {} # we will need the ngrams of the stoplist to filter if stoplist_id is not None: for id in session.query(NodeNgram.ngram_id).filter(NodeNgram.node_id == stoplist_id).all(): stop_ngrams_ids[id[0]] = True # 1) compute stems/lemmas # and group if same stem/lemma stemmers = prepare_stemmers(corpus) print("# STEMMERS LOADED", stemmers) supported_stemmers_lang = [lang for lang in corpus.hyperdata["languages"] if lang != "__unknown__"] print("#SUPPORTED STEMMERS LANGS", supported_stemmers_lang) # todo dict {lg => {ngrams_todo} } todo_ngrams_per_lg = defaultdict(set) # res dict { commonstem: {ngram_1:freq_1 ,ngram_2:freq_2 ,ngram_3:freq_3} } my_groups = defaultdict(Counter) # preloop per doc to sort ngrams by language for doc in corpus.children('DOCUMENT'): if doc.id not in corpus.hyperdata['skipped_docs']: if ('language_iso2' in doc.hyperdata) and doc.hyperdata['language_iso2'] \ in supported_stemmers_lang: lgid = doc.hyperdata['language_iso2'] else: lgid = "__unknown__" doc.status("NGRAMS_GROUPS", error="Error: unsupported language for stemming") doc.save_hyperdata() #corpus.hyperdata["skipped_docs"].append(doc.id) #corpus.save_hyperdata() # doc.ngrams is an sql query (ugly but useful intermediate step) # FIXME: move the counting and stoplist filtering up here for ngram_pack in doc.ngrams.all(): todo_ngrams_per_lg[lgid].add(ngram_pack) # -------------------- # long loop per ngrams for (lgid,todo_ngs) in todo_ngrams_per_lg.items(): # fun: word::str => stem::str stem_it = stemmers[lgid].stem for ng in todo_ngs: doc_wei = ng[0] ngram = ng[1] # Ngram obj # break if in STOPLIST if ngram.id in stop_ngrams_ids: next lexforms = [lexunit for lexunit in resplit(r'\W+',ngram.terms)] # STEM IT, and this term's stems will become a new grouping key... stemseq = " ".join([stem_it(lexfo) for lexfo in lexforms]) # ex: # groups['post'] = {'poste':3, 'poster':5, 'postés':2...} # groups['copper engrav'] = {'copper engraving':3, 'coppers engraver':1...} my_groups[stemseq][ngram.id] += doc_wei del todo_ngrams_per_lg # now serializing all groups to a list of couples ng_couples = [] addcouple = ng_couples.append for grped_ngramids in my_groups.values(): if len(grped_ngramids) > 1: # first find most frequent term in the counter winner_id = grped_ngramids.most_common(1)[0][0] for ngram_id in grped_ngramids: if ngram_id != winner_id: addcouple((winner_id, ngram_id)) del my_groups # 2) the list node if overwrite_id: # overwrite pre-existing id the_id = overwrite_id # or create the new id else: the_group = corpus.add_child( typename = "GROUPLIST", name = "Group (src:%s)" % corpus.name[0:10] ) # and save the node session.add(the_group) session.commit() the_id = the_group.id # 3) Save each grouping couple to DB thanks to Translations.save() table ndngng_list = Translations( [(sec,prim) for (prim,sec) in ng_couples], just_items=True ) # ...referring to the list node we just got ndngng_list.save(the_id) return the_id
def parse_extract_indexhyperdata(corpus): # retrieve corpus from database from id if isinstance(corpus, int): corpus_id = corpus corpus = session.query(Node).filter(Node.id == corpus_id).first() if corpus is None: print('NO SUCH CORPUS: #%d' % corpus_id) return # Instantiate status corpus.status('Workflow', progress=1) corpus.save_hyperdata() session.commit() # FIXME: 'Workflow' will still be uncomplete when 'Index' and 'Lists' will # get stacked into hyperdata['statuses'], but doing corpus.status() # will return only the 1st uncomplete action (corpus.status() doesn't # understand "subactions") # apply actions print('CORPUS #%d' % (corpus.id)) corpus.status('Docs', progress=1) corpus.save_hyperdata() session.commit() parse(corpus) docs = corpus.children("DOCUMENT").count() print('CORPUS #%d: parsed %d' % (corpus.id, docs)) extract_ngrams(corpus) # Preparing Databse # Indexing # corpus.status('Index', progress=0) corpus.save_hyperdata() session.commit() print('CORPUS #%d: extracted ngrams' % (corpus.id)) index_hyperdata(corpus) print('CORPUS #%d: indexed hyperdata' % (corpus.id)) # -> 'favorites' node favs = corpus.add_child(typename='FAVORITES', name='favorite docs in "%s"' % corpus.name) session.add(favs) session.commit() print('CORPUS #%d: [%s] new favorites node #%i' % (corpus.id, t(), favs.id)) corpus.status('Index', progress=1, complete=True) corpus.save_hyperdata() session.commit() # ------------------------------- # temporary ngram lists workflow # ------------------------------- corpus.status('Lists', progress=0) corpus.save_hyperdata() session.commit() print('CORPUS #%d: [%s] starting ngram lists computation' % (corpus.id, t())) # -> stoplist: filter + write (to Node and NodeNgram) stop_id = do_stoplist(corpus) print('CORPUS #%d: [%s] new stoplist node #%i' % (corpus.id, t(), stop_id)) # -> write groups to Node and NodeNgramNgram group_id = compute_groups(corpus, stoplist_id=None) print('CORPUS #%d: [%s] new grouplist node #%i' % (corpus.id, t(), group_id)) # ------------ # -> write occurrences to Node and NodeNodeNgram occ_id = compute_occs(corpus, groupings_id=group_id) print('CORPUS #%d: [%s] new occs node #%i' % (corpus.id, t(), occ_id)) # -> write cumulated ti_ranking (tfidf ranking vector) to Node and NodeNodeNgram tirank_id = compute_ti_ranking(corpus, groupings_id=group_id, count_scope="global") print('CORPUS #%d: [%s] new ti ranking node #%i' % (corpus.id, t(), tirank_id)) # -> mainlist: filter + write (to Node and NodeNgram) mainlist_id = do_mainlist(corpus, ranking_scores_id=tirank_id, stoplist_id=stop_id) print('CORPUS #%d: [%s] new mainlist node #%i' % (corpus.id, t(), mainlist_id)) # -> write local tfidf similarities to Node and NodeNodeNgram ltfidf_id = compute_tfidf_local(corpus, on_list_id=mainlist_id, groupings_id=group_id) print('CORPUS #%d: [%s] new localtfidf node #%i' % (corpus.id, t(), ltfidf_id)) # => used for doc <=> ngram association # ------------ # -> cooccurrences on mainlist: compute + write (=> Node and NodeNgramNgram)* coocs = compute_coocs(corpus, on_list_id=mainlist_id, groupings_id=group_id, just_pass_result=True, diagonal_filter=False) # preserving the diagonal # (useful for spec/gen) print('CORPUS #%d: [%s] computed mainlist coocs for specif rank' % (corpus.id, t())) # -> specclusion/genclusion: compute + write (2 Nodes + 2 lists in NodeNgram) (spec_id, gen_id) = compute_specgen(corpus, cooc_matrix=coocs) # no need here for subforms because cooc already counted them in mainform print('CORPUS #%d: [%s] new spec-clusion node #%i' % (corpus.id, t(), spec_id)) print('CORPUS #%d: [%s] new gen-clusion node #%i' % (corpus.id, t(), gen_id)) # maplist: compute + write (to Node and NodeNgram) map_id = do_maplist(corpus, mainlist_id=mainlist_id, specclusion_id=spec_id, genclusion_id=gen_id, grouplist_id=group_id) print('CORPUS #%d: [%s] new maplist node #%i' % (corpus.id, t(), map_id)) print('CORPUS #%d: [%s] FINISHED ngram lists computation' % (corpus.id, t())) corpus.status('Lists', progress=0, complete=True) corpus.save_hyperdata() session.commit() if DEBUG is False: print('CORPUS #%d: [%s] FINISHED Sending email notification' % (corpus.id, t())) notify_owner(corpus) corpus.status('Workflow', progress=10, complete=True) corpus.save_hyperdata() session.commit()
def save(request, project_id, return_corpus=False): print("testISTEX:") print(request.method) alist = ["bar", "foo"] # implicit global session # do we have a valid project id? try: project_id = int(project_id) except ValueError: raise Http404() # do we have a valid project? project = (session.query(Node).filter(Node.id == project_id).filter( Node.typename == 'PROJECT')).first() if project is None: raise Http404() # do we have a valid user? user = request.user if not user.is_authenticated(): return redirect('/auth/?next=%s' % request.path) if project.user_id != user.id: return HttpResponseForbidden() query_string = "" if request.method == "POST": query = "-" query_string = "-" #N = QUERY_SIZE_N_MAX if "query" in request.POST: query = request.POST["query"] query_string = query.replace(" ", "+") # url encoded q if "N" in request.POST: if request.POST["N"] == "NaN": N = QUERY_SIZE_N_MAX else: N = int(request.POST["N"]) # query_size from views_opti if N > QUERY_SIZE_N_MAX: N = QUERY_SIZE_N_MAX #msg = "Invalid sample size N = %i (max = %i)" % (N, QUERY_SIZE_N_MAX) #print("ERROR (scrap: istex d/l ): ",msg) #raise ValueError(msg) print("Scrapping Istex: '%s' (%i)" % (query_string, N)) urlreqs = [] pagesize = 50 tasks = Scraper() chunks = list(tasks.chunks(range(N), pagesize)) for k in chunks: if (k[0] + pagesize) > N: pagesize = N - k[0] urlreqs.append( "http://api.istex.fr/document/?q=" + query_string + "&output=id,corpusName,title,genre,language,doi,host,publicationDate,abstract,author&" + "from=" + str(k[0]) + "&size=" + str(pagesize)) # corpus node instanciation as a Django model corpus = Node(name=query, user_id=request.user.id, parent_id=project_id, typename='CORPUS', hyperdata={ "action": "Scrapping data", "language_id": None }) tasks = Scraper() for i in range(8): t = threading.Thread(target=tasks.worker2) #thing to do t.daemon = True # thread dies when main thread (only non-daemon thread) exits. t.start() for url in urlreqs: tasks.q.put(url) #put a task in th queue tasks.q.join() # wait until everything is finished dwnldsOK = 0 for filename in tasks.firstResults: if filename != False: # add the uploaded resource to the corpus corpus.add_resource( type=get_resource(RESOURCE_TYPE_ISTEX)["type"], path=filename) dwnldsOK += 1 session.add(corpus) session.commit() #corpus_id = corpus.id if dwnldsOK == 0: return JsonHttpResponse(["fail"]) ########################### ########################### try: scheduled(parse_extract_indexhyperdata)(corpus.id) except Exception as error: print('WORKFLOW ERROR') print(error) try: print_tb(error.__traceback__) except: pass # IMPORTANT --------------------------------- # sanitize session after interrupted transact session.rollback() # -------------------------------------------- if return_corpus: return corpus return render( template_name='pages/projects/wait.html', request=request, context={ 'user': request.user, 'project': project, }, ) data = [query_string, query, N] print(data) return JsonHttpResponse(data)
def old_post(self, request, project_id): form = self._validate_form(request) #get params method = form["method"] if method in ["parse", "scan", "copy"]: #Le corpus et la resource n'existent pas # [HACK] # creation d'un corpus corpus = Node( typename='CORPUS', user_id=request.user_id, parent_id=project.id, name=form["name"], ) session.add(corpus) session.commit() # creation d'une resource try: if method == "parse": form["file"] = request.FILES['file'] action = getattr(self, "_" + method) #toutes les actions sauf scan suppriment la resource? #et remontent l'info dans corpus if action(corpus, form): # transferer les infos resource dans le corpus documents = session.query(Node).filter( Node.typename == "DOCUMENT", Node.user_id == user.id, Node.parent_id == corpus.id).all() response_data = { "records": format_records(documents), "resource": format_records([resource]), "parent": format_parent(project), "count": len(documents) } return Response(response_data, 200) else: raise APIException("Error with ", method) except Exception as e: raise APIException(e) else: #Le corpus existe et la resource doit être mise à jour corpus = session.query(Node).filter( Node.typename == "CORPUS", Node.parent_id == project.id, Node.name == form["corpus_name"]).first() source = get_resource(form["source"]) if corpus is None: return Response("CORPUS not found", 404) #[HACK] one corpus one resource by Resourcetype_name resource = session.query(Node).filter( Node.typename == "RESOURCE", Node.parent_id == corpus.id, Node.corpus_name == form["corpus_name"], Node.name == source["name"]).first() action = getattr(self, "_" + method) if action(resource): # transferer les infos resource dans le corpus if method == "fetch": corpus.sources[resource["name"]].append(resource) session.delete(resource) session.add(corpus) session.commit() else: session.add(resource) session.commit() return Response({"log": "Created", "uids": [corpus.id]}, 200) else: session.delete(resource) session.delete(corpus) session.commit() return Response({"log": method + ": Error"}, 500)