def index_hyperdata(corpus): bulk_insert( table=NodeHyperdata, fields=('node_id', 'key', 'value_int', 'value_flt', 'value_utc', 'value_str', 'value_txt'), data=_nodes_hyperdata_generator(corpus), )
def put(self, request, corpus_id, check_each_doc=True): if not request.user.is_authenticated(): # can't use @requires_auth because of positional 'self' within class return HttpResponse('Unauthorized', status=401) # user is ok fav_node = self._get_fav_node(corpus_id) response = {} if fav_node == None: response = { 'warning': 'No favorites node is defined for this corpus (\'%s\')' % self.corpus.name, 'count_added': 0 } else: req_params = validate(get_parameters(request), { 'docs': list, 'default': "" }) nodeids_to_add = [ int(did) for did in req_params['docs'].split(',') ] if check_each_doc: # verification que ce sont bien des documents du bon corpus # un peu long => désactiver par défaut ? known_docs_q = (session.query( Node.id).filter(Node.parent_id == corpus_id).filter( Node.typename == 'DOCUMENT')) lookup = { known_doc.id: True for known_doc in known_docs_q.all() } # debug # print("lookup hash", lookup) rejected_list = [] for doc_node_id in nodeids_to_add: if (doc_node_id not in lookup): rejected_list.append(doc_node_id) if len(rejected_list): raise ValidationException( "Error on some requested docs: %s (Only nodes of type 'doc' AND belonging to corpus %i can be added to favorites.)" % (str(rejected_list), int(corpus_id))) # add them bulk_insert(NodeNode, ('node1_id', 'node2_id', 'score'), ((fav_node.id, doc_node_id, 1.0) for doc_node_id in nodeids_to_add)) # todo count really added (here: counts input param not result) response = {'count_added': len(nodeids_to_add)} return JsonHttpResponse(response)
def save(self, node_id=None): from gargantext.models import NodeNgram if node_id is None: if hasattr(self, 'id'): node_id = self.id else: raise ValueError('Please mention an ID to save the node.') # delete previous data session.query(NodeNgram).filter(NodeNgram.node_id == node_id).delete() session.commit() # insert new data bulk_insert(NodeNgram, ('node_id', 'ngram_id', 'weight'), ((node_id, key, 1.0) for key in self.items))
def save(self, node_id=None): from gargantext.models import NodeNgramNgram if node_id is None: if hasattr(self, 'id'): node_id = self.id else: raise ValueError('Please mention an ID to save the node.') # delete previous data session.query(NodeNgramNgram).filter( NodeNgramNgram.node_id == node_id).delete() session.commit() # insert new data print("WeightedMatrix bulk_insert start") bulk_insert(NodeNgramNgram, ('node_id', 'ngram1_id', 'ngram2_id', 'weight'), ((node_id, key1, key2, value) for key1, key2, value in self)) print("WeightedMatrix bulk_insert stop")
def put(self, request): """ Add some group elements to a group node => adds new couples from GroupsBuffer._to_add of terms view TODO see use of util.lists.Translations Parameters are all in the url (for symmetry with DELETE method) api/ngramlists/groups?node=783&1228[]=891,1639 => creates 1228 - 891 and 1228 - 1639 general format is: mainform_id[]=subform_id1,subform_id2 etc => creates mainform_id - subform_id1 and mainform_id - subform_id2 NB: also checks if the couples exist before because the ngram table will send the entire group (old existing links + new links) """ # from the url params = get_parameters(request) # the node param is unique group_node = params.pop('node') # the others params are links to change couples = self.links_to_couples(params) # debug # print("==couples from url =================================++++=") # print(couples) # local version of "insert if not exists" -------------------->8-------- # (1) check already existing elements check_query = (session.query(NodeNgramNgram).filter( NodeNgramNgram.node_id == group_node).filter( tuple_(NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id).in_(couples))) existing = {} for synonyms in check_query.all(): existing[(synonyms.ngram1_id, synonyms.ngram2_id)] = True # debug #print("==existing") #print(existing) # (2) compute difference locally couples_to_add = [(mform, sform) for (mform, sform) in couples if (mform, sform) not in existing] # debug # print("== couples_to_add =================================++++=") # print(couples_to_add) # (3) add new groupings bulk_insert(NodeNgramNgram, ('node_id', 'ngram1_id', 'ngram2_id', 'weight'), ((group_node, mainform, subform, 1.0) for (mainform, subform) in couples_to_add)) # ------------------------------------------------------------>8-------- return JsonHttpResponse({ 'count_added': len(couples_to_add), }, 200)
def index_new_ngrams(ngram_ids, corpus, keys=( 'title', 'abstract', )): """ Find occurrences of some ngrams for every document of the given corpus. + insert them in the NodeNgram table. @param ngram_ids: a list of ids for Ngram objects (we assume they already went throught normalizations and they were already added to Ngrams table and optionally to some of the lists like MAPLIST) (but we can't know if they were previously indexed in the corpus) @param corpus: the CORPUS node @param keys: the hyperdata fields to index # FIXME too slow: index_new_ngrams should be faster via tsvector on DB """ # retrieve *all* the ngrams from our list # (even if some relations may be already indexed # b/c they were perhaps not extracted in all docs # => we'll use already_indexed later) todo_ngrams = (session.query(Ngram).filter(Ngram.id.in_(ngram_ids)).all()) # initialize result dict node_ngram_to_write = {} # loop throught the docs and their text fields for (i, doc) in enumerate(corpus.children('DOCUMENT')): if (i % 100 == 0): print('CORPUS #%d: [%s] ngrams_addition: doc %i' % (corpus.id, t(), i)) print() # a new empty counting subdict node_ngram_to_write[doc.id] = {} for key in keys: # a text field text = doc.hyperdata.get(key, None) if not isinstance(text, str): # print("WARN: doc %i has no text in field %s" % (doc.id, key)) continue for ngram in todo_ngrams: # build regexp : "british" => r'\bbritish\b' ngram_re = r'\b%s\b' % ngram.terms # --------------------------------------- find --- n_occs = len(findall(ngram_re, text, IGNORECASE)) # ----------------------------------------------- # save the count results if n_occs > 0: if ngram.id not in node_ngram_to_write[doc.id]: node_ngram_to_write[doc.id][ngram.id] = n_occs else: node_ngram_to_write[doc.id][ngram.id] += n_occs # debug # print("new node_ngrams before filter:", node_ngram_to_write) # check the relations we won't insert (those that were already indexed) # NB costly but currently impossible with bulk_insert_ifnotexists # b/c double uniquekey already_indexed = (session.query( NodeNgram.node_id, NodeNgram.ngram_id).join(Node, Node.id == NodeNgram.node_id).filter( Node.parent_id == corpus.id).filter( Node.typename == 'DOCUMENT').all()) filter_out = {(nd_id, ng_id) for (nd_id, ng_id) in already_indexed} # POSSIBLE update those that are filtered out if wei_previous != wei # integrate all at the end my_new_rows = [] add_new_row = my_new_rows.append for doc_id in node_ngram_to_write: for ngram_id in node_ngram_to_write[doc_id]: if (doc_id, ngram_id) not in filter_out: wei = node_ngram_to_write[doc_id][ngram_id] add_new_row([doc_id, ngram_id, wei]) del node_ngram_to_write # debug # print("new node_ngrams after filter:", my_new_rows) bulk_insert(table=NodeNgram, fields=('node_id', 'ngram_id', 'weight'), data=my_new_rows) # bulk_insert_ifnotexists( # model = NodeNgram, # uniquekey = ('node_id','ngram_id'), <= currently impossible # fields = ('node_id', 'ngram_id', 'weight'), # data = my_new_rows # ) n_added = len(my_new_rows) print("index_new_ngrams: added %i new NodeNgram rows" % n_added) return n_added
def compute_tfidf_local(corpus, on_list_id=None, groupings_id=None, overwrite_id=None): """ Calculates tfidf similarity of each (doc, ngram) couple, within the current corpus Parameters: - the corpus itself - groupings_id: optional synonym relations to add all subform counts with their mainform's counts - on_list_id: mainlist or maplist type, to constrain the input ngrams - overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus (the Node and its previous NodeNodeNgram rows will be replaced) """ # All docs of this corpus docids_subquery = (session.query( Node.id).filter(Node.parent_id == corpus.id).filter( Node.typename == "DOCUMENT").subquery()) # N total_docs = session.query(docids_subquery).count() # define the counted form if not groupings_id: ngform_id = NodeNgram.ngram_id else: Syno = (session.query( NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id).filter( NodeNgramNgram.node_id == groupings_id).subquery()) ngform_id = case([(Syno.c.ngram1_id != None, Syno.c.ngram1_id), (Syno.c.ngram1_id == None, NodeNgram.ngram_id)]) # tf for each couple (number of rows = N docs X M ngrams) tf_doc_query = ( session.query( ngform_id, NodeNgram.node_id, func.sum(NodeNgram.weight).label("tf"), # tf: occurrences ) # select within docs of current corpus .join(docids_subquery, docids_subquery.c.id == NodeNgram.node_id)) if groupings_id: tf_doc_query = (tf_doc_query.outerjoin( Syno, Syno.c.ngram2_id == NodeNgram.ngram_id)) # now when we'll group_by the ngram2 freqs will be added to ngram1 if on_list_id: Miamlist = aliased(NodeNgram) tf_doc_query = (tf_doc_query.join( Miamlist, Miamlist.ngram_id == ngform_id).filter( Miamlist.node_id == on_list_id)) # execute query to do our tf sum tf_per_doc = tf_doc_query.group_by(NodeNgram.node_id, ngform_id).all() # ex: [(128371, 9732, 1.0), # (128383, 9740, 1.0), # (128373, 9731, 1.0), # (128376, 9734, 1.0), # (128372, 9731, 1.0), # (128383, 9733, 1.0), # (128383, 9735, 1.0), # (128389, 9734, 1.0), # (8624, 9731, 1.0), # (128382, 9740, 1.0), # (128383, 9739, 1.0), # (128383, 9736, 1.0), # (128378, 9735, 1.0), # (128375, 9733, 4.0), # (128383, 9732, 1.0)] # ^ ^ ^^ ^^ # ngram doc freq in this doc # simultaneously count docs with given term (number of rows = M ngrams) ndocswithngram = {} for triple in tf_per_doc: ng = triple[0] doc = triple[1] if ng in ndocswithngram: ndocswithngram[ng] += 1 else: ndocswithngram[ng] = 1 # print(ndocswithngram) # store for use in formula # { ngram_id => log(nd) } log_nd_lookup = { ng: log(nd_count) for (ng, nd_count) in ndocswithngram.items() } # --------------------------------------------------------- tfidfs = {} log_tot_docs = log(total_docs) for (ngram_id, node_id, tf) in tf_per_doc: log_nd = log_nd_lookup[ngram_id] # tfidfs[ngram_id] = tf * log(total_docs/nd) tfidfs[node_id, ngram_id] = tf * (log_tot_docs - log_nd) # --------------------------------------------------------- if overwrite_id: the_id = overwrite_id session.query(NodeNodeNgram).filter( NodeNodeNgram.node1_id == the_id).delete() session.commit() else: # create the new TFIDF-CORPUS node tfidf_node = corpus.add_child() tfidf_node.typename = "TFIDF-CORPUS" tfidf_node.name = "tfidf-sims-corpus (in:%s)" % corpus.id session.add(tfidf_node) session.commit() the_id = tfidf_node.id # reflect that in NodeNodeNgrams # £TODO replace bulk_insert by something like WeightedIndex.save() bulk_insert(NodeNodeNgram, ('node1_id', 'node2_id', 'ngram_id', 'score'), ((the_id, node_id, ngram_id, tfidfs[node_id, ngram_id]) for (node_id, ngram_id) in tfidfs)) return the_id
def compute_occs( corpus, overwrite_id=None, groupings_id=None, ): """ Calculates sum of occs per ngram (or per mainform if groups) within corpus (used as info in the ngrams table view) ? optimize ? OCCS here could be calculated simultaneously within TFIDF-CORPUS loop ? use cases ? => not the main score for users (their intuition for nb of docs having word) => but is the main weighting value for any NLP task Parameters: - overwrite_id: optional id of a pre-existing OCCURRENCES node for this corpus (the Node and its previous NodeNodeNgram rows will be replaced) - groupings_id: optional id of a GROUPLIST node for these ngrams IF absent the occurrences are the sums for each ngram IF present they're the sums for each ngram's mainform """ # simple case : no groups # --------- # (the occurrences are the sums for each ngram) if not groupings_id: # NodeNgram index occs_q = ( session.query( NodeNgram.ngram_id, func.sum(NodeNgram.weight) # <== OCCURRENCES ) # filter docs within corpus .join(Node).filter(Node.parent_id == corpus.id).filter( Node.typename == "DOCUMENT") # for the sum .group_by(NodeNgram.ngram_id)) # difficult case: with groups # ------------ # (the occurrences are the sums for each ngram's mainform) else: # sub-SELECT the synonyms of this GROUPLIST id (for OUTER JOIN later) syn = (session.query( NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id).filter( NodeNgramNgram.node_id == groupings_id).subquery()) # NodeNgram index with additional subform => mainform replacement occs_q = ( session.query( # intermediate columns for debug # ------------------------------- # NodeNgram.node_id, # document # NodeNgram.ngram_id, # <= the occurring ngram # NodeNgram.weight, # <= its frequency in doc # syn.c.ngram1_id # mainform # syn.c.ngram2_id, # subform # ngram to count aka counted_form # ---------------------------------- # either NodeNgram.ngram_id as before # or mainform if it exists case([(syn.c.ngram1_id != None, syn.c.ngram1_id)], else_=NodeNgram.ngram_id).label("counted_form"), # the sum itself # -------------- func.sum(NodeNgram.weight) # <== OCCURRENCES ) # this brings the mainform if NodeNgram.ngram_id has one in syn .outerjoin(syn, syn.c.ngram2_id == NodeNgram.ngram_id) # filter docs within corpus .join(Node).filter(Node.parent_id == corpus.id).filter( Node.typename == "DOCUMENT") # for the sum .group_by("counted_form")) occ_sums = occs_q.all() # example result = [(1970, 1.0), (2024, 2.0), (259, 2.0), (302, 1.0), ... ] # ^^^^ ^^^ # ngram_id sum_wei # OR # counted_form if overwrite_id: # overwrite pre-existing id the_id = overwrite_id session.query(NodeNodeNgram).filter( NodeNodeNgram.node1_id == the_id).delete() session.commit() else: # create the new OCCURRENCES node occnode = corpus.add_child(typename="OCCURRENCES", name="occ_sums (in:%s)" % corpus.id) session.add(occnode) session.commit() the_id = occnode.id # £TODO make it NodeNgram instead NodeNodeNgram ! and rebase :/ # (idem ti_ranking) bulk_insert(NodeNodeNgram, ('node1_id', 'node2_id', 'ngram_id', 'score'), ((the_id, corpus.id, res[0], res[1]) for res in occ_sums)) return the_id
def compute_ti_ranking(corpus, groupings_id=None, count_scope="local", termset_scope="local", overwrite_id=None): """ Calculates tfidf ranking within given scope ---------- | via weighting of cumulated tfidf --------- Sum{i}(tf_ij) * ln(N/|U{i}(docs{mot€d})|) per ngram ng_i (or per mainform ng_i' if groups) across some docs d_j Parameters: - the corpus itself (or corpus_id) - groupings_id: optional id of a GROUPLIST node for these ngrams IF absent the ti weights are the sums for each ngram IF present they're the sums for each ngram's mainform - count_scope: {"local" or "global"} - local <=> frequencies counted in the current corpus - global <=> frequencies counted in all corpora of this type when the count_scope is global, there is another parameter: - termset_scope: {"local" or "global"} - local <=> output list of terms limited to the current corpus (SELECT DISTINCT ngram_id FROM nodes_ngrams WHERE node_id IN <docs>) - global <=> output list of terms found in global doc scope !!!! (many more terms) - overwrite_id: optional id of a pre-existing XXXX node for this corpus (the Node and its previous Node NodeNgram rows will be replaced) """ # validate string params if count_scope not in ["local", "global"]: raise ValueError( "compute_ti_ranking: count_scope param allowed values: 'local', 'global'" ) if termset_scope not in ["local", "global"]: raise ValueError( "compute_ti_ranking: termset_scope param allowed values: 'local', 'global'" ) if count_scope == "local" and termset_scope == "global": raise ValueError( "compute_ti_ranking: the termset_scope param can be 'global' iff count_scope param is 'global' too." ) # get corpus if type(corpus) == int: corpus_id = corpus corpus = cache.Node[corpus_id] elif type(corpus) == str and match(r'\d+$', corpus): corpus_id = int(corpus) corpus = cache.Node[corpus_id] else: # assuming Node class corpus_id = corpus.id # prepare sqla mainform vs ngram selector ngform_i = None if not groupings_id: ngform_i = NodeNgram.ngram_id else: # prepare translations syno = (session.query( NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id).filter( NodeNgramNgram.node_id == groupings_id).subquery()) # cf commentaire détaillé dans compute_occs() + todo facto ngform_i = case([(syno.c.ngram1_id != None, syno.c.ngram1_id), (syno.c.ngram1_id == None, NodeNgram.ngram_id) # condition value ]) # MAIN QUERY SKELETON tf_nd_query = ( session.query( # NodeNgram.ngram_id # or similar if grouping ngrams under their mainform ngform_i.label("counted_ngform"), # the tfidf elements # ------------------ func.sum(NodeNgram.weight), # tf: same as occurrences # ----------------------- func.count(NodeNgram.node_id) # nd: n docs with term # -------------------- ).group_by("counted_ngform") # count_scope to specify in which doc nodes to count # ----------- # .join(countdocs_subquery, # countdocs_subquery.c.id == NodeNgram.node_id) # optional termset_scope: if we'll restrict the ngrams # ------------- # .join(termset_subquery, # termset_subquery.c.uniq_ngid == NodeNgram.ngram_id) # optional translations to bring the subform's replacement # ------------ # .outerjoin(syno, # syno.c.ngram2_id == NodeNgram.ngram_id) ) # TUNING THE QUERY if groupings_id: tf_nd_query = tf_nd_query.outerjoin( syno, syno.c.ngram2_id == NodeNgram.ngram_id) # local <=> within this corpus if count_scope == "local": # All docs of this corpus countdocs_subquery = (session.query( Node.id).filter(Node.typename == "DOCUMENT").filter( Node.parent_id == corpus_id).subquery()) # no need to independantly restrict the ngrams tf_nd_query = tf_nd_query.join( countdocs_subquery, countdocs_subquery.c.id == NodeNgram.node_id) # --- # global <=> within all corpora of this source elif count_scope == "global": this_source_type = corpus.resources()[0]['type'] CorpusNode = aliased(Node) # All docs **in all corpora of the same source** countdocs_subquery = ( session.query(Node.id).filter(Node.typename == "DOCUMENT") # join on parent_id with selected corpora nodes .join(CorpusNode, CorpusNode.id == Node.parent_id).filter( CorpusNode.typename == "CORPUS") # TODO index corpus_sourcetype in DB .filter(CorpusNode.hyperdata['resources'][0]['type'].astext == str( this_source_type)).subquery()) if termset_scope == "global": # both scopes are the same: no need to independantly restrict the ngrams tf_nd_query = tf_nd_query.join( countdocs_subquery, countdocs_subquery.c.id == NodeNgram.node_id) # --- elif termset_scope == "local": # All unique terms... termset_subquery = ( session.query(distinct(NodeNgram.ngram_id).label("uniq_ngid")) # ... in the original corpus .join(Node).filter(Node.typename == "DOCUMENT").filter( Node.parent_id == corpus_id).subquery()) # only case of independant restrictions on docs and terms tf_nd_query = (tf_nd_query.join( countdocs_subquery, countdocs_subquery.c.id == NodeNgram.node_id).join( termset_subquery, termset_subquery.c.uniq_ngid == NodeNgram.ngram_id)) # --- # M total_docs = session.query(countdocs_subquery).count() log_tot_docs = log(total_docs) # result tf_nd = tf_nd_query.all() # -------------- "sommatoire" sur mot i ---------------- tfidfsum = {} for (ngram_i, tf_i, nd_i) in tf_nd: # tfidfsum[ngram_i] = tf_i * log(total_docs/nd_i) tfidfsum[ngram_i] = tf_i * (log_tot_docs - log(nd_i)) # ------------------------------------------------------ # N pour info total_ngramforms = len(tfidfsum) if overwrite_id: the_id = overwrite_id session.query(NodeNodeNgram).filter( NodeNodeNgram.node1_id == the_id).delete() session.commit() else: # create the new TFIDF-XXXX node to get an id tir_nd = corpus.add_child() if count_scope == "local": tir_nd.typename = "TIRANK-CORPUS" tir_nd.name = "ti rank (%i ngforms in corpus:%s)" % ( total_ngramforms, corpus_id) elif count_scope == "global": tir_nd.typename = "TIRANK-GLOBAL" tir_nd.name = "ti rank (%i ngforms %s in corpora of sourcetype:%s)" % ( total_ngramforms, ("from corpus %i" % corpus_id) if (termset_scope == "local") else "", this_source_type) session.add(tir_nd) session.commit() the_id = tir_nd.id # TODO 1 discuss use and find new typename # TODO 2 release these 2 typenames TFIDF-CORPUS and TFIDF-GLOBAL # TODO 3 recreate them elsewhere in their sims (WeightedIndex) version # TODO 4 requalify this here as a NodeNgram # then TODO 5 use WeightedList.save() ! # reflect that in NodeNodeNgrams bulk_insert(NodeNodeNgram, ('node1_id', 'node2_id', 'ngram_id', 'score'), ((the_id, corpus_id, ng, tfidfsum[ng]) for ng in tfidfsum)) return the_id