def get(self, request, corpus_id): if not request.user.is_authenticated(): # can't use @requires_auth because of positional 'self' within class return HttpResponse('Unauthorized', status=401) parameters = get_parameters(request) parameters = validate(parameters, {'score': str, 'ngram_ids': list}) try: ngram_ids = [int(n) for n in parameters['ngram_ids'].split(',')] except: raise ValidationException( '"ngram_ids" needs integers separated by comma.') limit = DEFAULT_N_DOCS_HAVING_NGRAM nodes_list = [] corpus = session.query(Node).filter(Node.id == corpus_id).first() tfidf_id = (session.query(Node.id).filter( Node.typename == "TFIDF-CORPUS", Node.parent_id == corpus.id).first()) tfidf_id = tfidf_id[0] print(tfidf_id) # request data nodes_query = (session.query(Node, func.sum(NodeNodeNgram.score)).join( NodeNodeNgram, NodeNodeNgram.node2_id == Node.id).filter( NodeNodeNgram.node1_id == tfidf_id).filter( Node.typename == 'DOCUMENT', Node.parent_id == corpus.id).filter( or_(*[ NodeNodeNgram.ngram_id == ngram_id for ngram_id in ngram_ids ])).group_by(Node)) # get the total count before applying limit nodes_count = nodes_query.count() # now the query with the limit nodes_results_query = (nodes_query.order_by( func.sum(NodeNodeNgram.score).desc()).limit(limit)) for node, score in nodes_results_query: print(node, score) print("\t corpus:", corpus_id, "\t", node.name) node_dict = { 'id': node.id, 'score': score, } for key in ('title', 'publication_date', 'source', 'authors', 'fields'): if key in node.hyperdata: node_dict[key] = node.hyperdata[key] nodes_list.append(node_dict) return JsonHttpResponse({'count': nodes_count, 'records': nodes_list})
def get(self, request): """ Used for analytics ------------------ Get ngram listing + counts in a given scope """ # parameters retrieval and validation startwith = request.GET.get('startwith', '').replace("'", "\\'") # query ngrams ParentNode = aliased(Node) ngrams_query = (session .query(Ngram.id, Ngram.terms, func.sum(NodeNgram.weight).label('count')) .join(NodeNgram, NodeNgram.ngram_id == Ngram.id) .join(Node, Node.id == NodeNgram.node_id) .group_by(Ngram.id, Ngram.terms) # .group_by(Ngram) .order_by(func.sum(NodeNgram.weight).desc(), Ngram.terms) ) # filters if 'startwith' in request.GET: ngrams_query = ngrams_query.filter(Ngram.terms.startswith(request.GET['startwith'])) if 'contain' in request.GET: print("request.GET['contain']") print(request.GET['contain']) ngrams_query = ngrams_query.filter(Ngram.terms.contains(request.GET['contain'])) if 'corpus_id' in request.GET: corpus_id_list = list(map(int, request.GET.get('corpus_id', '').split(','))) if corpus_id_list and corpus_id_list[0]: ngrams_query = ngrams_query.filter(Node.parent_id.in_(corpus_id_list)) if 'ngram_id' in request.GET: ngram_id_list = list(map(int, request.GET.get('ngram_id', '').split(','))) if ngram_id_list and ngram_id_list[0]: ngrams_query = ngrams_query.filter(Ngram.id.in_(ngram_id_list)) # pagination offset = int(request.GET.get('offset', 0)) limit = int(request.GET.get('limit', 20)) total = ngrams_query.count() # return formatted result return JsonHttpResponse({ 'pagination': { 'offset': offset, 'limit': limit, 'total': total, }, 'data': [ { 'id': ngram.id, 'terms': ngram.terms, 'count': ngram.count, } for ngram in ngrams_query[offset : offset+limit] ], })
def compute_tfidf_local(corpus, on_list_id=None, groupings_id=None, overwrite_id=None): """ Calculates tfidf similarity of each (doc, ngram) couple, within the current corpus Parameters: - the corpus itself - groupings_id: optional synonym relations to add all subform counts with their mainform's counts - on_list_id: mainlist or maplist type, to constrain the input ngrams - overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus (the Node and its previous NodeNodeNgram rows will be replaced) """ # All docs of this corpus docids_subquery = (session.query( Node.id).filter(Node.parent_id == corpus.id).filter( Node.typename == "DOCUMENT").subquery()) # N total_docs = session.query(docids_subquery).count() # define the counted form if not groupings_id: ngform_id = NodeNgram.ngram_id else: Syno = (session.query( NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id).filter( NodeNgramNgram.node_id == groupings_id).subquery()) ngform_id = case([(Syno.c.ngram1_id != None, Syno.c.ngram1_id), (Syno.c.ngram1_id == None, NodeNgram.ngram_id)]) # tf for each couple (number of rows = N docs X M ngrams) tf_doc_query = ( session.query( ngform_id, NodeNgram.node_id, func.sum(NodeNgram.weight).label("tf"), # tf: occurrences ) # select within docs of current corpus .join(docids_subquery, docids_subquery.c.id == NodeNgram.node_id)) if groupings_id: tf_doc_query = (tf_doc_query.outerjoin( Syno, Syno.c.ngram2_id == NodeNgram.ngram_id)) # now when we'll group_by the ngram2 freqs will be added to ngram1 if on_list_id: Miamlist = aliased(NodeNgram) tf_doc_query = (tf_doc_query.join( Miamlist, Miamlist.ngram_id == ngform_id).filter( Miamlist.node_id == on_list_id)) # execute query to do our tf sum tf_per_doc = tf_doc_query.group_by(NodeNgram.node_id, ngform_id).all() # ex: [(128371, 9732, 1.0), # (128383, 9740, 1.0), # (128373, 9731, 1.0), # (128376, 9734, 1.0), # (128372, 9731, 1.0), # (128383, 9733, 1.0), # (128383, 9735, 1.0), # (128389, 9734, 1.0), # (8624, 9731, 1.0), # (128382, 9740, 1.0), # (128383, 9739, 1.0), # (128383, 9736, 1.0), # (128378, 9735, 1.0), # (128375, 9733, 4.0), # (128383, 9732, 1.0)] # ^ ^ ^^ ^^ # ngram doc freq in this doc # simultaneously count docs with given term (number of rows = M ngrams) ndocswithngram = {} for triple in tf_per_doc: ng = triple[0] doc = triple[1] if ng in ndocswithngram: ndocswithngram[ng] += 1 else: ndocswithngram[ng] = 1 # print(ndocswithngram) # store for use in formula # { ngram_id => log(nd) } log_nd_lookup = { ng: log(nd_count) for (ng, nd_count) in ndocswithngram.items() } # --------------------------------------------------------- tfidfs = {} log_tot_docs = log(total_docs) for (ngram_id, node_id, tf) in tf_per_doc: log_nd = log_nd_lookup[ngram_id] # tfidfs[ngram_id] = tf * log(total_docs/nd) tfidfs[node_id, ngram_id] = tf * (log_tot_docs - log_nd) # --------------------------------------------------------- if overwrite_id: the_id = overwrite_id session.query(NodeNodeNgram).filter( NodeNodeNgram.node1_id == the_id).delete() session.commit() else: # create the new TFIDF-CORPUS node tfidf_node = corpus.add_child() tfidf_node.typename = "TFIDF-CORPUS" tfidf_node.name = "tfidf-sims-corpus (in:%s)" % corpus.id session.add(tfidf_node) session.commit() the_id = tfidf_node.id # reflect that in NodeNodeNgrams # £TODO replace bulk_insert by something like WeightedIndex.save() bulk_insert(NodeNodeNgram, ('node1_id', 'node2_id', 'ngram_id', 'score'), ((the_id, node_id, ngram_id, tfidfs[node_id, ngram_id]) for (node_id, ngram_id) in tfidfs)) return the_id
def compute_occs( corpus, overwrite_id=None, groupings_id=None, ): """ Calculates sum of occs per ngram (or per mainform if groups) within corpus (used as info in the ngrams table view) ? optimize ? OCCS here could be calculated simultaneously within TFIDF-CORPUS loop ? use cases ? => not the main score for users (their intuition for nb of docs having word) => but is the main weighting value for any NLP task Parameters: - overwrite_id: optional id of a pre-existing OCCURRENCES node for this corpus (the Node and its previous NodeNodeNgram rows will be replaced) - groupings_id: optional id of a GROUPLIST node for these ngrams IF absent the occurrences are the sums for each ngram IF present they're the sums for each ngram's mainform """ # simple case : no groups # --------- # (the occurrences are the sums for each ngram) if not groupings_id: # NodeNgram index occs_q = ( session.query( NodeNgram.ngram_id, func.sum(NodeNgram.weight) # <== OCCURRENCES ) # filter docs within corpus .join(Node).filter(Node.parent_id == corpus.id).filter( Node.typename == "DOCUMENT") # for the sum .group_by(NodeNgram.ngram_id)) # difficult case: with groups # ------------ # (the occurrences are the sums for each ngram's mainform) else: # sub-SELECT the synonyms of this GROUPLIST id (for OUTER JOIN later) syn = (session.query( NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id).filter( NodeNgramNgram.node_id == groupings_id).subquery()) # NodeNgram index with additional subform => mainform replacement occs_q = ( session.query( # intermediate columns for debug # ------------------------------- # NodeNgram.node_id, # document # NodeNgram.ngram_id, # <= the occurring ngram # NodeNgram.weight, # <= its frequency in doc # syn.c.ngram1_id # mainform # syn.c.ngram2_id, # subform # ngram to count aka counted_form # ---------------------------------- # either NodeNgram.ngram_id as before # or mainform if it exists case([(syn.c.ngram1_id != None, syn.c.ngram1_id)], else_=NodeNgram.ngram_id).label("counted_form"), # the sum itself # -------------- func.sum(NodeNgram.weight) # <== OCCURRENCES ) # this brings the mainform if NodeNgram.ngram_id has one in syn .outerjoin(syn, syn.c.ngram2_id == NodeNgram.ngram_id) # filter docs within corpus .join(Node).filter(Node.parent_id == corpus.id).filter( Node.typename == "DOCUMENT") # for the sum .group_by("counted_form")) occ_sums = occs_q.all() # example result = [(1970, 1.0), (2024, 2.0), (259, 2.0), (302, 1.0), ... ] # ^^^^ ^^^ # ngram_id sum_wei # OR # counted_form if overwrite_id: # overwrite pre-existing id the_id = overwrite_id session.query(NodeNodeNgram).filter( NodeNodeNgram.node1_id == the_id).delete() session.commit() else: # create the new OCCURRENCES node occnode = corpus.add_child(typename="OCCURRENCES", name="occ_sums (in:%s)" % corpus.id) session.add(occnode) session.commit() the_id = occnode.id # £TODO make it NodeNgram instead NodeNodeNgram ! and rebase :/ # (idem ti_ranking) bulk_insert(NodeNodeNgram, ('node1_id', 'node2_id', 'ngram_id', 'score'), ((the_id, corpus.id, res[0], res[1]) for res in occ_sums)) return the_id
def compute_ti_ranking(corpus, groupings_id=None, count_scope="local", termset_scope="local", overwrite_id=None): """ Calculates tfidf ranking within given scope ---------- | via weighting of cumulated tfidf --------- Sum{i}(tf_ij) * ln(N/|U{i}(docs{mot€d})|) per ngram ng_i (or per mainform ng_i' if groups) across some docs d_j Parameters: - the corpus itself (or corpus_id) - groupings_id: optional id of a GROUPLIST node for these ngrams IF absent the ti weights are the sums for each ngram IF present they're the sums for each ngram's mainform - count_scope: {"local" or "global"} - local <=> frequencies counted in the current corpus - global <=> frequencies counted in all corpora of this type when the count_scope is global, there is another parameter: - termset_scope: {"local" or "global"} - local <=> output list of terms limited to the current corpus (SELECT DISTINCT ngram_id FROM nodes_ngrams WHERE node_id IN <docs>) - global <=> output list of terms found in global doc scope !!!! (many more terms) - overwrite_id: optional id of a pre-existing XXXX node for this corpus (the Node and its previous Node NodeNgram rows will be replaced) """ # validate string params if count_scope not in ["local", "global"]: raise ValueError( "compute_ti_ranking: count_scope param allowed values: 'local', 'global'" ) if termset_scope not in ["local", "global"]: raise ValueError( "compute_ti_ranking: termset_scope param allowed values: 'local', 'global'" ) if count_scope == "local" and termset_scope == "global": raise ValueError( "compute_ti_ranking: the termset_scope param can be 'global' iff count_scope param is 'global' too." ) # get corpus if type(corpus) == int: corpus_id = corpus corpus = cache.Node[corpus_id] elif type(corpus) == str and match(r'\d+$', corpus): corpus_id = int(corpus) corpus = cache.Node[corpus_id] else: # assuming Node class corpus_id = corpus.id # prepare sqla mainform vs ngram selector ngform_i = None if not groupings_id: ngform_i = NodeNgram.ngram_id else: # prepare translations syno = (session.query( NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id).filter( NodeNgramNgram.node_id == groupings_id).subquery()) # cf commentaire détaillé dans compute_occs() + todo facto ngform_i = case([(syno.c.ngram1_id != None, syno.c.ngram1_id), (syno.c.ngram1_id == None, NodeNgram.ngram_id) # condition value ]) # MAIN QUERY SKELETON tf_nd_query = ( session.query( # NodeNgram.ngram_id # or similar if grouping ngrams under their mainform ngform_i.label("counted_ngform"), # the tfidf elements # ------------------ func.sum(NodeNgram.weight), # tf: same as occurrences # ----------------------- func.count(NodeNgram.node_id) # nd: n docs with term # -------------------- ).group_by("counted_ngform") # count_scope to specify in which doc nodes to count # ----------- # .join(countdocs_subquery, # countdocs_subquery.c.id == NodeNgram.node_id) # optional termset_scope: if we'll restrict the ngrams # ------------- # .join(termset_subquery, # termset_subquery.c.uniq_ngid == NodeNgram.ngram_id) # optional translations to bring the subform's replacement # ------------ # .outerjoin(syno, # syno.c.ngram2_id == NodeNgram.ngram_id) ) # TUNING THE QUERY if groupings_id: tf_nd_query = tf_nd_query.outerjoin( syno, syno.c.ngram2_id == NodeNgram.ngram_id) # local <=> within this corpus if count_scope == "local": # All docs of this corpus countdocs_subquery = (session.query( Node.id).filter(Node.typename == "DOCUMENT").filter( Node.parent_id == corpus_id).subquery()) # no need to independantly restrict the ngrams tf_nd_query = tf_nd_query.join( countdocs_subquery, countdocs_subquery.c.id == NodeNgram.node_id) # --- # global <=> within all corpora of this source elif count_scope == "global": this_source_type = corpus.resources()[0]['type'] CorpusNode = aliased(Node) # All docs **in all corpora of the same source** countdocs_subquery = ( session.query(Node.id).filter(Node.typename == "DOCUMENT") # join on parent_id with selected corpora nodes .join(CorpusNode, CorpusNode.id == Node.parent_id).filter( CorpusNode.typename == "CORPUS") # TODO index corpus_sourcetype in DB .filter(CorpusNode.hyperdata['resources'][0]['type'].astext == str( this_source_type)).subquery()) if termset_scope == "global": # both scopes are the same: no need to independantly restrict the ngrams tf_nd_query = tf_nd_query.join( countdocs_subquery, countdocs_subquery.c.id == NodeNgram.node_id) # --- elif termset_scope == "local": # All unique terms... termset_subquery = ( session.query(distinct(NodeNgram.ngram_id).label("uniq_ngid")) # ... in the original corpus .join(Node).filter(Node.typename == "DOCUMENT").filter( Node.parent_id == corpus_id).subquery()) # only case of independant restrictions on docs and terms tf_nd_query = (tf_nd_query.join( countdocs_subquery, countdocs_subquery.c.id == NodeNgram.node_id).join( termset_subquery, termset_subquery.c.uniq_ngid == NodeNgram.ngram_id)) # --- # M total_docs = session.query(countdocs_subquery).count() log_tot_docs = log(total_docs) # result tf_nd = tf_nd_query.all() # -------------- "sommatoire" sur mot i ---------------- tfidfsum = {} for (ngram_i, tf_i, nd_i) in tf_nd: # tfidfsum[ngram_i] = tf_i * log(total_docs/nd_i) tfidfsum[ngram_i] = tf_i * (log_tot_docs - log(nd_i)) # ------------------------------------------------------ # N pour info total_ngramforms = len(tfidfsum) if overwrite_id: the_id = overwrite_id session.query(NodeNodeNgram).filter( NodeNodeNgram.node1_id == the_id).delete() session.commit() else: # create the new TFIDF-XXXX node to get an id tir_nd = corpus.add_child() if count_scope == "local": tir_nd.typename = "TIRANK-CORPUS" tir_nd.name = "ti rank (%i ngforms in corpus:%s)" % ( total_ngramforms, corpus_id) elif count_scope == "global": tir_nd.typename = "TIRANK-GLOBAL" tir_nd.name = "ti rank (%i ngforms %s in corpora of sourcetype:%s)" % ( total_ngramforms, ("from corpus %i" % corpus_id) if (termset_scope == "local") else "", this_source_type) session.add(tir_nd) session.commit() the_id = tir_nd.id # TODO 1 discuss use and find new typename # TODO 2 release these 2 typenames TFIDF-CORPUS and TFIDF-GLOBAL # TODO 3 recreate them elsewhere in their sims (WeightedIndex) version # TODO 4 requalify this here as a NodeNgram # then TODO 5 use WeightedList.save() ! # reflect that in NodeNodeNgrams bulk_insert(NodeNodeNgram, ('node1_id', 'node2_id', 'ngram_id', 'score'), ((the_id, corpus_id, ng, tfidfsum[ng]) for ng in tfidfsum)) return the_id
def post(self, request, project_id): # example only input = request.data or { 'x': { 'with_empty': True, 'resolution': 'decade', 'value': 'publication_date', }, 'y': { # 'divided_by': 'total_ngrams_count', # 'divided_by': 'total_documents_count', }, 'filter': { # 'ngrams': ['bees', 'bee', 'honeybee', 'honeybees', 'honey bee', 'honey bees'], # 'ngrams': ['insecticide', 'pesticide'], # 'corpora': [52633], # 'date': {'min': '1995-12-31'} }, # 'format': 'csv', } print(input) # input validation input = validate( input, { 'type': dict, 'default': {}, 'items': { 'x': { 'type': dict, 'default': {}, 'items': { # which hyperdata to choose for the date 'value': { 'type': str, 'default': 'publication_date', 'range': { 'publication_date', } }, # time resolution 'resolution': { 'type': str, 'range': self._resolutions.keys(), 'default': 'month' }, # should we add zeroes for empty values? 'with_empty': { 'type': bool, 'default': False }, } }, 'y': { 'type': dict, 'default': {}, 'items': { # mesured value 'value': { 'type': str, 'default': 'ngrams_count', 'range': { 'ngrams_count', 'documents_count', 'ngrams_tfidf' } }, # value by which we should normalize 'divided_by': { 'type': str, 'range': { 'total_documents_count', 'documents_count', 'total_ngrams_count' } }, } }, # filtering 'filter': { 'type': dict, 'default': {}, 'items': { # filter by metadata 'hyperdata': { 'type': list, 'default': [], 'items': { 'type': dict, 'items': { 'key': { 'type': str, 'range': self._operators.keys() }, 'operator': { 'type': str }, 'value': { 'type': str }, } } }, # filter by date 'date': { 'type': dict, 'items': { 'min': { 'type': datetime.datetime }, 'max': { 'type': datetime.datetime }, }, 'default': {} }, # filter by corpora 'corpora': { 'type': list, 'default': [], 'items': { 'type': int } }, # filter by ngrams 'ngrams': { 'type': list, 'default': [], 'items': { 'type': str } }, } }, # output format 'format': { 'type': str, 'default': 'json', 'range': {'json', 'csv'} }, } }) # build query: prepare columns X = aliased(NodeHyperdata) column_x = func.date_trunc(input['x']['resolution'], X.value_utc) column_y = { 'documents_count': func.count(Node.id.distinct()), 'ngrams_count': func.sum(NodeNgram.weight), # 'ngrams_tfidf': func.sum(NodeNodeNgram.weight), }[input['y']['value']] # build query: base print(input) query_base = ( session.query(column_x).select_from(Node).join( NodeNgram, NodeNgram.node_id == Node.id).join( X, X.node_id == NodeNgram.node_id) #.filter(X.key == input['x']['value']) .group_by(column_x).order_by(column_x)) # build query: base, filter by corpora or project if 'corpora' in input['filter'] and input['filter']['corpora']: query_base = (query_base.filter( Node.parent_id.in_(input['filter']['corpora']))) else: ParentNode = aliased(Node) query_base = (query_base.join( ParentNode, ParentNode.id == Node.parent_id).filter( ParentNode.parent_id == project_id)) # build query: base, filter by date if 'date' in input['filter']: if 'min' in input['filter']['date']: query_base = query_base.filter( X.value >= input['filter']['date']['min']) if 'max' in input['filter']['date']: query_base = query_base.filter( X.value <= input['filter']['date']['max']) # build query: filter by ngrams query_result = query_base.add_columns(column_y) if 'ngrams' in input['filter'] and input['filter']['ngrams']: query_result = (query_result.join( Ngram, Ngram.id == NodeNgram.ngram_id).filter( Ngram.terms.in_(input['filter']['ngrams']))) # build query: filter by metadata if 'hyperdata' in input['filter']: for h, hyperdata in enumerate(input['filter']['hyperdata']): print(h, hyperdata) # get hyperdata in database #if hyperdata_model is None: # continue #hyperdata_id, hyperdata_type = hyperdata_model # create alias and query it operator = self._operators[hyperdata['operator']] type_string = type2string( INDEXED_HYPERDATA[hyperdata['key']]['type']) value = self._converters[type_string](hyperdata['value']) query_result = (query_result.join( NodeHyperdata, NodeHyperdata.node_id == NodeNgram.node_id).filter( NodeHyperdata.key == hyperdata['key']).filter( operator(NodeHyperdata.value, value))) # build result: prepare data date_value_list = query_result.all() #print(date_value_list) if date_value_list: date_min = date_value_list[0][0].replace(tzinfo=None) date_max = date_value_list[-2][0].replace(tzinfo=None) # build result: prepare interval result = collections.OrderedDict() if input['x']['with_empty'] and date_value_list: compute_next_date = self._resolutions[input['x']['resolution']] date = date_min while date <= date_max: result[date] = 0.0 date = compute_next_date(date) # build result: integrate for date, value in date_value_list[0:-1]: result[date.replace(tzinfo=None)] = value # build result: normalize query_normalize = None if date_value_list and 'divided_by' in input['y'] and input['y'][ 'divided_by']: if input['y']['divided_by'] == 'total_documents_count': query_normalize = query_base.add_column( func.count(Node.id.distinct())) elif input['y']['divided_by'] == 'total_ngrams_count': query_normalize = query_base.add_column( func.sum(NodeNgram.weight)) if query_normalize is not None: for date, value in query_normalize[0:-1]: date = date.replace(tzinfo=None) if date in result: result[date] /= value # return result with proper formatting if input['format'] == 'json': return JsonHttpResponse( { 'query': input, 'result': sorted(result.items()), }, 201) elif input['format'] == 'csv': return CsvHttpResponse(sorted(result.items()), ('date', 'value'), 201)