def nodes(parent=None, group_by='typename', order_by='typename', has_child='check'): if group_by or has_child is not None: select = [ func.min(Node.id).label('id'), func.min(Node.name).label('name'), func.min(Node.typename).label('typename'), func.count(Node.id).label('cnt') ] else: select = [ Node.id.label('id'), Node.name.label('name'), Node.typename.label('typename'), literal_column('1').label('cnt') ] if has_child is not None: N = aliased(Node) select.append(func.count(N.id).label('children')) else: select.append(literal_column('NULL').label('children')) parent_id = getattr(parent, 'id', parent) q = session.query(*select).filter_by(parent_id=parent_id) \ .group_by(getattr(Node, group_by if group_by else 'id')) if has_child is not None: q = q.outerjoin(N, N.parent_id == Node.id).group_by(N.parent_id) return q.order_by(order_by)
def scan_gargantext(corpus_id, request, fast=False, documents=False): query = _search_docs(corpus_id, request, fast) if documents: return query.all() return query.with_entities(func.count(DocumentNode.id.distinct())).one()[0]
def doc_freq(corpus_id, node_ids): ''' doc_freq :: Corpus_id -> [(Ngram_id, Int)] Given a corpus, compute number of documents that have the ngram in it. ''' return ( session.query(NodeNgram.ngram_id, func.count(NodeNgram.node_id)) .join(Node, NodeNgram.node_id == Node.id) .filter( Node.parent_id == corpus_id , Node.typename== 'DOCUMENT') .filter( NodeNgram.weight > 0 , NodeNgram.ngram_id.in_(node_ids) ) .group_by(NodeNgram.ngram_id) .all() )
def countCooccurrences(corpus_id=None, cooc_id=None, field1='ngrams', field2='ngrams', start=None, end=None, mapList_id=None, groupList_id=None, distance=None, bridgeness=None, n_min=1, n_max=None, limit=1000, isMonopartite=True, threshold=3, save_on_db=True, reset=True): ''' Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id For the moment list of parameters are not supported because, lists need to be merged before. corpus :: Corpus mapList_id :: Int groupList_id :: Int start :: TimeStamp -- example: '2010-05-30 02:00:00+02' end :: TimeStamp limit :: Int ''' # FIXME remove the lines below after factorization of parameters parameters = dict() parameters['field1'] = field1 parameters['field2'] = field2 # Get corpus as Python object corpus = session.query(Node).filter(Node.id == corpus_id).first() # Get node of the Graph if not cooc_id: cooc_id = (session.query(Node.id).filter( Node.typename == "COOCCURRENCES", Node.name == "GRAPH EXPLORER", Node.parent_id == corpus.id).first()) if not cooc_id: coocNode = corpus.add_child(typename="COOCCURRENCES", name="GRAPH (in corpus %s)" % corpus.id) session.add(coocNode) session.commit() cooc_id = coocNode.id else: cooc_id = int(cooc_id[0]) # when cooc_id preexisted, but we want to continue (reset = True) # (to give new contents to this cooc_id) elif reset: print("GRAPH #%s ... Counting new cooccurrences data." % cooc_id) session.query(NodeNgramNgram).filter( NodeNgramNgram.node_id == cooc_id).delete() session.commit() # when cooc_id preexisted and we just want to load it (reset = False) else: print("GRAPH #%s ... Loading cooccurrences computed already." % cooc_id) cooc = session.query(NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id, NodeNgramNgram.weight).filter( NodeNgramNgram.node_id == cooc_id).all() return (int(cooc_id), WeightedMatrix(cooc)) NodeNgramX = aliased(NodeNgram) # Simple Cooccurrences cooc_score = func.count(NodeNgramX.node_id).label('cooc_score') # A kind of Euclidean distance cooccurrences #cooc_score = func.sqrt(func.sum(NodeNgramX.weight * NodeNgramY.weight)).label('cooc_score') if isMonopartite: NodeNgramY = aliased(NodeNgram) cooc_query = (session.query( NodeNgramX.ngram_id, NodeNgramY.ngram_id, cooc_score).join(Node, Node.id == NodeNgramX.node_id).join( NodeNgramY, NodeNgramY.node_id == Node.id).filter( Node.parent_id == corpus.id, Node.typename == "DOCUMENT")) else: NodeNgramY = aliased(NodeNgram) cooc_query = (session.query( NodeHyperdataNgram.ngram_id, NodeNgramY.ngram_id, cooc_score).join(Node, Node.id == NodeHyperdataNgram.node_id).join( NodeNgramY, NodeNgramY.node_id == Node.id).join( Hyperdata, Hyperdata.id == NodeHyperdataNgram.hyperdata_id).filter( Node.parent_id == corpus.id, Node.typename == "DOCUMENT").filter( Hyperdata.name == field1)) # Size of the ngrams between n_min and n_max if n_min is not None or n_max is not None: if isMonopartite: NgramX = aliased(Ngram) cooc_query = cooc_query.join(NgramX, NgramX.id == NodeNgramX.ngram_id) NgramY = aliased(Ngram) cooc_query = cooc_query.join(NgramY, NgramY.id == NodeNgramY.ngram_id) if n_min is not None: cooc_query = (cooc_query.filter(NgramY.n >= n_min)) if isMonopartite: cooc_query = cooc_query.filter(NgramX.n >= n_min) if n_max is not None: cooc_query = (cooc_query.filter(NgramY.n >= n_min)) if isMonopartite: cooc_query = cooc_query.filter(NgramX.n >= n_min) # Cooc between the dates start and end if start is not None: #date_start = datetime.datetime.strptime ("2001-2-3 10:11:12", "%Y-%m-%d %H:%M:%S") # TODO : more precise date format here (day is smaller grain actually). date_start = datetime.strptime(str(start), "%Y-%m-%d") date_start_utc = date_start.strftime("%Y-%m-%d %H:%M:%S") Start = aliased(NodeHyperdata) cooc_query = (cooc_query.join(Start, Start.node_id == Node.id).filter( Start.key == 'publication_date').filter( Start.value_utc >= date_start_utc)) parameters['start'] = date_start_utc if end is not None: # TODO : more precise date format here (day is smaller grain actually). date_end = datetime.strptime(str(end), "%Y-%m-%d") date_end_utc = date_end.strftime("%Y-%m-%d %H:%M:%S") End = aliased(NodeHyperdata) cooc_query = (cooc_query.join(End, End.node_id == Node.id).filter( End.key == 'publication_date').filter( End.value_utc <= date_end_utc)) parameters['end'] = date_end_utc if isMonopartite: # Cooc is symetric, take only the main cooccurrences and cut at the limit cooc_query = cooc_query.filter( NodeNgramX.ngram_id < NodeNgramY.ngram_id) cooc_query = cooc_query.having(cooc_score >= threshold) if isMonopartite: cooc_query = cooc_query.group_by(NodeNgramX.ngram_id, NodeNgramY.ngram_id) else: cooc_query = cooc_query.group_by(NodeHyperdataNgram.ngram_id, NodeNgramY.ngram_id) # Order according some scores # If ordering is really needed, use Ordered Index (faster) #cooc_query = cooc_query.order_by(desc('cooc_score')) matrix = WeightedMatrix(cooc_query) print("GRAPH #%s Filtering the matrix with Map and Group Lists." % cooc_id) cooc = filterMatrix(matrix, mapList_id, groupList_id) parameters['MapList_id'] = str(mapList_id) parameters['GroupList_id'] = str(groupList_id) # TODO factorize savings on db if save_on_db: # Saving the cooccurrences cooc.save(cooc_id) print("GRAPH #%s ... Node Cooccurrence Matrix saved" % cooc_id) # Saving the parameters print("GRAPH #%s ... Parameters saved in Node." % cooc_id) coocNode = session.query(Node).filter(Node.id == cooc_id).first() coocNode.hyperdata["parameters"] = dict() coocNode.hyperdata["parameters"] = parameters coocNode.save_hyperdata() session.commit() #data = cooc2graph(coocNode.id, cooc, distance=distance, bridgeness=bridgeness) else: return cooc return (coocNode.id, cooc)
def compute_ti_ranking(corpus, groupings_id=None, count_scope="local", termset_scope="local", overwrite_id=None): """ Calculates tfidf ranking within given scope ---------- | via weighting of cumulated tfidf --------- Sum{i}(tf_ij) * ln(N/|U{i}(docs{mot€d})|) per ngram ng_i (or per mainform ng_i' if groups) across some docs d_j Parameters: - the corpus itself (or corpus_id) - groupings_id: optional id of a GROUPLIST node for these ngrams IF absent the ti weights are the sums for each ngram IF present they're the sums for each ngram's mainform - count_scope: {"local" or "global"} - local <=> frequencies counted in the current corpus - global <=> frequencies counted in all corpora of this type when the count_scope is global, there is another parameter: - termset_scope: {"local" or "global"} - local <=> output list of terms limited to the current corpus (SELECT DISTINCT ngram_id FROM nodes_ngrams WHERE node_id IN <docs>) - global <=> output list of terms found in global doc scope !!!! (many more terms) - overwrite_id: optional id of a pre-existing XXXX node for this corpus (the Node and its previous Node NodeNgram rows will be replaced) """ # validate string params if count_scope not in ["local", "global"]: raise ValueError( "compute_ti_ranking: count_scope param allowed values: 'local', 'global'" ) if termset_scope not in ["local", "global"]: raise ValueError( "compute_ti_ranking: termset_scope param allowed values: 'local', 'global'" ) if count_scope == "local" and termset_scope == "global": raise ValueError( "compute_ti_ranking: the termset_scope param can be 'global' iff count_scope param is 'global' too." ) # get corpus if type(corpus) == int: corpus_id = corpus corpus = cache.Node[corpus_id] elif type(corpus) == str and match(r'\d+$', corpus): corpus_id = int(corpus) corpus = cache.Node[corpus_id] else: # assuming Node class corpus_id = corpus.id # prepare sqla mainform vs ngram selector ngform_i = None if not groupings_id: ngform_i = NodeNgram.ngram_id else: # prepare translations syno = (session.query( NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id).filter( NodeNgramNgram.node_id == groupings_id).subquery()) # cf commentaire détaillé dans compute_occs() + todo facto ngform_i = case([(syno.c.ngram1_id != None, syno.c.ngram1_id), (syno.c.ngram1_id == None, NodeNgram.ngram_id) # condition value ]) # MAIN QUERY SKELETON tf_nd_query = ( session.query( # NodeNgram.ngram_id # or similar if grouping ngrams under their mainform ngform_i.label("counted_ngform"), # the tfidf elements # ------------------ func.sum(NodeNgram.weight), # tf: same as occurrences # ----------------------- func.count(NodeNgram.node_id) # nd: n docs with term # -------------------- ).group_by("counted_ngform") # count_scope to specify in which doc nodes to count # ----------- # .join(countdocs_subquery, # countdocs_subquery.c.id == NodeNgram.node_id) # optional termset_scope: if we'll restrict the ngrams # ------------- # .join(termset_subquery, # termset_subquery.c.uniq_ngid == NodeNgram.ngram_id) # optional translations to bring the subform's replacement # ------------ # .outerjoin(syno, # syno.c.ngram2_id == NodeNgram.ngram_id) ) # TUNING THE QUERY if groupings_id: tf_nd_query = tf_nd_query.outerjoin( syno, syno.c.ngram2_id == NodeNgram.ngram_id) # local <=> within this corpus if count_scope == "local": # All docs of this corpus countdocs_subquery = (session.query( Node.id).filter(Node.typename == "DOCUMENT").filter( Node.parent_id == corpus_id).subquery()) # no need to independantly restrict the ngrams tf_nd_query = tf_nd_query.join( countdocs_subquery, countdocs_subquery.c.id == NodeNgram.node_id) # --- # global <=> within all corpora of this source elif count_scope == "global": this_source_type = corpus.resources()[0]['type'] CorpusNode = aliased(Node) # All docs **in all corpora of the same source** countdocs_subquery = ( session.query(Node.id).filter(Node.typename == "DOCUMENT") # join on parent_id with selected corpora nodes .join(CorpusNode, CorpusNode.id == Node.parent_id).filter( CorpusNode.typename == "CORPUS") # TODO index corpus_sourcetype in DB .filter(CorpusNode.hyperdata['resources'][0]['type'].astext == str( this_source_type)).subquery()) if termset_scope == "global": # both scopes are the same: no need to independantly restrict the ngrams tf_nd_query = tf_nd_query.join( countdocs_subquery, countdocs_subquery.c.id == NodeNgram.node_id) # --- elif termset_scope == "local": # All unique terms... termset_subquery = ( session.query(distinct(NodeNgram.ngram_id).label("uniq_ngid")) # ... in the original corpus .join(Node).filter(Node.typename == "DOCUMENT").filter( Node.parent_id == corpus_id).subquery()) # only case of independant restrictions on docs and terms tf_nd_query = (tf_nd_query.join( countdocs_subquery, countdocs_subquery.c.id == NodeNgram.node_id).join( termset_subquery, termset_subquery.c.uniq_ngid == NodeNgram.ngram_id)) # --- # M total_docs = session.query(countdocs_subquery).count() log_tot_docs = log(total_docs) # result tf_nd = tf_nd_query.all() # -------------- "sommatoire" sur mot i ---------------- tfidfsum = {} for (ngram_i, tf_i, nd_i) in tf_nd: # tfidfsum[ngram_i] = tf_i * log(total_docs/nd_i) tfidfsum[ngram_i] = tf_i * (log_tot_docs - log(nd_i)) # ------------------------------------------------------ # N pour info total_ngramforms = len(tfidfsum) if overwrite_id: the_id = overwrite_id session.query(NodeNodeNgram).filter( NodeNodeNgram.node1_id == the_id).delete() session.commit() else: # create the new TFIDF-XXXX node to get an id tir_nd = corpus.add_child() if count_scope == "local": tir_nd.typename = "TIRANK-CORPUS" tir_nd.name = "ti rank (%i ngforms in corpus:%s)" % ( total_ngramforms, corpus_id) elif count_scope == "global": tir_nd.typename = "TIRANK-GLOBAL" tir_nd.name = "ti rank (%i ngforms %s in corpora of sourcetype:%s)" % ( total_ngramforms, ("from corpus %i" % corpus_id) if (termset_scope == "local") else "", this_source_type) session.add(tir_nd) session.commit() the_id = tir_nd.id # TODO 1 discuss use and find new typename # TODO 2 release these 2 typenames TFIDF-CORPUS and TFIDF-GLOBAL # TODO 3 recreate them elsewhere in their sims (WeightedIndex) version # TODO 4 requalify this here as a NodeNgram # then TODO 5 use WeightedList.save() ! # reflect that in NodeNodeNgrams bulk_insert(NodeNodeNgram, ('node1_id', 'node2_id', 'ngram_id', 'score'), ((the_id, corpus_id, ng, tfidfsum[ng]) for ng in tfidfsum)) return the_id
def compute_coocs( corpus, overwrite_id=None, just_pass_result=True, # just return the WeightedMatrix, # (don't write to DB) threshold=DEFAULT_COOC_THRESHOLD, groupings_id=None, on_list_id=None, stoplist_id=None, start=None, end=None, symmetry_filter=False, diagonal_filter=True): """ Count how often some extracted terms appear together in a small context (document) throughout a larger context (corpus). [NodeNgram] [NodeNgramNgram] node_id | ngram_id | weight ngram1_id | ngram2_id | score | --------+----------+-------- ----------+-----------+-------+ MyDocA | 487 | 1 => 487 | 294 | 2 | MyDocA | 294 | 3 MyDocB | 487 | 1 MyDocB | 294 | 4 Fill that info in DB: - a *new* COOCCURRENCES node - and all corresponding NodeNgramNgram rows worse case complexity ~ O(N²/2) with N = number of ngrams If a mainlist is provided, we filter doc ngrams to those also in the list. Parameters: - the corpus node - overwrite_id: id of a pre-existing COOCCURRENCES node for this corpus (all hyperdata and previous NodeNgramNgram rows will be replaced) - threshold: on output cooc count (previously called hapax) - groupings_id: optional synonym relations to add all subform counts with their mainform's counts - on_list_id: mainlist or maplist type, to constrain the input ngrams - stoplist_id: stoplist for filtering input ngrams (normally unnecessary if a mainlist is already provided) - start, end: provide one or both temporal limits to filter on doc date NB the expected type of parameter value is datetime.datetime (string is also possible but format must follow this convention: "2001-01-01" aka "%Y-%m-%d") - symmetry_filter: prevent calculating where ngram1_id > ngram2_id - diagonal_filter: prevent calculating where ngram1_id == ngram2_id (deprecated parameters) - field1,2: allowed to count other things than ngrams (eg tags) but no use case at present - isMonopartite: ?? used a nodes_hyperdata_ngrams table ??? basic idea for one doc ====================== each pair of ngrams sharing same doc (node_id) SELEC idxa.ngram_id, idxb.ngram_id FROM nodes_ngrams AS idxa --------------------------------- JOIN nodes_ngrams AS idxb ON idxa.node_id = idxb.node_id <== that's cooc --------------------------------- AND idxa.ngram_id <> idxb.ngram_id (diagonal_filter) AND idxa.node_id = MY_DOC ; on entire corpus ================= coocs for each doc : - each given pair like (termA, termB) will likely appear several times => we do GROUP BY (Xindex.ngram_id, Yindex.ngram_id) - we count unique appearances of the pair (cooc) """ # - TODO cvalue_id: allow a metric as additional input filter # - TODO n_min, n_max : filter on Ngram.n (aka length of ngram) # - TODO weighted: if False normal cooc to be saved as result # if True weighted cooc (experimental) # /!\ big combinatorial complexity /!\ # pour 8439 lignes dans l'index nodes_ngrams dont 1442 avec occ > 1 # 1.859.408 lignes pour la requête cooc simple # 71.134 lignes en se limitant aux ngrammes qui ont une occ > 1 (weight) # 2 x the occurrence index table Xindex = aliased(NodeNgram) Yindex = aliased(NodeNgram) # for debug (1/4) # Xngram = aliased(Ngram) # Yngram = aliased(Ngram) # 1) prepare definition of counted forms if not groupings_id: # no groupings => the counted forms are the ngrams Xindex_ngform_id = Xindex.ngram_id Yindex_ngform_id = Yindex.ngram_id # groupings: cf commentaire détaillé dans compute_occs() + todo facto else: # prepare translations Xsyno = (session.query( NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id).filter( NodeNgramNgram.node_id == groupings_id).subquery()) # further use as anon tables prevent doing Ysyno = Xsyno Ysyno = (session.query( NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id).filter( NodeNgramNgram.node_id == groupings_id).subquery()) # groupings => define the counted form depending on the existence of a synonym Xindex_ngform_id = case([ (Xsyno.c.ngram1_id != None, Xsyno.c.ngram1_id), (Xsyno.c.ngram1_id == None, Xindex.ngram_id) # condition value ]) Yindex_ngform_id = case([(Ysyno.c.ngram1_id != None, Ysyno.c.ngram1_id), (Ysyno.c.ngram1_id == None, Yindex.ngram_id)]) # --- # 2) BASE DB QUERY # cooccurrences columns definition ---------------- ucooc = func.count(Xindex_ngform_id).label("ucooc") # NB could be X or Y in this line # (we're counting grouped rows and just happen to do it on this column) base_query = ( session.query( Xindex_ngform_id, Yindex_ngform_id, ucooc # for debug (2/4) # , Xngram.terms.label("w_x") # , Yngram.terms.label("w_y") ).join(Yindex, Xindex.node_id == Yindex.node_id) # <- by definition of cooc .join(Node, Node.id == Xindex.node_id) # <- b/c within corpus .filter(Node.parent_id == corpus.id) # <- b/c within corpus .filter(Node.typename == "DOCUMENT") # <- b/c within corpus ) # outerjoin the synonyms if needed if groupings_id: base_query = ( base_query.outerjoin( Xsyno, # <- synonyms for Xindex.ngrams Xsyno.c.ngram2_id == Xindex.ngram_id).outerjoin( Ysyno, # <- synonyms for Yindex.ngrams Ysyno.c.ngram2_id == Yindex.ngram_id)) # 3) counting clause in any case coocs_query = ( base_query.group_by( Xindex_ngform_id, Yindex_ngform_id # <- what we're counting # for debug (3/4) # ,"w_x", "w_y" ) # for debug (4/4) # .join(Xngram, Xngram.id == Xindex_ngform_id) # .join(Yngram, Yngram.id == Yindex_ngform_id) .order_by(ucooc)) # 4) INPUT FILTERS (reduce N before O(N²)) if on_list_id: # £TODO listes différentes ou bien une liste pour x et tous les ngrammes pour y # car permettrait expansion de liste aux plus proches voisins (MacLachlan) # (avec une matr rectangulaire) m1 = aliased(NodeNgram) m2 = aliased(NodeNgram) coocs_query = (coocs_query.join( m1, m1.ngram_id == Xindex_ngform_id).join( m2, m2.ngram_id == Yindex_ngform_id).filter( m1.node_id == on_list_id).filter(m2.node_id == on_list_id)) if stoplist_id: s1 = (session.query(NodeNgram.ngram_id).filter( NodeNgram.node_id == stoplist_id).subquery()) # further use as anon tables prevent doing s2 = s1 s2 = (session.query(NodeNgram.ngram_id).filter( NodeNgram.node_id == stoplist_id).subquery()) coocs_query = ( coocs_query.outerjoin(s1, s1.c.ngram_id == Xindex_ngform_id).outerjoin( s2, s2.c.ngram_id == Yindex_ngform_id) # équivalent NOT IN stoplist .filter(s1.c.ngram_id == None).filter(s2.c.ngram_id == None)) if diagonal_filter: # don't compute ngram with itself coocs_query = coocs_query.filter(Xindex_ngform_id != Yindex_ngform_id) if start or end: Time = aliased(NodeHyperdata) coocs_query = (coocs_query.join(Time, Time.node_id == Xindex.node_id).filter( Time.key == "publication_date")) if start: if not isinstance(start, datetime): try: start = datetime.strptime(start, '%Y-%m-%d') except: raise TypeError( "'start' param expects datetime object or %%Y-%%m-%%d string" ) # the filtering by start limit coocs_query = coocs_query.filter(Time.value_utc >= start) if end: if not isinstance(end, datetime): try: end = datetime.strptime(end, '%Y-%m-%d') except: raise TypeError( "'end' param expects datetime object or %%Y-%%m-%%d string" ) # the filtering by start limit coocs_query = coocs_query.filter(Time.value_utc <= end) if symmetry_filter: # 1 filtre tenant en compte de la symétrie # -> réduit le travail de moitié !! # -> mais récupération sera plus couteuse via des requêtes OR comme: # WHERE ngram1 = mon_ngram OR ngram2 = mon_ngram coocs_query = coocs_query.filter(Xindex_ngform_id < Yindex_ngform_id) # 5) OUTPUT FILTERS # ------------------ # threshold # £TODO adjust COOC_THRESHOLD a posteriori: # ex: sometimes 2 sometimes 4 depending on sparsity print("COOCS: filtering pairs under threshold:", threshold) coocs_query = coocs_query.having(ucooc >= threshold) # 6) EXECUTE QUERY # ---------------- # => storage in our matrix structure matrix = WeightedMatrix(coocs_query.all()) # ------------------- # fyi shape_0 = len({pair[0] for pair in matrix.items}) shape_1 = len({pair[1] for pair in matrix.items}) print("COOCS: NEW matrix shape [%ix%i]" % (shape_0, shape_1)) if just_pass_result: return matrix else: # 5) SAVE # -------- # saving the parameters of the analysis in the Node JSON new_hyperdata = {'corpus': corpus.id, 'threshold': threshold} if overwrite_id: # overwrite pre-existing id the_cooc = cache.Node[overwrite_id] the_cooc.hyperdata = new_hyperdata the_cooc.save_hyperdata() session.commit() the_id = overwrite_id else: # create the new cooc node the_cooc = corpus.add_child( typename="COOCCURRENCES", name="Coocs (in:%s)" % corpus.name[0:10], hyperdata=new_hyperdata, ) session.add(the_cooc) session.commit() the_id = the_cooc.id # ==> save all NodeNgramNgram with link to new cooc node id matrix.save(the_id) return the_id
def post(self, request, project_id): # example only input = request.data or { 'x': { 'with_empty': True, 'resolution': 'decade', 'value': 'publication_date', }, 'y': { # 'divided_by': 'total_ngrams_count', # 'divided_by': 'total_documents_count', }, 'filter': { # 'ngrams': ['bees', 'bee', 'honeybee', 'honeybees', 'honey bee', 'honey bees'], # 'ngrams': ['insecticide', 'pesticide'], # 'corpora': [52633], # 'date': {'min': '1995-12-31'} }, # 'format': 'csv', } print(input) # input validation input = validate( input, { 'type': dict, 'default': {}, 'items': { 'x': { 'type': dict, 'default': {}, 'items': { # which hyperdata to choose for the date 'value': { 'type': str, 'default': 'publication_date', 'range': { 'publication_date', } }, # time resolution 'resolution': { 'type': str, 'range': self._resolutions.keys(), 'default': 'month' }, # should we add zeroes for empty values? 'with_empty': { 'type': bool, 'default': False }, } }, 'y': { 'type': dict, 'default': {}, 'items': { # mesured value 'value': { 'type': str, 'default': 'ngrams_count', 'range': { 'ngrams_count', 'documents_count', 'ngrams_tfidf' } }, # value by which we should normalize 'divided_by': { 'type': str, 'range': { 'total_documents_count', 'documents_count', 'total_ngrams_count' } }, } }, # filtering 'filter': { 'type': dict, 'default': {}, 'items': { # filter by metadata 'hyperdata': { 'type': list, 'default': [], 'items': { 'type': dict, 'items': { 'key': { 'type': str, 'range': self._operators.keys() }, 'operator': { 'type': str }, 'value': { 'type': str }, } } }, # filter by date 'date': { 'type': dict, 'items': { 'min': { 'type': datetime.datetime }, 'max': { 'type': datetime.datetime }, }, 'default': {} }, # filter by corpora 'corpora': { 'type': list, 'default': [], 'items': { 'type': int } }, # filter by ngrams 'ngrams': { 'type': list, 'default': [], 'items': { 'type': str } }, } }, # output format 'format': { 'type': str, 'default': 'json', 'range': {'json', 'csv'} }, } }) # build query: prepare columns X = aliased(NodeHyperdata) column_x = func.date_trunc(input['x']['resolution'], X.value_utc) column_y = { 'documents_count': func.count(Node.id.distinct()), 'ngrams_count': func.sum(NodeNgram.weight), # 'ngrams_tfidf': func.sum(NodeNodeNgram.weight), }[input['y']['value']] # build query: base print(input) query_base = ( session.query(column_x).select_from(Node).join( NodeNgram, NodeNgram.node_id == Node.id).join( X, X.node_id == NodeNgram.node_id) #.filter(X.key == input['x']['value']) .group_by(column_x).order_by(column_x)) # build query: base, filter by corpora or project if 'corpora' in input['filter'] and input['filter']['corpora']: query_base = (query_base.filter( Node.parent_id.in_(input['filter']['corpora']))) else: ParentNode = aliased(Node) query_base = (query_base.join( ParentNode, ParentNode.id == Node.parent_id).filter( ParentNode.parent_id == project_id)) # build query: base, filter by date if 'date' in input['filter']: if 'min' in input['filter']['date']: query_base = query_base.filter( X.value >= input['filter']['date']['min']) if 'max' in input['filter']['date']: query_base = query_base.filter( X.value <= input['filter']['date']['max']) # build query: filter by ngrams query_result = query_base.add_columns(column_y) if 'ngrams' in input['filter'] and input['filter']['ngrams']: query_result = (query_result.join( Ngram, Ngram.id == NodeNgram.ngram_id).filter( Ngram.terms.in_(input['filter']['ngrams']))) # build query: filter by metadata if 'hyperdata' in input['filter']: for h, hyperdata in enumerate(input['filter']['hyperdata']): print(h, hyperdata) # get hyperdata in database #if hyperdata_model is None: # continue #hyperdata_id, hyperdata_type = hyperdata_model # create alias and query it operator = self._operators[hyperdata['operator']] type_string = type2string( INDEXED_HYPERDATA[hyperdata['key']]['type']) value = self._converters[type_string](hyperdata['value']) query_result = (query_result.join( NodeHyperdata, NodeHyperdata.node_id == NodeNgram.node_id).filter( NodeHyperdata.key == hyperdata['key']).filter( operator(NodeHyperdata.value, value))) # build result: prepare data date_value_list = query_result.all() #print(date_value_list) if date_value_list: date_min = date_value_list[0][0].replace(tzinfo=None) date_max = date_value_list[-2][0].replace(tzinfo=None) # build result: prepare interval result = collections.OrderedDict() if input['x']['with_empty'] and date_value_list: compute_next_date = self._resolutions[input['x']['resolution']] date = date_min while date <= date_max: result[date] = 0.0 date = compute_next_date(date) # build result: integrate for date, value in date_value_list[0:-1]: result[date.replace(tzinfo=None)] = value # build result: normalize query_normalize = None if date_value_list and 'divided_by' in input['y'] and input['y'][ 'divided_by']: if input['y']['divided_by'] == 'total_documents_count': query_normalize = query_base.add_column( func.count(Node.id.distinct())) elif input['y']['divided_by'] == 'total_ngrams_count': query_normalize = query_base.add_column( func.sum(NodeNgram.weight)) if query_normalize is not None: for date, value in query_normalize[0:-1]: date = date.replace(tzinfo=None) if date in result: result[date] /= value # return result with proper formatting if input['format'] == 'json': return JsonHttpResponse( { 'query': input, 'result': sorted(result.items()), }, 201) elif input['format'] == 'csv': return CsvHttpResponse(sorted(result.items()), ('date', 'value'), 201)