def corpus_list(corpus_id, list_types=ALL_LIST_TYPES, with_synonyms=False, with_count=False): # Link between a GROUPLIST, a normal form (ngram1), and a synonym (ngram2) NNN = NodeNgramNgram # Get the list type from the Node type -- as in CSV export list_type = (case([(Node.typename == 'MAINLIST', 'main'), (Node.typename == 'MAPLIST', 'map'), (Node.typename == 'STOPLIST', 'stop')]).label('type')) # We will retrieve each ngram as the following tuple: entities = (list_type, Ngram.terms.label('ng')) if with_count: entities += (Ngram.id.label('id'), ) # First, get ngrams from wanted lists ngrams = _ngrams(corpus_id, list_types, entities) # Secondly, exclude "synonyms" (grouped ngrams that are not normal forms). # We have to exclude synonyms first because data is inconsistent and some # of them can be both in GROUPLIST and in MAIN/MAP/STOP lists. We want to # take synonyms from GROUPLIST only -- see below. Groups = aliased(Node, name='groups') query = (ngrams.outerjoin( Groups, (Groups.parent_id == corpus_id) & (Groups.typename == 'GROUPLIST')).outerjoin( NNN, (NNN.node_id == Groups.id) & (NNN.ngram2_id == Ngram.id)).filter(NNN.ngram1_id == None)) # If `with_synonyms` is True, add them from GROUPLIST: this is the reliable # source for them if with_synonyms: Synonym = aliased(Ngram) ent = (list_type, Synonym.terms.label('ng'), Synonym.id.label('id')) synonyms = (ngrams.with_entities(*ent).filter( NNN.ngram1_id == Ngram.id, NNN.ngram2_id == Synonym.id, NNN.node_id == Groups.id, Groups.parent_id == corpus_id, Groups.typename == 'GROUPLIST')) query = query.union(synonyms) # Again, data is inconsistent: MAINLIST may intersect with MAPLIST and # we don't wan't that if 'main' in list_types and 'map' not in list_types: # Exclude MAPLIST ngrams from MAINLIST query = query.except_(_ngrams(corpus_id, 'map', entities)) if with_count: N = query.subquery() return (session.query(N.c.type, N.c.ng, NodeNodeNgram.score).join( Node, (Node.parent_id == corpus_id) & (Node.typename == 'OCCURRENCES')).outerjoin( NodeNodeNgram, (NodeNodeNgram.ngram_id == N.c.id) & (NodeNodeNgram.node1_id == Node.id) & (NodeNodeNgram.node2_id == corpus_id))) # Return found ngrams sorted by list type, and then alphabetically return query.order_by('type', 'ng')
def query_groups(groupings_id, details=False): """ Listing of couples (mainform, subform) aka (ngram1_id, ngram2_id) Parameter: - details: if False, just send the array of couples if True, send quadruplets with (ngram1_id, term1, ngram2_id, term2) """ if not details: # simple contents query = session.query(NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id) else: # detailed contents (id + terms) Ngram1 = aliased(Ngram) Ngram2 = aliased(Ngram) query = (session.query( NodeNgramNgram.ngram1_id, Ngram1.terms, NodeNgramNgram.ngram2_id, Ngram2.terms, ).join(Ngram1, NodeNgramNgram.ngram1_id == Ngram1.id).join( Ngram2, NodeNgramNgram.ngram2_id == Ngram2.id)) # main filter # ----------- query = query.filter(NodeNgramNgram.node_id == groupings_id) return query
def nodes(parent=None, group_by='typename', order_by='typename', has_child='check'): if group_by or has_child is not None: select = [ func.min(Node.id).label('id'), func.min(Node.name).label('name'), func.min(Node.typename).label('typename'), func.count(Node.id).label('cnt') ] else: select = [ Node.id.label('id'), Node.name.label('name'), Node.typename.label('typename'), literal_column('1').label('cnt') ] if has_child is not None: N = aliased(Node) select.append(func.count(N.id).label('children')) else: select.append(literal_column('NULL').label('children')) parent_id = getattr(parent, 'id', parent) q = session.query(*select).filter_by(parent_id=parent_id) \ .group_by(getattr(Node, group_by if group_by else 'id')) if has_child is not None: q = q.outerjoin(N, N.parent_id == Node.id).group_by(N.parent_id) return q.order_by(order_by)
def query_groups(groupings_id, details=False, sort=False): """ Listing of couples (mainform, subform) aka (ngram1_id, ngram2_id) Parameter: - details: if False, just send the array of couples if True, send quadruplets with (ngram1_id, term1, ngram2_id, term2) - sort: order results by terms of ngram1 then ngram2 """ if details or sort: Ngram1, Ngram2 = Ngram, aliased(Ngram) if not details: # simple contents columns = (NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id) else: # detailed contents (id + terms) columns = (Ngram1.id, Ngram1.terms, Ngram2.id, Ngram2.terms) query = session.query(*columns) if details or sort: query = (query.join(Ngram1, NodeNgramNgram.ngram1_id == Ngram1.id).join( Ngram2, NodeNgramNgram.ngram2_id == Ngram2.id)) if sort: query = query.order_by(Ngram1.terms, Ngram2.terms) # main filter # ----------- query = query.filter(NodeNgramNgram.node_id == groupings_id) return query
def contacts(self): """get all contacts in relation with the user""" Friend = aliased(User) query = (session .query(Friend) .join(Contact, Contact.user2_id == Friend.id) .filter(Contact.user1_id == self.id) ) return query.all()
def get(self, request, corpus_id, doc_id): """ Get all ngrams for a doc id, sorted by list usual route: /annotations/documents/<docid> NB1 : we are within a doc only NB2 : MAINLIST items are actually MAINLIST without MAP items NB3 : mostly the mainforms are in lists, but doc can have subform => if we simply join on ngram_id, we'll filter out the subforms => join on value filled by case switch: (the ngram itself or a mainform if exists) """ corpus_id = int(corpus_id) doc_id = int(doc_id) # our results: ngrams within a doc and a list + weights in the doc doc_ngram_list = [] doc_ngram_list_add = doc_ngram_list.append lists = {} corpus_nod = cache.Node[corpus_id] doc_nod = cache.Node[doc_id] # scores_nod = corpus_nod.children(typename="OCCURRENCES").first() groups_nod = corpus_nod.children(typename="GROUPLIST").first() # synonyms sub table for outerjoins Syno = (session.query( NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id).filter( NodeNgramNgram.node_id == groups_nod.id).subquery()) # maplist_ids to filter map ngrams from mainlist maplist_ids = {} # NB must do mainlist after map for filtering map items out of main for list_type in ['MAPLIST', 'STOPLIST', 'MAINLIST']: list_nod = corpus_nod.children(typename=list_type).first() list_id = list_nod.id lists["%s" % list_id] = list_type ListsTable = aliased(NodeNgram) mainform_id = case([(Syno.c.ngram1_id != None, Syno.c.ngram1_id), (Syno.c.ngram1_id == None, Ngram.id)]) q = (session # ngrams from the doc_id .query(NodeNgram.weight, Ngram, mainform_id) # debug #.query(NodeNgram.weight, Ngram.terms, Ngram.id, Syno.c.ngram1_id, mainform_id) .select_from(NodeNgram) .join(Ngram) .filter(NodeNgram.node_id == doc_id) # add mainforms next to their subforms .outerjoin(Syno, Syno.c.ngram2_id == Ngram.id) # filter mainforms on the list we want .join(ListsTable, # possible that mainform is in list # and not the subform ListsTable.ngram_id == mainform_id ) .filter(ListsTable.node_id == list_id) ) # add to results (and optional filtering) for (w, obj, mainform_id) in q.all(): ngram_id = obj.id # boolean if needed # is_subform = (ngram_id == mainform_id) # special filtering case # when MAINLIST requested we actually want MAIN without MAP if list_type == "MAPLIST": maplist_ids[ngram_id] = True if list_type == "MAINLIST": if ngram_id in maplist_ids: # skip object continue if mainform_id == ngram_id: group = None else: group = mainform_id # normal case doc_ngram_list_add((ngram_id, obj.terms, group, w, list_id)) # debug # print("annotations.views.NgramList.doc_ngram_list: ", doc_ngram_list) data = { '%s' % corpus_id: { '%s' % doc_id: [ { 'uuid': ngram_id, 'group': group, # the mainform if there is a group 'text': ngram_text, 'occs': ngram_occs, 'list_id': list_id, } for (ngram_id, ngram_text, group, ngram_occs, list_id) in doc_ngram_list ], 'lists': lists } } # format alternatif de transmission des "annotations", classé par listes puis ngram_id # { 'corpus_id' : { # list_id_stop: {term_stop1: {term_data}, term_stop2: {term_data}..}, # list_id_miam: {term_miam1: {term_data}, term_miam2: {term_data}..}, # list_id_map: {term_map1: {term_data}, term_map2: {term_data}..}, # } # 'lists' : {"list_id" : "list_type" ... } # } # NB 3rd possibility: unicity of ngram_text could also allow us to use it # as key and could enhance lookup later (frequent checks if term exists) return Response(data)
def do_maplist(corpus, overwrite_id=None, mainlist_id=None, specclusion_id=None, genclusion_id=None, grouplist_id=None, limit=DEFAULT_MAPLIST_MAX, genclusion_part=DEFAULT_MAPLIST_GENCLUSION_RATIO, monograms_part=DEFAULT_MAPLIST_MONOGRAMS_RATIO): ''' According to Genericity/Specificity and mainlist Parameters: - mainlist_id (starting point, already cleaned of stoplist terms) - specclusion_id (ngram inclusion by cooc specificity -- ranking factor) - genclusion_id (ngram inclusion by cooc genericity -- ranking factor) - grouplist_id (filtering grouped ones) - overwrite_id: optional if preexisting MAPLIST node to overwrite + 3 params to modulate the terms choice - limit for the amount of picked terms - monograms_part: a ratio of terms with only one lexical unit to keep (multigrams quota = limit * (1-monograms_part)) - genclusion_part: a ratio of terms with only one lexical unit to keep (speclusion quota = limit * (1-genclusion_part)) ''' if not (mainlist_id and specclusion_id and genclusion_id and grouplist_id): raise ValueError( "Please provide mainlist_id, specclusion_id, genclusion_id and grouplist_id" ) quotas = {'topgen': {}, 'topspec': {}} genclusion_limit = round(limit * genclusion_part) speclusion_limit = limit - genclusion_limit quotas['topgen']['monograms'] = round(genclusion_limit * monograms_part) quotas['topgen'][ 'multigrams'] = genclusion_limit - quotas['topgen']['monograms'] quotas['topspec']['monograms'] = round(speclusion_limit * monograms_part) quotas['topspec'][ 'multigrams'] = speclusion_limit - quotas['topspec']['monograms'] print("MAPLIST quotas:", quotas) #dbg = DebugTime('Corpus #%d - computing Miam' % corpus.id) MainlistTable = aliased(NodeNgram) IsSubform = (session # we want only secondary terms (ngram2) # to be able to filter them out .query(NodeNgramNgram.ngram2_id) .filter(NodeNgramNgram.node_id == grouplist_id) .subquery() ) ScoreSpec = aliased(NodeNgram) ScoreGen = aliased(NodeNgram) # ngram with both ranking factors spec and gen query = ( session.query(ScoreSpec.ngram_id, ScoreSpec.weight, ScoreGen.weight, Ngram.n).join( Ngram, Ngram.id == ScoreSpec.ngram_id).join( ScoreGen, ScoreGen.ngram_id == ScoreSpec.ngram_id).filter( ScoreSpec.node_id == specclusion_id).filter( ScoreGen.node_id == genclusion_id) # we want only terms within mainlist .join(MainlistTable, Ngram.id == MainlistTable.ngram_id).filter( MainlistTable.node_id == mainlist_id) # we remove all ngrams matching an ngram2_id from the synonyms .outerjoin(IsSubform, IsSubform.c.ngram2_id == ScoreSpec.ngram_id).filter( IsSubform.c.ngram2_id == None) # specificity-ranked .order_by(desc(ScoreSpec.weight))) # format in scored_ngrams array: # ------------------------------- # [(37723, 8.428, 14.239, 3 ), etc] # ngramid wspec wgen nwords scored_ngrams = query.all() n_ngrams = len(scored_ngrams) if n_ngrams == 0: raise ValueError("No ngrams in cooc table ?") #return # results, with same structure as quotas chosen_ngrams = { 'topgen': { 'monograms': [], 'multigrams': [] }, 'topspec': { 'monograms': [], 'multigrams': [] } } # specificity and genericity are rather reverse-correlated # but occasionally they can have common ngrams (same ngram well ranked in both) # => we'll use a lookup table to check if we didn't already get it already_gotten_ngramids = {} # 2 loops to fill spec-clusion then gen-clusion quotas # (1st loop uses order from DB, 2nd loop uses our own sort at end of 1st) for rkr in ['topspec', 'topgen']: got_enough_mono = False got_enough_multi = False all_done = False i = -1 while ((not all_done) and (not (got_enough_mono and got_enough_multi))): # retrieve sorted ngram n° i i += 1 (ng_id, wspec, wgen, nwords) = scored_ngrams[i] # before any continue case, we check the next i for max reached all_done = (i + 1 >= n_ngrams) if ng_id in already_gotten_ngramids: continue # NB: nwords could be replaced by a simple search on r' ' if nwords == 1: if got_enough_mono: continue else: # add ngram to results and lookup chosen_ngrams[rkr]['monograms'].append(ng_id) already_gotten_ngramids[ng_id] = True # multi else: if got_enough_multi: continue else: # add ngram to results and lookup chosen_ngrams[rkr]['multigrams'].append(ng_id) already_gotten_ngramids[ng_id] = True got_enough_mono = (len(chosen_ngrams[rkr]['monograms']) >= quotas[rkr]['monograms']) got_enough_multi = (len(chosen_ngrams[rkr]['multigrams']) >= quotas[rkr]['multigrams']) # at the end of the first loop we just need to sort all by the second ranker (gen) scored_ngrams = sorted(scored_ngrams, key=lambda ng_infos: ng_infos[2], reverse=True) obtained_spec_mono = len(chosen_ngrams['topspec']['monograms']) obtained_spec_multi = len(chosen_ngrams['topspec']['multigrams']) obtained_gen_mono = len(chosen_ngrams['topgen']['monograms']) obtained_gen_multi = len(chosen_ngrams['topgen']['multigrams']) obtained_total = obtained_spec_mono \ + obtained_spec_multi \ + obtained_gen_mono \ + obtained_gen_multi print("MAPLIST: top_spec_monograms =", obtained_spec_mono) print("MAPLIST: top_spec_multigrams =", obtained_spec_multi) print("MAPLIST: top_gen_monograms =", obtained_gen_mono) print("MAPLIST: top_gen_multigrams =", obtained_gen_multi) print("MAPLIST: kept %i ngrams in total " % obtained_total) obtained_data = chosen_ngrams['topspec']['monograms'] \ + chosen_ngrams['topspec']['multigrams'] \ + chosen_ngrams['topgen']['monograms'] \ + chosen_ngrams['topgen']['multigrams'] # NEW MAPLIST NODE # ----------------- # saving the parameters of the analysis in the Node JSON new_hyperdata = { 'corpus': corpus.id, 'limit': limit, 'monograms_part': monograms_part, 'genclusion_part': genclusion_part, } if overwrite_id: # overwrite pre-existing node the_maplist = cache.Node[overwrite_id] the_maplist.hyperdata = new_hyperdata the_maplist.save_hyperdata() session.commit() the_id = overwrite_id else: # create a new maplist node the_maplist = corpus.add_child(name="Maplist (in %i)" % corpus.id, typename="MAPLIST", hyperdata=new_hyperdata) session.add(the_maplist) session.commit() the_id = the_maplist.id # create UnweightedList object and save (=> new NodeNgram rows) datalist = UnweightedList(obtained_data) # save datalist.save(the_id) # dbg.show('MapList computed') return the_id
def countCooccurrences(corpus_id=None, cooc_id=None, field1='ngrams', field2='ngrams', start=None, end=None, mapList_id=None, groupList_id=None, distance=None, bridgeness=None, n_min=1, n_max=None, limit=1000, isMonopartite=True, threshold=3, save_on_db=True, reset=True): ''' Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id For the moment list of parameters are not supported because, lists need to be merged before. corpus :: Corpus mapList_id :: Int groupList_id :: Int start :: TimeStamp -- example: '2010-05-30 02:00:00+02' end :: TimeStamp limit :: Int ''' # FIXME remove the lines below after factorization of parameters parameters = dict() parameters['field1'] = field1 parameters['field2'] = field2 # Get corpus as Python object corpus = session.query(Node).filter(Node.id == corpus_id).first() # Get node of the Graph if not cooc_id: cooc_id = (session.query(Node.id).filter( Node.typename == "COOCCURRENCES", Node.name == "GRAPH EXPLORER", Node.parent_id == corpus.id).first()) if not cooc_id: coocNode = corpus.add_child(typename="COOCCURRENCES", name="GRAPH (in corpus %s)" % corpus.id) session.add(coocNode) session.commit() cooc_id = coocNode.id else: cooc_id = int(cooc_id[0]) # when cooc_id preexisted, but we want to continue (reset = True) # (to give new contents to this cooc_id) elif reset: print("GRAPH #%s ... Counting new cooccurrences data." % cooc_id) session.query(NodeNgramNgram).filter( NodeNgramNgram.node_id == cooc_id).delete() session.commit() # when cooc_id preexisted and we just want to load it (reset = False) else: print("GRAPH #%s ... Loading cooccurrences computed already." % cooc_id) cooc = session.query(NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id, NodeNgramNgram.weight).filter( NodeNgramNgram.node_id == cooc_id).all() return (int(cooc_id), WeightedMatrix(cooc)) NodeNgramX = aliased(NodeNgram) # Simple Cooccurrences cooc_score = func.count(NodeNgramX.node_id).label('cooc_score') # A kind of Euclidean distance cooccurrences #cooc_score = func.sqrt(func.sum(NodeNgramX.weight * NodeNgramY.weight)).label('cooc_score') if isMonopartite: NodeNgramY = aliased(NodeNgram) cooc_query = (session.query( NodeNgramX.ngram_id, NodeNgramY.ngram_id, cooc_score).join(Node, Node.id == NodeNgramX.node_id).join( NodeNgramY, NodeNgramY.node_id == Node.id).filter( Node.parent_id == corpus.id, Node.typename == "DOCUMENT")) else: NodeNgramY = aliased(NodeNgram) cooc_query = (session.query( NodeHyperdataNgram.ngram_id, NodeNgramY.ngram_id, cooc_score).join(Node, Node.id == NodeHyperdataNgram.node_id).join( NodeNgramY, NodeNgramY.node_id == Node.id).join( Hyperdata, Hyperdata.id == NodeHyperdataNgram.hyperdata_id).filter( Node.parent_id == corpus.id, Node.typename == "DOCUMENT").filter( Hyperdata.name == field1)) # Size of the ngrams between n_min and n_max if n_min is not None or n_max is not None: if isMonopartite: NgramX = aliased(Ngram) cooc_query = cooc_query.join(NgramX, NgramX.id == NodeNgramX.ngram_id) NgramY = aliased(Ngram) cooc_query = cooc_query.join(NgramY, NgramY.id == NodeNgramY.ngram_id) if n_min is not None: cooc_query = (cooc_query.filter(NgramY.n >= n_min)) if isMonopartite: cooc_query = cooc_query.filter(NgramX.n >= n_min) if n_max is not None: cooc_query = (cooc_query.filter(NgramY.n >= n_min)) if isMonopartite: cooc_query = cooc_query.filter(NgramX.n >= n_min) # Cooc between the dates start and end if start is not None: #date_start = datetime.datetime.strptime ("2001-2-3 10:11:12", "%Y-%m-%d %H:%M:%S") # TODO : more precise date format here (day is smaller grain actually). date_start = datetime.strptime(str(start), "%Y-%m-%d") date_start_utc = date_start.strftime("%Y-%m-%d %H:%M:%S") Start = aliased(NodeHyperdata) cooc_query = (cooc_query.join(Start, Start.node_id == Node.id).filter( Start.key == 'publication_date').filter( Start.value_utc >= date_start_utc)) parameters['start'] = date_start_utc if end is not None: # TODO : more precise date format here (day is smaller grain actually). date_end = datetime.strptime(str(end), "%Y-%m-%d") date_end_utc = date_end.strftime("%Y-%m-%d %H:%M:%S") End = aliased(NodeHyperdata) cooc_query = (cooc_query.join(End, End.node_id == Node.id).filter( End.key == 'publication_date').filter( End.value_utc <= date_end_utc)) parameters['end'] = date_end_utc if isMonopartite: # Cooc is symetric, take only the main cooccurrences and cut at the limit cooc_query = cooc_query.filter( NodeNgramX.ngram_id < NodeNgramY.ngram_id) cooc_query = cooc_query.having(cooc_score >= threshold) if isMonopartite: cooc_query = cooc_query.group_by(NodeNgramX.ngram_id, NodeNgramY.ngram_id) else: cooc_query = cooc_query.group_by(NodeHyperdataNgram.ngram_id, NodeNgramY.ngram_id) # Order according some scores # If ordering is really needed, use Ordered Index (faster) #cooc_query = cooc_query.order_by(desc('cooc_score')) matrix = WeightedMatrix(cooc_query) print("GRAPH #%s Filtering the matrix with Map and Group Lists." % cooc_id) cooc = filterMatrix(matrix, mapList_id, groupList_id) parameters['MapList_id'] = str(mapList_id) parameters['GroupList_id'] = str(groupList_id) # TODO factorize savings on db if save_on_db: # Saving the cooccurrences cooc.save(cooc_id) print("GRAPH #%s ... Node Cooccurrence Matrix saved" % cooc_id) # Saving the parameters print("GRAPH #%s ... Parameters saved in Node." % cooc_id) coocNode = session.query(Node).filter(Node.id == cooc_id).first() coocNode.hyperdata["parameters"] = dict() coocNode.hyperdata["parameters"] = parameters coocNode.save_hyperdata() session.commit() #data = cooc2graph(coocNode.id, cooc, distance=distance, bridgeness=bridgeness) else: return cooc return (coocNode.id, cooc)
def get_graph(request=None, corpus=None, field1='ngrams', field2='ngrams', mapList_id=None, groupList_id=None, cooc_id=None, type='node_link', start=None, end=None, distance='conditional', bridgeness=5, threshold=1, isMonopartite=True, saveOnly=True): ''' Get_graph : main steps: 0) Check the parameters get_graph :: GraphParameters -> Either (Dic Nodes Links) (Dic State Length) where type Length = Int get_graph first checks the parameters and return either graph data or a dict with state "type" with an integer to indicate the size of the parameter (maybe we could add a String in that step to factor and give here the error message) 1) compute_graph (see function above) 2) return graph ''' overwrite_node_contents = False # Case of graph has been computed already if cooc_id is not None: print("GRAPH#%d ... Loading data already computed." % int(cooc_id)) node = session.query(Node).filter(Node.id == cooc_id).first() # Structure of the Node.hyperdata[distance][bridbeness] # All parameters (but distance and bridgeness) # are in Node.hyperdata["parameters"] # Check distance of the graph if node.hyperdata.get(distance, None) is not None: graph = node.hyperdata[distance] # Check bridgeness of the graph if graph.get(str(bridgeness), None) is not None: return graph[str(bridgeness)] # new graph: we give it an empty node with new id and status elif saveOnly: # NB: we do creation already here (instead of same in countCooccurrences) # to guarantee a unique ref id to the saveOnly graph (async generation) new_node = corpus.add_child(typename="COOCCURRENCES", name="GRAPH (in corpus %s)" % corpus.id) session.add(new_node) session.commit() cooc_id = new_node.id cooc_name = new_node.name cooc_date = new_node.date # and the empty content will need redoing by countCooccurrences overwrite_node_contents = True print("GRAPH #%d ... Created new empty data node for saveOnly" % int(cooc_id)) # Case of graph has not been computed already # First, check the parameters # Case of mapList not big enough # ============================== # if we do not have any mapList_id already if mapList_id is None: mapList_id = session.query( Node.id).filter(Node.typename == "MAPLIST").first()[0] mapList_size = session.query(NodeNgram).filter( NodeNgram.node_id == mapList_id).count() if mapList_size < graph_constraints['mapList']: # Do not compute the graph if mapList is not big enough return {'state': "mapListError", "length": mapList_size} # Instantiate query for case of corpus not big enough # =================================================== corpus_size_query = (session.query(Node).filter( Node.typename == "DOCUMENT").filter(Node.parent_id == corpus.id)) # Filter corpus by date if any start date # --------------------------------------- if start is not None: #date_start = datetime.datetime.strptime ("2001-2-3 10:11:12", "%Y-%m-%d %H:%M:%S") date_start = datetime.strptime(str(start), "%Y-%m-%d") date_start_utc = date_start.strftime("%Y-%m-%d %H:%M:%S") Start = aliased(NodeHyperdata) corpus_size_query = (corpus_size_query.join( Start, Start.node_id == Node.id).filter( Start.key == 'publication_date').filter( Start.value_utc >= date_start_utc)) # Filter corpus by date if any end date # ------------------------------------- if end is not None: date_end = datetime.strptime(str(end), "%Y-%m-%d") date_end_utc = date_end.strftime("%Y-%m-%d %H:%M:%S") End = aliased(NodeHyperdata) corpus_size_query = (corpus_size_query.join( End, End.node_id == Node.id).filter( End.key == 'publication_date').filter( End.value_utc <= date_end_utc)) # Finally test if the size of the corpora is big enough # -------------------------------- corpus_size = corpus_size_query.count() if saveOnly is not None and saveOnly == "True": scheduled(compute_graph)( corpus_id=corpus.id, cooc_id=cooc_id #, field1="ngrams", field2="ngrams" , start=start, end=end, mapList_id=mapList_id, groupList_id=groupList_id, isMonopartite=True, threshold=threshold, distance=distance, bridgeness=bridgeness, save_on_db=True, reset=overwrite_node_contents #, limit=size ) return { "state": "saveOnly", "target_id": cooc_id, "target_name": cooc_name, "target_date": cooc_date } elif corpus_size > graph_constraints['corpusMax']: # Then compute cooc asynchronously with celery scheduled(compute_graph)( corpus_id=corpus.id, cooc_id=cooc_id #, field1="ngrams", field2="ngrams" , start=start, end=end, mapList_id=mapList_id, groupList_id=groupList_id, isMonopartite=True, threshold=threshold, distance=distance, bridgeness=bridgeness, save_on_db=True, reset=overwrite_node_contents #, limit=size ) # Dict to inform user that corpus maximum is reached # then graph is computed asynchronously return {"state": "corpusMax", "length": corpus_size} elif corpus_size <= graph_constraints['corpusMin']: # Do not compute the graph if corpus is not big enough return {"state": "corpusMin", "length": corpus_size} else: # If graph_constraints are ok then compute the graph in live data = compute_graph( corpus_id=corpus.id, cooc_id=cooc_id #, field1="ngrams", field2="ngrams" , start=start, end=end, mapList_id=mapList_id, groupList_id=groupList_id, isMonopartite=True, threshold=threshold, distance=distance, bridgeness=bridgeness, save_on_db=True, reset=overwrite_node_contents #, limit=size ) # case when 0 coocs are observed (usually b/c not enough ngrams in maplist) if len(data) == 0: print("GRAPH # ... GET_GRAPH: 0 coocs in matrix") data = {'nodes': [], 'links': []} # empty data return data
def compute_tfidf_local(corpus, on_list_id=None, groupings_id=None, overwrite_id=None): """ Calculates tfidf similarity of each (doc, ngram) couple, within the current corpus Parameters: - the corpus itself - groupings_id: optional synonym relations to add all subform counts with their mainform's counts - on_list_id: mainlist or maplist type, to constrain the input ngrams - overwrite_id: optional id of a pre-existing TFIDF-XXXX node for this corpus (the Node and its previous NodeNodeNgram rows will be replaced) """ # All docs of this corpus docids_subquery = (session.query( Node.id).filter(Node.parent_id == corpus.id).filter( Node.typename == "DOCUMENT").subquery()) # N total_docs = session.query(docids_subquery).count() # define the counted form if not groupings_id: ngform_id = NodeNgram.ngram_id else: Syno = (session.query( NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id).filter( NodeNgramNgram.node_id == groupings_id).subquery()) ngform_id = case([(Syno.c.ngram1_id != None, Syno.c.ngram1_id), (Syno.c.ngram1_id == None, NodeNgram.ngram_id)]) # tf for each couple (number of rows = N docs X M ngrams) tf_doc_query = ( session.query( ngform_id, NodeNgram.node_id, func.sum(NodeNgram.weight).label("tf"), # tf: occurrences ) # select within docs of current corpus .join(docids_subquery, docids_subquery.c.id == NodeNgram.node_id)) if groupings_id: tf_doc_query = (tf_doc_query.outerjoin( Syno, Syno.c.ngram2_id == NodeNgram.ngram_id)) # now when we'll group_by the ngram2 freqs will be added to ngram1 if on_list_id: Miamlist = aliased(NodeNgram) tf_doc_query = (tf_doc_query.join( Miamlist, Miamlist.ngram_id == ngform_id).filter( Miamlist.node_id == on_list_id)) # execute query to do our tf sum tf_per_doc = tf_doc_query.group_by(NodeNgram.node_id, ngform_id).all() # ex: [(128371, 9732, 1.0), # (128383, 9740, 1.0), # (128373, 9731, 1.0), # (128376, 9734, 1.0), # (128372, 9731, 1.0), # (128383, 9733, 1.0), # (128383, 9735, 1.0), # (128389, 9734, 1.0), # (8624, 9731, 1.0), # (128382, 9740, 1.0), # (128383, 9739, 1.0), # (128383, 9736, 1.0), # (128378, 9735, 1.0), # (128375, 9733, 4.0), # (128383, 9732, 1.0)] # ^ ^ ^^ ^^ # ngram doc freq in this doc # simultaneously count docs with given term (number of rows = M ngrams) ndocswithngram = {} for triple in tf_per_doc: ng = triple[0] doc = triple[1] if ng in ndocswithngram: ndocswithngram[ng] += 1 else: ndocswithngram[ng] = 1 # print(ndocswithngram) # store for use in formula # { ngram_id => log(nd) } log_nd_lookup = { ng: log(nd_count) for (ng, nd_count) in ndocswithngram.items() } # --------------------------------------------------------- tfidfs = {} log_tot_docs = log(total_docs) for (ngram_id, node_id, tf) in tf_per_doc: log_nd = log_nd_lookup[ngram_id] # tfidfs[ngram_id] = tf * log(total_docs/nd) tfidfs[node_id, ngram_id] = tf * (log_tot_docs - log_nd) # --------------------------------------------------------- if overwrite_id: the_id = overwrite_id session.query(NodeNodeNgram).filter( NodeNodeNgram.node1_id == the_id).delete() session.commit() else: # create the new TFIDF-CORPUS node tfidf_node = corpus.add_child() tfidf_node.typename = "TFIDF-CORPUS" tfidf_node.name = "tfidf-sims-corpus (in:%s)" % corpus.id session.add(tfidf_node) session.commit() the_id = tfidf_node.id # reflect that in NodeNodeNgrams # £TODO replace bulk_insert by something like WeightedIndex.save() bulk_insert(NodeNodeNgram, ('node1_id', 'node2_id', 'ngram_id', 'score'), ((the_id, node_id, ngram_id, tfidfs[node_id, ngram_id]) for (node_id, ngram_id) in tfidfs)) return the_id
def compute_ti_ranking(corpus, groupings_id=None, count_scope="local", termset_scope="local", overwrite_id=None): """ Calculates tfidf ranking within given scope ---------- | via weighting of cumulated tfidf --------- Sum{i}(tf_ij) * ln(N/|U{i}(docs{mot€d})|) per ngram ng_i (or per mainform ng_i' if groups) across some docs d_j Parameters: - the corpus itself (or corpus_id) - groupings_id: optional id of a GROUPLIST node for these ngrams IF absent the ti weights are the sums for each ngram IF present they're the sums for each ngram's mainform - count_scope: {"local" or "global"} - local <=> frequencies counted in the current corpus - global <=> frequencies counted in all corpora of this type when the count_scope is global, there is another parameter: - termset_scope: {"local" or "global"} - local <=> output list of terms limited to the current corpus (SELECT DISTINCT ngram_id FROM nodes_ngrams WHERE node_id IN <docs>) - global <=> output list of terms found in global doc scope !!!! (many more terms) - overwrite_id: optional id of a pre-existing XXXX node for this corpus (the Node and its previous Node NodeNgram rows will be replaced) """ # validate string params if count_scope not in ["local", "global"]: raise ValueError( "compute_ti_ranking: count_scope param allowed values: 'local', 'global'" ) if termset_scope not in ["local", "global"]: raise ValueError( "compute_ti_ranking: termset_scope param allowed values: 'local', 'global'" ) if count_scope == "local" and termset_scope == "global": raise ValueError( "compute_ti_ranking: the termset_scope param can be 'global' iff count_scope param is 'global' too." ) # get corpus if type(corpus) == int: corpus_id = corpus corpus = cache.Node[corpus_id] elif type(corpus) == str and match(r'\d+$', corpus): corpus_id = int(corpus) corpus = cache.Node[corpus_id] else: # assuming Node class corpus_id = corpus.id # prepare sqla mainform vs ngram selector ngform_i = None if not groupings_id: ngform_i = NodeNgram.ngram_id else: # prepare translations syno = (session.query( NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id).filter( NodeNgramNgram.node_id == groupings_id).subquery()) # cf commentaire détaillé dans compute_occs() + todo facto ngform_i = case([(syno.c.ngram1_id != None, syno.c.ngram1_id), (syno.c.ngram1_id == None, NodeNgram.ngram_id) # condition value ]) # MAIN QUERY SKELETON tf_nd_query = ( session.query( # NodeNgram.ngram_id # or similar if grouping ngrams under their mainform ngform_i.label("counted_ngform"), # the tfidf elements # ------------------ func.sum(NodeNgram.weight), # tf: same as occurrences # ----------------------- func.count(NodeNgram.node_id) # nd: n docs with term # -------------------- ).group_by("counted_ngform") # count_scope to specify in which doc nodes to count # ----------- # .join(countdocs_subquery, # countdocs_subquery.c.id == NodeNgram.node_id) # optional termset_scope: if we'll restrict the ngrams # ------------- # .join(termset_subquery, # termset_subquery.c.uniq_ngid == NodeNgram.ngram_id) # optional translations to bring the subform's replacement # ------------ # .outerjoin(syno, # syno.c.ngram2_id == NodeNgram.ngram_id) ) # TUNING THE QUERY if groupings_id: tf_nd_query = tf_nd_query.outerjoin( syno, syno.c.ngram2_id == NodeNgram.ngram_id) # local <=> within this corpus if count_scope == "local": # All docs of this corpus countdocs_subquery = (session.query( Node.id).filter(Node.typename == "DOCUMENT").filter( Node.parent_id == corpus_id).subquery()) # no need to independantly restrict the ngrams tf_nd_query = tf_nd_query.join( countdocs_subquery, countdocs_subquery.c.id == NodeNgram.node_id) # --- # global <=> within all corpora of this source elif count_scope == "global": this_source_type = corpus.resources()[0]['type'] CorpusNode = aliased(Node) # All docs **in all corpora of the same source** countdocs_subquery = ( session.query(Node.id).filter(Node.typename == "DOCUMENT") # join on parent_id with selected corpora nodes .join(CorpusNode, CorpusNode.id == Node.parent_id).filter( CorpusNode.typename == "CORPUS") # TODO index corpus_sourcetype in DB .filter(CorpusNode.hyperdata['resources'][0]['type'].astext == str( this_source_type)).subquery()) if termset_scope == "global": # both scopes are the same: no need to independantly restrict the ngrams tf_nd_query = tf_nd_query.join( countdocs_subquery, countdocs_subquery.c.id == NodeNgram.node_id) # --- elif termset_scope == "local": # All unique terms... termset_subquery = ( session.query(distinct(NodeNgram.ngram_id).label("uniq_ngid")) # ... in the original corpus .join(Node).filter(Node.typename == "DOCUMENT").filter( Node.parent_id == corpus_id).subquery()) # only case of independant restrictions on docs and terms tf_nd_query = (tf_nd_query.join( countdocs_subquery, countdocs_subquery.c.id == NodeNgram.node_id).join( termset_subquery, termset_subquery.c.uniq_ngid == NodeNgram.ngram_id)) # --- # M total_docs = session.query(countdocs_subquery).count() log_tot_docs = log(total_docs) # result tf_nd = tf_nd_query.all() # -------------- "sommatoire" sur mot i ---------------- tfidfsum = {} for (ngram_i, tf_i, nd_i) in tf_nd: # tfidfsum[ngram_i] = tf_i * log(total_docs/nd_i) tfidfsum[ngram_i] = tf_i * (log_tot_docs - log(nd_i)) # ------------------------------------------------------ # N pour info total_ngramforms = len(tfidfsum) if overwrite_id: the_id = overwrite_id session.query(NodeNodeNgram).filter( NodeNodeNgram.node1_id == the_id).delete() session.commit() else: # create the new TFIDF-XXXX node to get an id tir_nd = corpus.add_child() if count_scope == "local": tir_nd.typename = "TIRANK-CORPUS" tir_nd.name = "ti rank (%i ngforms in corpus:%s)" % ( total_ngramforms, corpus_id) elif count_scope == "global": tir_nd.typename = "TIRANK-GLOBAL" tir_nd.name = "ti rank (%i ngforms %s in corpora of sourcetype:%s)" % ( total_ngramforms, ("from corpus %i" % corpus_id) if (termset_scope == "local") else "", this_source_type) session.add(tir_nd) session.commit() the_id = tir_nd.id # TODO 1 discuss use and find new typename # TODO 2 release these 2 typenames TFIDF-CORPUS and TFIDF-GLOBAL # TODO 3 recreate them elsewhere in their sims (WeightedIndex) version # TODO 4 requalify this here as a NodeNgram # then TODO 5 use WeightedList.save() ! # reflect that in NodeNodeNgrams bulk_insert(NodeNodeNgram, ('node1_id', 'node2_id', 'ngram_id', 'score'), ((the_id, corpus_id, ng, tfidfsum[ng]) for ng in tfidfsum)) return the_id
def compute_coocs( corpus, overwrite_id=None, just_pass_result=True, # just return the WeightedMatrix, # (don't write to DB) threshold=DEFAULT_COOC_THRESHOLD, groupings_id=None, on_list_id=None, stoplist_id=None, start=None, end=None, symmetry_filter=False, diagonal_filter=True): """ Count how often some extracted terms appear together in a small context (document) throughout a larger context (corpus). [NodeNgram] [NodeNgramNgram] node_id | ngram_id | weight ngram1_id | ngram2_id | score | --------+----------+-------- ----------+-----------+-------+ MyDocA | 487 | 1 => 487 | 294 | 2 | MyDocA | 294 | 3 MyDocB | 487 | 1 MyDocB | 294 | 4 Fill that info in DB: - a *new* COOCCURRENCES node - and all corresponding NodeNgramNgram rows worse case complexity ~ O(N²/2) with N = number of ngrams If a mainlist is provided, we filter doc ngrams to those also in the list. Parameters: - the corpus node - overwrite_id: id of a pre-existing COOCCURRENCES node for this corpus (all hyperdata and previous NodeNgramNgram rows will be replaced) - threshold: on output cooc count (previously called hapax) - groupings_id: optional synonym relations to add all subform counts with their mainform's counts - on_list_id: mainlist or maplist type, to constrain the input ngrams - stoplist_id: stoplist for filtering input ngrams (normally unnecessary if a mainlist is already provided) - start, end: provide one or both temporal limits to filter on doc date NB the expected type of parameter value is datetime.datetime (string is also possible but format must follow this convention: "2001-01-01" aka "%Y-%m-%d") - symmetry_filter: prevent calculating where ngram1_id > ngram2_id - diagonal_filter: prevent calculating where ngram1_id == ngram2_id (deprecated parameters) - field1,2: allowed to count other things than ngrams (eg tags) but no use case at present - isMonopartite: ?? used a nodes_hyperdata_ngrams table ??? basic idea for one doc ====================== each pair of ngrams sharing same doc (node_id) SELEC idxa.ngram_id, idxb.ngram_id FROM nodes_ngrams AS idxa --------------------------------- JOIN nodes_ngrams AS idxb ON idxa.node_id = idxb.node_id <== that's cooc --------------------------------- AND idxa.ngram_id <> idxb.ngram_id (diagonal_filter) AND idxa.node_id = MY_DOC ; on entire corpus ================= coocs for each doc : - each given pair like (termA, termB) will likely appear several times => we do GROUP BY (Xindex.ngram_id, Yindex.ngram_id) - we count unique appearances of the pair (cooc) """ # - TODO cvalue_id: allow a metric as additional input filter # - TODO n_min, n_max : filter on Ngram.n (aka length of ngram) # - TODO weighted: if False normal cooc to be saved as result # if True weighted cooc (experimental) # /!\ big combinatorial complexity /!\ # pour 8439 lignes dans l'index nodes_ngrams dont 1442 avec occ > 1 # 1.859.408 lignes pour la requête cooc simple # 71.134 lignes en se limitant aux ngrammes qui ont une occ > 1 (weight) # 2 x the occurrence index table Xindex = aliased(NodeNgram) Yindex = aliased(NodeNgram) # for debug (1/4) # Xngram = aliased(Ngram) # Yngram = aliased(Ngram) # 1) prepare definition of counted forms if not groupings_id: # no groupings => the counted forms are the ngrams Xindex_ngform_id = Xindex.ngram_id Yindex_ngform_id = Yindex.ngram_id # groupings: cf commentaire détaillé dans compute_occs() + todo facto else: # prepare translations Xsyno = (session.query( NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id).filter( NodeNgramNgram.node_id == groupings_id).subquery()) # further use as anon tables prevent doing Ysyno = Xsyno Ysyno = (session.query( NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id).filter( NodeNgramNgram.node_id == groupings_id).subquery()) # groupings => define the counted form depending on the existence of a synonym Xindex_ngform_id = case([ (Xsyno.c.ngram1_id != None, Xsyno.c.ngram1_id), (Xsyno.c.ngram1_id == None, Xindex.ngram_id) # condition value ]) Yindex_ngform_id = case([(Ysyno.c.ngram1_id != None, Ysyno.c.ngram1_id), (Ysyno.c.ngram1_id == None, Yindex.ngram_id)]) # --- # 2) BASE DB QUERY # cooccurrences columns definition ---------------- ucooc = func.count(Xindex_ngform_id).label("ucooc") # NB could be X or Y in this line # (we're counting grouped rows and just happen to do it on this column) base_query = ( session.query( Xindex_ngform_id, Yindex_ngform_id, ucooc # for debug (2/4) # , Xngram.terms.label("w_x") # , Yngram.terms.label("w_y") ).join(Yindex, Xindex.node_id == Yindex.node_id) # <- by definition of cooc .join(Node, Node.id == Xindex.node_id) # <- b/c within corpus .filter(Node.parent_id == corpus.id) # <- b/c within corpus .filter(Node.typename == "DOCUMENT") # <- b/c within corpus ) # outerjoin the synonyms if needed if groupings_id: base_query = ( base_query.outerjoin( Xsyno, # <- synonyms for Xindex.ngrams Xsyno.c.ngram2_id == Xindex.ngram_id).outerjoin( Ysyno, # <- synonyms for Yindex.ngrams Ysyno.c.ngram2_id == Yindex.ngram_id)) # 3) counting clause in any case coocs_query = ( base_query.group_by( Xindex_ngform_id, Yindex_ngform_id # <- what we're counting # for debug (3/4) # ,"w_x", "w_y" ) # for debug (4/4) # .join(Xngram, Xngram.id == Xindex_ngform_id) # .join(Yngram, Yngram.id == Yindex_ngform_id) .order_by(ucooc)) # 4) INPUT FILTERS (reduce N before O(N²)) if on_list_id: # £TODO listes différentes ou bien une liste pour x et tous les ngrammes pour y # car permettrait expansion de liste aux plus proches voisins (MacLachlan) # (avec une matr rectangulaire) m1 = aliased(NodeNgram) m2 = aliased(NodeNgram) coocs_query = (coocs_query.join( m1, m1.ngram_id == Xindex_ngform_id).join( m2, m2.ngram_id == Yindex_ngform_id).filter( m1.node_id == on_list_id).filter(m2.node_id == on_list_id)) if stoplist_id: s1 = (session.query(NodeNgram.ngram_id).filter( NodeNgram.node_id == stoplist_id).subquery()) # further use as anon tables prevent doing s2 = s1 s2 = (session.query(NodeNgram.ngram_id).filter( NodeNgram.node_id == stoplist_id).subquery()) coocs_query = ( coocs_query.outerjoin(s1, s1.c.ngram_id == Xindex_ngform_id).outerjoin( s2, s2.c.ngram_id == Yindex_ngform_id) # équivalent NOT IN stoplist .filter(s1.c.ngram_id == None).filter(s2.c.ngram_id == None)) if diagonal_filter: # don't compute ngram with itself coocs_query = coocs_query.filter(Xindex_ngform_id != Yindex_ngform_id) if start or end: Time = aliased(NodeHyperdata) coocs_query = (coocs_query.join(Time, Time.node_id == Xindex.node_id).filter( Time.key == "publication_date")) if start: if not isinstance(start, datetime): try: start = datetime.strptime(start, '%Y-%m-%d') except: raise TypeError( "'start' param expects datetime object or %%Y-%%m-%%d string" ) # the filtering by start limit coocs_query = coocs_query.filter(Time.value_utc >= start) if end: if not isinstance(end, datetime): try: end = datetime.strptime(end, '%Y-%m-%d') except: raise TypeError( "'end' param expects datetime object or %%Y-%%m-%%d string" ) # the filtering by start limit coocs_query = coocs_query.filter(Time.value_utc <= end) if symmetry_filter: # 1 filtre tenant en compte de la symétrie # -> réduit le travail de moitié !! # -> mais récupération sera plus couteuse via des requêtes OR comme: # WHERE ngram1 = mon_ngram OR ngram2 = mon_ngram coocs_query = coocs_query.filter(Xindex_ngform_id < Yindex_ngform_id) # 5) OUTPUT FILTERS # ------------------ # threshold # £TODO adjust COOC_THRESHOLD a posteriori: # ex: sometimes 2 sometimes 4 depending on sparsity print("COOCS: filtering pairs under threshold:", threshold) coocs_query = coocs_query.having(ucooc >= threshold) # 6) EXECUTE QUERY # ---------------- # => storage in our matrix structure matrix = WeightedMatrix(coocs_query.all()) # ------------------- # fyi shape_0 = len({pair[0] for pair in matrix.items}) shape_1 = len({pair[1] for pair in matrix.items}) print("COOCS: NEW matrix shape [%ix%i]" % (shape_0, shape_1)) if just_pass_result: return matrix else: # 5) SAVE # -------- # saving the parameters of the analysis in the Node JSON new_hyperdata = {'corpus': corpus.id, 'threshold': threshold} if overwrite_id: # overwrite pre-existing id the_cooc = cache.Node[overwrite_id] the_cooc.hyperdata = new_hyperdata the_cooc.save_hyperdata() session.commit() the_id = overwrite_id else: # create the new cooc node the_cooc = corpus.add_child( typename="COOCCURRENCES", name="Coocs (in:%s)" % corpus.name[0:10], hyperdata=new_hyperdata, ) session.add(the_cooc) session.commit() the_id = the_cooc.id # ==> save all NodeNgramNgram with link to new cooc node id matrix.save(the_id) return the_id