Python count Exemples, gargantext.util.db.func.count Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : show_nodes.py Projet : project-renard-survey/gargantext

def nodes(parent=None,
          group_by='typename',
          order_by='typename',
          has_child='check'):
    if group_by or has_child is not None:
        select = [
            func.min(Node.id).label('id'),
            func.min(Node.name).label('name'),
            func.min(Node.typename).label('typename'),
            func.count(Node.id).label('cnt')
        ]
    else:
        select = [
            Node.id.label('id'),
            Node.name.label('name'),
            Node.typename.label('typename'),
            literal_column('1').label('cnt')
        ]

    if has_child is not None:
        N = aliased(Node)
        select.append(func.count(N.id).label('children'))
    else:
        select.append(literal_column('NULL').label('children'))

    parent_id = getattr(parent, 'id', parent)
    q = session.query(*select).filter_by(parent_id=parent_id) \
               .group_by(getattr(Node, group_by if group_by else 'id'))

    if has_child is not None:
        q = q.outerjoin(N, N.parent_id == Node.id).group_by(N.parent_id)

    return q.order_by(order_by)

Exemple #2

0

Afficher le fichier

Fichier : gargantext_notebook.py Projet : project-renard-survey/gargantext

def scan_gargantext(corpus_id, request, fast=False, documents=False):
    query = _search_docs(corpus_id, request, fast)

    if documents:
        return query.all()

    return query.with_entities(func.count(DocumentNode.id.distinct())).one()[0]

Exemple #3

0

Afficher le fichier

def doc_freq(corpus_id, node_ids):
    '''
    doc_freq :: Corpus_id -> [(Ngram_id, Int)]
    Given a corpus, compute number of documents that have the ngram in it.
    '''
    return ( session.query(NodeNgram.ngram_id, func.count(NodeNgram.node_id))
                    .join(Node, NodeNgram.node_id == Node.id)
                    .filter( Node.parent_id == corpus_id
                           , Node.typename== 'DOCUMENT')
                    .filter( NodeNgram.weight > 0 
                           , NodeNgram.ngram_id.in_(node_ids) )
                    .group_by(NodeNgram.ngram_id)
                    .all()
                  )

Exemple #4

0

Afficher le fichier

Fichier : cooccurrences.py Projet : project-renard-survey/gargantext

def countCooccurrences(corpus_id=None,
                       cooc_id=None,
                       field1='ngrams',
                       field2='ngrams',
                       start=None,
                       end=None,
                       mapList_id=None,
                       groupList_id=None,
                       distance=None,
                       bridgeness=None,
                       n_min=1,
                       n_max=None,
                       limit=1000,
                       isMonopartite=True,
                       threshold=3,
                       save_on_db=True,
                       reset=True):
    '''
    Compute the cooccurence matrix and save it, returning NodeNgramNgram.node_id
    For the moment list of parameters are not supported because, lists need to
    be merged before.
    corpus           :: Corpus

    mapList_id       :: Int
    groupList_id     :: Int

    start :: TimeStamp -- example: '2010-05-30 02:00:00+02'
    end   :: TimeStamp
    limit :: Int

    '''
    # FIXME remove the lines below after factorization of parameters
    parameters = dict()
    parameters['field1'] = field1
    parameters['field2'] = field2

    # Get corpus as Python object
    corpus = session.query(Node).filter(Node.id == corpus_id).first()

    # Get node of the Graph
    if not cooc_id:

        cooc_id = (session.query(Node.id).filter(
            Node.typename == "COOCCURRENCES", Node.name == "GRAPH EXPLORER",
            Node.parent_id == corpus.id).first())
        if not cooc_id:
            coocNode = corpus.add_child(typename="COOCCURRENCES",
                                        name="GRAPH (in corpus %s)" %
                                        corpus.id)

            session.add(coocNode)
            session.commit()
            cooc_id = coocNode.id
        else:
            cooc_id = int(cooc_id[0])

    # when cooc_id preexisted, but we want to continue  (reset = True)
    #    (to give new contents to this cooc_id)
    elif reset:
        print("GRAPH #%s ... Counting new cooccurrences data." % cooc_id)
        session.query(NodeNgramNgram).filter(
            NodeNgramNgram.node_id == cooc_id).delete()
        session.commit()

    # when cooc_id preexisted and we just want to load it (reset = False)
    else:
        print("GRAPH #%s ... Loading cooccurrences computed already." %
              cooc_id)
        cooc = session.query(NodeNgramNgram.ngram1_id,
                             NodeNgramNgram.ngram2_id,
                             NodeNgramNgram.weight).filter(
                                 NodeNgramNgram.node_id == cooc_id).all()
        return (int(cooc_id), WeightedMatrix(cooc))

    NodeNgramX = aliased(NodeNgram)

    # Simple Cooccurrences
    cooc_score = func.count(NodeNgramX.node_id).label('cooc_score')

    # A kind of Euclidean distance cooccurrences
    #cooc_score = func.sqrt(func.sum(NodeNgramX.weight * NodeNgramY.weight)).label('cooc_score')

    if isMonopartite:
        NodeNgramY = aliased(NodeNgram)

        cooc_query = (session.query(
            NodeNgramX.ngram_id, NodeNgramY.ngram_id,
            cooc_score).join(Node, Node.id == NodeNgramX.node_id).join(
                NodeNgramY, NodeNgramY.node_id == Node.id).filter(
                    Node.parent_id == corpus.id, Node.typename == "DOCUMENT"))
    else:
        NodeNgramY = aliased(NodeNgram)

        cooc_query = (session.query(
            NodeHyperdataNgram.ngram_id, NodeNgramY.ngram_id,
            cooc_score).join(Node, Node.id == NodeHyperdataNgram.node_id).join(
                NodeNgramY, NodeNgramY.node_id == Node.id).join(
                    Hyperdata,
                    Hyperdata.id == NodeHyperdataNgram.hyperdata_id).filter(
                        Node.parent_id == corpus.id,
                        Node.typename == "DOCUMENT").filter(
                            Hyperdata.name == field1))

    # Size of the ngrams between n_min and n_max
    if n_min is not None or n_max is not None:
        if isMonopartite:
            NgramX = aliased(Ngram)
            cooc_query = cooc_query.join(NgramX,
                                         NgramX.id == NodeNgramX.ngram_id)

        NgramY = aliased(Ngram)
        cooc_query = cooc_query.join(NgramY, NgramY.id == NodeNgramY.ngram_id)

    if n_min is not None:
        cooc_query = (cooc_query.filter(NgramY.n >= n_min))
        if isMonopartite:
            cooc_query = cooc_query.filter(NgramX.n >= n_min)

    if n_max is not None:
        cooc_query = (cooc_query.filter(NgramY.n >= n_min))
        if isMonopartite:
            cooc_query = cooc_query.filter(NgramX.n >= n_min)

    # Cooc between the dates start and end
    if start is not None:
        #date_start = datetime.datetime.strptime ("2001-2-3 10:11:12", "%Y-%m-%d %H:%M:%S")
        # TODO : more precise date format here (day is smaller grain actually).
        date_start = datetime.strptime(str(start), "%Y-%m-%d")
        date_start_utc = date_start.strftime("%Y-%m-%d %H:%M:%S")

        Start = aliased(NodeHyperdata)
        cooc_query = (cooc_query.join(Start, Start.node_id == Node.id).filter(
            Start.key == 'publication_date').filter(
                Start.value_utc >= date_start_utc))

        parameters['start'] = date_start_utc

    if end is not None:
        # TODO : more precise date format here (day is smaller grain actually).
        date_end = datetime.strptime(str(end), "%Y-%m-%d")
        date_end_utc = date_end.strftime("%Y-%m-%d %H:%M:%S")

        End = aliased(NodeHyperdata)

        cooc_query = (cooc_query.join(End, End.node_id == Node.id).filter(
            End.key == 'publication_date').filter(
                End.value_utc <= date_end_utc))

        parameters['end'] = date_end_utc

    if isMonopartite:
        # Cooc is symetric, take only the main cooccurrences and cut at the limit
        cooc_query = cooc_query.filter(
            NodeNgramX.ngram_id < NodeNgramY.ngram_id)

    cooc_query = cooc_query.having(cooc_score >= threshold)

    if isMonopartite:
        cooc_query = cooc_query.group_by(NodeNgramX.ngram_id,
                                         NodeNgramY.ngram_id)
    else:
        cooc_query = cooc_query.group_by(NodeHyperdataNgram.ngram_id,
                                         NodeNgramY.ngram_id)

    # Order according some scores
    # If ordering is really needed, use Ordered Index (faster)
    #cooc_query = cooc_query.order_by(desc('cooc_score'))

    matrix = WeightedMatrix(cooc_query)

    print("GRAPH #%s Filtering the matrix with Map and Group Lists." % cooc_id)
    cooc = filterMatrix(matrix, mapList_id, groupList_id)

    parameters['MapList_id'] = str(mapList_id)
    parameters['GroupList_id'] = str(groupList_id)

    # TODO factorize savings on db
    if save_on_db:
        # Saving the cooccurrences
        cooc.save(cooc_id)
        print("GRAPH #%s ... Node Cooccurrence Matrix saved" % cooc_id)

        # Saving the parameters
        print("GRAPH #%s ... Parameters saved in Node." % cooc_id)
        coocNode = session.query(Node).filter(Node.id == cooc_id).first()

        coocNode.hyperdata["parameters"] = dict()
        coocNode.hyperdata["parameters"] = parameters
        coocNode.save_hyperdata()
        session.commit()

        #data = cooc2graph(coocNode.id, cooc, distance=distance, bridgeness=bridgeness)
    else:
        return cooc

    return (coocNode.id, cooc)

Exemple #5

0

Afficher le fichier

def compute_ti_ranking(corpus,
                       groupings_id=None,
                       count_scope="local",
                       termset_scope="local",
                       overwrite_id=None):
    """
    Calculates tfidf ranking within given scope
                ----------
                   |
            via weighting of
            cumulated tfidf  --------- Sum{i}(tf_ij) * ln(N/|U{i}(docs{mot€d})|)
             per ngram ng_i
         (or per mainform ng_i' if groups)
           across some docs d_j

    Parameters:
      - the corpus itself (or corpus_id)
      - groupings_id: optional id of a GROUPLIST node for these ngrams
                        IF absent the ti weights are the sums for each ngram
                        IF present they're the sums for each ngram's mainform

      - count_scope: {"local" or "global"}
         - local  <=> frequencies counted in the current corpus
         - global <=> frequencies counted in all corpora of this type

        when the count_scope is global, there is another parameter:
          - termset_scope: {"local" or "global"}
             - local <=> output list of terms limited to the current corpus
               (SELECT DISTINCT ngram_id FROM nodes_ngrams WHERE node_id IN <docs>)
             - global <=> output list of terms found in global doc scope
                                                    !!!! (many more terms)

      - overwrite_id: optional id of a pre-existing XXXX node for this corpus
                   (the Node and its previous Node NodeNgram rows will be replaced)
    """
    # validate string params
    if count_scope not in ["local", "global"]:
        raise ValueError(
            "compute_ti_ranking: count_scope param allowed values: 'local', 'global'"
        )
    if termset_scope not in ["local", "global"]:
        raise ValueError(
            "compute_ti_ranking: termset_scope param allowed values: 'local', 'global'"
        )
    if count_scope == "local" and termset_scope == "global":
        raise ValueError(
            "compute_ti_ranking: the termset_scope param can be 'global' iff count_scope param is 'global' too."
        )

    # get corpus
    if type(corpus) == int:
        corpus_id = corpus
        corpus = cache.Node[corpus_id]
    elif type(corpus) == str and match(r'\d+$', corpus):
        corpus_id = int(corpus)
        corpus = cache.Node[corpus_id]
    else:
        # assuming Node class
        corpus_id = corpus.id

    # prepare sqla mainform vs ngram selector
    ngform_i = None

    if not groupings_id:
        ngform_i = NodeNgram.ngram_id

    else:
        # prepare translations
        syno = (session.query(
            NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id).filter(
                NodeNgramNgram.node_id == groupings_id).subquery())
        # cf commentaire détaillé dans compute_occs() + todo facto

        ngform_i = case([(syno.c.ngram1_id != None, syno.c.ngram1_id),
                         (syno.c.ngram1_id == None, NodeNgram.ngram_id)
                         #     condition               value
                         ])

    # MAIN QUERY SKELETON
    tf_nd_query = (
        session.query(
            # NodeNgram.ngram_id
            # or similar if grouping ngrams under their mainform
            ngform_i.label("counted_ngform"),

            # the tfidf elements
            # ------------------
            func.sum(NodeNgram.weight),  # tf: same as occurrences
            # -----------------------
            func.count(NodeNgram.node_id)  # nd: n docs with term
            # --------------------
        ).group_by("counted_ngform")

        # count_scope to specify in which doc nodes to count
        # -----------
        # .join(countdocs_subquery,
        #       countdocs_subquery.c.id == NodeNgram.node_id)

        # optional termset_scope: if we'll restrict the ngrams
        #          -------------
        # .join(termset_subquery,
        #       termset_subquery.c.uniq_ngid == NodeNgram.ngram_id)

        # optional translations to bring the subform's replacement
        #          ------------
        # .outerjoin(syno,
        #           syno.c.ngram2_id == NodeNgram.ngram_id)
    )

    # TUNING THE QUERY

    if groupings_id:
        tf_nd_query = tf_nd_query.outerjoin(
            syno, syno.c.ngram2_id == NodeNgram.ngram_id)

    # local <=> within this corpus
    if count_scope == "local":
        # All docs of this corpus
        countdocs_subquery = (session.query(
            Node.id).filter(Node.typename == "DOCUMENT").filter(
                Node.parent_id == corpus_id).subquery())

        # no need to independantly restrict the ngrams
        tf_nd_query = tf_nd_query.join(
            countdocs_subquery, countdocs_subquery.c.id == NodeNgram.node_id)
        # ---

    # global <=> within all corpora of this source
    elif count_scope == "global":
        this_source_type = corpus.resources()[0]['type']

        CorpusNode = aliased(Node)

        # All docs **in all corpora of the same source**
        countdocs_subquery = (
            session.query(Node.id).filter(Node.typename == "DOCUMENT")

            # join on parent_id with selected corpora nodes
            .join(CorpusNode, CorpusNode.id == Node.parent_id).filter(
                CorpusNode.typename == "CORPUS")
            # TODO index corpus_sourcetype in DB
            .filter(CorpusNode.hyperdata['resources'][0]['type'].astext == str(
                this_source_type)).subquery())

        if termset_scope == "global":
            # both scopes are the same: no need to independantly restrict the ngrams
            tf_nd_query = tf_nd_query.join(
                countdocs_subquery,
                countdocs_subquery.c.id == NodeNgram.node_id)
            # ---

        elif termset_scope == "local":

            # All unique terms...
            termset_subquery = (
                session.query(distinct(NodeNgram.ngram_id).label("uniq_ngid"))
                # ... in the original corpus
                .join(Node).filter(Node.typename == "DOCUMENT").filter(
                    Node.parent_id == corpus_id).subquery())

            # only case of independant restrictions on docs and terms
            tf_nd_query = (tf_nd_query.join(
                countdocs_subquery,
                countdocs_subquery.c.id == NodeNgram.node_id).join(
                    termset_subquery,
                    termset_subquery.c.uniq_ngid == NodeNgram.ngram_id))
            # ---

    # M
    total_docs = session.query(countdocs_subquery).count()
    log_tot_docs = log(total_docs)

    # result
    tf_nd = tf_nd_query.all()

    # -------------- "sommatoire" sur mot i ----------------
    tfidfsum = {}
    for (ngram_i, tf_i, nd_i) in tf_nd:
        # tfidfsum[ngram_i] = tf_i * log(total_docs/nd_i)
        tfidfsum[ngram_i] = tf_i * (log_tot_docs - log(nd_i))
    # ------------------------------------------------------

    # N pour info
    total_ngramforms = len(tfidfsum)

    if overwrite_id:
        the_id = overwrite_id
        session.query(NodeNodeNgram).filter(
            NodeNodeNgram.node1_id == the_id).delete()
        session.commit()
    else:
        # create the new TFIDF-XXXX node to get an id
        tir_nd = corpus.add_child()
        if count_scope == "local":
            tir_nd.typename = "TIRANK-CORPUS"
            tir_nd.name = "ti rank (%i ngforms in corpus:%s)" % (
                total_ngramforms, corpus_id)
        elif count_scope == "global":
            tir_nd.typename = "TIRANK-GLOBAL"
            tir_nd.name = "ti rank (%i ngforms %s in corpora of sourcetype:%s)" % (
                total_ngramforms, ("from corpus %i" % corpus_id) if
                (termset_scope == "local") else "", this_source_type)

        session.add(tir_nd)
        session.commit()
        the_id = tir_nd.id

    # TODO 1 discuss use and find new typename
    # TODO 2 release these 2 typenames TFIDF-CORPUS and TFIDF-GLOBAL
    # TODO 3 recreate them elsewhere in their sims (WeightedIndex) version
    # TODO 4 requalify this here as a NodeNgram
    # then TODO 5 use WeightedList.save() !

    # reflect that in NodeNodeNgrams
    bulk_insert(NodeNodeNgram, ('node1_id', 'node2_id', 'ngram_id', 'score'),
                ((the_id, corpus_id, ng, tfidfsum[ng]) for ng in tfidfsum))

    return the_id

Exemple #6

0

Afficher le fichier

def compute_coocs(
        corpus,
        overwrite_id=None,
        just_pass_result=True,  # just return the WeightedMatrix,
        #    (don't write to DB)
    threshold=DEFAULT_COOC_THRESHOLD,
        groupings_id=None,
        on_list_id=None,
        stoplist_id=None,
        start=None,
        end=None,
        symmetry_filter=False,
        diagonal_filter=True):
    """
    Count how often some extracted terms appear
    together in a small context (document)
    throughout a larger context (corpus).

             [NodeNgram]                       [NodeNgramNgram]

    node_id | ngram_id | weight       ngram1_id | ngram2_id | score |
    --------+----------+--------      ----------+-----------+-------+
     MyDocA |      487 |      1   =>        487 |       294 |     2 |
     MyDocA |      294 |      3
     MyDocB |      487 |      1
     MyDocB |      294 |      4

    Fill that info in DB:
      - a *new* COOCCURRENCES node
      - and all corresponding NodeNgramNgram rows

    worse case complexity ~ O(N²/2) with N = number of ngrams

    If a mainlist is provided, we filter doc ngrams to those also in the list.

    Parameters:
      - the corpus node
      - overwrite_id: id of a pre-existing COOCCURRENCES node for this corpus
                     (all hyperdata and previous NodeNgramNgram rows will be replaced)
      - threshold: on output cooc count (previously called hapax)
      - groupings_id: optional synonym relations to add all subform counts
                      with their mainform's counts
      - on_list_id: mainlist or maplist type, to constrain the input ngrams
      - stoplist_id: stoplist for filtering input ngrams
                     (normally unnecessary if a mainlist is already provided)
      - start, end: provide one or both temporal limits to filter on doc date
                    NB the expected type of parameter value is datetime.datetime
                        (string is also possible but format must follow
                          this convention: "2001-01-01" aka "%Y-%m-%d")
      - symmetry_filter: prevent calculating where ngram1_id  > ngram2_id
      - diagonal_filter: prevent calculating where ngram1_id == ngram2_id


     (deprecated parameters)
      - field1,2: allowed to count other things than ngrams (eg tags) but no use case at present
      - isMonopartite: ?? used a nodes_hyperdata_ngrams table ???

    basic idea for one doc
    ======================
    each pair of ngrams sharing same doc (node_id)
        SELEC idxa.ngram_id, idxb.ngram_id
        FROM nodes_ngrams AS idxa
        ---------------------------------
        JOIN nodes_ngrams AS idxb
        ON idxa.node_id = idxb.node_id      <== that's cooc
        ---------------------------------
        AND idxa.ngram_id <> idxb.ngram_id   (diagonal_filter)
        AND idxa.node_id = MY_DOC ;

    on entire corpus
    =================
    coocs for each doc :
      - each given pair like (termA, termB) will likely appear several times
        => we do GROUP BY (Xindex.ngram_id, Yindex.ngram_id)
      - we count unique appearances of the pair (cooc)


    """

    #   - TODO cvalue_id: allow a metric as additional  input filter
    #   - TODO n_min, n_max : filter on Ngram.n (aka length of ngram)
    #   - TODO weighted: if False normal cooc to be saved as result
    #                    if True  weighted cooc (experimental)

    # /!\ big combinatorial complexity /!\
    # pour 8439 lignes dans l'index nodes_ngrams dont 1442 avec occ > 1
    #  1.859.408 lignes pour la requête cooc simple
    #     71.134 lignes en se limitant aux ngrammes qui ont une occ > 1 (weight)

    # 2 x the occurrence index table
    Xindex = aliased(NodeNgram)
    Yindex = aliased(NodeNgram)

    # for debug (1/4)
    # Xngram = aliased(Ngram)
    # Yngram = aliased(Ngram)

    # 1) prepare definition of counted forms
    if not groupings_id:

        # no groupings => the counted forms are the ngrams
        Xindex_ngform_id = Xindex.ngram_id
        Yindex_ngform_id = Yindex.ngram_id

    # groupings: cf commentaire détaillé dans compute_occs() + todo facto
    else:
        # prepare translations
        Xsyno = (session.query(
            NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id).filter(
                NodeNgramNgram.node_id == groupings_id).subquery())

        # further use as anon tables prevent doing Ysyno = Xsyno
        Ysyno = (session.query(
            NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id).filter(
                NodeNgramNgram.node_id == groupings_id).subquery())

        # groupings => define the counted form depending on the existence of a synonym
        Xindex_ngform_id = case([
            (Xsyno.c.ngram1_id != None, Xsyno.c.ngram1_id),
            (Xsyno.c.ngram1_id == None, Xindex.ngram_id)
            #     condition               value
        ])

        Yindex_ngform_id = case([(Ysyno.c.ngram1_id != None,
                                  Ysyno.c.ngram1_id),
                                 (Ysyno.c.ngram1_id == None, Yindex.ngram_id)])
        # ---

    # 2) BASE DB QUERY

    # cooccurrences columns definition ----------------
    ucooc = func.count(Xindex_ngform_id).label("ucooc")
    # NB could be X or Y in this line
    #    (we're counting grouped rows and just happen to do it on this column)
    base_query = (
        session.query(
            Xindex_ngform_id, Yindex_ngform_id, ucooc

            # for debug (2/4)
            # , Xngram.terms.label("w_x")
            # , Yngram.terms.label("w_y")
        ).join(Yindex,
               Xindex.node_id == Yindex.node_id)  # <- by definition of cooc
        .join(Node, Node.id == Xindex.node_id)  # <- b/c within corpus
        .filter(Node.parent_id == corpus.id)  # <- b/c within corpus
        .filter(Node.typename == "DOCUMENT")  # <- b/c within corpus
    )

    # outerjoin the synonyms if needed
    if groupings_id:
        base_query = (
            base_query.outerjoin(
                Xsyno,  # <- synonyms for Xindex.ngrams
                Xsyno.c.ngram2_id == Xindex.ngram_id).outerjoin(
                    Ysyno,  # <- synonyms for Yindex.ngrams
                    Ysyno.c.ngram2_id == Yindex.ngram_id))

    # 3) counting clause in any case
    coocs_query = (
        base_query.group_by(
            Xindex_ngform_id,
            Yindex_ngform_id  # <- what we're counting
            # for debug (3/4)
            # ,"w_x", "w_y"
        )

        # for debug (4/4)
        # .join(Xngram, Xngram.id == Xindex_ngform_id)
        # .join(Yngram, Yngram.id == Yindex_ngform_id)
        .order_by(ucooc))

    # 4) INPUT FILTERS (reduce N before O(N²))
    if on_list_id:
        # £TODO listes différentes ou bien une liste pour x et tous les ngrammes pour y
        #       car permettrait expansion de liste aux plus proches voisins (MacLachlan)
        #       (avec une matr rectangulaire)

        m1 = aliased(NodeNgram)
        m2 = aliased(NodeNgram)

        coocs_query = (coocs_query.join(
            m1, m1.ngram_id == Xindex_ngform_id).join(
                m2, m2.ngram_id == Yindex_ngform_id).filter(
                    m1.node_id == on_list_id).filter(m2.node_id == on_list_id))

    if stoplist_id:
        s1 = (session.query(NodeNgram.ngram_id).filter(
            NodeNgram.node_id == stoplist_id).subquery())

        # further use as anon tables prevent doing s2 = s1
        s2 = (session.query(NodeNgram.ngram_id).filter(
            NodeNgram.node_id == stoplist_id).subquery())

        coocs_query = (
            coocs_query.outerjoin(s1,
                                  s1.c.ngram_id == Xindex_ngform_id).outerjoin(
                                      s2, s2.c.ngram_id == Yindex_ngform_id)

            # équivalent NOT IN stoplist
            .filter(s1.c.ngram_id == None).filter(s2.c.ngram_id == None))

    if diagonal_filter:
        # don't compute ngram with itself
        coocs_query = coocs_query.filter(Xindex_ngform_id != Yindex_ngform_id)

    if start or end:
        Time = aliased(NodeHyperdata)

        coocs_query = (coocs_query.join(Time,
                                        Time.node_id == Xindex.node_id).filter(
                                            Time.key == "publication_date"))

        if start:
            if not isinstance(start, datetime):
                try:
                    start = datetime.strptime(start, '%Y-%m-%d')
                except:
                    raise TypeError(
                        "'start' param expects datetime object or %%Y-%%m-%%d string"
                    )

            # the filtering by start limit
            coocs_query = coocs_query.filter(Time.value_utc >= start)

        if end:
            if not isinstance(end, datetime):
                try:
                    end = datetime.strptime(end, '%Y-%m-%d')
                except:
                    raise TypeError(
                        "'end' param expects datetime object or %%Y-%%m-%%d string"
                    )

            # the filtering by start limit
            coocs_query = coocs_query.filter(Time.value_utc <= end)

    if symmetry_filter:
        # 1 filtre tenant en compte de la symétrie
        #  -> réduit le travail de moitié !!
        #  -> mais récupération sera plus couteuse via des requêtes OR comme:
        #       WHERE ngram1 = mon_ngram OR ngram2 = mon_ngram
        coocs_query = coocs_query.filter(Xindex_ngform_id < Yindex_ngform_id)

    # 5) OUTPUT FILTERS
    # ------------------
    # threshold
    # £TODO adjust COOC_THRESHOLD a posteriori:
    # ex: sometimes 2 sometimes 4 depending on sparsity
    print("COOCS: filtering pairs under threshold:", threshold)
    coocs_query = coocs_query.having(ucooc >= threshold)

    # 6) EXECUTE QUERY
    # ----------------
    #  => storage in our matrix structure
    matrix = WeightedMatrix(coocs_query.all())
    #                      -------------------

    # fyi
    shape_0 = len({pair[0] for pair in matrix.items})
    shape_1 = len({pair[1] for pair in matrix.items})
    print("COOCS: NEW matrix shape [%ix%i]" % (shape_0, shape_1))

    if just_pass_result:
        return matrix
    else:
        # 5) SAVE
        # --------
        # saving the parameters of the analysis in the Node JSON
        new_hyperdata = {'corpus': corpus.id, 'threshold': threshold}

        if overwrite_id:
            # overwrite pre-existing id
            the_cooc = cache.Node[overwrite_id]
            the_cooc.hyperdata = new_hyperdata
            the_cooc.save_hyperdata()
            session.commit()
            the_id = overwrite_id
        else:
            # create the new cooc node
            the_cooc = corpus.add_child(
                typename="COOCCURRENCES",
                name="Coocs (in:%s)" % corpus.name[0:10],
                hyperdata=new_hyperdata,
            )
            session.add(the_cooc)
            session.commit()

            the_id = the_cooc.id

        # ==> save all NodeNgramNgram with link to new cooc node id
        matrix.save(the_id)

        return the_id

Exemple #7

0

Afficher le fichier

Fichier : analytics.py Projet : project-renard-survey/gargantext

    def post(self, request, project_id):

        # example only

        input = request.data or {
            'x': {
                'with_empty': True,
                'resolution': 'decade',
                'value': 'publication_date',
            },
            'y': {
                # 'divided_by': 'total_ngrams_count',
                # 'divided_by': 'total_documents_count',
            },
            'filter': {
                # 'ngrams': ['bees', 'bee', 'honeybee', 'honeybees', 'honey bee', 'honey bees'],
                # 'ngrams': ['insecticide', 'pesticide'],
                # 'corpora': [52633],
                # 'date': {'min': '1995-12-31'}
            },
            # 'format': 'csv',
        }
        print(input)
        # input validation
        input = validate(
            input,
            {
                'type': dict,
                'default': {},
                'items': {
                    'x': {
                        'type': dict,
                        'default': {},
                        'items': {
                            # which hyperdata to choose for the date
                            'value': {
                                'type': str,
                                'default': 'publication_date',
                                'range': {
                                    'publication_date',
                                }
                            },
                            # time resolution
                            'resolution': {
                                'type': str,
                                'range': self._resolutions.keys(),
                                'default': 'month'
                            },
                            # should we add zeroes for empty values?
                            'with_empty': {
                                'type': bool,
                                'default': False
                            },
                        }
                    },
                    'y': {
                        'type': dict,
                        'default': {},
                        'items': {
                            # mesured value
                            'value': {
                                'type': str,
                                'default': 'ngrams_count',
                                'range': {
                                    'ngrams_count', 'documents_count',
                                    'ngrams_tfidf'
                                }
                            },
                            # value by which we should normalize
                            'divided_by': {
                                'type': str,
                                'range': {
                                    'total_documents_count', 'documents_count',
                                    'total_ngrams_count'
                                }
                            },
                        }
                    },
                    # filtering
                    'filter': {
                        'type': dict,
                        'default': {},
                        'items': {
                            # filter by metadata
                            'hyperdata': {
                                'type': list,
                                'default': [],
                                'items': {
                                    'type': dict,
                                    'items': {
                                        'key': {
                                            'type': str,
                                            'range': self._operators.keys()
                                        },
                                        'operator': {
                                            'type': str
                                        },
                                        'value': {
                                            'type': str
                                        },
                                    }
                                }
                            },
                            # filter by date
                            'date': {
                                'type': dict,
                                'items': {
                                    'min': {
                                        'type': datetime.datetime
                                    },
                                    'max': {
                                        'type': datetime.datetime
                                    },
                                },
                                'default': {}
                            },
                            # filter by corpora
                            'corpora': {
                                'type': list,
                                'default': [],
                                'items': {
                                    'type': int
                                }
                            },
                            # filter by ngrams
                            'ngrams': {
                                'type': list,
                                'default': [],
                                'items': {
                                    'type': str
                                }
                            },
                        }
                    },
                    # output format
                    'format': {
                        'type': str,
                        'default': 'json',
                        'range': {'json', 'csv'}
                    },
                }
            })
        # build query: prepare columns
        X = aliased(NodeHyperdata)
        column_x = func.date_trunc(input['x']['resolution'], X.value_utc)
        column_y = {
            'documents_count': func.count(Node.id.distinct()),
            'ngrams_count': func.sum(NodeNgram.weight),
            # 'ngrams_tfidf':     func.sum(NodeNodeNgram.weight),
        }[input['y']['value']]
        # build query: base
        print(input)
        query_base = (
            session.query(column_x).select_from(Node).join(
                NodeNgram, NodeNgram.node_id == Node.id).join(
                    X, X.node_id == NodeNgram.node_id)
            #.filter(X.key == input['x']['value'])
            .group_by(column_x).order_by(column_x))
        # build query: base, filter by corpora or project
        if 'corpora' in input['filter'] and input['filter']['corpora']:
            query_base = (query_base.filter(
                Node.parent_id.in_(input['filter']['corpora'])))
        else:
            ParentNode = aliased(Node)
            query_base = (query_base.join(
                ParentNode, ParentNode.id == Node.parent_id).filter(
                    ParentNode.parent_id == project_id))
        # build query: base, filter by date
        if 'date' in input['filter']:
            if 'min' in input['filter']['date']:
                query_base = query_base.filter(
                    X.value >= input['filter']['date']['min'])
            if 'max' in input['filter']['date']:
                query_base = query_base.filter(
                    X.value <= input['filter']['date']['max'])
        # build query: filter by ngrams
        query_result = query_base.add_columns(column_y)
        if 'ngrams' in input['filter'] and input['filter']['ngrams']:
            query_result = (query_result.join(
                Ngram, Ngram.id == NodeNgram.ngram_id).filter(
                    Ngram.terms.in_(input['filter']['ngrams'])))
        # build query: filter by metadata
        if 'hyperdata' in input['filter']:
            for h, hyperdata in enumerate(input['filter']['hyperdata']):
                print(h, hyperdata)
                # get hyperdata in database
                #if hyperdata_model is None:
                #    continue
                #hyperdata_id, hyperdata_type = hyperdata_model
                # create alias and query it
                operator = self._operators[hyperdata['operator']]
                type_string = type2string(
                    INDEXED_HYPERDATA[hyperdata['key']]['type'])
                value = self._converters[type_string](hyperdata['value'])
                query_result = (query_result.join(
                    NodeHyperdata,
                    NodeHyperdata.node_id == NodeNgram.node_id).filter(
                        NodeHyperdata.key == hyperdata['key']).filter(
                            operator(NodeHyperdata.value, value)))
        # build result: prepare data
        date_value_list = query_result.all()
        #print(date_value_list)

        if date_value_list:
            date_min = date_value_list[0][0].replace(tzinfo=None)
            date_max = date_value_list[-2][0].replace(tzinfo=None)
        # build result: prepare interval
        result = collections.OrderedDict()
        if input['x']['with_empty'] and date_value_list:
            compute_next_date = self._resolutions[input['x']['resolution']]
            date = date_min
            while date <= date_max:
                result[date] = 0.0
                date = compute_next_date(date)
        # build result: integrate
        for date, value in date_value_list[0:-1]:
            result[date.replace(tzinfo=None)] = value
        # build result: normalize
        query_normalize = None
        if date_value_list and 'divided_by' in input['y'] and input['y'][
                'divided_by']:
            if input['y']['divided_by'] == 'total_documents_count':
                query_normalize = query_base.add_column(
                    func.count(Node.id.distinct()))
            elif input['y']['divided_by'] == 'total_ngrams_count':
                query_normalize = query_base.add_column(
                    func.sum(NodeNgram.weight))
        if query_normalize is not None:
            for date, value in query_normalize[0:-1]:
                date = date.replace(tzinfo=None)
                if date in result:
                    result[date] /= value
        # return result with proper formatting
        if input['format'] == 'json':
            return JsonHttpResponse(
                {
                    'query': input,
                    'result': sorted(result.items()),
                }, 201)
        elif input['format'] == 'csv':
            return CsvHttpResponse(sorted(result.items()), ('date', 'value'),
                                   201)