Python get_parameters Beispiele, gargantext.util.http.get_parameters Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: ngramlists.py Projekt: project-renard-survey/gargantext

    def initial(self, request):
        """
        Before dispatching to put(), delete()...

        1) Checks current user authentication to prevent remote DB manipulation
        2) Prepares self.list_objects from params
        """

        if not request.user.is_authenticated():
            raise Http404()
            # can't use return in initial() (although 401 maybe better than 404)
            # can't use @requires_auth because of positional 'self' within class

        # get validated params
        self.params = get_parameters(request)

        (self.base_list, self.change_list) = ListChange._validate(self.params)

        if not len(self.change_list.items):
            payload_ngrams = request.data['ngrams']
            # print("no change_list in params but we got:", payload_ngrams)
            # change_list can be in payload too
            change_ngram_ids = [int(n) for n in payload_ngrams.split(',')]
            if (not len(change_ngram_ids)):
                raise ValidationException(
                    'The "ngrams" parameter requires one or more ngram_ids separated by comma'
                )
            else:
                self.change_list = UnweightedList(change_ngram_ids)

Beispiel #2

0

Datei anzeigen

Datei: ngramlists.py Projekt: fabelier/gargantext

    def get(self, request):
        parameters = get_parameters(request)

        maplist_id = None
        scores_id = None

        if "corpus" in parameters:
            corpus_id = parameters['corpus']
            corpus = cache.Node[corpus_id]
            maplist_id = corpus.children('MAPLIST').first().id
            # with a corpus_id, the explicit scoring pointer is optional
            if "scoring" in parameters:
                scores_id = parameters['scoring']
            else:
                scores_id = corpus.children('OCCURRENCES').first().id

        elif "maplist" in parameters and "scoring" in parameters:
            maplist_id = int(parameters['mainlist'])
            scores_id = int(parameters['scoring'])
        else:
            raise ValidationException("A 'corpus' id or 'maplist' id is required, and a 'scoring' for occurences counts")

        ngraminfo = {}           # ngram details sorted per ngram id
        listmembers = {'maplist':[]}         # ngram ids sorted per list name

        # infos for all ngrams from maplist
        map_ngrams = query_list(maplist_id, details=True,
                                      scoring_metric_id= scores_id).all()

        # ex:  [(8805, 'mean age', 4.0),
        #        (1632, 'activity', 4.0),
        #        (8423, 'present', 2.0),
        #        (2928, 'objective', 2.0)]


        # shortcut to useful function during loop
        add_to_members = listmembers['maplist'].append

        for ng in map_ngrams:
            ng_id   = ng[0]
            ngraminfo[ng_id] = ng[1:]

            # maplist ngrams will already be <=> ngraminfos
            # but the client side expects a membership lookup
            # as when there are multiple lists or some groupings
            add_to_members(ng_id)


        return JsonHttpResponse({
            'ngraminfos' : ngraminfo,
            'listmembers' : listmembers,
            'links' : {},   # no grouping links sent during glance (for speed)
            'nodeids' : {
                'mainlist':  None,
                'maplist' :  maplist_id,
                'stoplist':  None,
                'groups':  None,
                'scores':  None,
            }
        })

Beispiel #3

0

Datei anzeigen

Datei: ngramlists.py Projekt: project-renard-survey/gargantext

    def delete(self, request):
        """
        Within a groupnode, deletes some group elements from some groups

        Data format just like in POST, everything in the url
        """

        # from the url
        params = get_parameters(request)
        # the node param is unique
        group_node = params.pop('node')
        # the others params are links to change
        couples_to_remove = self.links_to_couples(params)

        # debug
        # print("==couples_to_remove=================================dd=")
        # print(couples_to_remove)

        # remove selectively group_couples
        # using IN is correct in this case: list of ids is short and external
        # see stackoverflow.com/questions/444475/
        db_rows = (session.query(NodeNgramNgram).filter(
            NodeNgramNgram.node_id == group_node).filter(
                tuple_(NodeNgramNgram.ngram1_id,
                       NodeNgramNgram.ngram2_id).in_(couples_to_remove)))

        n_removed = db_rows.delete(synchronize_session=False)
        session.commit()

        return JsonHttpResponse({'count_removed': n_removed}, 200)

Beispiel #4

0

Datei anzeigen

Datei: ngramlists.py Projekt: fabelier/gargantext

    def patch(self,request):
        """
        A copy of POST (merging list) but with the source == just an internal corpus_id

        params in request.GET:
            onto_corpus:  the corpus whose lists are getting patched
            from:         the corpus from which we take the source lists to merge in
            todo:         an array of the list types ("map", "main", "stop") to merge in

        """
        if not request.user.is_authenticated():
            res = HttpResponse("Unauthorized")
            res.status_code = 401
            return res

        params = get_parameters(request)
        print(params)

        # the corpus with the target lists to be patched
        corpus_id = int(params.pop("onto_corpus"))
        corpus_node = cache.Node[corpus_id]

        print(params)

        if request.user.id != corpus_node.user_id:
            res = HttpResponse("Unauthorized")
            res.status_code = 401
            return res

        list_types = {'map':'MAPLIST', 'main':'MAINLIST', 'stop':'STOPLIST'}

        # internal DB retrieve source_lists
        source_corpus_id = int(params.pop("from_corpus"))
        source_node = cache.Node[source_corpus_id]

        todo_lists = params.pop("todo").split(',')   # ex: ['map', 'stop']
        source_lists = {}
        for key in todo_lists:
            source_lists[key] = UnweightedList(
                                    source_node.children(list_types[key]).first().id
                                )

        # add the groupings too
        source_lists['groupings'] = Translations(
                                        source_node.children("GROUPLIST").first().id
                                    )

        # attempt to merge and send response
        try:
            # merge the source_lists onto those of the target corpus
            log_msg = merge_ngramlists(source_lists, onto_corpus=corpus_node)
            return JsonHttpResponse({
                'log': log_msg,
                }, 200)

        except Exception as e:
            return JsonHttpResponse({
                'err': str(e),
                }, 400)

Beispiel #5

0

Datei anzeigen

Datei: nodes.py Projekt: project-renard-survey/gargantext

    def get(self, request, corpus_id):

        if not request.user.is_authenticated():
            # can't use @requires_auth because of positional 'self' within class
            return HttpResponse('Unauthorized', status=401)

        parameters = get_parameters(request)
        parameters = validate(parameters, {'score': str, 'ngram_ids': list})

        try:
            ngram_ids = [int(n) for n in parameters['ngram_ids'].split(',')]
        except:
            raise ValidationException(
                '"ngram_ids" needs integers separated by comma.')

        limit = DEFAULT_N_DOCS_HAVING_NGRAM
        nodes_list = []

        corpus = session.query(Node).filter(Node.id == corpus_id).first()

        tfidf_id = (session.query(Node.id).filter(
            Node.typename == "TFIDF-CORPUS",
            Node.parent_id == corpus.id).first())

        tfidf_id = tfidf_id[0]
        print(tfidf_id)
        # request data
        nodes_query = (session.query(Node, func.sum(NodeNodeNgram.score)).join(
            NodeNodeNgram, NodeNodeNgram.node2_id == Node.id).filter(
                NodeNodeNgram.node1_id == tfidf_id).filter(
                    Node.typename == 'DOCUMENT',
                    Node.parent_id == corpus.id).filter(
                        or_(*[
                            NodeNodeNgram.ngram_id == ngram_id
                            for ngram_id in ngram_ids
                        ])).group_by(Node))

        # get the total count before applying limit
        nodes_count = nodes_query.count()

        # now the query with the limit
        nodes_results_query = (nodes_query.order_by(
            func.sum(NodeNodeNgram.score).desc()).limit(limit))

        for node, score in nodes_results_query:
            print(node, score)
            print("\t corpus:", corpus_id, "\t", node.name)
            node_dict = {
                'id': node.id,
                'score': score,
            }
            for key in ('title', 'publication_date', 'source', 'authors',
                        'fields'):
                if key in node.hyperdata:
                    node_dict[key] = node.hyperdata[key]
            nodes_list.append(node_dict)

        return JsonHttpResponse({'count': nodes_count, 'records': nodes_list})

Beispiel #6

0

Datei anzeigen

Datei: nodes.py Projekt: project-renard-survey/gargantext

    def put(self, request, corpus_id, check_each_doc=True):
        if not request.user.is_authenticated():
            # can't use @requires_auth because of positional 'self' within class
            return HttpResponse('Unauthorized', status=401)

        # user is ok
        fav_node = self._get_fav_node(corpus_id)

        response = {}

        if fav_node == None:
            response = {
                'warning':
                'No favorites node is defined for this corpus (\'%s\')' %
                self.corpus.name,
                'count_added':
                0
            }
        else:
            req_params = validate(get_parameters(request), {
                'docs': list,
                'default': ""
            })
            nodeids_to_add = [
                int(did) for did in req_params['docs'].split(',')
            ]

            if check_each_doc:
                # verification que ce sont bien des documents du bon corpus
                # un peu long => désactiver par défaut ?
                known_docs_q = (session.query(
                    Node.id).filter(Node.parent_id == corpus_id).filter(
                        Node.typename == 'DOCUMENT'))
                lookup = {
                    known_doc.id: True
                    for known_doc in known_docs_q.all()
                }
                # debug
                # print("lookup hash", lookup)
                rejected_list = []
                for doc_node_id in nodeids_to_add:
                    if (doc_node_id not in lookup):
                        rejected_list.append(doc_node_id)
                if len(rejected_list):
                    raise ValidationException(
                        "Error on some requested docs: %s (Only nodes of type 'doc' AND belonging to corpus %i can be added to favorites.)"
                        % (str(rejected_list), int(corpus_id)))

            # add them
            bulk_insert(NodeNode, ('node1_id', 'node2_id', 'score'),
                        ((fav_node.id, doc_node_id, 1.0)
                         for doc_node_id in nodeids_to_add))

            # todo count really added (here: counts input param not result)
            response = {'count_added': len(nodeids_to_add)}

        return JsonHttpResponse(response)

Beispiel #7

0

Datei anzeigen

Datei: nodes.py Projekt: project-renard-survey/gargantext

    def get(self, request, corpus_id):
        """
        2 possibilities with/without param

        1) GET http://localhost:8000/api/nodes/2/favorites
        (returns the full list of fav docs within corpus 2)

        2) GET http://localhost:8000/api/nodes/2/favorites?docs=53,54
        (will test if docs 53 and 54 are among the favorites of corpus 2)
        (returns the intersection of fav docs with [53,54])
        """

        if not request.user.is_authenticated():
            # can't use @requires_auth because of positional 'self' within class
            return HttpResponse('Unauthorized', status=401)

        fav_node = self._get_fav_node(corpus_id)

        req_params = validate(get_parameters(request), {
            'docs': list,
            'default': ""
        })

        response = {}

        if fav_node == None:
            response = {
                'warning':
                'No favorites node is defined for this corpus (\'%s\')' %
                self.corpus.name,
                'favdocs': []
            }
        elif 'docs' not in req_params:
            # each docnode associated to the favnode of this corpusnode
            q = (session.query(
                NodeNode.node2_id).filter(NodeNode.node1_id == fav_node.id))
            all_doc_ids = [row.node2_id for row in q.all()]
            response = {'favdocs': all_doc_ids}
        else:
            nodeids_to_check = [
                int(did) for did in req_params['docs'].split(',')
            ]

            # each docnode from the input list, if it is associated to the favnode
            q = (session.query(NodeNode.node2_id).filter(
                NodeNode.node1_id == fav_node.id).filter(
                    NodeNode.node2_id.in_(nodeids_to_check)))
            present_doc_ids = [row.node2_id for row in q.all()]
            absent_doc_ids = [
                did for did in nodeids_to_check if did not in present_doc_ids
            ]
            response = {'favdocs': present_doc_ids, 'missing': absent_doc_ids}

        return JsonHttpResponse(response)

Beispiel #8

0

Datei anzeigen

Datei: ngramlists.py Projekt: fabelier/gargantext

    def get(self, request):
        params = get_parameters(request)
        corpus_id = int(params.pop("corpus"))
        corpus_node = cache.Node[corpus_id]

        # response is file-like + headers
        response = HttpResponse(content_type='text/csv')
        response['Content-Disposition'] = 'attachment; filename="corpus-%i_gargantext_term_list.csv"' % corpus_id

        # fill the response with the data
        export_ngramlists(corpus_node, fname=response, titles=True)
        return response

Beispiel #9

0

Datei anzeigen

Datei: ngramlists.py Projekt: project-renard-survey/gargantext

    def post(self, request):
        """
        Merge the lists of a corpus with other lists from a CSV source
                                                 or from another corpus

        params in request.GET:
            onto_corpus:  the corpus whose lists are getting patched

        params in request.data:
            csvfile:      the csv file

        /!\ We assume we checked the file size client-side before upload
        """
        if not request.user.is_authenticated():
            res = HttpResponse("Unauthorized")
            res.status_code = 401
            return res

        # the corpus with the target lists to be patched
        params = get_parameters(request)
        corpus_id = int(params.pop("onto_corpus"))
        corpus_node = cache.Node[corpus_id]

        if request.user.id != corpus_node.user_id:
            res = HttpResponse("Unauthorized")
            res.status_code = 401
            return res

        # request also contains the file
        # csv_file has type django.core.files.uploadedfile.InMemoryUploadedFile
        #                                                 ----------------------
        csv_file = request.data['csvfile']

        csv_contents = csv_file.read().decode("UTF-8").split("\n")
        csv_file.close()
        del csv_file

        # import the csv
        # try:
        log_msg = "Async generation"

        corpus_node_id = corpus_node.id
        scheduled(import_and_merge_ngramlists)(csv_contents,
                                               corpus_node_id,
                                               overwrite=bool(
                                                   params.get('overwrite')))

        return JsonHttpResponse({
            'log': log_msg,
        }, 200)

Beispiel #10

0

Datei anzeigen

Datei: nodes.py Projekt: project-renard-survey/gargantext

    def get(self, request, node_id):
        # check that the node is a corpus
        #   ? faster from cache than: corpus = session.query(Node)...

        if not request.user.is_authenticated():
            # can't use @requires_auth because of positional 'self' within class
            return HttpResponse('Unauthorized', status=401)

        corpus = cache.Node[node_id]
        if corpus.typename != 'CORPUS':
            raise ValidationException(
                "Only nodes of type CORPUS can accept facet queries" +
                " (but this node has type %s)..." % corpus.typename)
        else:
            self.corpus = corpus

        # check that the hyperfield parameter makes sense
        _facet_available_subfields = [
            'source', 'publication_year', 'rubrique', 'language_iso2',
            'language_iso3', 'language_name', 'authors'
        ]
        parameters = get_parameters(request)

        # validate() triggers an info message if subfield not in range
        parameters = validate(
            parameters, {
                'type': dict,
                'items': {
                    'hyperfield': {
                        'type': str,
                        'range': _facet_available_subfields
                    }
                }
            })

        subfield = parameters['hyperfield']

        # do the aggregated sum
        (xcounts, total) = self._ndocs_by_facet(subfield)

        # response
        return JsonHttpResponse({
            'doc_count': total,
            'by': {
                subfield: xcounts
            }
        })

Beispiel #11

0

Datei anzeigen

Datei: nodes.py Projekt: fabelier/gargantext

    def delete(self, request):
        """Removes the list of nodes corresponding to the query.
        TODO : Should be a delete method!
        """
        if not request.user.is_authenticated():
            # can't use @requires_auth because of positional 'self' within class
            return HttpResponse('Unauthorized', status=401)

        parameters = get_parameters(request)
        parameters = validate(parameters, {'ids': list})
        try:
            node_ids = [int(n) for n in parameters['ids'].split(',')]
        except:
            raise ValidationException(
                '"ids" needs integers separated by comma.')

        result = session.execute(delete(Node).where(Node.id.in_(node_ids)))
        session.commit()

        return JsonHttpResponse({'deleted': result.rowcount})

Beispiel #12

0

Datei anzeigen

Datei: nodes.py Projekt: project-renard-survey/gargantext

    def delete(self, request, corpus_id):
        """
        DELETE http://localhost:8000/api/nodes/2/favorites?docs=53,54
        (will delete docs 53 and 54 from the favorites of corpus 2)
        """
        if not request.user.is_authenticated():
            # can't use @requires_auth because of positional 'self' within class
            return HttpResponse('Unauthorized', status=401)

        # user is ok
        fav_node = self._get_fav_node(corpus_id)

        response = {}

        if fav_node == None:
            response = {
                'warning':
                'No favorites node is defined for this corpus (\'%s\')' %
                self.corpus.name,
                'count_removed':
                0
            }
        else:
            req_params = validate(get_parameters(request), {
                'docs': list,
                'default': ""
            })
            nodeids_to_delete = [
                int(did) for did in req_params['docs'].split(',')
            ]
            try:
                # it deletes from favourites but not from DB
                result = session.execute(
                    delete(NodeNode).where(
                        NodeNode.node1_id == fav_node.id).where(
                            NodeNode.node2_id.in_(nodeids_to_delete)))
                session.commit()
                response = {'count_removed': result.rowcount}
            finally:
                session.close()
        return JsonHttpResponse(response)

Beispiel #13

0

Datei anzeigen

Datei: nodes.py Projekt: project-renard-survey/gargantext

def _query_nodes(request, node_id=None):

    if request.user.id is None:
        raise TypeError(
            "This API request must come from an authenticated user.")
    else:
        # we query among the nodes that belong to this user
        user = cache.User[request.user.id]

    # parameters validation
    # fixme: this validation does not allow custom keys in url (eg '?name=' for rename action)
    parameters = get_parameters(request)

    parameters = validate(
        parameters,
        {
            'type': dict,
            'items': {
                'formated': {
                    'type': str,
                    'required': False,
                    'default': 'json'
                },
                'pagination_limit': {
                    'type': int,
                    'default': 10
                },
                'pagination_offset': {
                    'type': int,
                    'default': 0
                },
                'fields': {
                    'type': list,
                    'default': _node_default_fields,
                    'items': {
                        'type': str,
                        'range': _node_available_fields,
                    }
                },
                # choice of hyperdata fields
                'hyperdata_filter': {
                    'type': list,
                    'required': False,
                    'items': {
                        'type': str,
                        'range': _hyperdata_available_fields,
                    }
                },
                # optional filtering parameters
                'types': {
                    'type': list,
                    'required': False,
                    'items': {
                        'type': str,
                        'range': _node_available_types,
                    }
                },
                'parent_id': {
                    'type': int,
                    'required': False
                },
            }
        })

    # debug
    # print('PARAMS', parameters)

    # additional validation for hyperdata_filter
    if (('hyperdata_filter' in parameters)
            and (not ('hyperdata' in parameters['fields']))):
        raise ValidationException(
            "Using the hyperdata_filter filter requires fields[]=hyperdata")

    # start the query
    query = user.nodes()

    # filter by id
    if node_id is not None:
        query = query.filter(Node.id == node_id)
    # filter by type
    if 'types' in parameters:
        query = query.filter(Node.typename.in_(parameters['types']))
    # filter by parent
    if 'parent_id' in parameters:
        query = query.filter(Node.parent_id == parameters['parent_id'])
    # count
    count = query.count()
    # order
    query = query.order_by(Node.hyperdata['publication_date'], Node.id)

    # paginate the query
    if parameters['pagination_limit'] == -1:
        query = query[parameters['pagination_offset']:]
    else:
        query = query[
            parameters['pagination_offset']:parameters['pagination_limit']]
    # return the result!
    # (the receiver function does the filtering of fields and hyperdata_filter)
    return parameters, query, count

Beispiel #14

0

Datei anzeigen

Datei: ngramlists.py Projekt: project-renard-survey/gargantext

    def get(self, request):

        parameters = get_parameters(request)
        glance_limit = None
        mainlist_id = None
        scores_id = None
        groups_id = None
        other_list_ids = {'maplist': None, 'stoplist': None}

        # 1) retrieve a mainlist_id and other lists
        ##########################################

        # simple request: just refers to the parent corpus
        # ------------------------------------------------
        if "corpus" in parameters:
            corpus_id = parameters['corpus']
            corpus = cache.Node[corpus_id]
            # with a corpus_id, the explicit scoring pointer is optional
            if "scoring" in parameters:
                scores_id = parameters['scoring']
            else:
                scores_id = corpus.children('OCCURRENCES').first().id
            # retrieve the family of lists that have corpus as parent
            mainlist_id = corpus.children('MAINLIST').first().id
            groups_id = corpus.children('GROUPLIST').first().id
            other_list_ids['stoplist'] = corpus.children('STOPLIST').first().id
            other_list_ids['maplist'] = corpus.children('MAPLIST').first().id

        # custom request: refers to each list individually
        # -------------------------------------------------
        elif "mainlist" in parameters and "scoring" in parameters:
            mainlist_id = parameters['mainlist']
            scores_id = parameters['scoring']
            groups_id = None
            if 'groups' in parameters:
                groups_id = parameters['scoring']
            for k in ['stoplist', 'maplist']:
                if k in parameters:
                    other_list_ids[k] = parameters[k]

        # or request has an error
        # -----------------------
        else:
            raise ValidationException(
                "Either a 'corpus' parameter or 'mainlist' & 'scoring' params are required"
            )

        # 2) get the infos for each list
        ################################
        ngraminfo = {}  # ngram details sorted per ngram id
        linkinfo = {}  # ngram groups sorted per ngram id
        listmembers = {}  # ngram ids sorted per list name
        if "head" in parameters:
            # head <=> only mainlist AND only k top ngrams
            glance_limit = int(parameters['head'])
            mainlist_query = query_list(mainlist_id,
                                        details=True,
                                        pagination_limit=glance_limit,
                                        scoring_metric_id=scores_id)
        else:
            # infos for all ngrams from mainlist
            mainlist_query = query_list(mainlist_id,
                                        details=True,
                                        scoring_metric_id=scores_id)
            # infos for grouped ngrams, absent from mainlist
            hidden_ngrams_query = query_grouped_ngrams(groups_id, details=True)

            # infos for stoplist terms, absent from mainlist
            stop_ngrams_query = query_list(other_list_ids['stoplist'],
                                           details=True,
                                           scoring_metric_id=scores_id)

            # and for the other lists (stop and map)
            # no details needed here, just the member ids
            for li in other_list_ids:
                li_elts = query_list(other_list_ids[li], details=False).all()
                # simple array of ngram_ids
                listmembers[li] = [ng[0] for ng in li_elts]

            # and the groupings
            if groups_id:
                links = Translations(groups_id)
                linkinfo = links.groups

        # list of
        ngrams_which_need_detailed_info = []
        if "head" in parameters:
            # head triggered simplified form: just the top of the mainlist
            # TODO add maplist membership
            ngrams_which_need_detailed_info = mainlist_query.all()
        else:
            ngrams_which_need_detailed_info = mainlist_query.all(
            ) + hidden_ngrams_query.all() + stop_ngrams_query.all()

        # the output form of details is:
        # ngraminfo[id] => [term, weight]
        for ng in ngrams_which_need_detailed_info:
            ng_id = ng[0]
            ngraminfo[ng_id] = ng[1:]

            # NB the client js will sort mainlist ngs from hidden ngs after ajax
            #    using linkinfo (otherwise needs redundant listmembers for main)

        return JsonHttpResponse({
            'ngraminfos': ngraminfo,
            'listmembers': listmembers,
            'links': linkinfo,
            'nodeids': {
                'mainlist': mainlist_id,
                'maplist': other_list_ids['maplist'],
                'stoplist': other_list_ids['stoplist'],
                'groups': groups_id,
                'scores': scores_id,
            }
        })

Beispiel #15

0

Datei anzeigen

Datei: ngramlists.py Projekt: project-renard-survey/gargantext

    def put(self, request):
        """
        Add some group elements to a group node
          => adds new couples from GroupsBuffer._to_add of terms view

        TODO see use of util.lists.Translations

        Parameters are all in the url (for symmetry with DELETE method)
           api/ngramlists/groups?node=783&1228[]=891,1639
                                     => creates 1228 - 891
                                            and 1228 - 1639

        general format is:   mainform_id[]=subform_id1,subform_id2 etc
                                     => creates mainform_id - subform_id1
                                            and mainform_id - subform_id2

        NB: also checks if the couples exist before because the ngram table
            will send the entire group (old existing links + new links)
        """
        # from the url
        params = get_parameters(request)
        # the node param is unique
        group_node = params.pop('node')
        # the others params are links to change
        couples = self.links_to_couples(params)

        # debug
        # print("==couples from url =================================++++=")
        # print(couples)

        # local version of "insert if not exists" -------------------->8--------
        # (1) check already existing elements
        check_query = (session.query(NodeNgramNgram).filter(
            NodeNgramNgram.node_id == group_node).filter(
                tuple_(NodeNgramNgram.ngram1_id,
                       NodeNgramNgram.ngram2_id).in_(couples)))

        existing = {}
        for synonyms in check_query.all():
            existing[(synonyms.ngram1_id, synonyms.ngram2_id)] = True

        # debug
        #print("==existing")
        #print(existing)

        # (2) compute difference locally
        couples_to_add = [(mform, sform) for (mform, sform) in couples
                          if (mform, sform) not in existing]

        # debug
        # print("== couples_to_add =================================++++=")
        # print(couples_to_add)

        # (3) add new groupings
        bulk_insert(NodeNgramNgram,
                    ('node_id', 'ngram1_id', 'ngram2_id', 'weight'),
                    ((group_node, mainform, subform, 1.0)
                     for (mainform, subform) in couples_to_add))

        # ------------------------------------------------------------>8--------

        return JsonHttpResponse({
            'count_added': len(couples_to_add),
        }, 200)

Beispiel #16

0

Datei anzeigen

Datei: ngrams.py Projekt: project-renard-survey/gargantext

    def put(self, request):
        """
        Basic external access for *creating an ngram*
        ---------------------------------------------

         1 - checks user authentication before any changes

         2 - checks if ngram to Ngram table in DB
              if yes returns ngram_id and optionally mainform_id
              otherwise continues

         3 - adds the ngram to Ngram table in DB

         4 - (if corpus param is present)
             adds the ngram doc counts to NodeNgram table in DB
             (aka "index the ngram" throught the docs of the corpus)

         5 - returns json with:
             'msg'   => a success msg
             'text'  => the initial text content
             'term'  => the normalized text content
             'id'    => the new ngram_id
             'count' => the number of docs with the ngram in the corpus
                        (if corpus param is present)
             'group' => the mainform_id if applicable

        possible inline parameters
        --------------------------
        @param    text=<ngram_string>         [required]
        @param    corpus=<CORPUS_ID>          [optional]
        @param    testgroup (true if present) [optional, requires corpus]
        """

        # 1 - check user authentication
        if not request.user.is_authenticated():
            res = HttpResponse("Unauthorized")
            res.status_code = 401
            return res

        # the params
        params = get_parameters(request)

        print("PARAMS", [(i,v) for (i,v) in params.items()])

        if 'text' in params:
            original_text = str(params.pop('text'))
            ngram_str = normalize_forms(normalize_chars(original_text))
        else:
            raise ValidationException('The route PUT /api/ngrams/ is used to create a new ngram\
                                        It requires a "text" parameter,\
                                        for instance /api/ngrams?text=hydrometallurgy')

        if ('testgroup' in params) and (not ('corpus' in params)):
            raise ValidationException("'testgroup' param requires 'corpus' param")

        # if we have a 'corpus' param (to do the indexing)...
        do_indexation = False
        if 'corpus' in params:
            # we retrieve the corpus...
            corpus_id = int(params.pop('corpus'))
            corpus_node = cache.Node[corpus_id]
            # and the user must also have rights on the corpus
            if request.user.id == corpus_node.user_id:
                do_indexation = True
            else:
                res = HttpResponse("Unauthorized")
                res.status_code = 401
                return res

        # number of "words" in the ngram
        ngram_size = len(findall(r' +', ngram_str)) + 1

        # do the additions
        try:
            log_msg = ""
            ngram_id = None
            mainform_id = None

            preexisting = session.query(Ngram).filter(Ngram.terms==ngram_str).first()

            if preexisting is not None:
                ngram_id = preexisting.id
                log_msg += "ngram already existed (id %i)\n" % ngram_id

                # in the context of a corpus we can also check if has mainform
                # (useful for)
                if 'testgroup' in params:
                    groupings_id = (session.query(Node.id)
                                           .filter(Node.parent_id == corpus_id)
                                           .filter(Node.typename == 'GROUPLIST')
                                           .first()
                                    )
                    had_mainform = (session.query(NodeNgramNgram.ngram1_id)
                                          .filter(NodeNgramNgram.node_id == groupings_id)
                                          .filter(NodeNgramNgram.ngram2_id == preexisting.id)
                                          .first()
                                    )
                    if had_mainform:
                        mainform_id = had_mainform[0]
                        log_msg += "ngram had mainform (id %i) in this corpus" % mainform_id
                    else:
                        log_msg += "ngram was not in any group for this corpus"

            else:
                # 2 - insert into Ngrams
                new_ngram = Ngram(terms=ngram_str, n=ngram_size)
                session.add(new_ngram)
                session.commit()
                ngram_id = new_ngram.id
                log_msg += "ngram was added with new id %i\n" % ngram_id

            # 3 - index the term
            if do_indexation:
                n_added = index_new_ngrams([ngram_id], corpus_node)
                log_msg += 'ngram indexed in corpus %i\n' % corpus_id

            return JsonHttpResponse({
                'msg': log_msg,
                'text': original_text,
                'term': ngram_str,
                'id' : ngram_id,
                'group' : mainform_id,
                'count': n_added if do_indexation else 'no corpus provided for indexation'
                }, 200)

        # just in case
        except Exception as e:
            return JsonHttpResponse({
                'msg': str(e),
                'text': original_text
                }, 400)