Ejemplos de Translations en Python

Lenguaje de programación: Python

Namespace/Package Name: gargantext.util.lists

Clase / Tipo: Translations

Ejemplos en hotexamples.com: 6

Python Translations - 6 ejemplos encontrados. Estos son los ejemplos en Python del mundo real mejor valorados de gargantext.util.lists.Translations extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

Translations(6)

save(2)

Métodos usados con frecuencia

Translations (6)

save (2)

Ejemplo n.º 1

Mostrar archivo

Archivo: ngramlists.py Proyecto: fabelier/gargantext

    def patch(self,request):
        """
        A copy of POST (merging list) but with the source == just an internal corpus_id

        params in request.GET:
            onto_corpus:  the corpus whose lists are getting patched
            from:         the corpus from which we take the source lists to merge in
            todo:         an array of the list types ("map", "main", "stop") to merge in

        """
        if not request.user.is_authenticated():
            res = HttpResponse("Unauthorized")
            res.status_code = 401
            return res

        params = get_parameters(request)
        print(params)

        # the corpus with the target lists to be patched
        corpus_id = int(params.pop("onto_corpus"))
        corpus_node = cache.Node[corpus_id]

        print(params)

        if request.user.id != corpus_node.user_id:
            res = HttpResponse("Unauthorized")
            res.status_code = 401
            return res

        list_types = {'map':'MAPLIST', 'main':'MAINLIST', 'stop':'STOPLIST'}

        # internal DB retrieve source_lists
        source_corpus_id = int(params.pop("from_corpus"))
        source_node = cache.Node[source_corpus_id]

        todo_lists = params.pop("todo").split(',')   # ex: ['map', 'stop']
        source_lists = {}
        for key in todo_lists:
            source_lists[key] = UnweightedList(
                                    source_node.children(list_types[key]).first().id
                                )

        # add the groupings too
        source_lists['groupings'] = Translations(
                                        source_node.children("GROUPLIST").first().id
                                    )

        # attempt to merge and send response
        try:
            # merge the source_lists onto those of the target corpus
            log_msg = merge_ngramlists(source_lists, onto_corpus=corpus_node)
            return JsonHttpResponse({
                'log': log_msg,
                }, 200)

        except Exception as e:
            return JsonHttpResponse({
                'err': str(e),
                }, 400)

Ejemplo n.º 2

Mostrar archivo

Archivo: ngramlists.py Proyecto: project-renard-survey/gargantext

    def get(self, request):

        parameters = get_parameters(request)
        glance_limit = None
        mainlist_id = None
        scores_id = None
        groups_id = None
        other_list_ids = {'maplist': None, 'stoplist': None}

        # 1) retrieve a mainlist_id and other lists
        ##########################################

        # simple request: just refers to the parent corpus
        # ------------------------------------------------
        if "corpus" in parameters:
            corpus_id = parameters['corpus']
            corpus = cache.Node[corpus_id]
            # with a corpus_id, the explicit scoring pointer is optional
            if "scoring" in parameters:
                scores_id = parameters['scoring']
            else:
                scores_id = corpus.children('OCCURRENCES').first().id
            # retrieve the family of lists that have corpus as parent
            mainlist_id = corpus.children('MAINLIST').first().id
            groups_id = corpus.children('GROUPLIST').first().id
            other_list_ids['stoplist'] = corpus.children('STOPLIST').first().id
            other_list_ids['maplist'] = corpus.children('MAPLIST').first().id

        # custom request: refers to each list individually
        # -------------------------------------------------
        elif "mainlist" in parameters and "scoring" in parameters:
            mainlist_id = parameters['mainlist']
            scores_id = parameters['scoring']
            groups_id = None
            if 'groups' in parameters:
                groups_id = parameters['scoring']
            for k in ['stoplist', 'maplist']:
                if k in parameters:
                    other_list_ids[k] = parameters[k]

        # or request has an error
        # -----------------------
        else:
            raise ValidationException(
                "Either a 'corpus' parameter or 'mainlist' & 'scoring' params are required"
            )

        # 2) get the infos for each list
        ################################
        ngraminfo = {}  # ngram details sorted per ngram id
        linkinfo = {}  # ngram groups sorted per ngram id
        listmembers = {}  # ngram ids sorted per list name
        if "head" in parameters:
            # head <=> only mainlist AND only k top ngrams
            glance_limit = int(parameters['head'])
            mainlist_query = query_list(mainlist_id,
                                        details=True,
                                        pagination_limit=glance_limit,
                                        scoring_metric_id=scores_id)
        else:
            # infos for all ngrams from mainlist
            mainlist_query = query_list(mainlist_id,
                                        details=True,
                                        scoring_metric_id=scores_id)
            # infos for grouped ngrams, absent from mainlist
            hidden_ngrams_query = query_grouped_ngrams(groups_id, details=True)

            # infos for stoplist terms, absent from mainlist
            stop_ngrams_query = query_list(other_list_ids['stoplist'],
                                           details=True,
                                           scoring_metric_id=scores_id)

            # and for the other lists (stop and map)
            # no details needed here, just the member ids
            for li in other_list_ids:
                li_elts = query_list(other_list_ids[li], details=False).all()
                # simple array of ngram_ids
                listmembers[li] = [ng[0] for ng in li_elts]

            # and the groupings
            if groups_id:
                links = Translations(groups_id)
                linkinfo = links.groups

        # list of
        ngrams_which_need_detailed_info = []
        if "head" in parameters:
            # head triggered simplified form: just the top of the mainlist
            # TODO add maplist membership
            ngrams_which_need_detailed_info = mainlist_query.all()
        else:
            ngrams_which_need_detailed_info = mainlist_query.all(
            ) + hidden_ngrams_query.all() + stop_ngrams_query.all()

        # the output form of details is:
        # ngraminfo[id] => [term, weight]
        for ng in ngrams_which_need_detailed_info:
            ng_id = ng[0]
            ngraminfo[ng_id] = ng[1:]

            # NB the client js will sort mainlist ngs from hidden ngs after ajax
            #    using linkinfo (otherwise needs redundant listmembers for main)

        return JsonHttpResponse({
            'ngraminfos': ngraminfo,
            'listmembers': listmembers,
            'links': linkinfo,
            'nodeids': {
                'mainlist': mainlist_id,
                'maplist': other_list_ids['maplist'],
                'stoplist': other_list_ids['stoplist'],
                'groups': groups_id,
                'scores': scores_id,
            }
        })

Ejemplo n.º 3

Mostrar archivo

Archivo: ngram_groups.py Proyecto: project-renard-survey/gargantext

def compute_groups(corpus, stoplist_id = None, overwrite_id = None):
    """
    1) Use a stemmer/lemmatizer to group forms if they have same stem/lemma
    2) Create an empty GROUPLIST node (for a list of "synonym" ngrams)
    3) Save the list to DB (list node + each grouping as listnode - ngram1 - ngram2)
    """

    stop_ngrams_ids = {}
    # we will need the ngrams of the stoplist to filter
    if stoplist_id is not None:
        for id in session.query(NodeNgram.ngram_id).filter(NodeNgram.node_id == stoplist_id).all():
            stop_ngrams_ids[id[0]] = True


    # 1) compute stems/lemmas
    #    and group if same stem/lemma
    stemmers = prepare_stemmers(corpus)
    print("# STEMMERS LOADED", stemmers)
    supported_stemmers_lang = [lang for lang in corpus.hyperdata["languages"] if lang != "__unknown__"]

    print("#SUPPORTED STEMMERS LANGS", supported_stemmers_lang)
    # todo dict {lg => {ngrams_todo} }
    todo_ngrams_per_lg = defaultdict(set)

    # res dict { commonstem: {ngram_1:freq_1 ,ngram_2:freq_2 ,ngram_3:freq_3} }
    my_groups = defaultdict(Counter)

    # preloop per doc to sort ngrams by language
    for doc in corpus.children('DOCUMENT'):
        if doc.id not in corpus.hyperdata['skipped_docs']:
            if ('language_iso2' in doc.hyperdata) and doc.hyperdata['language_iso2'] \
                                                    in supported_stemmers_lang:
                lgid = doc.hyperdata['language_iso2']

            else:
                lgid = "__unknown__"
                doc.status("NGRAMS_GROUPS", error="Error: unsupported language for stemming")
                doc.save_hyperdata()
                #corpus.hyperdata["skipped_docs"].append(doc.id)
                #corpus.save_hyperdata()
            # doc.ngrams is an sql query (ugly but useful intermediate step)
            # FIXME: move the counting and stoplist filtering up here
            for ngram_pack in doc.ngrams.all():
                todo_ngrams_per_lg[lgid].add(ngram_pack)

    # --------------------
    # long loop per ngrams
    for (lgid,todo_ngs) in todo_ngrams_per_lg.items():
        # fun: word::str => stem::str
        stem_it = stemmers[lgid].stem

        for ng in todo_ngs:
            doc_wei = ng[0]
            ngram  = ng[1]       # Ngram obj

            # break if in STOPLIST
            if ngram.id in stop_ngrams_ids:
                next

            lexforms = [lexunit for lexunit in resplit(r'\W+',ngram.terms)]

            # STEM IT, and this term's stems will become a new grouping key...
            stemseq = " ".join([stem_it(lexfo) for lexfo in lexforms])

            # ex:
            # groups['post'] = {'poste':3, 'poster':5, 'postés':2...}
            # groups['copper engrav'] = {'copper engraving':3, 'coppers engraver':1...}
            my_groups[stemseq][ngram.id] += doc_wei

    del todo_ngrams_per_lg

    # now serializing all groups to a list of couples
    ng_couples = []
    addcouple = ng_couples.append
    for grped_ngramids in my_groups.values():
        if len(grped_ngramids) > 1:
            # first find most frequent term in the counter
            winner_id = grped_ngramids.most_common(1)[0][0]

            for ngram_id in grped_ngramids:
                if ngram_id != winner_id:
                    addcouple((winner_id, ngram_id))

    del my_groups

    # 2) the list node
    if overwrite_id:
        # overwrite pre-existing id
        the_id = overwrite_id
    # or create the new id
    else:
        the_group =  corpus.add_child(
            typename  = "GROUPLIST",
            name = "Group (src:%s)" % corpus.name[0:10]
        )

        # and save the node
        session.add(the_group)
        session.commit()
        the_id = the_group.id

    # 3) Save each grouping couple to DB thanks to Translations.save() table
    ndngng_list = Translations(
                                [(sec,prim) for (prim,sec) in ng_couples],
                                just_items=True
                   )

    # ...referring to the list node we just got
    ndngng_list.save(the_id)

    return the_id

Ejemplo n.º 4

Mostrar archivo

def merge_ngramlists(new_lists={}, onto_corpus=None, del_originals=[]):
    """
    Integrates an external terms table to the current one:
       - merges groups (using group_union() function)
       - resolves conflicts if terms belong in different lists
          > map wins over both other types
          > main wins over stop
          > stop never wins   £TODO STOP wins over candidates from main

    @param new_lists:     a dict of *new* imported lists with format:
                                {'stop':     UnweightedList,
                                 'main':     UnweightedList,
                                 'map':      UnweightedList,
                                 'groupings': Translations }

                   if any of those lists is absent it is considered empty

    @param onto_corpus:   a corpus node to get the *old* lists

    @param del_originals: an array of original wordlists to ignore
                          and delete during the merge
                          possible values : ['stop','main','map']

            par exemple
            del_originals = ['stop','main'] => effacera la stoplist
                                                 et la mainlist
                                          mais pas la maplist qui sera fusionnée
                                         (les éléments de la map list
                                          seront remis dans la main à la fin)

    NB: Uses group_tools.group_union() to merge the synonym links.
        Uses ngrams_addition.index_new_ngrams() to also add new ngrams to the docs
    """
    # log to send back to client-side (lines will be joined)
    my_log = []

    # the tgt node arg has to be a corpus here
    if not hasattr(onto_corpus,
                   "typename") or onto_corpus.typename != "CORPUS":
        raise TypeError("IMPORT: 'onto_corpus' argument must be a Corpus Node")

    # for stats
    added_nd_ng = 0  # number of added list elements

    # our list shortcuts will be 0,1,2 (aka lid)
    # by order of precedence
    linfos = [
        {
            'key': 'stop',
            'name': "STOPLIST"
        },  # lid = 0
        {
            'key': 'main',
            'name': "MAINLIST"
        },  # lid = 1
        {
            'key': 'map',
            'name': "MAPLIST"
        }  # lid = 2
    ]

    # ======== Index the new ngrams in the docs =========
    all_possibly_new_ngram_ids = []
    collect = all_possibly_new_ngram_ids.append
    for lid, info in enumerate(linfos):
        list_type = info['key']
        if list_type in new_lists:
            for ng_id in new_lists[list_type].items:
                collect(ng_id)

    from gargantext.util.toolchain.main import t
    print("MERGE DEBUG: starting index_new_ngrams", t())
    n_added = index_new_ngrams(all_possibly_new_ngram_ids, onto_corpus)
    print("MERGE DEBUG: finished index_new_ngrams", t())

    my_log.append("MERGE: added %i new ngram occurrences in docs" % n_added)

    # ======== Get the old lists =========
    old_lists = {}

    # DB nodes stored with same indices 0,1,2 (resp. stop, miam and map)
    # find target ids of the list node objects
    tgt_nodeids = [
        onto_corpus.children(
            "STOPLIST").first().id,  # £todo via parent project?
        onto_corpus.children("MAINLIST").first().id,
        onto_corpus.children("MAPLIST").first().id
    ]

    old_group_id = onto_corpus.children("GROUPLIST").first().id

    # retrieve old data into old_lists[list_type]...
    # ----------------------------------------------
    for lid, linfo in enumerate(linfos):
        list_type = linfo['key']
        if list_type not in del_originals:

            # NB can't use UnweightedList(tgt_nodeids[lid])
            # because we need to include out-of-list subforms
            list_ngrams_q = query_list(tgt_nodeids[lid],
                                       groupings_id=old_group_id)
            old_lists[list_type] = UnweightedList(list_ngrams_q.all())
        else:
            # ...or use empty objects if replacing old list
            # ----------------------------------------------
            old_lists[list_type] = UnweightedList()
            msg = "MERGE: ignoring old %s which will be overwritten" % linfo[
                'name']
            print(msg)
            my_log.append(msg)

    # ======== Merging all involved ngrams =========

    # all memberships with resolved conflicts of interfering memberships
    resolved_memberships = {}

    for list_set in [old_lists, new_lists]:
        for lid, info in enumerate(linfos):
            list_type = info['key']
            # if you don't want to merge one list just don't put it in new_lists
            if list_type in list_set:
                # we use the fact that lids are ordered ints...
                for ng_id in list_set[list_type].items:
                    if ng_id not in resolved_memberships:
                        resolved_memberships[ng_id] = lid
                    else:
                        # ...now resolving is simply taking the max
                        # stop < main < map
                        resolved_memberships[ng_id] = max(
                            lid, resolved_memberships[ng_id])
            # now each ngram is only in its most important list
            # -------------------------------------------------
            # NB temporarily map items are not in main anymore
            #    but we'll copy it at the end
            # NB temporarily all subforms were treated separately
            #    from mainforms but we'll force them into same list
            #    after we merge the groups

    del old_lists

    # ======== Merging old and new groups =========
    # get the arcs already in the target DB (directed couples)
    previous_links = session.query(
        NodeNgramNgram.ngram1_id, NodeNgramNgram.ngram2_id).filter(
            NodeNgramNgram.node_id == old_group_id).all()

    n_links_previous = len(previous_links)

    # same format for the new arcs (Translations ~~~> array of couples)
    translated_imported_links = []
    add_link = translated_imported_links.append
    n_links_added = 0
    for (y, x) in new_lists['groupings'].items.items():
        add_link((x, y))
        n_links_added += 1
    del new_lists

    # group_union: joins 2 different synonym-links lists into 1 new list
    new_links = group_union(previous_links, translated_imported_links)
    del previous_links
    del translated_imported_links

    n_links_after = len(new_links)

    merged_group = Translations([(y, x) for (x, y) in new_links])
    del new_links

    # ======== Overwrite old data with new =========

    merged_group.save(old_group_id)

    msg = "MERGE: groupings %i updated (links before/added/after: %i/%i/%i)" % (
        old_group_id, n_links_previous, n_links_added, n_links_after)
    my_log.append(msg)
    print(msg)

    # ======== Target list(s) append data =========
    # if list 2 => write in both tgt_data_lists [1,2]
    # lists 0 or 1 => straightforward targets [0] or [1]

    merged_results = {
        'stop': UnweightedList(),
        'main': UnweightedList(),
        'map': UnweightedList()
    }

    for (ng_id, winner_lid) in resolved_memberships.items():

        ## 1) using the new groups
        # normal case if not a subform
        if ng_id not in merged_group.items:
            target_lid = winner_lid
        # inherit case if is a subform
        else:
            mainform_id = merged_group.items[ng_id]
            # inherited winner
            try:
                target_lid = resolved_memberships[mainform_id]
            except KeyError:
                target_lid = winner_lid
                print("MERGE: WARN ng_id %i has incorrect mainform %i ?" %
                      (ng_id, mainform_id))

        ## 2) map => map + main
        if target_lid == 2:
            todo_lids = [1, 2]
        else:
            todo_lids = [target_lid]

        ## 3) storage
        for lid in todo_lids:
            list_type = linfos[lid]['key']
            merged_results[list_type].items.add(ng_id)

    # print("IMPORT: added %i elements in the lists indices" % added_nd_ng)

    # ======== Overwrite old data with new =========
    for lid, info in enumerate(linfos):
        tgt_id = tgt_nodeids[lid]
        list_type = info['key']
        result = merged_results[list_type]
        result.save(tgt_id)

        msg = "MERGE: %s %i updated (new size: %i)" % (
            info['name'], tgt_id, len(merged_results[list_type].items))
        my_log.append(msg)
        print(msg)

    # return a log
    return ("\n".join(my_log))

Ejemplo n.º 5

Mostrar archivo

def import_ngramlists(the_file,
                      delimiter=DEFAULT_CSV_DELIM,
                      group_delimiter=DEFAULT_CSV_DELIM_GROUP):
    '''
    This function reads a CSV of an ngrams table for a Corpus,
    then it converts old ngram_ids to those of the current DB
       (and adds to DB any unknown ngrams)
    then recreates an equivalent set of MAINLIST, MAPLIST, STOPLIST + GROUPS

    Input example:
        status  | label         |forms
        --------+---------------+---------------------
        map      water table     water tables
        map      water supply    water-supply|&|water supplies
        stop     wastewater

    The title line is mandatory.
    The label will correspond to our DB mainform type.

    Variants:
    ----------
    For user accessibility, we allow different formats using equivalence rules:

    1) It is implicit that the label string is also one of the forms
       therefore the input example table is equivalent to this "verbose" table:

        status  | label         |forms
        --------+---------------+---------------------
        map      water table     water table|&|water tables
        map      water supply    water supply|&|water-supply|&|water supplies
        stop     wastewater      wastewater


    2) The default status is map and the status column is optional
       thus, if we ignore "wastewater", the input table is also equivalent to:

         label         |forms
        ---------------+---------------------
        water table     water tables
        water supply    water-supply|&|water supplies


    3) From DB point of view, both "forms that are labels" and "other forms" are
       finally saved just as ngrams. So the input table is also equivalent to:

        status  | label         |forms
        --------+---------------+---------------------
        map      water table     water tables
        map      water tables
        map      water supply    water-supply|&|water supplies
        map      water supplies
        map      water-supply
        stop     wastewater


    Output:
    -------
        3 x UnweightedList + 1 x Translations

    @param the_file         a local filename or file contents or a filehandle-like
    @param delimiter        a character used as separator in the CSV
    @param group_delimiter  a character used as grouped subforms separator
                            (in the last column)

    The retrieval of ngram_ids works in 2 steps:
        => look up each term str in the DB with bulk_insert_ifnotexists
           (creates absent ngrams if necessary)
        => use the new ids to map the relations involving the old ones

    NB: the creation of MAINLIST also adds all elements from the MAPLIST

    NB: To merge the imported lists into a corpus node's lists,
        chain this function with merge_ngramlists()
    '''
    # ---------------
    #  ngram storage
    # ---------------

    # main storage for the ngrams by list
    imported_nodes_ngrams = {'stop': [], 'main': [], 'map': []}

    # and all the terms (for unique and for dbdata bulk_insert)
    imported_unique_ngramstrs = {}

    # and for the imported_grouping list of couples [(str1,str1),(str1,str2)..]
    imported_groupings = []

    # /!\ imported_grouping contains the subforms' terms themselves
    #     (that will have to be translated to ngram_ids for the target db)

    # =============== READ CSV ===============

    if isinstance(the_file, list):
        fname = 'imported_file'
        contents = the_file
    else:
        if isinstance(the_file, str):
            fh = open(the_file, "r")
            fname = the_file
        elif callable(getattr(the_file, "read", None)):
            fh = the_file
            fname = the_file
        else:
            raise TypeError("IMPORT: the_file argument has unknown type %s" %
                            type(the_file))

        # reading all directly b/c csv.reader takes only lines or a real fh in bytes
        # and we usually have a "false" fh (uploadedfile.InMemoryUploadedFile) in strings
        # (but we checked its size before!)
        contents = fh.read().decode("UTF-8").split("\n")

        # end of CSV read
        fh.close()

    # <class 'django.core.files.uploadedfile.InMemoryUploadedFile'>

    ngrams_csv_rows = reader(contents,
                             delimiter=delimiter,
                             quoting=QUOTE_MINIMAL)

    # for stats
    n_read_lines = 0
    n_total_ng = 0
    n_added_ng = 0
    n_group_relations = 0

    # columntype => int
    columns = {}

    # load CSV + initial checks
    for i, csv_row in enumerate(ngrams_csv_rows):
        # fyi
        n_read_lines += 1
        # print("---------------READ LINE %i" % i)

        # headers
        if i == 0:
            n_cols = len(csv_row)
            for j, colname in enumerate(csv_row):
                if colname in ['label', 'status', 'forms']:
                    columns[colname] = j
                # skip empty columns
                elif match(r'^\s*$', colname):
                    pass
                else:
                    raise ValueError(
                        'Wrong header "%s" on line %i (only possible headers are "label", "forms" and "status")'
                        % (colname, n_read_lines))
            if 'label' not in columns:
                raise ValueError(
                    'CSV must contain at least one column with the header "label"'
                )

        if not len(csv_row):
            continue

        # try:
        # mandatory column
        this_row_label = str(csv_row[columns['label']])

        # other columns or their default values
        if 'status' in columns:
            this_list_type = str(csv_row[columns['status']])
        else:
            this_list_type = 'map'

        if 'forms' in columns:
            this_row_forms = str(csv_row[columns['forms']])
        else:
            this_row_forms = ''

        # string normalizations
        this_row_label = normalize_forms(normalize_chars(this_row_label))

        # except:
        #     if i == 0:
        #         print("IMPORT WARN: (skip line) probable header line at CSV %s:l.0" % fname)
        #         continue
        #     else:
        #         raise ValueError("Error on CSV read line %i" % i)

        # --- term checking
        if not len(this_row_label) > 0:
            print("IMPORT WARN: (skip line) empty term at CSV %s:l.%i" %
                  (fname, i))
            continue

        # --- check correct list type
        if not this_list_type in ['stop', 'main', 'map']:
            print("IMPORT WARN: (skip line) wrong list type at CSV %s:l.%i" %
                  (fname, i))
            continue

        # subforms can be duplicated (in forms and another label)
        # but we must take care of unwanted other duplicates too
        if this_row_label in imported_unique_ngramstrs:
            print(
                "TODO IMPORT DUPL: (skip line) term appears more than once at CSV %s:l.%i"
                % (fname, i))

        # ================= Store the data ====================
        # the ngram census
        imported_unique_ngramstrs[this_row_label] = True

        # and the "list to ngram" relation
        imported_nodes_ngrams[this_list_type].append(this_row_label)

        # ====== Store synonyms from the import (if any) ======
        if len(this_row_forms) != 0:
            other_terms = []
            for raw_term_str in this_row_forms.split(group_delimiter):

                # each subform is also like an ngram declaration
                term_str = normalize_forms(normalize_chars(raw_term_str))
                imported_unique_ngramstrs[term_str] = True
                imported_nodes_ngrams[this_list_type].append(term_str)

                # the optional repeated mainform doesn't interest us
                # because we already have it via the label
                if term_str != this_row_label:

                    # save links
                    imported_groupings.append((this_row_label, term_str))

    # ======== ngram save + id lookup =========
    n_total_ng = len(imported_unique_ngramstrs)

    # prepare data format
    imported_ngrams_dbdata = []
    for ngram_str in imported_unique_ngramstrs:
        # DB needs the number of separate words
        n_words = 1 + len(findall(r' ', ngram_str))
        imported_ngrams_dbdata.append((ngram_str, n_words))

    # returns a dict {term => id} and a count of inserted ones
    #                             -------------------------
    (new_ngrams_ids, n_added_ng) = bulk_insert_ifnotexists(
        #                             -------------------------
        model=Ngram,
        uniquekey='terms',
        fields=('terms', 'n'),
        data=imported_ngrams_dbdata,
        do_stats=True)
    del imported_ngrams_dbdata

    # new_ngrams_ids contains a direct mapping ng_str => new_id
    del imported_unique_ngramstrs

    # print(new_ngrams_ids)
    # print(imported_nodes_ngrams)

    # ======== Import into lists =========

    # 3 x abstract lists + 1 translations
    result = {
        'map': UnweightedList(),
        'main': UnweightedList(),
        'stop': UnweightedList(),
        'groupings': Translations()
    }

    for list_type in imported_nodes_ngrams:
        for ng_str in imported_nodes_ngrams[list_type]:
            new_id = new_ngrams_ids[ng_str]
            # add to the abstract list
            result[list_type].items.add(new_id)

        # for main also add map elements
        if list_type == 'main':
            for ng_str in imported_nodes_ngrams['map']:
                new_id = new_ngrams_ids[ng_str]
                result['main'].items.add(new_id)

    # ======== Synonyms =========
    for (x_str, y_str) in imported_groupings:
        new_mainform_id = new_ngrams_ids[x_str]
        new_subform_id = new_ngrams_ids[y_str]

        # /!\ Translations use (subform => mainform) order
        result['groupings'].items[new_subform_id] = new_mainform_id
        n_group_relations += 1

    # ------------------------------------------------------------------
    print("IMPORT: read %i lines from the CSV" % n_read_lines)
    print("IMPORT: read %i terms (%i added and %i already existing)" %
          (n_total_ng, n_added_ng, n_total_ng - n_added_ng))
    print("IMPORT: read %i grouping relations" % n_group_relations)

    # print("IMPORT RESULT", result)
    return result

Ejemplo n.º 6

Mostrar archivo

Archivo: cooccurrences.py Proyecto: project-renard-survey/gargantext

def filterMatrix(matrix, mapList_id, groupList_id):
    mapList = UnweightedList(mapList_id)
    group_list = Translations(groupList_id)
    cooc = matrix & (mapList * group_list)
    return cooc