Ejemplo n.º 1
0
    def _create_subcorpus(self, request):
        """
        req. arguments:
        subcname -- name of new subcorpus
        create -- bool, sets whether to create new subcorpus
        cql -- custom within condition
        """
        subcname = request.form['subcname']
        within_json = request.form.get('within_json')
        raw_cql = request.form.get('cql')
        corp_encoding = self._corp().get_conf('ENCODING')

        if raw_cql:
            tt_query = ()
            within_cql = raw_cql
            full_cql = 'aword,[] %s' % raw_cql
            imp_cql = (full_cql,)
        elif within_json:  # user entered a subcorpus query manually
            tt_query = ()
            within_cql = self._deserialize_custom_within(json.loads(within_json))
            full_cql = 'aword,[] %s' % within_cql
            imp_cql = (full_cql,)
        else:
            tt_query = TextTypeCollector(self._corp(), request).get_query()
            full_cql = ' within '.join(['<%s %s />' % item for item in tt_query])
            full_cql = 'aword,[] within %s' % full_cql
            full_cql = import_string(full_cql, from_encoding=corp_encoding)
            imp_cql = (full_cql,)
        basecorpname = self.args.corpname.split(':')[0]
        if not subcname:
            raise UserActionException(_('No subcorpus name specified!'))
        path = self.prepare_subc_path(basecorpname, subcname)

        if type(path) == unicode:
            path = path.encode('utf-8')

        if len(tt_query) == 1:
            result = corplib.create_subcorpus(path, self._corp(), tt_query[0][0], tt_query[0][1])
        elif len(tt_query) > 1 or within_cql:
            conc = conclib.get_conc(self._corp(), self._session_get('user', 'user'), q=imp_cql)
            conc.sync()
            struct = self._corp().get_struct(tt_query[0][0]) if len(tt_query) == 1 else None
            result = corplib.subcorpus_from_conc(path, conc, struct)
        else:
            raise UserActionException(_('Nothing specified!'))

        if result:
            if plugins.has_plugin('subc_restore'):
                try:
                    plugins.get('subc_restore').store_query(user_id=self._session_get('user', 'id'),
                                                            corpname=self.args.corpname,
                                                            subcname=subcname,
                                                            cql=full_cql.split('[]')[-1])
                except Exception as e:
                    logging.getLogger(__name__).warning('Failed to store subcorpus query: %s' % e)
                    self.add_system_message('warning',
                                            _('Subcorpus created but there was a problem saving a backup copy.'))
            return {}
        else:
            raise ConcError(_('Empty subcorpus!'))
Ejemplo n.º 2
0
    def _create_subcorpus(self, request):
        """
        req. arguments:
        subcname -- name of new subcorpus
        create -- bool, sets whether to create new subcorpus
        within_condition -- custom within condition; if non-empty then clickable form is omitted
        within_struct -- a structure the within_condition will be applied to
        """
        subcname = request.form['subcname']
        within_condition = request.form['within_condition']
        within_struct = request.form['within_struct']
        corp_encoding = self._corp().get_conf('ENCODING')

        if within_condition and within_struct:  # user entered a subcorpus query manually
            tt_query = [(export_string(within_struct, to_encoding=corp_encoding),
                        export_string(within_condition, to_encoding=corp_encoding))]
        else:
            tt_query = self._texttype_query(request)
            within_struct = import_string(tt_query[0][0], from_encoding=corp_encoding)
            within_condition = import_string(tt_query[0][1], from_encoding=corp_encoding)

        basecorpname = self.args.corpname.split(':')[0]
        if not subcname:
            raise ConcError(_('No subcorpus name specified!'))

        path = os.path.join(self.subcpath[-1], basecorpname)
        if not os.path.isdir(path):
            os.makedirs(path)
        path = os.path.join(path, subcname) + '.subc'
        if not tt_query:
            raise ConcError(_('Nothing specified!'))

        # Even if _texttype_query() parsed multiple structures into tt_query,
        # Manatee can accept directly only one (but with arbitrarily complex attribute
        # condition).
        # For this reason, we choose only the first struct+condition pair.
        # It is up to the user interface to deal with it.
        structname, subquery = tt_query[0]
        if type(path) == unicode:
            path = path.encode("utf-8")
        if corplib.create_subcorpus(path, self._corp(), structname, subquery):
            if plugins.has_plugin('subc_restore'):
                try:
                    plugins.get('subc_restore').store_query(user_id=self._session_get('user', 'id'),
                                                            corpname=self.args.corpname,
                                                            subcname=subcname,
                                                            structname=within_struct,
                                                            condition=within_condition)
                except Exception as e:
                    logging.getLogger(__name__).warning('Failed to store subcorpus query: %s' % e)
                    self.add_system_message('warning',
                                            _('Subcorpus created but there was a problem saving a backup copy.'))
            return {}
        else:
            raise ConcError(_('Empty subcorpus!'))
Ejemplo n.º 3
0
    def _create_subcorpus(self, request):
        """
        req. arguments:
        subcname -- name of new subcorpus
        create -- bool, sets whether to create new subcorpus
        cql -- custom within condition
        """
        subcname = request.form['subcname']
        within_json = request.form.get('within_json')
        raw_cql = request.form.get('cql')
        aligned_corpora = request.form.getlist('aligned_corpora')
        publish = bool(int(request.form.get('publish')))
        corpus_info = self.get_corpus_info(self.args.corpname)
        description = request.form.get('description')

        if not subcname:
            raise UserActionException(translate('No subcorpus name specified!'))

        if publish and not description:
            raise UserActionException(translate('No description specified'))

        if raw_cql:
            aligned_corpora = []
            tt_query = ()
            within_cql = raw_cql
            full_cql = 'aword,[] %s' % raw_cql
            imp_cql = (full_cql,)
        elif within_json:  # user entered a subcorpus query manually
            aligned_corpora = []
            tt_query = ()
            within_cql = self._deserialize_custom_within(json.loads(within_json))
            full_cql = 'aword,[] %s' % within_cql
            imp_cql = (full_cql,)
        elif len(aligned_corpora) > 0 and plugins.runtime.LIVE_ATTRIBUTES.exists:
            if corpus_info.metadata.label_attr and corpus_info.metadata.id_attr:
                within_cql = None
                attrs = json.loads(request.form.get('attrs', '{}'))
                sel_match = plugins.runtime.LIVE_ATTRIBUTES.instance.get_attr_values(
                    self._plugin_api, corpus=self.corp,
                    attr_map=attrs,
                    aligned_corpora=aligned_corpora,
                    limit_lists=False)
                values = sel_match['attr_values'][corpus_info.metadata.label_attr]
                args = argmapping.Args()
                setattr(args, 'sca_{0}'.format(
                    corpus_info.metadata.id_attr), [v[1] for v in values])
                tt_query = TextTypeCollector(self.corp, args).get_query()
                tmp = ['<%s %s />' % item for item in tt_query]
                full_cql = ' within '.join(tmp)
                full_cql = 'aword,[] within %s' % full_cql
                full_cql = import_string(full_cql, from_encoding=self.corp_encoding)
                imp_cql = (full_cql,)
            else:
                raise FunctionNotSupported(
                    'Corpus must have a bibliography item defined to support this function')
        else:
            within_cql = None
            tt_query = TextTypeCollector(self.corp, request).get_query()
            tmp = ['<%s %s />' % item for item in tt_query]
            full_cql = ' within '.join(tmp)
            full_cql = 'aword,[] within %s' % full_cql
            full_cql = import_string(full_cql, from_encoding=self.corp_encoding)
            imp_cql = (full_cql,)

        basecorpname = self.args.corpname.split(':')[0]
        path = self.prepare_subc_path(basecorpname, subcname, publish=False)
        publish_path = self.prepare_subc_path(
            basecorpname, subcname, publish=True) if publish else None

        if type(path) == unicode:
            path = path.encode('utf-8')

        if len(tt_query) == 1 and len(aligned_corpora) == 0:
            result = corplib.create_subcorpus(path, self.corp, tt_query[0][0], tt_query[0][1])
            if result and publish_path:
                corplib.mk_publish_links(path, publish_path, self.session_get(
                    'user', 'fullname'), description)
        elif len(tt_query) > 1 or within_cql or len(aligned_corpora) > 0:
            backend = settings.get('calc_backend', 'type')
            if backend in ('celery', 'konserver'):
                import bgcalc
                app = bgcalc.calc_backend_client(settings)
                res = app.send_task('worker.create_subcorpus',
                                    (self.session_get('user', 'id'), self.args.corpname, path, publish_path,
                                     tt_query, imp_cql, self.session_get('user', 'fullname'), description),
                                    time_limit=TASK_TIME_LIMIT)
                self._store_async_task(AsyncTaskStatus(status=res.status, ident=res.id,
                                                       category=AsyncTaskStatus.CATEGORY_SUBCORPUS,
                                                       label=u'%s:%s' % (basecorpname, subcname),
                                                       args=dict(subcname=subcname, corpname=basecorpname)))
                result = {}
            elif backend == 'multiprocessing':
                from bgcalc import subc_calc
                import functools
                import multiprocessing
                worker = subc_calc.CreateSubcorpusTask(user_id=self.session_get('user', 'id'),
                                                       corpus_id=self.args.corpname)
                multiprocessing.Process(target=functools.partial(
                    worker.run, tt_query, imp_cql, path, publish_path, description)).start()
                result = {}
        else:
            raise UserActionException(translate('Nothing specified!'))
        if result is not False:
            with plugins.runtime.SUBC_RESTORE as sr:
                try:
                    sr.store_query(user_id=self.session_get('user', 'id'),
                                   corpname=self.args.corpname,
                                   subcname=subcname,
                                   cql=full_cql.strip().split('[]', 1)[-1])
                except Exception as e:
                    logging.getLogger(__name__).warning('Failed to store subcorpus query: %s' % e)
                    self.add_system_message('warning',
                                            translate('Subcorpus created but there was a problem saving a backup copy.'))
            unfinished_corpora = filter(lambda at: not at.is_finished(),
                                        self.get_async_tasks(category=AsyncTaskStatus.CATEGORY_SUBCORPUS))
            return dict(processed_subc=[uc.to_dict() for uc in unfinished_corpora])
        else:
            raise SubcorpusError(translate('Empty subcorpus!'))
Ejemplo n.º 4
0
    def _create_subcorpus(self, request: Request) -> Dict[str, Any]:
        """
        req. arguments:
        subcname -- name of new subcorpus
        create -- bool, sets whether to create new subcorpus
        cql -- custom within condition
        """
        within_cql = None
        form_type = request.json['form_type']

        if form_type == 'tt-sel':
            data = CreateSubcorpusArgs(**request.json)
            corpus_info = self.get_corpus_info(data.corpname)
            if (plugins.runtime.LIVE_ATTRIBUTES.exists and
                    plugins.runtime.LIVE_ATTRIBUTES.instance.is_enabled_for(
                        self._plugin_ctx, [data.corpname]
                    )  # TODO here we skip aligned corpora which is debatable
                    and len(data.aligned_corpora) > 0):
                if corpus_info.metadata.label_attr and corpus_info.metadata.id_attr:
                    within_cql = None
                    sel_match = plugins.runtime.LIVE_ATTRIBUTES.instance.get_attr_values(
                        self._plugin_ctx,
                        corpus=self.corp,
                        attr_map=data.text_types,
                        aligned_corpora=data.aligned_corpora,
                        limit_lists=False)
                    sel_attrs = {}
                    for k, vals in sel_match.attr_values.items():
                        if k == corpus_info.metadata.label_attr:
                            k = corpus_info.metadata.id_attr
                        if '.' in k:
                            sel_attrs[k] = [v[1] for v in vals]
                    tt_query = TextTypeCollector(self.corp,
                                                 sel_attrs).get_query()
                    tmp = ['<%s %s />' % item for item in tt_query]
                    full_cql = ' within '.join(tmp)
                    full_cql = 'aword,[] within %s' % full_cql
                    imp_cql = (full_cql, )
                else:
                    raise FunctionNotSupported(
                        'Corpus must have a bibliography item defined to support this function'
                    )
            else:
                tt_query = TextTypeCollector(self.corp,
                                             data.text_types).get_query()
                tmp = ['<%s %s />' % item for item in tt_query]
                full_cql = ' within '.join(tmp)
                full_cql = 'aword,[] within %s' % full_cql
                imp_cql = (full_cql, )
        elif form_type == 'within':
            data = CreateSubcorpusWithinArgs(**request.json)
            tt_query = ()
            within_cql = self._deserialize_custom_within(data.within)
            full_cql = 'aword,[] %s' % within_cql
            imp_cql = (full_cql, )
        elif form_type == 'cql':
            data = CreateSubcorpusRawCQLArgs(**request.json)
            tt_query = ()
            within_cql = data.cql
            full_cql = f'aword,[] {data.cql}'
            imp_cql = (full_cql, )
        else:
            raise UserActionException(
                f'Invalid form type provided - "{form_type}"')

        if not data.subcname:
            raise UserActionException(
                translate('No subcorpus name specified!'))

        if data.publish and not data.description:
            raise UserActionException(translate('No description specified'))

        path = self.prepare_subc_path(self.args.corpname,
                                      data.subcname,
                                      publish=False)
        publish_path = self.prepare_subc_path(
            self.args.corpname, data.subcname,
            publish=True) if data.publish else None

        if len(tt_query) == 1 and not data.has_aligned_corpora():
            result = corplib.create_subcorpus(path, self.corp, tt_query[0][0],
                                              tt_query[0][1])
            if result and publish_path:
                corplib.mk_publish_links(path, publish_path,
                                         self.session_get('user', 'fullname'),
                                         data.description)
        elif len(tt_query) > 1 or within_cql or data.has_aligned_corpora():
            worker = bgcalc.calc_backend_client(settings)
            res = worker.send_task(
                'create_subcorpus',
                object.__class__,
                (self.session_get('user', 'id'), self.args.corpname, path,
                 publish_path, tt_query, imp_cql,
                 self.session_get('user', 'fullname'), data.description),
                time_limit=TASK_TIME_LIMIT)
            self._store_async_task(
                AsyncTaskStatus(status=res.status,
                                ident=res.id,
                                category=AsyncTaskStatus.CATEGORY_SUBCORPUS,
                                label=f'{self.args.corpname}/{data.subcname}',
                                args=dict(subcname=data.subcname,
                                          corpname=self.args.corpname)))
            result = {}
        else:
            raise UserActionException(translate('Nothing specified!'))
        if result is not False:
            with plugins.runtime.SUBC_RESTORE as sr:
                try:
                    sr.store_query(user_id=self.session_get('user', 'id'),
                                   corpname=self.args.corpname,
                                   subcname=data.subcname,
                                   cql=full_cql.strip().split('[]', 1)[-1])
                except Exception as e:
                    logging.getLogger(__name__).warning(
                        'Failed to store subcorpus query: %s' % e)
                    self.add_system_message(
                        'warning',
                        translate(
                            'Subcorpus created but there was a problem saving a backup copy.'
                        ))
            unfinished_corpora = [
                at for at in self.get_async_tasks(
                    category=AsyncTaskStatus.CATEGORY_SUBCORPUS)
                if not at.is_finished()
            ]
            return dict(
                processed_subc=[uc.to_dict() for uc in unfinished_corpora])
        else:
            raise SubcorpusError(translate('Empty subcorpus!'))
Ejemplo n.º 5
0
    def _create_subcorpus(self, request):
        """
        req. arguments:
        subcname -- name of new subcorpus
        create -- bool, sets whether to create new subcorpus
        cql -- custom within condition
        """
        subcname = request.form['subcname']
        within_json = request.form.get('within_json')
        raw_cql = request.form.get('cql')
        aligned_corpora = request.form.getlist('aligned_corpora')
        publish = bool(int(request.form.get('publish')))
        corpus_info = self.get_corpus_info(self.args.corpname)
        description = request.form.get('description')

        if raw_cql:
            aligned_corpora = []
            tt_query = ()
            within_cql = raw_cql
            full_cql = 'aword,[] %s' % raw_cql
            imp_cql = (full_cql, )
        elif within_json:  # user entered a subcorpus query manually
            aligned_corpora = []
            tt_query = ()
            within_cql = self._deserialize_custom_within(
                json.loads(within_json))
            full_cql = 'aword,[] %s' % within_cql
            imp_cql = (full_cql, )
        elif len(aligned_corpora
                 ) > 0 and plugins.runtime.LIVE_ATTRIBUTES.exists:
            if corpus_info.metadata.label_attr and corpus_info.metadata.id_attr:
                within_cql = None
                attrs = json.loads(request.form.get('attrs', '{}'))
                sel_match = plugins.runtime.LIVE_ATTRIBUTES.instance.get_attr_values(
                    self._plugin_api,
                    corpus=self.corp,
                    attr_map=attrs,
                    aligned_corpora=aligned_corpora,
                    limit_lists=False)
                values = sel_match['attr_values'][
                    corpus_info.metadata.label_attr]
                args = argmapping.Args()
                setattr(args, 'sca_{0}'.format(corpus_info.metadata.id_attr),
                        [v[1] for v in values])
                tt_query = TextTypeCollector(self.corp, args).get_query()
                tmp = ['<%s %s />' % item for item in tt_query]
                full_cql = ' within '.join(tmp)
                full_cql = 'aword,[] within %s' % full_cql
                full_cql = import_string(full_cql,
                                         from_encoding=self.corp_encoding)
                imp_cql = (full_cql, )
            else:
                raise FunctionNotSupported(
                    'Corpus must have a bibliography item defined to support this function'
                )
        else:
            within_cql = None
            tt_query = TextTypeCollector(self.corp, request).get_query()
            tmp = ['<%s %s />' % item for item in tt_query]
            full_cql = ' within '.join(tmp)
            full_cql = 'aword,[] within %s' % full_cql
            full_cql = import_string(full_cql,
                                     from_encoding=self.corp_encoding)
            imp_cql = (full_cql, )

        basecorpname = self.args.corpname.split(':')[0]
        if not subcname:
            raise UserActionException(_('No subcorpus name specified!'))
        path = self.prepare_subc_path(basecorpname, subcname, publish=False)
        publish_path = self.prepare_subc_path(
            basecorpname, subcname, publish=True) if publish else None

        if type(path) == unicode:
            path = path.encode('utf-8')

        if len(tt_query) == 1 and len(aligned_corpora) == 0:
            result = corplib.create_subcorpus(path, self.corp, tt_query[0][0],
                                              tt_query[0][1])
            if result and publish_path:
                corplib.mk_publish_links(path, publish_path, description)
        elif len(tt_query) > 1 or within_cql or len(aligned_corpora) > 0:
            backend, conf = settings.get_full('global', 'calc_backend')
            if backend == 'celery':
                import task
                app = task.get_celery_app(conf['conf'])
                res = app.send_task(
                    'worker.create_subcorpus',
                    (self.session_get('user', 'id'), self.args.corpname, path,
                     publish_path, tt_query, imp_cql, description),
                    time_limit=TASK_TIME_LIMIT)
                self._store_async_task(
                    AsyncTaskStatus(
                        status=res.status,
                        ident=res.id,
                        category=AsyncTaskStatus.CATEGORY_SUBCORPUS,
                        label=u'%s:%s' % (basecorpname, subcname),
                        args=dict(subcname=subcname, corpname=basecorpname)))
                result = {}
            elif backend == 'multiprocessing':
                from bgcalc import subc_calc
                import functools
                import multiprocessing
                worker = subc_calc.CreateSubcorpusTask(
                    user_id=self.session_get('user', 'id'),
                    corpus_id=self.args.corpname)
                multiprocessing.Process(target=functools.partial(
                    worker.run, tt_query, imp_cql, path, publish_path,
                    description)).start()
                result = {}
        else:
            raise UserActionException(_('Nothing specified!'))
        if result is not False:
            with plugins.runtime.SUBC_RESTORE as sr:
                try:
                    sr.store_query(user_id=self.session_get('user', 'id'),
                                   corpname=self.args.corpname,
                                   subcname=subcname,
                                   cql=full_cql.strip().split('[]', 1)[-1])
                except Exception as e:
                    logging.getLogger(__name__).warning(
                        'Failed to store subcorpus query: %s' % e)
                    self.add_system_message(
                        'warning',
                        _('Subcorpus created but there was a problem saving a backup copy.'
                          ))
            unfinished_corpora = filter(
                lambda at: not at.is_finished(),
                self.get_async_tasks(
                    category=AsyncTaskStatus.CATEGORY_SUBCORPUS))
            return dict(
                unfinished_subc=[uc.to_dict() for uc in unfinished_corpora])
        else:
            raise SubcorpusError(_('Empty subcorpus!'))
Ejemplo n.º 6
0
 def _create_subcorpus(self, subc_path, corpus, corpus_info, atom_ids):
     structattr = corpus_info.metadata.id_attr.split('.')
     query = '(%s)' % '|'.join(map(lambda x: '%s="%s"' % (structattr[1], x), atom_ids))
     corplib.create_subcorpus(subc_path, corpus, structattr[0], query)