Example #1
0
def subcmixer_create_subcorpus(ctrl, request):
    """
    Create a subcorpus in a low-level way.
    The action writes a list of 64-bit signed integers
    to a file (just like Manatee does).
    The current version does not optimize the
    write by merging adjacent position intervals
    (Manatee does this).
    """
    if not request.form['subcname']:
        ctrl.add_system_message('error', 'Missing subcorpus name')
        return {}
    else:
        publish = bool(int(request.form.get('publish')))
        subc_path = ctrl.prepare_subc_path(request.form['corpname'],
                                           request.form['subcname'],
                                           publish=False)
        struct_indices = sorted(
            [int(x) for x in request.form['ids'].split(',')])
        id_attr = request.form['idAttr'].split('.')
        attr = ctrl.corp.get_struct(id_attr[0])
        with open(subc_path, 'wb') as fw:
            for idx in struct_indices:
                fw.write(struct.pack('<q', attr.beg(idx)))
                fw.write(struct.pack('<q', attr.end(idx)))

        pub_path = ctrl.prepare_subc_path(request.form['corpname'],
                                          request.form['subcname'],
                                          publish=publish) if publish else None
        if pub_path:
            corplib.mk_publish_links(subc_path, pub_path,
                                     ctrl.session_get('user', 'fullname'),
                                     request.form['description'])

        return dict(status=True)
Example #2
0
 def run(self, tt_query, cql, path, publish_path):
     """
     returns:
     True in case of success
     In case of an empty subcorus, EmptySubcorpusException is thrown
     """
     conc = conclib.get_conc(self._corp, self._user_id, q=cql, asnc=0)
     if conc.size() == 0:
         raise EmptySubcorpusException('Empty subcorpus')
     ans = corplib.subcorpus_from_conc(path, conc)
     if ans is False:
         raise EmptySubcorpusException(
             'Failed to create the subcorpus from a concordance')
     if not os.path.isfile(
             path):  # this should not happen but it looks like it did
         logging.getLogger(__name__).warning(
             'Sync. called conc. file not created (path: {})'.format(path))
         time.sleep(5)
     # we must set write perms for group as this is created by Celery and we won't be
     # able to create hardlinks otherwise
     os.chmod(path, 0o664)
     if publish_path:
         corplib.mk_publish_links(path, publish_path, self._author,
                                  self._description)
     return ans
Example #3
0
 def publish_subcorpus(self, request):
     subcname = request.form['subcname']
     corpname = request.form['corpname']
     description = request.form['description']
     curr_subc = os.path.join(self.subcpath[0], corpname, subcname + '.subc')
     public_subc = self.prepare_subc_path(corpname, subcname, True)
     if os.path.isfile(curr_subc):
         corplib.mk_publish_links(curr_subc, public_subc,
                                  self.session_get('user', 'fullname'), description)
         return dict(code=os.path.splitext(os.path.basename(public_subc))[0])
     else:
         raise UserActionException('Subcorpus {0} not found'.format(subcname))
Example #4
0
 def publish_subcorpus(self, request):
     subcname = request.form['subcname']
     corpname = request.form['corpname']
     description = request.form['description']
     curr_subc = os.path.join(self.subcpath[0], corpname, subcname + '.subc')
     public_subc = self.prepare_subc_path(corpname, subcname, True)
     if os.path.isfile(curr_subc):
         corplib.mk_publish_links(curr_subc, public_subc,
                                  self.session_get('user', 'fullname'), description)
         return dict(code=os.path.splitext(os.path.basename(public_subc))[0])
     else:
         raise UserActionException('Subcorpus {0} not found'.format(subcname))
Example #5
0
 def run(self, tt_query, cql, path, publish_path):
     """
     returns:
     True in case of success
     In case of an empty subcorus, EmptySubcorpusException is thrown
     """
     conc = conclib.get_conc(self._corp, self._user_id, q=cql, async=0)
     ans = corplib.subcorpus_from_conc(path, conc)
     if ans is False:
         raise EmptySubcorpusException('Empty subcorpus')
     if publish_path:
         corplib.mk_publish_links(path, publish_path, self._author, self._description)
     return ans
Example #6
0
 def run(self, tt_query, cql, path, publish_path):
     """
     returns:
     True in case of success
     In case of an empty subcorus, EmptySubcorpusException is thrown
     """
     conc = conclib.get_conc(self._corp, self._user_id, q=cql, async=0)
     ans = corplib.subcorpus_from_conc(path, conc)
     if ans is False:
         raise EmptySubcorpusException('Empty subcorpus')
     if publish_path:
         corplib.mk_publish_links(path, publish_path)
     return ans
Example #7
0
    def _create_subcorpus(self, request):
        """
        req. arguments:
        subcname -- name of new subcorpus
        create -- bool, sets whether to create new subcorpus
        cql -- custom within condition
        """
        subcname = request.form['subcname']
        within_json = request.form.get('within_json')
        raw_cql = request.form.get('cql')
        aligned_corpora = request.form.getlist('aligned_corpora')
        publish = bool(int(request.form.get('publish')))
        corpus_info = self.get_corpus_info(self.args.corpname)
        description = request.form.get('description')

        if not subcname:
            raise UserActionException(translate('No subcorpus name specified!'))

        if publish and not description:
            raise UserActionException(translate('No description specified'))

        if raw_cql:
            aligned_corpora = []
            tt_query = ()
            within_cql = raw_cql
            full_cql = 'aword,[] %s' % raw_cql
            imp_cql = (full_cql,)
        elif within_json:  # user entered a subcorpus query manually
            aligned_corpora = []
            tt_query = ()
            within_cql = self._deserialize_custom_within(json.loads(within_json))
            full_cql = 'aword,[] %s' % within_cql
            imp_cql = (full_cql,)
        elif len(aligned_corpora) > 0 and plugins.runtime.LIVE_ATTRIBUTES.exists:
            if corpus_info.metadata.label_attr and corpus_info.metadata.id_attr:
                within_cql = None
                attrs = json.loads(request.form.get('attrs', '{}'))
                sel_match = plugins.runtime.LIVE_ATTRIBUTES.instance.get_attr_values(
                    self._plugin_api, corpus=self.corp,
                    attr_map=attrs,
                    aligned_corpora=aligned_corpora,
                    limit_lists=False)
                values = sel_match['attr_values'][corpus_info.metadata.label_attr]
                args = argmapping.Args()
                setattr(args, 'sca_{0}'.format(
                    corpus_info.metadata.id_attr), [v[1] for v in values])
                tt_query = TextTypeCollector(self.corp, args).get_query()
                tmp = ['<%s %s />' % item for item in tt_query]
                full_cql = ' within '.join(tmp)
                full_cql = 'aword,[] within %s' % full_cql
                full_cql = import_string(full_cql, from_encoding=self.corp_encoding)
                imp_cql = (full_cql,)
            else:
                raise FunctionNotSupported(
                    'Corpus must have a bibliography item defined to support this function')
        else:
            within_cql = None
            tt_query = TextTypeCollector(self.corp, request).get_query()
            tmp = ['<%s %s />' % item for item in tt_query]
            full_cql = ' within '.join(tmp)
            full_cql = 'aword,[] within %s' % full_cql
            full_cql = import_string(full_cql, from_encoding=self.corp_encoding)
            imp_cql = (full_cql,)

        basecorpname = self.args.corpname.split(':')[0]
        path = self.prepare_subc_path(basecorpname, subcname, publish=False)
        publish_path = self.prepare_subc_path(
            basecorpname, subcname, publish=True) if publish else None

        if type(path) == unicode:
            path = path.encode('utf-8')

        if len(tt_query) == 1 and len(aligned_corpora) == 0:
            result = corplib.create_subcorpus(path, self.corp, tt_query[0][0], tt_query[0][1])
            if result and publish_path:
                corplib.mk_publish_links(path, publish_path, self.session_get(
                    'user', 'fullname'), description)
        elif len(tt_query) > 1 or within_cql or len(aligned_corpora) > 0:
            backend = settings.get('calc_backend', 'type')
            if backend in ('celery', 'konserver'):
                import bgcalc
                app = bgcalc.calc_backend_client(settings)
                res = app.send_task('worker.create_subcorpus',
                                    (self.session_get('user', 'id'), self.args.corpname, path, publish_path,
                                     tt_query, imp_cql, self.session_get('user', 'fullname'), description),
                                    time_limit=TASK_TIME_LIMIT)
                self._store_async_task(AsyncTaskStatus(status=res.status, ident=res.id,
                                                       category=AsyncTaskStatus.CATEGORY_SUBCORPUS,
                                                       label=u'%s:%s' % (basecorpname, subcname),
                                                       args=dict(subcname=subcname, corpname=basecorpname)))
                result = {}
            elif backend == 'multiprocessing':
                from bgcalc import subc_calc
                import functools
                import multiprocessing
                worker = subc_calc.CreateSubcorpusTask(user_id=self.session_get('user', 'id'),
                                                       corpus_id=self.args.corpname)
                multiprocessing.Process(target=functools.partial(
                    worker.run, tt_query, imp_cql, path, publish_path, description)).start()
                result = {}
        else:
            raise UserActionException(translate('Nothing specified!'))
        if result is not False:
            with plugins.runtime.SUBC_RESTORE as sr:
                try:
                    sr.store_query(user_id=self.session_get('user', 'id'),
                                   corpname=self.args.corpname,
                                   subcname=subcname,
                                   cql=full_cql.strip().split('[]', 1)[-1])
                except Exception as e:
                    logging.getLogger(__name__).warning('Failed to store subcorpus query: %s' % e)
                    self.add_system_message('warning',
                                            translate('Subcorpus created but there was a problem saving a backup copy.'))
            unfinished_corpora = filter(lambda at: not at.is_finished(),
                                        self.get_async_tasks(category=AsyncTaskStatus.CATEGORY_SUBCORPUS))
            return dict(processed_subc=[uc.to_dict() for uc in unfinished_corpora])
        else:
            raise SubcorpusError(translate('Empty subcorpus!'))
Example #8
0
    def _create_subcorpus(self, request: Request) -> Dict[str, Any]:
        """
        req. arguments:
        subcname -- name of new subcorpus
        create -- bool, sets whether to create new subcorpus
        cql -- custom within condition
        """
        within_cql = None
        form_type = request.json['form_type']

        if form_type == 'tt-sel':
            data = CreateSubcorpusArgs(**request.json)
            corpus_info = self.get_corpus_info(data.corpname)
            if (plugins.runtime.LIVE_ATTRIBUTES.exists and
                    plugins.runtime.LIVE_ATTRIBUTES.instance.is_enabled_for(
                        self._plugin_ctx, [data.corpname]
                    )  # TODO here we skip aligned corpora which is debatable
                    and len(data.aligned_corpora) > 0):
                if corpus_info.metadata.label_attr and corpus_info.metadata.id_attr:
                    within_cql = None
                    sel_match = plugins.runtime.LIVE_ATTRIBUTES.instance.get_attr_values(
                        self._plugin_ctx,
                        corpus=self.corp,
                        attr_map=data.text_types,
                        aligned_corpora=data.aligned_corpora,
                        limit_lists=False)
                    sel_attrs = {}
                    for k, vals in sel_match.attr_values.items():
                        if k == corpus_info.metadata.label_attr:
                            k = corpus_info.metadata.id_attr
                        if '.' in k:
                            sel_attrs[k] = [v[1] for v in vals]
                    tt_query = TextTypeCollector(self.corp,
                                                 sel_attrs).get_query()
                    tmp = ['<%s %s />' % item for item in tt_query]
                    full_cql = ' within '.join(tmp)
                    full_cql = 'aword,[] within %s' % full_cql
                    imp_cql = (full_cql, )
                else:
                    raise FunctionNotSupported(
                        'Corpus must have a bibliography item defined to support this function'
                    )
            else:
                tt_query = TextTypeCollector(self.corp,
                                             data.text_types).get_query()
                tmp = ['<%s %s />' % item for item in tt_query]
                full_cql = ' within '.join(tmp)
                full_cql = 'aword,[] within %s' % full_cql
                imp_cql = (full_cql, )
        elif form_type == 'within':
            data = CreateSubcorpusWithinArgs(**request.json)
            tt_query = ()
            within_cql = self._deserialize_custom_within(data.within)
            full_cql = 'aword,[] %s' % within_cql
            imp_cql = (full_cql, )
        elif form_type == 'cql':
            data = CreateSubcorpusRawCQLArgs(**request.json)
            tt_query = ()
            within_cql = data.cql
            full_cql = f'aword,[] {data.cql}'
            imp_cql = (full_cql, )
        else:
            raise UserActionException(
                f'Invalid form type provided - "{form_type}"')

        if not data.subcname:
            raise UserActionException(
                translate('No subcorpus name specified!'))

        if data.publish and not data.description:
            raise UserActionException(translate('No description specified'))

        path = self.prepare_subc_path(self.args.corpname,
                                      data.subcname,
                                      publish=False)
        publish_path = self.prepare_subc_path(
            self.args.corpname, data.subcname,
            publish=True) if data.publish else None

        if len(tt_query) == 1 and not data.has_aligned_corpora():
            result = corplib.create_subcorpus(path, self.corp, tt_query[0][0],
                                              tt_query[0][1])
            if result and publish_path:
                corplib.mk_publish_links(path, publish_path,
                                         self.session_get('user', 'fullname'),
                                         data.description)
        elif len(tt_query) > 1 or within_cql or data.has_aligned_corpora():
            worker = bgcalc.calc_backend_client(settings)
            res = worker.send_task(
                'create_subcorpus',
                object.__class__,
                (self.session_get('user', 'id'), self.args.corpname, path,
                 publish_path, tt_query, imp_cql,
                 self.session_get('user', 'fullname'), data.description),
                time_limit=TASK_TIME_LIMIT)
            self._store_async_task(
                AsyncTaskStatus(status=res.status,
                                ident=res.id,
                                category=AsyncTaskStatus.CATEGORY_SUBCORPUS,
                                label=f'{self.args.corpname}/{data.subcname}',
                                args=dict(subcname=data.subcname,
                                          corpname=self.args.corpname)))
            result = {}
        else:
            raise UserActionException(translate('Nothing specified!'))
        if result is not False:
            with plugins.runtime.SUBC_RESTORE as sr:
                try:
                    sr.store_query(user_id=self.session_get('user', 'id'),
                                   corpname=self.args.corpname,
                                   subcname=data.subcname,
                                   cql=full_cql.strip().split('[]', 1)[-1])
                except Exception as e:
                    logging.getLogger(__name__).warning(
                        'Failed to store subcorpus query: %s' % e)
                    self.add_system_message(
                        'warning',
                        translate(
                            'Subcorpus created but there was a problem saving a backup copy.'
                        ))
            unfinished_corpora = [
                at for at in self.get_async_tasks(
                    category=AsyncTaskStatus.CATEGORY_SUBCORPUS)
                if not at.is_finished()
            ]
            return dict(
                processed_subc=[uc.to_dict() for uc in unfinished_corpora])
        else:
            raise SubcorpusError(translate('Empty subcorpus!'))
Example #9
0
    def _create_subcorpus(self, request):
        """
        req. arguments:
        subcname -- name of new subcorpus
        create -- bool, sets whether to create new subcorpus
        cql -- custom within condition
        """
        subcname = request.form['subcname']
        within_json = request.form.get('within_json')
        raw_cql = request.form.get('cql')
        aligned_corpora = request.form.getlist('aligned_corpora')
        publish = bool(int(request.form.get('publish')))
        corpus_info = self.get_corpus_info(self.args.corpname)
        description = request.form.get('description')

        if raw_cql:
            aligned_corpora = []
            tt_query = ()
            within_cql = raw_cql
            full_cql = 'aword,[] %s' % raw_cql
            imp_cql = (full_cql, )
        elif within_json:  # user entered a subcorpus query manually
            aligned_corpora = []
            tt_query = ()
            within_cql = self._deserialize_custom_within(
                json.loads(within_json))
            full_cql = 'aword,[] %s' % within_cql
            imp_cql = (full_cql, )
        elif len(aligned_corpora
                 ) > 0 and plugins.runtime.LIVE_ATTRIBUTES.exists:
            if corpus_info.metadata.label_attr and corpus_info.metadata.id_attr:
                within_cql = None
                attrs = json.loads(request.form.get('attrs', '{}'))
                sel_match = plugins.runtime.LIVE_ATTRIBUTES.instance.get_attr_values(
                    self._plugin_api,
                    corpus=self.corp,
                    attr_map=attrs,
                    aligned_corpora=aligned_corpora,
                    limit_lists=False)
                values = sel_match['attr_values'][
                    corpus_info.metadata.label_attr]
                args = argmapping.Args()
                setattr(args, 'sca_{0}'.format(corpus_info.metadata.id_attr),
                        [v[1] for v in values])
                tt_query = TextTypeCollector(self.corp, args).get_query()
                tmp = ['<%s %s />' % item for item in tt_query]
                full_cql = ' within '.join(tmp)
                full_cql = 'aword,[] within %s' % full_cql
                full_cql = import_string(full_cql,
                                         from_encoding=self.corp_encoding)
                imp_cql = (full_cql, )
            else:
                raise FunctionNotSupported(
                    'Corpus must have a bibliography item defined to support this function'
                )
        else:
            within_cql = None
            tt_query = TextTypeCollector(self.corp, request).get_query()
            tmp = ['<%s %s />' % item for item in tt_query]
            full_cql = ' within '.join(tmp)
            full_cql = 'aword,[] within %s' % full_cql
            full_cql = import_string(full_cql,
                                     from_encoding=self.corp_encoding)
            imp_cql = (full_cql, )

        basecorpname = self.args.corpname.split(':')[0]
        if not subcname:
            raise UserActionException(_('No subcorpus name specified!'))
        path = self.prepare_subc_path(basecorpname, subcname, publish=False)
        publish_path = self.prepare_subc_path(
            basecorpname, subcname, publish=True) if publish else None

        if type(path) == unicode:
            path = path.encode('utf-8')

        if len(tt_query) == 1 and len(aligned_corpora) == 0:
            result = corplib.create_subcorpus(path, self.corp, tt_query[0][0],
                                              tt_query[0][1])
            if result and publish_path:
                corplib.mk_publish_links(path, publish_path, description)
        elif len(tt_query) > 1 or within_cql or len(aligned_corpora) > 0:
            backend, conf = settings.get_full('global', 'calc_backend')
            if backend == 'celery':
                import task
                app = task.get_celery_app(conf['conf'])
                res = app.send_task(
                    'worker.create_subcorpus',
                    (self.session_get('user', 'id'), self.args.corpname, path,
                     publish_path, tt_query, imp_cql, description),
                    time_limit=TASK_TIME_LIMIT)
                self._store_async_task(
                    AsyncTaskStatus(
                        status=res.status,
                        ident=res.id,
                        category=AsyncTaskStatus.CATEGORY_SUBCORPUS,
                        label=u'%s:%s' % (basecorpname, subcname),
                        args=dict(subcname=subcname, corpname=basecorpname)))
                result = {}
            elif backend == 'multiprocessing':
                from bgcalc import subc_calc
                import functools
                import multiprocessing
                worker = subc_calc.CreateSubcorpusTask(
                    user_id=self.session_get('user', 'id'),
                    corpus_id=self.args.corpname)
                multiprocessing.Process(target=functools.partial(
                    worker.run, tt_query, imp_cql, path, publish_path,
                    description)).start()
                result = {}
        else:
            raise UserActionException(_('Nothing specified!'))
        if result is not False:
            with plugins.runtime.SUBC_RESTORE as sr:
                try:
                    sr.store_query(user_id=self.session_get('user', 'id'),
                                   corpname=self.args.corpname,
                                   subcname=subcname,
                                   cql=full_cql.strip().split('[]', 1)[-1])
                except Exception as e:
                    logging.getLogger(__name__).warning(
                        'Failed to store subcorpus query: %s' % e)
                    self.add_system_message(
                        'warning',
                        _('Subcorpus created but there was a problem saving a backup copy.'
                          ))
            unfinished_corpora = filter(
                lambda at: not at.is_finished(),
                self.get_async_tasks(
                    category=AsyncTaskStatus.CATEGORY_SUBCORPUS))
            return dict(
                unfinished_subc=[uc.to_dict() for uc in unfinished_corpora])
        else:
            raise SubcorpusError(_('Empty subcorpus!'))