def calculate_colls_bg(coll_args): """ Background collocations calculation. This function is expected to be run either from Celery or from other process (via multiprocessing). """ cm = corplib.CorpusManager(subcpath=coll_args.subcpath) corp = cm.get_Corpus(coll_args.corpname, subcname=coll_args.subcname) try: # try to fetch precalculated data; if none then MissingSubCorpFreqFile corplib.frq_db(corp, coll_args.cattr) conc = get_conc(corp=corp, user_id=coll_args.user_id, q=coll_args.q, fromp=0, pagesize=0, asnc=0, save=coll_args.save, samplesize=coll_args.samplesize) if not conc.finished(): raise UnfinishedConcordanceError( _('Cannot calculate yet - source concordance not finished. Please try again later.')) collocs = conc.collocs(cattr=coll_args.cattr, csortfn=coll_args.csortfn, cbgrfns=coll_args.cbgrfns, cfromw=coll_args.cfromw, ctow=coll_args.ctow, cminfreq=coll_args.cminfreq, cminbgr=coll_args.cminbgr, max_lines=coll_args.num_fetch_items) for item in collocs['Items']: item['pfilter'] = [('q2', item['pfilter'])] item['nfilter'] = [('q2', item['nfilter'])] return dict(data=collocs, processing=0, tasks=[]) except corplib.MissingSubCorpFreqFile as e: ans = {'attrname': coll_args.cattr, 'tasks': []} out = freq_calc.build_arf_db(e.corpus, coll_args.cattr) if type(out) is list: processing = 1 ans['tasks'].extend(out) else: processing = 0 ans['processing'] = processing ans['data'] = dict(Items=[], Head=[]) return ans
def calculate_colls_bg(coll_args): """ Background collocations calculation. This function is expected to be run either from Celery or from other process (via multiprocessing). """ cm = corplib.CorpusManager(subcpath=coll_args.subcpath) corp = cm.get_Corpus(coll_args.corpname, subcname=coll_args.subcname) try: # try to fetch precalculated data; if none then MissingSubCorpFreqFile corplib.frq_db(corp, coll_args.cattr) conc = conclib.get_conc(corp=corp, user_id=coll_args.user_id, minsize=coll_args.minsize, q=coll_args.q, fromp=0, pagesize=0, async=0, save=coll_args.save, samplesize=coll_args.samplesize) if not conc.finished(): raise UnfinishedConcordanceError( _('Cannot calculate yet - source concordance not finished. Please try again later.')) collocs = conc.collocs(cattr=coll_args.cattr, csortfn=coll_args.csortfn, cbgrfns=coll_args.cbgrfns, cfromw=coll_args.cfromw, ctow=coll_args.ctow, cminfreq=coll_args.cminfreq, cminbgr=coll_args.cminbgr, max_lines=coll_args.num_fetch_items) for item in collocs['Items']: item['pfilter'] = [('q2', item['pfilter'])] item['nfilter'] = [('q2', item['nfilter'])] item['str'] = import_string(item['str'], from_encoding=coll_args.corpus_encoding) return dict(data=collocs, processing=0, tasks=[]) except corplib.MissingSubCorpFreqFile as e: ans = {'attrname': coll_args.cattr, 'tasks': []} out = freq_calc.build_arf_db(e.args[0], coll_args.cattr) if type(out) is list: processing = 1 ans['tasks'].extend(out) else: processing = 0 ans['processing'] = processing ans['data'] = dict(Items=[], Head=[]) return ans
def submit(self, request): form_args = WordlistFormArgs() form_args.update_by_user_query(request.json) worker = calc_backend_client(settings) ans = dict(corpname=self.args.corpname, usesubcorp=self.args.usesubcorp, freq_files_avail=True, subtasks=[]) async_res = worker.send_task( 'get_wordlist', object.__class__, args=(form_args.to_dict(), self.corp.size, self.session_get('user', 'id'))) bg_result = async_res.get() if isinstance(bg_result, MissingSubCorpFreqFile): data_calc = freq_calc.build_arf_db(self.session_get('user', 'id'), self.corp, form_args.wlattr) if type(data_calc) is list: for subtask in data_calc: self._store_async_task(subtask) ans['subtasks'].append(subtask.to_dict()) ans['freq_files_avail'] = False else: # TODO we should join the current calculation here instead of throwing an error raise WordlistError('The data calculation is already running') elif isinstance(bg_result, Exception): raise bg_result self._curr_wlform_args = form_args def on_conc_store(query_ids, history_ts, result): result['wl_query_id'] = query_ids[0] if history_ts: self._store_last_search('wlist', query_ids[0]) self.on_conc_store = on_conc_store return ans
def calculate_colls_bg(coll_args: CollCalcArgs): """ Background collocations calculation running on a worker server. In case auxiliary data files are needed and not present already (MissingSubCorpFreqFile exception), the function triggers a respective calculation. """ cm = corplib.CorpusManager(subcpath=coll_args.subcpath) corp = cm.get_corpus(coll_args.corpname, subcname=coll_args.subcname) try: # try to fetch precalculated data; if none then MissingSubCorpFreqFile corplib.frq_db(corp, coll_args.cattr) conc = require_existing_conc(corp=corp, q=coll_args.q) if not conc.finished(): raise UnfinishedConcordanceError( _('Cannot calculate yet - source concordance not finished. Please try again later.' )) collocs = conc.collocs(cattr=coll_args.cattr, csortfn=coll_args.csortfn, cbgrfns=coll_args.cbgrfns, cfromw=coll_args.cfromw, ctow=coll_args.ctow, cminfreq=coll_args.cminfreq, cminbgr=coll_args.cminbgr, max_lines=conc.size()) for item in collocs['Items']: item['pfilter'] = {'q2': item['pfilter']} item['nfilter'] = {'q2': item['nfilter']} return dict(data=collocs, processing=0, tasks=[]) except MissingSubCorpFreqFile: ans = {'attrname': coll_args.cattr, 'tasks': []} out = freq_calc.build_arf_db(corp, coll_args.cattr) if type(out) is list: processing = 1 ans['tasks'].extend(out) else: processing = 0 ans['processing'] = processing ans['data'] = dict(Items=[], Head=[]) return ans
def result(self, wlpat='', paginate=True, wlhash='', blhash=''): """ """ self.disabled_menu_items = (MainMenu.VIEW('kwic-sentence', 'structs-attrs'), MainMenu.FILTER, MainMenu.FREQUENCY, MainMenu.COLLOCATIONS, MainMenu.CONCORDANCE) if not wlpat: self.args.wlpat = '.*' if '.' in self.args.wlattr: orig_wlnums = self.args.wlnums self.args.wlnums = self._wlnums2structattr(self.args.wlnums) if paginate: wlmaxitems = self.args.wlpagesize * self.args.wlpage + 1 else: wlmaxitems = sys.maxsize wlstart = (self.args.wlpage - 1) * self.args.wlpagesize result = { 'reload_args': list({ 'corpname': self.args.corpname, 'usesubcorp': self.args.usesubcorp, 'wlattr': self.args.wlattr, 'wlpat': self.args.wlpat, 'wlminfreq': self.args.wlminfreq, 'include_nonwords': self.args.include_nonwords, 'wlsort': self.args.wlsort, 'wlnums': self.args.wlnums }.items()), 'form_args': dict(wlattr=self.args.wlattr, wlpat=self.args.wlpat, wlsort=self.args.wlsort, subcnorm=self.args.subcnorm, wltype=self.args.wltype, wlnums=self.args.wlnums, wlminfreq=self.args.wlminfreq, wlwords=self.args.wlwords, blacklist=self.args.blacklist, wlFileName='', blFileName='', includeNonwords=self.args.include_nonwords) } try: if hasattr(self, 'wlfile') and self.args.wlpat == '.*': self.args.wlsort = '' white_words = self.args.wlwords black_words = self.args.blacklist if wlhash != '': white_words = self.load_bw_file(wlhash) if blhash != '': black_words = self.load_bw_file(blhash) whitelist = [w for w in re.split(r'\s+', white_words.strip()) if w] blacklist = [w for w in re.split(r'\s+', black_words.strip()) if w] if wlhash == '' and len(self.args.wlwords) > 0: wlhash = self.save_bw_file(self.args.wlwords) if blhash == '' and len(self.args.blacklist) > 0: blhash = self.save_bw_file(self.args.blacklist) result['reload_args'] = list({ 'corpname': self.args.corpname, 'usesubcorp': self.args.usesubcorp, 'wlattr': self.args.wlattr, 'wlpat': self.args.wlpat, 'wlminfreq': self.args.wlminfreq, 'include_nonwords': self.args.include_nonwords, 'wlsort': self.args.wlsort, 'wlnums': self.args.wlnums, 'wlhash': wlhash, 'blhash': blhash }.items()) result_list = corplib.wordlist( corp=self.corp, words=whitelist, wlattr=self.args.wlattr, wlpat=self.args.wlpat, wlminfreq=self.args.wlminfreq, wlmaxitems=wlmaxitems, wlsort=self.args.wlsort, blacklist=blacklist, wlnums=self.args.wlnums, include_nonwords=self.args.include_nonwords)[wlstart:] result['Items'] = result_list if len(result_list) < self.args.wlpagesize + 1: result['lastpage'] = 1 else: result['lastpage'] = 0 if paginate: result_list = result_list[:-1] result['Items'] = result_list if '.' in self.args.wlattr: self.args.wlnums = orig_wlnums try: result['wlattr_label'] = (self.corp.get_conf(self.args.wlattr + '.LABEL') or self.args.wlattr) except Exception as e: result['wlattr_label'] = self.args.wlattr logging.getLogger(__name__).warning( 'wlattr_label set failed: %s' % e) result['freq_figure'] = translate( self.FREQ_FIGURES.get(self.args.wlnums, '?')) result['processing'] = None self._add_save_menu_item( 'CSV', save_format='csv', hint=translate( 'Saves at most {0} items. Use "Custom" for more options.'. format(self.WORDLIST_QUICK_SAVE_MAX_LINES))) self._add_save_menu_item( 'XLSX', save_format='xlsx', hint=translate( 'Saves at most {0} items. Use "Custom" for more options.'. format(self.WORDLIST_QUICK_SAVE_MAX_LINES))) self._add_save_menu_item( 'XML', save_format='xml', hint=translate( 'Saves at most {0} items. Use "Custom" for more options.'. format(self.WORDLIST_QUICK_SAVE_MAX_LINES))) self._add_save_menu_item( 'TXT', save_format='text', hint=translate( 'Saves at most {0} items. Use "Custom" for more options.'. format(self.WORDLIST_QUICK_SAVE_MAX_LINES))) self._add_save_menu_item(translate('Custom')) # custom save is solved in templates because of compatibility issues result['tasks'] = [] result['SubcorpList'] = [] result['quick_save_row_limit'] = self.WORDLIST_QUICK_SAVE_MAX_LINES self._export_subcorpora_list(self.args.corpname, self.args.usesubcorp, result) return result except corplib.MissingSubCorpFreqFile as e: result.update({'attrname': self.args.cattr, 'tasks': []}) out = freq_calc.build_arf_db(e.corpus, self.args.wlattr) if type(out) is list: processing = 0 result['tasks'].extend(out) elif out: processing = out else: processing = 0 result['quick_save_row_limit'] = self.WORDLIST_QUICK_SAVE_MAX_LINES result['wlattr'] = self.args.wlattr result['wlattr_label'] = '' result['processing'] = processing result['SubcorpList'] = [] result['freq_figure'] = '' result['lastpage'] = None return result
def result(self, wlpat='', paginate=True, wlhash='', blhash=''): """ """ self.disabled_menu_items = (MainMenu.VIEW('kwic-sentence', 'structs-attrs'), MainMenu.FILTER, MainMenu.FREQUENCY, MainMenu.COLLOCATIONS, MainMenu.CONCORDANCE) if not wlpat: self.args.wlpat = '.*' if '.' in self.args.wlattr: orig_wlnums = self.args.wlnums # TODO get rid of this retarded hidden deps rewriting (see the self.call_function piece of shit) self.args.wlnums = self._wlnums2structattr(self.args.wlnums) if paginate: wlmaxitems = self.args.wlpagesize * self.args.wlpage + 1 else: wlmaxitems = sys.maxint wlstart = (self.args.wlpage - 1) * self.args.wlpagesize result = { 'reload_args': { 'corpname': self.args.corpname, 'usesubcorp': self.args.usesubcorp, 'wlattr': self.args.wlattr, 'wlpat': self.args.wlpat, 'wlminfreq': self.args.wlminfreq, 'include_nonwords': self.args.include_nonwords, 'wlsort': self.args.wlsort, 'wlnums': self.args.wlnums}.items(), 'form_args': dict( wlattr=self.args.wlattr, wlpat=self.args.wlpat, wlsort=self.args.wlsort, subcnorm=self.args.subcnorm, wltype=self.args.wltype, wlnums=self.args.wlnums, wlminfreq=self.args.wlminfreq, wlwords=self.args.wlwords, blacklist=self.args.blacklist, wlFileName='', blFileName='', includeNonwords=self.args.include_nonwords) } try: if hasattr(self, 'wlfile') and self.args.wlpat == '.*': self.args.wlsort = '' white_words = self.args.wlwords black_words = self.args.blacklist if wlhash != '': white_words = self.load_bw_file(wlhash) if blhash != '': black_words = self.load_bw_file(blhash) whitelist = [w for w in re.split('\s+', white_words.strip()) if w] blacklist = [w for w in re.split('\s+', black_words.strip()) if w] if wlhash == '' and len(self.args.wlwords) > 0: wlhash = self.save_bw_file(self.args.wlwords) if blhash == '' and len(self.args.blacklist) > 0: blhash = self.save_bw_file(self.args.blacklist) result['reload_args'] = { 'corpname': self.args.corpname, 'usesubcorp': self.args.usesubcorp, 'wlattr': self.args.wlattr, 'wlpat': self.args.wlpat, 'wlminfreq': self.args.wlminfreq, 'include_nonwords': self.args.include_nonwords, 'wlsort': self.args.wlsort, 'wlnums': self.args.wlnums, 'wlhash': wlhash, 'blhash': blhash }.items() result_list = self.call_function(corplib.wordlist, (self.corp,), wlmaxitems=wlmaxitems, words=whitelist, blacklist=blacklist)[wlstart:] result['Items'] = result_list if len(result_list) < self.args.wlpagesize + 1: result['lastpage'] = 1 else: result['lastpage'] = 0 if paginate: result_list = result_list[:-1] result['Items'] = result_list if '.' in self.args.wlattr: self.args.wlnums = orig_wlnums try: result['wlattr_label'] = (self.corp.get_conf(self.args.wlattr + '.LABEL') or self.args.wlattr) except Exception as e: result['wlattr_label'] = self.args.wlattr logging.getLogger(__name__).warning('wlattr_label set failed: %s' % e) result['freq_figure'] = translate(self.FREQ_FIGURES.get(self.args.wlnums, '?')) result['processing'] = None self._add_save_menu_item('CSV', save_format='csv', hint=translate('Saves at most {0} items. Use "Custom" for more options.'.format( self.WORDLIST_QUICK_SAVE_MAX_LINES))) self._add_save_menu_item('XLSX', save_format='xlsx', hint=translate('Saves at most {0} items. Use "Custom" for more options.'.format( self.WORDLIST_QUICK_SAVE_MAX_LINES))) self._add_save_menu_item('XML', save_format='xml', hint=translate('Saves at most {0} items. Use "Custom" for more options.'.format( self.WORDLIST_QUICK_SAVE_MAX_LINES))) self._add_save_menu_item('TXT', save_format='text', hint=translate('Saves at most {0} items. Use "Custom" for more options.'.format( self.WORDLIST_QUICK_SAVE_MAX_LINES))) self._add_save_menu_item(translate('Custom')) # custom save is solved in templates because of compatibility issues result['tasks'] = [] result['SubcorpList'] = [] result['quick_save_row_limit'] = self.WORDLIST_QUICK_SAVE_MAX_LINES self._export_subcorpora_list(self.args.corpname, self.args.usesubcorp, result) return result except corplib.MissingSubCorpFreqFile as e: result.update({'attrname': self.args.cattr, 'tasks': []}) out = freq_calc.build_arf_db(e.corpus, self.args.wlattr) if type(out) is list: processing = 0 result['tasks'].extend(out) elif out: processing = out else: processing = 0 result['quick_save_row_limit'] = self.WORDLIST_QUICK_SAVE_MAX_LINES result['wlattr'] = self.args.wlattr result['wlattr_label'] = '' result['processing'] = processing result['SubcorpList'] = [] result['freq_figure'] = '' result['lastpage'] = None return result