def fcs_scan(corpname, scan_query, max_ter, start): """ aux function for federated content search: operation=scan """ if not scan_query: raise Exception(7, "", "Mandatory parameter not supplied") query = scan_query.replace("+", " ") # convert URL spaces exact_match = False if "exact" in query.lower() and not "=" in query: # lemma ExacT "dog" pos = query.lower().index("exact") # first occurence of EXACT query = query[:pos] + "=" + query[pos + 5 :] # 1st exact > = exact_match = True corp = manatee.Corpus(corpname) attrs = corp.get_conf("ATTRLIST").split(",") # list of available attrs try: if "=" in query: attr, value = query.split("=") attr = attr.strip() value = value.strip() else: # must be in format attr = value raise Exception if '"' in attr: raise Exception if '"' in value: if value[0] == '"' and value[-1] == '"': value = value[1:-1].strip() else: raise Exception except Exception: raise Exception(10, scan_query, "Query syntax error") if not attr in attrs: raise Exception(16, attr, "Unsupported index") import corplib if exact_match: wlpattern = "^" + value + "$" else: wlpattern = ".*" + value + ".*" wl = corplib.wordlist(corp, wlattr=attr, wlpat=wlpattern, wlsort="f") return [(d["str"], d["freq"]) for d in wl][start:][:max_ter]
def fcs_scan(corpname, scan_query, max_ter, start): """ aux function for federated content search: operation=scan """ if not scan_query: raise Exception(7, 'scan_query', 'Mandatory parameter not supplied') query = scan_query.replace('+', ' ') # convert URL spaces exact_match = False if 'exact' in query.lower() and not '=' in query: # lemma ExacT "dog" pos = query.lower().index('exact') # first occurence of EXACT query = query[:pos] + '=' + query[pos + 5:] # 1st exact > = exact_match = True corp = manatee.Corpus(corpname) attrs = corp.get_conf('ATTRLIST').split(',') # list of available attrs try: if '=' in query: attr, value = query.split('=') attr = attr.strip() value = value.strip() else: # must be in format attr = value raise Exception if '"' in attr: raise Exception if '"' in value: if value[0] == '"' and value[-1] == '"': value = value[1:-1].strip() else: raise Exception except Exception: raise Exception(10, scan_query, 'Query syntax error') if not attr in attrs: raise Exception(16, attr, 'Unsupported index') import corplib if exact_match: wlpattern = '^' + value + '$' else: wlpattern = '.*' + value + '.*' wl = corplib.wordlist(corp, wlattr=attr, wlpat=wlpattern, wlsort='f') return [(d['str'], d['freq']) for d in wl][start:][:max_ter]
def fcs_scan(corpname, scan_query, max_ter, start): """ aux function for federated content search: operation=scan """ if not scan_query: raise Exception(7, '', 'Mandatory parameter not supplied') query = scan_query.replace('+', ' ') # convert URL spaces exact_match = False if 'exact' in query.lower() and not '=' in query: # lemma ExacT "dog" pos = query.lower().index('exact') # first occurence of EXACT query = query[:pos] + '=' + query[pos+5:] # 1st exact > = exact_match = True corp = manatee.Corpus(corpname) attrs = corp.get_conf('ATTRLIST').split(',') # list of available attrs try: if '=' in query: attr, value = query.split('=') attr = attr.strip() value = value.strip() else: # must be in format attr = value raise Exception if '"' in attr: raise Exception if '"' in value: if value[0] == '"' and value[-1] == '"': value = value[1:-1].strip() else: raise Exception except Exception: raise Exception(10, scan_query, 'Query syntax error') if not attr in attrs: raise Exception(16, attr, 'Unsupported index') import corplib if exact_match: wlpattern = '^' + value + '$' else: wlpattern = '.*' + value + '.*' wl = corplib.wordlist(corp, wlattr=attr, wlpat=wlpattern, wlsort='f') return [(d['str'], d['freq']) for d in wl][start:][:max_ter]
def result(self, wlpat='', paginate=True, wlhash='', blhash=''): """ """ self.disabled_menu_items = (MainMenu.VIEW('kwic-sentence', 'structs-attrs'), MainMenu.FILTER, MainMenu.FREQUENCY, MainMenu.COLLOCATIONS, MainMenu.CONCORDANCE) if not wlpat: self.args.wlpat = '.*' if '.' in self.args.wlattr: orig_wlnums = self.args.wlnums self.args.wlnums = self._wlnums2structattr(self.args.wlnums) if paginate: wlmaxitems = self.args.wlpagesize * self.args.wlpage + 1 else: wlmaxitems = sys.maxsize wlstart = (self.args.wlpage - 1) * self.args.wlpagesize result = { 'reload_args': list({ 'corpname': self.args.corpname, 'usesubcorp': self.args.usesubcorp, 'wlattr': self.args.wlattr, 'wlpat': self.args.wlpat, 'wlminfreq': self.args.wlminfreq, 'include_nonwords': self.args.include_nonwords, 'wlsort': self.args.wlsort, 'wlnums': self.args.wlnums }.items()), 'form_args': dict(wlattr=self.args.wlattr, wlpat=self.args.wlpat, wlsort=self.args.wlsort, subcnorm=self.args.subcnorm, wltype=self.args.wltype, wlnums=self.args.wlnums, wlminfreq=self.args.wlminfreq, wlwords=self.args.wlwords, blacklist=self.args.blacklist, wlFileName='', blFileName='', includeNonwords=self.args.include_nonwords) } try: if hasattr(self, 'wlfile') and self.args.wlpat == '.*': self.args.wlsort = '' white_words = self.args.wlwords black_words = self.args.blacklist if wlhash != '': white_words = self.load_bw_file(wlhash) if blhash != '': black_words = self.load_bw_file(blhash) whitelist = [w for w in re.split(r'\s+', white_words.strip()) if w] blacklist = [w for w in re.split(r'\s+', black_words.strip()) if w] if wlhash == '' and len(self.args.wlwords) > 0: wlhash = self.save_bw_file(self.args.wlwords) if blhash == '' and len(self.args.blacklist) > 0: blhash = self.save_bw_file(self.args.blacklist) result['reload_args'] = list({ 'corpname': self.args.corpname, 'usesubcorp': self.args.usesubcorp, 'wlattr': self.args.wlattr, 'wlpat': self.args.wlpat, 'wlminfreq': self.args.wlminfreq, 'include_nonwords': self.args.include_nonwords, 'wlsort': self.args.wlsort, 'wlnums': self.args.wlnums, 'wlhash': wlhash, 'blhash': blhash }.items()) result_list = corplib.wordlist( corp=self.corp, words=whitelist, wlattr=self.args.wlattr, wlpat=self.args.wlpat, wlminfreq=self.args.wlminfreq, wlmaxitems=wlmaxitems, wlsort=self.args.wlsort, blacklist=blacklist, wlnums=self.args.wlnums, include_nonwords=self.args.include_nonwords)[wlstart:] result['Items'] = result_list if len(result_list) < self.args.wlpagesize + 1: result['lastpage'] = 1 else: result['lastpage'] = 0 if paginate: result_list = result_list[:-1] result['Items'] = result_list if '.' in self.args.wlattr: self.args.wlnums = orig_wlnums try: result['wlattr_label'] = (self.corp.get_conf(self.args.wlattr + '.LABEL') or self.args.wlattr) except Exception as e: result['wlattr_label'] = self.args.wlattr logging.getLogger(__name__).warning( 'wlattr_label set failed: %s' % e) result['freq_figure'] = translate( self.FREQ_FIGURES.get(self.args.wlnums, '?')) result['processing'] = None self._add_save_menu_item( 'CSV', save_format='csv', hint=translate( 'Saves at most {0} items. Use "Custom" for more options.'. format(self.WORDLIST_QUICK_SAVE_MAX_LINES))) self._add_save_menu_item( 'XLSX', save_format='xlsx', hint=translate( 'Saves at most {0} items. Use "Custom" for more options.'. format(self.WORDLIST_QUICK_SAVE_MAX_LINES))) self._add_save_menu_item( 'XML', save_format='xml', hint=translate( 'Saves at most {0} items. Use "Custom" for more options.'. format(self.WORDLIST_QUICK_SAVE_MAX_LINES))) self._add_save_menu_item( 'TXT', save_format='text', hint=translate( 'Saves at most {0} items. Use "Custom" for more options.'. format(self.WORDLIST_QUICK_SAVE_MAX_LINES))) self._add_save_menu_item(translate('Custom')) # custom save is solved in templates because of compatibility issues result['tasks'] = [] result['SubcorpList'] = [] result['quick_save_row_limit'] = self.WORDLIST_QUICK_SAVE_MAX_LINES self._export_subcorpora_list(self.args.corpname, self.args.usesubcorp, result) return result except corplib.MissingSubCorpFreqFile as e: result.update({'attrname': self.args.cattr, 'tasks': []}) out = freq_calc.build_arf_db(e.corpus, self.args.wlattr) if type(out) is list: processing = 0 result['tasks'].extend(out) elif out: processing = out else: processing = 0 result['quick_save_row_limit'] = self.WORDLIST_QUICK_SAVE_MAX_LINES result['wlattr'] = self.args.wlattr result['wlattr_label'] = '' result['processing'] = processing result['SubcorpList'] = [] result['freq_figure'] = '' result['lastpage'] = None return result