Esempio n. 1
0
def fcs_scan(corpname, scan_query, max_ter, start):
    """
    aux function for federated content search: operation=scan
    """
    if not scan_query:
        raise Exception(7, "", "Mandatory parameter not supplied")
    query = scan_query.replace("+", " ")  # convert URL spaces
    exact_match = False
    if "exact" in query.lower() and not "=" in query:  # lemma ExacT "dog"
        pos = query.lower().index("exact")  # first occurence of EXACT
        query = query[:pos] + "=" + query[pos + 5 :]  # 1st exact > =
        exact_match = True
    corp = manatee.Corpus(corpname)
    attrs = corp.get_conf("ATTRLIST").split(",")  # list of available attrs
    try:
        if "=" in query:
            attr, value = query.split("=")
            attr = attr.strip()
            value = value.strip()
        else:  # must be in format attr = value
            raise Exception
        if '"' in attr:
            raise Exception
        if '"' in value:
            if value[0] == '"' and value[-1] == '"':
                value = value[1:-1].strip()
            else:
                raise Exception
    except Exception:
        raise Exception(10, scan_query, "Query syntax error")
    if not attr in attrs:
        raise Exception(16, attr, "Unsupported index")
    import corplib

    if exact_match:
        wlpattern = "^" + value + "$"
    else:
        wlpattern = ".*" + value + ".*"
    wl = corplib.wordlist(corp, wlattr=attr, wlpat=wlpattern, wlsort="f")
    return [(d["str"], d["freq"]) for d in wl][start:][:max_ter]
Esempio n. 2
0
def fcs_scan(corpname, scan_query, max_ter, start):
    """
    aux function for federated content search: operation=scan
    """
    if not scan_query:
        raise Exception(7, 'scan_query', 'Mandatory parameter not supplied')
    query = scan_query.replace('+', ' ')  # convert URL spaces
    exact_match = False
    if 'exact' in query.lower() and not '=' in query:  # lemma ExacT "dog"
        pos = query.lower().index('exact')  # first occurence of EXACT
        query = query[:pos] + '=' + query[pos + 5:]  # 1st exact > =
        exact_match = True
    corp = manatee.Corpus(corpname)
    attrs = corp.get_conf('ATTRLIST').split(',')  # list of available attrs
    try:
        if '=' in query:
            attr, value = query.split('=')
            attr = attr.strip()
            value = value.strip()
        else:  # must be in format attr = value
            raise Exception
        if '"' in attr:
            raise Exception
        if '"' in value:
            if value[0] == '"' and value[-1] == '"':
                value = value[1:-1].strip()
            else:
                raise Exception
    except Exception:
        raise Exception(10, scan_query, 'Query syntax error')
    if not attr in attrs:
        raise Exception(16, attr, 'Unsupported index')
    import corplib
    if exact_match:
        wlpattern = '^' + value + '$'
    else:
        wlpattern = '.*' + value + '.*'
    wl = corplib.wordlist(corp, wlattr=attr, wlpat=wlpattern, wlsort='f')
    return [(d['str'], d['freq']) for d in wl][start:][:max_ter]
Esempio n. 3
0
def fcs_scan(corpname, scan_query, max_ter, start):
    """
    aux function for federated content search: operation=scan
    """
    if not scan_query:
        raise Exception(7, '', 'Mandatory parameter not supplied')
    query = scan_query.replace('+', ' ')  # convert URL spaces
    exact_match = False
    if 'exact' in query.lower() and not '=' in query:  # lemma ExacT "dog"
        pos = query.lower().index('exact')  # first occurence of EXACT
        query = query[:pos] + '=' + query[pos+5:]  # 1st exact > =
        exact_match = True
    corp = manatee.Corpus(corpname)
    attrs = corp.get_conf('ATTRLIST').split(',')  # list of available attrs
    try:
        if '=' in query:
            attr, value = query.split('=')
            attr = attr.strip()
            value = value.strip()
        else:  # must be in format attr = value
            raise Exception
        if '"' in attr:
            raise Exception
        if '"' in value:
            if value[0] == '"' and value[-1] == '"':
                value = value[1:-1].strip()
            else:
                raise Exception
    except Exception:
        raise Exception(10, scan_query, 'Query syntax error')
    if not attr in attrs:
        raise Exception(16, attr, 'Unsupported index')
    import corplib
    if exact_match:
        wlpattern = '^' + value + '$'
    else:
        wlpattern = '.*' + value + '.*'
    wl = corplib.wordlist(corp, wlattr=attr, wlpat=wlpattern, wlsort='f')
    return [(d['str'], d['freq']) for d in wl][start:][:max_ter]
Esempio n. 4
0
    def result(self, wlpat='', paginate=True, wlhash='', blhash=''):
        """
        """
        self.disabled_menu_items = (MainMenu.VIEW('kwic-sentence',
                                                  'structs-attrs'),
                                    MainMenu.FILTER, MainMenu.FREQUENCY,
                                    MainMenu.COLLOCATIONS,
                                    MainMenu.CONCORDANCE)
        if not wlpat:
            self.args.wlpat = '.*'
        if '.' in self.args.wlattr:
            orig_wlnums = self.args.wlnums
            self.args.wlnums = self._wlnums2structattr(self.args.wlnums)

        if paginate:
            wlmaxitems = self.args.wlpagesize * self.args.wlpage + 1
        else:
            wlmaxitems = sys.maxsize
        wlstart = (self.args.wlpage - 1) * self.args.wlpagesize
        result = {
            'reload_args':
            list({
                'corpname': self.args.corpname,
                'usesubcorp': self.args.usesubcorp,
                'wlattr': self.args.wlattr,
                'wlpat': self.args.wlpat,
                'wlminfreq': self.args.wlminfreq,
                'include_nonwords': self.args.include_nonwords,
                'wlsort': self.args.wlsort,
                'wlnums': self.args.wlnums
            }.items()),
            'form_args':
            dict(wlattr=self.args.wlattr,
                 wlpat=self.args.wlpat,
                 wlsort=self.args.wlsort,
                 subcnorm=self.args.subcnorm,
                 wltype=self.args.wltype,
                 wlnums=self.args.wlnums,
                 wlminfreq=self.args.wlminfreq,
                 wlwords=self.args.wlwords,
                 blacklist=self.args.blacklist,
                 wlFileName='',
                 blFileName='',
                 includeNonwords=self.args.include_nonwords)
        }
        try:
            if hasattr(self, 'wlfile') and self.args.wlpat == '.*':
                self.args.wlsort = ''

            white_words = self.args.wlwords
            black_words = self.args.blacklist

            if wlhash != '':
                white_words = self.load_bw_file(wlhash)

            if blhash != '':
                black_words = self.load_bw_file(blhash)

            whitelist = [w for w in re.split(r'\s+', white_words.strip()) if w]
            blacklist = [w for w in re.split(r'\s+', black_words.strip()) if w]

            if wlhash == '' and len(self.args.wlwords) > 0:
                wlhash = self.save_bw_file(self.args.wlwords)

            if blhash == '' and len(self.args.blacklist) > 0:
                blhash = self.save_bw_file(self.args.blacklist)

            result['reload_args'] = list({
                'corpname': self.args.corpname,
                'usesubcorp': self.args.usesubcorp,
                'wlattr': self.args.wlattr,
                'wlpat': self.args.wlpat,
                'wlminfreq': self.args.wlminfreq,
                'include_nonwords': self.args.include_nonwords,
                'wlsort': self.args.wlsort,
                'wlnums': self.args.wlnums,
                'wlhash': wlhash,
                'blhash': blhash
            }.items())

            result_list = corplib.wordlist(
                corp=self.corp,
                words=whitelist,
                wlattr=self.args.wlattr,
                wlpat=self.args.wlpat,
                wlminfreq=self.args.wlminfreq,
                wlmaxitems=wlmaxitems,
                wlsort=self.args.wlsort,
                blacklist=blacklist,
                wlnums=self.args.wlnums,
                include_nonwords=self.args.include_nonwords)[wlstart:]
            result['Items'] = result_list
            if len(result_list) < self.args.wlpagesize + 1:
                result['lastpage'] = 1
            else:
                result['lastpage'] = 0
                if paginate:
                    result_list = result_list[:-1]
            result['Items'] = result_list

            if '.' in self.args.wlattr:
                self.args.wlnums = orig_wlnums

            try:
                result['wlattr_label'] = (self.corp.get_conf(self.args.wlattr +
                                                             '.LABEL')
                                          or self.args.wlattr)
            except Exception as e:
                result['wlattr_label'] = self.args.wlattr
                logging.getLogger(__name__).warning(
                    'wlattr_label set failed: %s' % e)

            result['freq_figure'] = translate(
                self.FREQ_FIGURES.get(self.args.wlnums, '?'))
            result['processing'] = None

            self._add_save_menu_item(
                'CSV',
                save_format='csv',
                hint=translate(
                    'Saves at most {0} items. Use "Custom" for more options.'.
                    format(self.WORDLIST_QUICK_SAVE_MAX_LINES)))
            self._add_save_menu_item(
                'XLSX',
                save_format='xlsx',
                hint=translate(
                    'Saves at most {0} items. Use "Custom" for more options.'.
                    format(self.WORDLIST_QUICK_SAVE_MAX_LINES)))
            self._add_save_menu_item(
                'XML',
                save_format='xml',
                hint=translate(
                    'Saves at most {0} items. Use "Custom" for more options.'.
                    format(self.WORDLIST_QUICK_SAVE_MAX_LINES)))
            self._add_save_menu_item(
                'TXT',
                save_format='text',
                hint=translate(
                    'Saves at most {0} items. Use "Custom" for more options.'.
                    format(self.WORDLIST_QUICK_SAVE_MAX_LINES)))
            self._add_save_menu_item(translate('Custom'))
            # custom save is solved in templates because of compatibility issues
            result['tasks'] = []
            result['SubcorpList'] = []
            result['quick_save_row_limit'] = self.WORDLIST_QUICK_SAVE_MAX_LINES
            self._export_subcorpora_list(self.args.corpname,
                                         self.args.usesubcorp, result)
            return result

        except corplib.MissingSubCorpFreqFile as e:
            result.update({'attrname': self.args.cattr, 'tasks': []})
            out = freq_calc.build_arf_db(e.corpus, self.args.wlattr)
            if type(out) is list:
                processing = 0
                result['tasks'].extend(out)
            elif out:
                processing = out
            else:
                processing = 0
            result['quick_save_row_limit'] = self.WORDLIST_QUICK_SAVE_MAX_LINES
            result['wlattr'] = self.args.wlattr
            result['wlattr_label'] = ''
            result['processing'] = processing
            result['SubcorpList'] = []
            result['freq_figure'] = ''
            result['lastpage'] = None
            return result