コード例 #1
0
    def read_xls_response(self, xls_response):
        data = excel_to_dict(xls_response.read())
        results = []
        for row in data:
            #print 'row:', row
            if row:
                try:
                    item = {
                        'pubnumber':
                        row['Publication number'],
                        'pubdate':
                        row['Publication date']
                        and date_iso(from_german(row['Publication date']))
                        or None,
                        'appdate':
                        row['Application date']
                        and date_iso(from_german(row['Application date']))
                        or None,
                        'title':
                        row['Title'],
                        'applicant':
                        row['Applicant/Owner'],
                        'inventor':
                        row['Inventor'],
                    }
                except KeyError as ex:
                    logger.error(
                        'Could not decode row from DEPATISnet. row={row}, exception={exception}\n{trace}'
                        .format(row=row,
                                exception=ex,
                                trace=_exception_traceback()))
                    raise
                results.append(item)

        return results
コード例 #2
0
def sip_published_data_crawl_handler(request):
    """Crawl published-data at SIP"""

    # XML query expression
    query = request.params.get('expression', '')
    log.info('query raw: ' + query)

    if should_be_quoted(query):
        query = '"%s"' % query

    # constituents: abstract, biblio and/or full-cycle
    constituents = request.matchdict.get('constituents', 'full-cycle')
    #print 'constituents:', constituents

    chunksize = int(request.params.get('chunksize', '2500'))

    try:
        result = sip_published_data_crawl(constituents, query, chunksize)
        return result

    except Exception as ex:
        if hasattr(ex, 'user_info'):
            message = ex.user_info
        else:
            message = unicode(ex)
        request.errors.add('sip-crawl', 'crawl', message)
        log.error(request.errors)
        log.error(u'query="{0}", exception:\n{1}'.format(query, _exception_traceback()))
コード例 #3
0
def ops_published_data_crawl_handler(request):
    """Crawl published-data at OPS"""

    # constituents: abstract, biblio and/or full-cycle
    constituents = request.matchdict.get('constituents', 'full-cycle')
    print('constituents:', constituents)

    # CQL query string
    query = request.params.get('expression', '')
    log.info('query raw: ' + query)

    # Transcode CQL query expression
    search = cql_prepare_query(query)

    # Propagate keywords to highlighting component
    keywords_to_response(request, search=search)

    log.info('query cql: ' + search.expression)

    chunksize = int(request.params.get('chunksize', '100'))

    try:
        result = ops_published_data_crawl(constituents, search.expression,
                                          chunksize)
        return result

    except Exception as ex:
        log.error(
            'OPS crawler error: query="{0}", reason={1}, Exception was:\n{2}'.
            format(query, ex, _exception_traceback()))
        request.errors.add('ops-published-data-crawl', 'query', str(ex))
コード例 #4
0
def depatech_published_data_crawl_handler(request):
    """Crawl published-data at MTC depa.tech"""

    # Get hold of query expression and filter
    query = SmartBunch({
        'expression': request.params.get('expression', ''),
        'filter':     request.params.get('filter', ''),
        })
    log.info('query: {}'.format(query))

    if should_be_quoted(query.expression):
        query.expression = '"%s"' % query.expression

    # constituents: abstract, biblio and/or full-cycle
    constituents = request.matchdict.get('constituents', 'full-cycle')
    #print 'constituents:', constituents

    chunksize = int(request.params.get('chunksize', '5000'))

    try:
        result = depatech_crawl(constituents, query, chunksize)
        return result

    except Exception as ex:
        request.errors.add('depatech-crawl', 'crawl', str(ex))
        log.error(request.errors)
        log.error('query="{0}", exception:\n{1}'.format(query, _exception_traceback()))
コード例 #5
0
ファイル: ops.py プロジェクト: zheyuan2025/ip-navigator
def ops_published_data_crawl_handler(request):
    """Crawl published-data at OPS"""

    # constituents: abstract, biblio and/or full-cycle
    constituents = request.matchdict.get('constituents', 'full-cycle')
    print 'constituents:', constituents

    # CQL query string
    query = request.params.get('expression', '')
    log.info('query raw: ' + query)

    query_object, query = cql_prepare_query(query)
    propagate_keywords(request, query_object)

    log.info('query cql: ' + query)

    chunksize = int(request.params.get('chunksize', '100'))

    try:
        result = ops_published_data_crawl(constituents, query, chunksize)
        return result

    except Exception as ex:
        log.error(
            u'OPS crawler error: query="{0}", reason={1}, Exception was:\n{2}'.
            format(query, ex, _exception_traceback()))
        request.errors.add('ops-published-data-crawl', 'query', str(ex))
コード例 #6
0
def depatisnet_published_data_crawl_handler(request):
    """Crawl published-data at DEPATISnet"""

    search, options = prepare_search(request)

    chunksize = 1000
    options.update({'limit': chunksize})

    try:
        result = dpma_published_data_search(search.expression, options)
        return result

    except SyntaxError as ex:
        request.errors.add('depatisnet-search', 'expression', str(ex.msg))
        log.warn(request.errors)

    except Exception as ex:
        http_response = None
        if hasattr(ex, 'http_response'):
            http_response = ex.http_response
        log.error(
            u'DEPATISnet crawler error: query="{0}", reason={1}\nresponse:\n{2}\nexception:\n{3}'
            .format(query, ex, http_response, _exception_traceback()))

        message = u'An exception occurred while processing your query<br/>Reason: {}'.format(
            ex)
        request.errors.add('depatisnet-search', 'crawl', message)
コード例 #7
0
    def parse_expression_cql(self, expression):

        # Fixup query: Wrap into quotes if CQL expression is a) unspecific, b) contains spaces and c) is still unquoted
        if should_be_quoted(expression) and u'within' not in expression:
            expression = u'"%s"' % expression

        # Parse and recompile CQL query string to apply number normalization
        query_object = None
        try:

            # v1: Cheshire3 CQL parser
            #query_object = cql_parse(query)
            #query = query_object.toCQL().strip()

            # v2 pyparsing CQL parser
            query_object = CQL(expression,
                               grammar=self.grammar,
                               keyword_fields=self.keyword_fields).polish()
            query_recompiled = query_object.dumps()

            if query_recompiled:
                expression = query_recompiled

            if query_recompiled != expression:
                logger.info(
                    u'Recompiled search expression to "{query}"'.format(
                        query=expression))

        except Exception as ex:
            # TODO: Can we get more details from diagnostic information to just stop here w/o propagating obviously wrong query to OPS?
            logger.warn(
                u'CQL parse error: query="{0}", reason={1}, Exception was:\n{2}'
                .format(expression, ex, _exception_traceback()))

        self.cql_parser = query_object
        self.expression = expression

        if query_object:

            keywords = []
            try:
                keywords = query_object.keywords()
                self.keywords_origin = 'grammar'

            except AttributeError:
                keywords = compute_keywords(query_object)
                self.keywords_origin = 'compute'

            # List of keywords should contain only unique items
            self.keywords = unique_sequence(keywords)
コード例 #8
0
ファイル: __init__.py プロジェクト: zheyuan2025/ip-navigator
def handle_generic_exception(request, ex, backend_name, query):

    if isinstance(ex, cornice.util._JSONError):
        raise

    http_response = None
    if hasattr(ex, 'http_response'):
        http_response = ex.http_response

    module_name = ex.__class__.__module__
    class_name = ex.__class__.__name__
    reason = u'{}.{}: {}'.format(module_name, class_name, ex.message)

    log.critical(u'{backend_name} error: query="{query}", reason={reason}\nresponse:\n{http_response}\nexception:\n{exception}'.format(
        exception=_exception_traceback(), **locals()))

    message = u'An exception occurred while processing your query.<br/>\nReason: {}<br/><br/>\n'.format(reason)
    if module_name == 'pymongo.errors':
        message += 'Error connecting to cache database. Please report this problem to us.'

    return message
コード例 #9
0
ファイル: dpma.py プロジェクト: zheyuan2025/ip-navigator
def depatisnet_published_data_crawl_handler(request):
    """Crawl published-data at DEPATISnet"""

    # CQL query string
    query = request.params.get('expression', '')
    log.info('query raw: ' + query)

    query_object, query = cql_prepare_query(query)
    propagate_keywords(request, query_object)

    chunksize = 1000

    # Compute query options, like
    # - limit
    # - sorting
    # - whether to remove family members
    options = {}
    options.update({'limit': chunksize})

    # propagate request parameters to search options parameters
    request_to_options(request, options)

    log.info('query cql: ' + query)
    try:
        result = dpma_published_data_search(query, options)
        return result

    except SyntaxError as ex:
        request.errors.add('depatisnet-search', 'expression', str(ex.msg))
        log.warn(request.errors)

    except Exception as ex:
        http_response = None
        if hasattr(ex, 'http_response'):
            http_response = ex.http_response
        log.error(u'DEPATISnet crawler error: query="{0}", reason={1}\nresponse:\n{2}\nexception:\n{3}'.format(
            query, ex, http_response, _exception_traceback()))

        message = u'An exception occurred while processing your query<br/>Reason: {}'.format(ex)
        request.errors.add('depatisnet-search', 'crawl', message)
コード例 #10
0
    def config_parameters(self):

        request = get_current_request()

        # prefix environment and settings in configuration model
        environment = dict_prefix_key(self.environment(), 'request.')
        setting_params = dict_prefix_key(self.config_settings(), 'setting.')
        request_params = dict(request.params)
        user_params = {}
        if request.user:

            # Formulate JS-domain settings
            user_params = dict_prefix_key({
                'modules': request.user.modules,
                'tags': request.user.tags},
                'user.')

            # Get representation of user attributes
            user_dict = json.loads(request.user.to_json())

            # Strip sensitive information
            if '_id' in user_dict:
                del user_dict['_id']
            if 'password' in user_dict:
                del user_dict['password']
            if 'upstream_credentials' in user_dict:
                del user_dict['upstream_credentials']

            # Add whole user attributes to JS-domain
            user_params['user'] = user_dict

        request_opaque = dict(request.opaque)
        request_opaque_meta = dict_prefix_key(dict(request.opaque_meta), 'opaque.meta.')

        try:
            unixtime = request.opaque_meta.get('exp')
            if unixtime:
                request_opaque['link_expires'] = datetime_isoformat(unixtime_to_datetime(int(unixtime)))
        except Exception as ex:
            log.error(
                'Could not compute opaque parameter link expiry time, unixtime=%s. '
                'Exception was: %s\n%s', unixtime, ex, _exception_traceback())

        # A. parameter firewall, INPUT

        # determine if we're in view-only mode by matching against the hostname
        host = request.headers.get('Host')
        isviewer = 'patentview' in host or 'viewer' in host or 'patview' in host

        # 1. don't allow "query" from outside on view-only domains
        if 'query' in request_params and isviewer:
            log.warn('parameter "query=%s" not allowed on this vhost, purging it', request_params['query'])
            del request_params['query']


        # B. merge parameters
        # 1. use "environment" as foundation (prefixed "request.")
        # 2. merge "settings" (prefixed "setting.")
        # 3. merge "opaque meta" parameters (prefixed "opaque.meta.")
        # 4. merge "request parameters"
        # 5. merge "user parameters"
        # 6. merge "opaque parameters" taking the highest precedence
        params = {}
        params['system'] = self.datasource_settings()
        params.update(environment)
        params.update(setting_params)
        params.update(request_opaque_meta)
        params.update(request_params)
        params.update(user_params)
        params.update(request_opaque)


        # C. parameter firewall, OUTPUT

        # remove "opaque parameter"
        if 'op' in params:
            del params['op']


        # D. special customizations

        # 0. Vendor
        params['vendor'] = self.vendor.name

        # 1. On patentview domains, limit access to liveview mode only
        params['isviewer'] = isviewer
        if isviewer:
            params['mode'] = 'liveview'

        # 2. Compute whether data sources are enabled
        params['datasources_enabled'] = []
        for datasource in self.registry.datasource_settings.datasources:
            if self.is_datasource_enabled(datasource):
                params['datasources_enabled'].append(datasource)

        # E. backward-compat amendments
        for key, value in params.items():
            if key.startswith('ship_'):
                newkey = key.replace('ship_', 'ship-')
                params[newkey] = value
                del params[key]

        return params
コード例 #11
0
ファイル: expression.py プロジェクト: herrkrueger/patzilla
    def pair_to_solr(cls, key, value, modifiers=None):

        try:
            fieldname = cls.datasource_indexnames[key]
        except KeyError:
            return

        expression = None
        format = u'{0}:{1}'


        # ------------------------------------------
        #   value mogrifiers
        # ------------------------------------------
        if key == 'patentnumber':
            # TODO: parse more sophisticated to make things like "EP666666 or EP666667" or "?query=pn%3AEP666666&datasource=ifi" possible
            # TODO: use different normalization flavor for IFI, e.g. JP01153210A will not work as JPH01153210A, which is required by OPS
            value = normalize_patent(value, for_ops=False)

        elif key == 'pubdate':

            """
            - pd:[19800101 TO 19851231]
            - pd:[* TO 19601231]
            - pdyear:[1980 TO 1985]
            - pdyear:[* TO 1960]
            """

            try:

                parsed = False

                # e.g. 1991
                if len(value) == 4 and value.isdigit():
                    fieldname = 'pdyear'
                    parsed = True

                # e.g. 1990-2014, 1990 - 2014
                value = year_range_to_within(value)

                # e.g.
                # within 1978,1986
                # within 1900,2009-08-20
                # within 2009-08-20,2011-03-03
                if 'within' in value:
                    within_dates = parse_date_within(value)
                    elements_are_years = all([len(value) == 4 and value.isdigit() for value in within_dates.values()])
                    if elements_are_years:
                        fieldname = 'pdyear'

                    else:
                        if within_dates['startdate']:
                            within_dates['startdate'] = parse_date_universal(within_dates['startdate']).format('YYYYMMDD')

                        if within_dates['enddate']:
                            within_dates['enddate'] = parse_date_universal(within_dates['enddate']).format('YYYYMMDD')

                    if not within_dates['startdate']:
                        within_dates['startdate'] = '*'

                    if not within_dates['enddate']:
                        within_dates['enddate'] = '*'

                    expression = '{fieldname}:[{startdate} TO {enddate}]'.format(fieldname=fieldname, **within_dates)

                elif not parsed:
                    value_date = parse_date_universal(value)
                    if value_date:
                        value = value_date.format('YYYYMMDD')
                    else:
                        raise ValueError(value)

            except Exception as ex:
                message = 'IFI CLAIMS query: Invalid date or range expression "{0}". Reason: {1}.'.format(value, ex)
                logger.warn(message + '\nException was:\n{0}'.format(_exception_traceback()))
                return {'error': True, 'message': message}

        elif key == 'inventor' or key == 'applicant':
            if not has_booleans(value) and should_be_quoted(value):
                value = u'"{0}"'.format(value)

        elif key == 'class':

            # v1: Naive implementation can only handle single values
            #value = ifi_convert_class(value)

            # v2: Advanced implementation can handle expressions on field "class"
            # Translate class expression from "H04L12/433 or H04L12/24"
            # to "(ic:H04L0012433 OR cpc:H04L0012433) OR (ic:H04L001224 OR cpc:H04L001224)"
            try:

                # Put value into parenthesis, to properly capture expressions
                if value:
                    value = u'({value})'.format(value=value)

                # Parse value as simple query expression
                query_object = CQL(cql=value)

                # Rewrite all patent classifications in query expression ast from OPS format to IFI format
                rewrite_classes_ifi(query_object, format, fieldname)

                # Serialize into appropriate upstream datasource query expression syntax
                expression = query_object.dumps()

            except pyparsing.ParseException as ex:
                return {'error': True, 'message': '<pre>' + str(ex.explanation) + '</pre>'}


        # ------------------------------------------
        #   surround with parentheses
        # ------------------------------------------
        if key in ['fulltext', 'inventor', 'applicant', 'country', 'citation']:
            if has_booleans(value) and not should_be_quoted(value) and not '{!complexphrase' in value:
                value = u'({0})'.format(value)

        # ------------------------------------------
        #   expression formatter
        # ------------------------------------------
        # Serialize into appropriate upstream datasource query expression syntax
        if not expression:
            if key == 'fulltext' and '{!complexphrase' in value:
                expression = value
            else:
                expression = format_expression(format, fieldname, value)
            #print 'expression:', expression

        # ------------------------------------------
        #   final polishing
        # ------------------------------------------
        # Solr(?) syntax: boolean operators must be uppercase
        if has_booleans(expression):
            boolis = [' or ', ' and ', ' not ']
            for booli in boolis:
                expression = expression.replace(booli, booli.upper())

        return {'query': expression}
コード例 #12
0
ファイル: api.py プロジェクト: herrkrueger/patzilla
def ificlaims_download_multi(numberlist, formats):

    logger.info(
        'ificlaims_download_multi: numberlist={numberlist}, formats={formats}'.
        format(**locals()))

    report = OrderedDict()
    results = []

    for number in numberlist:

        report.setdefault(number, OrderedDict({'format': OrderedDict()}))

        for format in formats:

            format_parts = format.split(u':')

            # decode modifiers
            if len(format_parts) == 1:
                format_real = format
                modifiers = []
            else:
                format_real = format_parts[0]
                modifiers = format_parts[1:]

            # initialize availability status
            report[number]['format'][format_real] = False

            # compute options
            options = {}
            if 'pretty' in modifiers:
                options['pretty'] = True

            # collect nested documents, i.e. multiple drawings
            if format_real in ['tif', 'png']:
                count = 0
                try:
                    result_first = ificlaims_download_single(
                        number, format_real, options)
                except Exception as ex:
                    logger.error('IFI: {ex}\n{traceback}'.format(
                        ex=ex, traceback=_exception_traceback()))
                    continue

                if result_first:
                    report[number]['format'][format_real] = True
                    report[number]['ucid'] = result_first.ucid
                    report[number]['ucid-natural'] = result_first.ucid_natural
                    results.append(result_first.__dict__)
                    count += 1

                    # fetch more drawings until exhaust
                    for seq in range(2, 50):
                        options['seq'] = seq
                        try:
                            result_next = ificlaims_download_single(
                                number, format_real, options)
                        except Exception as ex:
                            logger.error('IFI: {ex}\n{traceback}'.format(
                                ex=ex, traceback=_exception_traceback()))
                            break

                        if not result_next:
                            break

                        results.append(result_next.__dict__)
                        count += 1

                report[number].setdefault('count', OrderedDict())
                report[number]['count'][format_real] = count

            else:
                try:
                    result_single = ificlaims_download_single(
                        number, format_real, options)

                except Exception as ex:
                    logger.error('IFI: {ex}\n{traceback}'.format(
                        ex=ex, traceback=_exception_traceback()))
                    continue

                if result_single:
                    report[number]['format'][format_real] = True
                    report[number]['ucid'] = result_single.ucid
                    report[number]['ucid-natural'] = result_single.ucid_natural
                    results.append(result_single.__dict__)

    response = {
        'report': report,
        'results': results,
    }
    return response
コード例 #13
0
ファイル: __init__.py プロジェクト: zheyuan2025/ip-navigator
def cql_prepare_query(query, grammar=None, keyword_fields=None):

    log.info(u'Parsing search expression "{query}" with grammar "{grammar}"'.format(
        query=query, grammar=grammar and grammar.__name__ or u'default'))

    keyword_fields = keyword_fields or ops_keyword_fields + DpmaDepatisnetAccess.keyword_fields

    # fixup query: wrap into quotes if cql string is a) unspecific, b) contains spaces and c) is still unquoted
    if should_be_quoted(query) and u'within' not in query:
        query = u'"%s"' % query

    # Parse and recompile CQL query string to apply number normalization
    query_object = None
    try:

        # v1: Cheshire3 CQL parser
        #query_object = cql_parse(query)
        #query = query_object.toCQL().strip()

        # v2 pyparsing CQL parser
        query_object = CQL(query, grammar=grammar, keyword_fields=keyword_fields).polish()
        query_recompiled = query_object.dumps()

        if query_recompiled:
            query = query_recompiled

    except Exception as ex:
        # TODO: can we get more details from diagnostic information to just stop here w/o propagating obviously wrong query to OPS?
        log.warn(u'CQL parse error: query="{0}", reason={1}, Exception was:\n{2}'.format(query, ex, _exception_traceback()))

    return query_object, query
コード例 #14
0
    def pair_to_sip_xml(cls, key, value, modifiers):

        # reformat modifiers to lower booleans
        # {u'fulltext': {u'claim': True, u'abstract': True, u'description': True, u'title': True}
        # ->
        # {u'fulltext': {u'claim': 'true', u'abstract': 'true', u'description': 'true', u'title': 'true'}
        for modifier_field, modifier_values in modifiers.iteritems():
            if type(modifiers[modifier_field]) is types.DictionaryType:
                for modifier_name, modifier_value in modifiers[
                        modifier_field].iteritems():
                    modifiers[modifier_field][modifier_name] = str(
                        modifier_value).lower()
            elif type(modifiers[modifier_field]) is types.BooleanType:
                modifiers[modifier_field] = str(
                    modifiers[modifier_field]).lower()

        xml_part = None
        keywords = None

        if key == 'pubdate':

            try:

                if len(value) == 4 and value.isdigit():
                    # e.g. 1978
                    value = u'within {year}-01-01,{year}-12-31'.format(
                        year=value)

                # e.g. 1990-2014, 1990 - 2014
                value = year_range_to_within(value)

                if 'within' in value:
                    try:
                        within_dates = parse_date_within(value)
                    except:
                        raise ValueError('Could not parse "within" expression')

                    if len(within_dates['startdate']
                           ) == 4 and within_dates['startdate'].isdigit():
                        within_dates[
                            'startdate'] = within_dates['startdate'] + '-01-01'
                    if len(within_dates['enddate']
                           ) == 4 and within_dates['enddate'].isdigit():
                        within_dates[
                            'enddate'] = within_dates['enddate'] + '-12-31'

                    if all(within_dates.values()):
                        template = cls.sip_xml_expression_templates[key][
                            'both']
                    elif within_dates['startdate']:
                        template = cls.sip_xml_expression_templates[key][
                            'startdate']
                    # API not capable of handling "enddate"-only attribute
                    #elif within_dates['enddate']:
                    #    template = cls.sip_xml_expression_templates[key]['enddate']
                    else:
                        raise ValueError(
                            'SIP cannot handle date ranges with end date only')

                    xml_part = template.format(
                        startdate=iso_to_german(within_dates['startdate']),
                        enddate=iso_to_german(within_dates['enddate']))

                else:
                    template = cls.sip_xml_expression_templates[key]['both']
                    xml_part = template.format(startdate=iso_to_german(value),
                                               enddate=iso_to_german(value))

            except Exception as ex:
                message = 'SIP query: Invalid date or range expression "{0}". Reason: {1}'.format(
                    value, ex)
                logger.warn(
                    message +
                    ' Exception was: {0}'.format(_exception_traceback()))
                return {'error': True, 'message': message}

        elif key == 'country':

            if ' and ' in value.lower():
                message = 'SIP query: Concatenating offices with "AND" would yield zero results'
                logger.warn(message)
                return {'error': True, 'message': message}

            entries = re.split(' or ', value, flags=re.IGNORECASE)
            entries = [entry.strip() for entry in entries]
            ccids = []
            for country in entries:
                country = country.upper()
                sip_country = SipCountry.objects(cc=country).first()
                if sip_country:
                    sip_ccid = sip_country.ccid
                    ccids.append(sip_ccid)
                else:
                    message = 'SIP query: Country "{0}" could not be resolved'.format(
                        country)
                    logger.warn(message)
                    return {'error': True, 'message': message}

            if ccids:
                xml_part = '<country>\n' + '\n'.join([
                    '<ccid>{ccid}</ccid>'.format(ccid=ccid) for ccid in ccids
                ]) + '\n</country>'

        elif key == 'class':

            try:
                expression = SipCqlClass(value)
                xml_part = expression.dumpxml()

                # debugging
                #print '-' * 42
                #print pretty_print(xml_part)

            except ClassDecodingError as ex:
                return {'error': True, 'message': str(ex)}

            except pyparsing.ParseException as ex:
                return {
                    'error': True,
                    'message': '<pre>' + str(ex.explanation) + '</pre>'
                }

        elif key == 'fulltext':
            """
            parse cql subexpression (possible fields are ti, ab, de, cl, bi) and map to SIP syntax
            """

            try:
                expression = SipCqlFulltext(value,
                                            modifiers=modifiers.get(key, {}))
                xml_part = expression.dumpxml()
                keywords = expression.keywords()

                # debugging
                #print '-' * 42
                #print pretty_print(xml_part)

            except FulltextDecodingError as ex:
                return {'error': True, 'message': unicode(ex)}

            except pyparsing.ParseException as ex:
                return {
                    'error': True,
                    'message': u'<pre>' + ex.explanation + '</pre>'
                }

            except SyntaxError as ex:
                return {
                    'error': True,
                    'message': u'<pre>' + unicode(ex) + '</pre>'
                }

        elif key in cls.sip_xml_expression_templates:
            template = cls.sip_xml_expression_templates[key]

            if key == 'patentnumber':
                value = value.upper()

            xml_part = template.format(key=key,
                                       value=value.strip(),
                                       **modifiers.get(key, {}))

        else:
            logger.warn('SIP query: Could not handle pair {0}={1}'.format(
                key, value))

        response = {}
        if xml_part:
            response = {'query': xml_part}

        if keywords:
            response.update({'keywords': keywords})

        return response
コード例 #15
0
    def pair_to_elasticsearch(cls, key, value, modifiers=None):

        try:
            fieldname = cls.datasource_indexnames[key]
        except KeyError:
            return

        expression = None
        format = u'{0}:{1}'

        # ------------------------------------------
        #   value mogrifiers
        # ------------------------------------------
        if key == 'patentnumber':

            # Transform into distinct fields PC, DE, KI

            #if has_booleans(value):
            #    value = '({})'.format(value)

            expression_parts = []

            # Publication number
            patent = split_patent_number(value)

            patent_normalized = normalize_patent(patent, for_ops=False)
            if patent_normalized:
                patent = patent_normalized

            if patent:
                subexpression = u'PC:{country} AND DE:{number}'.format(
                    **patent)
                if patent['kind']:
                    subexpression += u' AND KI:{kind}'.format(**patent)
                expression_parts.append(u'({})'.format(subexpression))

            # Application number
            subexpression = u'AN:{}'.format(value)
            expression_parts.append(subexpression)
            expression = u' OR '.join(expression_parts)

            # Priority number
            subexpression = u'NP:{}'.format(value)
            expression_parts.append(subexpression)
            expression = u' OR '.join(expression_parts)

        elif key == 'pubdate':
            """
            - DP:[19800101 TO 19851231]
            - DP:[* TO 19601231]
            """

            try:

                parsed = False

                # e.g. 1991
                if len(value) == 4 and value.isdigit():
                    value = u'within {}0101,{}1231'.format(value, value)

                # e.g. 1990-2014, 1990 - 2014
                value = year_range_to_within(value)

                # e.g.
                # within 1978,1986
                # within 1900,2009-08-20
                # within 2009-08-20,2011-03-03
                if 'within' in value:
                    within_dates = parse_date_within(value)

                    if within_dates['startdate']:
                        if len(within_dates['startdate']) == 4:
                            within_dates['startdate'] += '0101'
                        within_dates['startdate'] = parse_date_universal(
                            within_dates['startdate']).format('YYYYMMDD')
                    else:
                        within_dates['startdate'] = '*'

                    if within_dates['enddate']:
                        if len(within_dates['enddate']) == 4:
                            within_dates['enddate'] += '1231'
                        within_dates['enddate'] = parse_date_universal(
                            within_dates['enddate']).format('YYYYMMDD')
                    else:
                        within_dates['enddate'] = '*'

                    expression = '{fieldname}:[{startdate} TO {enddate}]'.format(
                        fieldname=fieldname, **within_dates)

                elif not parsed:
                    value_date = parse_date_universal(value)
                    if value_date:
                        value = value_date.format('YYYYMMDD')
                    else:
                        raise ValueError(value)

            except Exception as ex:
                message = 'depatech query: Invalid date or range expression "{0}". Reason: {1}.'.format(
                    value, ex)
                logger.warn(
                    message +
                    ' Exception was: {0}'.format(_exception_traceback()))
                return {'error': True, 'message': message}

        elif key == 'inventor' or key == 'applicant':
            if not has_booleans(value) and should_be_quoted(value):
                value = u'"{0}"'.format(value)

        elif key == 'class':

            # v1: Naive implementation can only handle single values
            #value = lucene_convert_class(value)

            # v2: Advanced implementation can handle expressions on field "class"
            # Translate class expression from "H04L12/433 or H04L12/24"
            # to "(ic:H04L0012433 OR cpc:H04L0012433) OR (ic:H04L001224 OR cpc:H04L001224)"
            try:

                # Put value into parenthesis, to properly capture expressions
                if value:
                    value = u'({value})'.format(value=value)

                # Parse value as simple query expression
                query_object = CQL(cql=value)

                # Rewrite all patent classifications in query expression ast from OPS format to Lucene format
                rewrite_classes_lucene(query_object, format, fieldname)

                # Serialize into appropriate upstream datasource query expression syntax
                expression = query_object.dumps()

            except pyparsing.ParseException as ex:
                return {
                    'error': True,
                    'message': '<pre>' + str(ex.explanation) + '</pre>'
                }

        elif key == 'country':
            value = value.upper()

        # ------------------------------------------
        #   surround with parentheses
        # ------------------------------------------
        if key in ['fulltext', 'inventor', 'applicant', 'country', 'citation']:
            if has_booleans(value) and not should_be_quoted(value):
                value = u'({0})'.format(value)

        # ------------------------------------------
        #   expression formatter
        # ------------------------------------------
        # Serialize into appropriate upstream datasource query expression syntax
        if not expression:
            expression = format_expression(format, fieldname, value)
            #print 'expression:', expression

        # ------------------------------------------
        #   final polishing
        # ------------------------------------------
        # Solr(?) syntax: boolean operators must be uppercase
        if has_booleans(expression):
            boolis = [' or ', ' and ', ' not ']
            for booli in boolis:
                expression = expression.replace(booli, booli.upper())

        return {'query': expression}