Ejemplo n.º 1
0
    def pair_to_solr(cls, key, value, modifiers=None):

        try:
            fieldname = cls.datasource_indexnames[key]
        except KeyError:
            return

        expression = None
        format = u'{0}:{1}'


        # ------------------------------------------
        #   value mogrifiers
        # ------------------------------------------
        if key == 'patentnumber':
            # TODO: parse more sophisticated to make things like "EP666666 or EP666667" or "?query=pn%3AEP666666&datasource=ifi" possible
            # TODO: use different normalization flavor for IFI, e.g. JP01153210A will not work as JPH01153210A, which is required by OPS
            value = normalize_patent(value, for_ops=False)

        elif key == 'pubdate':

            """
            - pd:[19800101 TO 19851231]
            - pd:[* TO 19601231]
            - pdyear:[1980 TO 1985]
            - pdyear:[* TO 1960]
            """

            try:

                parsed = False

                # e.g. 1991
                if len(value) == 4 and value.isdigit():
                    fieldname = 'pdyear'
                    parsed = True

                # e.g. 1990-2014, 1990 - 2014
                value = year_range_to_within(value)

                # e.g.
                # within 1978,1986
                # within 1900,2009-08-20
                # within 2009-08-20,2011-03-03
                if 'within' in value:
                    within_dates = parse_date_within(value)
                    elements_are_years = all([len(value) == 4 and value.isdigit() for value in within_dates.values()])
                    if elements_are_years:
                        fieldname = 'pdyear'

                    else:
                        if within_dates['startdate']:
                            within_dates['startdate'] = parse_date_universal(within_dates['startdate']).format('YYYYMMDD')

                        if within_dates['enddate']:
                            within_dates['enddate'] = parse_date_universal(within_dates['enddate']).format('YYYYMMDD')

                    if not within_dates['startdate']:
                        within_dates['startdate'] = '*'

                    if not within_dates['enddate']:
                        within_dates['enddate'] = '*'

                    expression = '{fieldname}:[{startdate} TO {enddate}]'.format(fieldname=fieldname, **within_dates)

                elif not parsed:
                    value_date = parse_date_universal(value)
                    if value_date:
                        value = value_date.format('YYYYMMDD')
                    else:
                        raise ValueError(value)

            except Exception as ex:
                message = 'IFI CLAIMS query: Invalid date or range expression "{0}". Reason: {1}.'.format(value, ex)
                logger.warn(message + '\nException was:\n{0}'.format(_exception_traceback()))
                return {'error': True, 'message': message}

        elif key == 'inventor' or key == 'applicant':
            if not has_booleans(value) and should_be_quoted(value):
                value = u'"{0}"'.format(value)

        elif key == 'class':

            # v1: Naive implementation can only handle single values
            #value = ifi_convert_class(value)

            # v2: Advanced implementation can handle expressions on field "class"
            # Translate class expression from "H04L12/433 or H04L12/24"
            # to "(ic:H04L0012433 OR cpc:H04L0012433) OR (ic:H04L001224 OR cpc:H04L001224)"
            try:

                # Put value into parenthesis, to properly capture expressions
                if value:
                    value = u'({value})'.format(value=value)

                # Parse value as simple query expression
                query_object = CQL(cql=value)

                # Rewrite all patent classifications in query expression ast from OPS format to IFI format
                rewrite_classes_ifi(query_object, format, fieldname)

                # Serialize into appropriate upstream datasource query expression syntax
                expression = query_object.dumps()

            except pyparsing.ParseException as ex:
                return {'error': True, 'message': '<pre>' + str(ex.explanation) + '</pre>'}


        # ------------------------------------------
        #   surround with parentheses
        # ------------------------------------------
        if key in ['fulltext', 'inventor', 'applicant', 'country', 'citation']:
            if has_booleans(value) and not should_be_quoted(value) and not '{!complexphrase' in value:
                value = u'({0})'.format(value)

        # ------------------------------------------
        #   expression formatter
        # ------------------------------------------
        # Serialize into appropriate upstream datasource query expression syntax
        if not expression:
            if key == 'fulltext' and '{!complexphrase' in value:
                expression = value
            else:
                expression = format_expression(format, fieldname, value)
            #print 'expression:', expression

        # ------------------------------------------
        #   final polishing
        # ------------------------------------------
        # Solr(?) syntax: boolean operators must be uppercase
        if has_booleans(expression):
            boolis = [' or ', ' and ', ' not ']
            for booli in boolis:
                expression = expression.replace(booli, booli.upper())

        return {'query': expression}
Ejemplo n.º 2
0
    def pair_to_sip_xml(cls, key, value, modifiers):

        # reformat modifiers to lower booleans
        # {u'fulltext': {u'claim': True, u'abstract': True, u'description': True, u'title': True}
        # ->
        # {u'fulltext': {u'claim': 'true', u'abstract': 'true', u'description': 'true', u'title': 'true'}
        for modifier_field, modifier_values in modifiers.iteritems():
            if type(modifiers[modifier_field]) is types.DictionaryType:
                for modifier_name, modifier_value in modifiers[
                        modifier_field].iteritems():
                    modifiers[modifier_field][modifier_name] = str(
                        modifier_value).lower()
            elif type(modifiers[modifier_field]) is types.BooleanType:
                modifiers[modifier_field] = str(
                    modifiers[modifier_field]).lower()

        xml_part = None
        keywords = None

        if key == 'pubdate':

            try:

                if len(value) == 4 and value.isdigit():
                    # e.g. 1978
                    value = u'within {year}-01-01,{year}-12-31'.format(
                        year=value)

                # e.g. 1990-2014, 1990 - 2014
                value = year_range_to_within(value)

                if 'within' in value:
                    try:
                        within_dates = parse_date_within(value)
                    except:
                        raise ValueError('Could not parse "within" expression')

                    if len(within_dates['startdate']
                           ) == 4 and within_dates['startdate'].isdigit():
                        within_dates[
                            'startdate'] = within_dates['startdate'] + '-01-01'
                    if len(within_dates['enddate']
                           ) == 4 and within_dates['enddate'].isdigit():
                        within_dates[
                            'enddate'] = within_dates['enddate'] + '-12-31'

                    if all(within_dates.values()):
                        template = cls.sip_xml_expression_templates[key][
                            'both']
                    elif within_dates['startdate']:
                        template = cls.sip_xml_expression_templates[key][
                            'startdate']
                    # API not capable of handling "enddate"-only attribute
                    #elif within_dates['enddate']:
                    #    template = cls.sip_xml_expression_templates[key]['enddate']
                    else:
                        raise ValueError(
                            'SIP cannot handle date ranges with end date only')

                    xml_part = template.format(
                        startdate=iso_to_german(within_dates['startdate']),
                        enddate=iso_to_german(within_dates['enddate']))

                else:
                    template = cls.sip_xml_expression_templates[key]['both']
                    xml_part = template.format(startdate=iso_to_german(value),
                                               enddate=iso_to_german(value))

            except Exception as ex:
                message = 'SIP query: Invalid date or range expression "{0}". Reason: {1}'.format(
                    value, ex)
                logger.warn(
                    message +
                    ' Exception was: {0}'.format(_exception_traceback()))
                return {'error': True, 'message': message}

        elif key == 'country':

            if ' and ' in value.lower():
                message = 'SIP query: Concatenating offices with "AND" would yield zero results'
                logger.warn(message)
                return {'error': True, 'message': message}

            entries = re.split(' or ', value, flags=re.IGNORECASE)
            entries = [entry.strip() for entry in entries]
            ccids = []
            for country in entries:
                country = country.upper()
                sip_country = SipCountry.objects(cc=country).first()
                if sip_country:
                    sip_ccid = sip_country.ccid
                    ccids.append(sip_ccid)
                else:
                    message = 'SIP query: Country "{0}" could not be resolved'.format(
                        country)
                    logger.warn(message)
                    return {'error': True, 'message': message}

            if ccids:
                xml_part = '<country>\n' + '\n'.join([
                    '<ccid>{ccid}</ccid>'.format(ccid=ccid) for ccid in ccids
                ]) + '\n</country>'

        elif key == 'class':

            try:
                expression = SipCqlClass(value)
                xml_part = expression.dumpxml()

                # debugging
                #print '-' * 42
                #print pretty_print(xml_part)

            except ClassDecodingError as ex:
                return {'error': True, 'message': str(ex)}

            except pyparsing.ParseException as ex:
                return {
                    'error': True,
                    'message': '<pre>' + str(ex.explanation) + '</pre>'
                }

        elif key == 'fulltext':
            """
            parse cql subexpression (possible fields are ti, ab, de, cl, bi) and map to SIP syntax
            """

            try:
                expression = SipCqlFulltext(value,
                                            modifiers=modifiers.get(key, {}))
                xml_part = expression.dumpxml()
                keywords = expression.keywords()

                # debugging
                #print '-' * 42
                #print pretty_print(xml_part)

            except FulltextDecodingError as ex:
                return {'error': True, 'message': unicode(ex)}

            except pyparsing.ParseException as ex:
                return {
                    'error': True,
                    'message': u'<pre>' + ex.explanation + '</pre>'
                }

            except SyntaxError as ex:
                return {
                    'error': True,
                    'message': u'<pre>' + unicode(ex) + '</pre>'
                }

        elif key in cls.sip_xml_expression_templates:
            template = cls.sip_xml_expression_templates[key]

            if key == 'patentnumber':
                value = value.upper()

            xml_part = template.format(key=key,
                                       value=value.strip(),
                                       **modifiers.get(key, {}))

        else:
            logger.warn('SIP query: Could not handle pair {0}={1}'.format(
                key, value))

        response = {}
        if xml_part:
            response = {'query': xml_part}

        if keywords:
            response.update({'keywords': keywords})

        return response
Ejemplo n.º 3
0
def pair_to_cql(datasource, key, value):

    try:
        fieldname = datasource_indexnames[key][datasource]
    except KeyError:
        return

    # Sanity checks
    if fieldname is None:
        return

    cql_part = None
    format = u'{0}=({1})'

    # Special processing rules for depatisnet
    if datasource == 'depatisnet':

        if key in ['pubdate', 'appdate', 'priodate']:

            # Date fields for DEPATISnet yield a dictionary here
            fieldinfo = fieldname

            # Assume parsing a regular date
            fieldname = fieldinfo['date']

            # Check if value is a year (4 digits)
            if len(value) == 4 and value.isdigit():
                fieldname = fieldinfo['year']

            # e.g. 1990-2014, 1990 - 2014
            value = year_range_to_within(value)

            if 'within' in value:
                within_dates = parse_date_within(value)

                cql_parts = []
                if within_dates['startdate']:
                    startdate = within_dates['startdate']
                    # Check if value is a year (4 digits)
                    if len(startdate) == 4 and startdate.isdigit():
                        fieldname = fieldinfo['year']
                    part = '{fieldname} >= {startdate}'.format(
                        fieldname=fieldname,
                        startdate=iso_to_german(startdate))
                    cql_parts.append(part)

                if within_dates['enddate']:
                    enddate = within_dates['enddate']
                    # Check if value is a year (4 digits)
                    if len(enddate) == 4 and enddate.isdigit():
                        fieldname = fieldinfo['year']
                    part = '{fieldname} <= {enddate}'.format(
                        fieldname=fieldname, enddate=iso_to_german(enddate))
                    cql_parts.append(part)

                cql_part = ' and '.join(cql_parts)

            else:
                try:
                    value = iso_to_german(value)
                except ValueError as ex:
                    return {'error': True, 'message': ex.message}

        elif key == 'patentnumber' and 1 <= len(value) <= 2:
            fieldname = 'pcod'

        elif key == 'inventor' or key == 'applicant':
            value = value.strip(' "')
            if not has_booleans(value) and should_be_quoted(value):
                value = value.replace(' ', '(L)')

        # 2016-04-19: Improve DEPATISnet convenience by adapting wildcard semantics to world standards
        if '*' in value or '?' in value:
            """
            TRUNCATION/ WILDCARDS
            ? 	no characters to any number of characters
            ! 	precisely one character
            # 	zero or one character

            See also:
            https://depatisnet.dpma.de/prod/en/hilfe/recherchemodi/experten-recherche/index.html

            So, the translation table would be:
            *  ->  ?
            ?  ->  !
            """
            value = value.replace('?', '!')
            value = value.replace('*', '?')

    elif datasource == 'ops':

        if key == 'inventor' or key == 'applicant':
            if not has_booleans(value) and should_be_quoted(value):
                value = u'"{0}"'.format(value)

        if key == 'pubdate':

            # e.g. 1990-2014, 1990 - 2014
            value = year_range_to_within(value)

            if 'within' in value:
                within_dates = parse_date_within(value)
                if not within_dates['startdate'] or not within_dates['enddate']:
                    return {
                        'error':
                        True,
                        'message':
                        'OPS only accepts full date ranges in "within" expressions'
                    }

                value = 'within "{startdate},{enddate}"'.format(
                    startdate=within_dates['startdate'],
                    enddate=within_dates['enddate'])

                format = '{0} {1}'

    if not cql_part:
        cql_part = format.format(fieldname, value)

    return {'query': cql_part}
Ejemplo n.º 4
0
    def pair_to_elasticsearch(cls, key, value, modifiers=None):

        try:
            fieldname = cls.datasource_indexnames[key]
        except KeyError:
            return

        expression = None
        format = u'{0}:{1}'

        # ------------------------------------------
        #   value mogrifiers
        # ------------------------------------------
        if key == 'patentnumber':

            # Transform into distinct fields PC, DE, KI

            #if has_booleans(value):
            #    value = '({})'.format(value)

            expression_parts = []

            # Publication number
            patent = split_patent_number(value)

            patent_normalized = normalize_patent(patent, for_ops=False)
            if patent_normalized:
                patent = patent_normalized

            if patent:
                subexpression = u'PC:{country} AND DE:{number}'.format(
                    **patent)
                if patent['kind']:
                    subexpression += u' AND KI:{kind}'.format(**patent)
                expression_parts.append(u'({})'.format(subexpression))

            # Application number
            subexpression = u'AN:{}'.format(value)
            expression_parts.append(subexpression)
            expression = u' OR '.join(expression_parts)

            # Priority number
            subexpression = u'NP:{}'.format(value)
            expression_parts.append(subexpression)
            expression = u' OR '.join(expression_parts)

        elif key == 'pubdate':
            """
            - DP:[19800101 TO 19851231]
            - DP:[* TO 19601231]
            """

            try:

                parsed = False

                # e.g. 1991
                if len(value) == 4 and value.isdigit():
                    value = u'within {}0101,{}1231'.format(value, value)

                # e.g. 1990-2014, 1990 - 2014
                value = year_range_to_within(value)

                # e.g.
                # within 1978,1986
                # within 1900,2009-08-20
                # within 2009-08-20,2011-03-03
                if 'within' in value:
                    within_dates = parse_date_within(value)

                    if within_dates['startdate']:
                        if len(within_dates['startdate']) == 4:
                            within_dates['startdate'] += '0101'
                        within_dates['startdate'] = parse_date_universal(
                            within_dates['startdate']).format('YYYYMMDD')
                    else:
                        within_dates['startdate'] = '*'

                    if within_dates['enddate']:
                        if len(within_dates['enddate']) == 4:
                            within_dates['enddate'] += '1231'
                        within_dates['enddate'] = parse_date_universal(
                            within_dates['enddate']).format('YYYYMMDD')
                    else:
                        within_dates['enddate'] = '*'

                    expression = '{fieldname}:[{startdate} TO {enddate}]'.format(
                        fieldname=fieldname, **within_dates)

                elif not parsed:
                    value_date = parse_date_universal(value)
                    if value_date:
                        value = value_date.format('YYYYMMDD')
                    else:
                        raise ValueError(value)

            except Exception as ex:
                message = 'depatech query: Invalid date or range expression "{0}". Reason: {1}.'.format(
                    value, ex)
                logger.warn(
                    message +
                    ' Exception was: {0}'.format(_exception_traceback()))
                return {'error': True, 'message': message}

        elif key == 'inventor' or key == 'applicant':
            if not has_booleans(value) and should_be_quoted(value):
                value = u'"{0}"'.format(value)

        elif key == 'class':

            # v1: Naive implementation can only handle single values
            #value = lucene_convert_class(value)

            # v2: Advanced implementation can handle expressions on field "class"
            # Translate class expression from "H04L12/433 or H04L12/24"
            # to "(ic:H04L0012433 OR cpc:H04L0012433) OR (ic:H04L001224 OR cpc:H04L001224)"
            try:

                # Put value into parenthesis, to properly capture expressions
                if value:
                    value = u'({value})'.format(value=value)

                # Parse value as simple query expression
                query_object = CQL(cql=value)

                # Rewrite all patent classifications in query expression ast from OPS format to Lucene format
                rewrite_classes_lucene(query_object, format, fieldname)

                # Serialize into appropriate upstream datasource query expression syntax
                expression = query_object.dumps()

            except pyparsing.ParseException as ex:
                return {
                    'error': True,
                    'message': '<pre>' + str(ex.explanation) + '</pre>'
                }

        elif key == 'country':
            value = value.upper()

        # ------------------------------------------
        #   surround with parentheses
        # ------------------------------------------
        if key in ['fulltext', 'inventor', 'applicant', 'country', 'citation']:
            if has_booleans(value) and not should_be_quoted(value):
                value = u'({0})'.format(value)

        # ------------------------------------------
        #   expression formatter
        # ------------------------------------------
        # Serialize into appropriate upstream datasource query expression syntax
        if not expression:
            expression = format_expression(format, fieldname, value)
            #print 'expression:', expression

        # ------------------------------------------
        #   final polishing
        # ------------------------------------------
        # Solr(?) syntax: boolean operators must be uppercase
        if has_booleans(expression):
            boolis = [' or ', ' and ', ' not ']
            for booli in boolis:
                expression = expression.replace(booli, booli.upper())

        return {'query': expression}