def pair_to_solr(cls, key, value, modifiers=None): try: fieldname = cls.datasource_indexnames[key] except KeyError: return expression = None format = u'{0}:{1}' # ------------------------------------------ # value mogrifiers # ------------------------------------------ if key == 'patentnumber': # TODO: parse more sophisticated to make things like "EP666666 or EP666667" or "?query=pn%3AEP666666&datasource=ifi" possible # TODO: use different normalization flavor for IFI, e.g. JP01153210A will not work as JPH01153210A, which is required by OPS value = normalize_patent(value, for_ops=False) elif key == 'pubdate': """ - pd:[19800101 TO 19851231] - pd:[* TO 19601231] - pdyear:[1980 TO 1985] - pdyear:[* TO 1960] """ try: parsed = False # e.g. 1991 if len(value) == 4 and value.isdigit(): fieldname = 'pdyear' parsed = True # e.g. 1990-2014, 1990 - 2014 value = year_range_to_within(value) # e.g. # within 1978,1986 # within 1900,2009-08-20 # within 2009-08-20,2011-03-03 if 'within' in value: within_dates = parse_date_within(value) elements_are_years = all([len(value) == 4 and value.isdigit() for value in within_dates.values()]) if elements_are_years: fieldname = 'pdyear' else: if within_dates['startdate']: within_dates['startdate'] = parse_date_universal(within_dates['startdate']).format('YYYYMMDD') if within_dates['enddate']: within_dates['enddate'] = parse_date_universal(within_dates['enddate']).format('YYYYMMDD') if not within_dates['startdate']: within_dates['startdate'] = '*' if not within_dates['enddate']: within_dates['enddate'] = '*' expression = '{fieldname}:[{startdate} TO {enddate}]'.format(fieldname=fieldname, **within_dates) elif not parsed: value_date = parse_date_universal(value) if value_date: value = value_date.format('YYYYMMDD') else: raise ValueError(value) except Exception as ex: message = 'IFI CLAIMS query: Invalid date or range expression "{0}". Reason: {1}.'.format(value, ex) logger.warn(message + '\nException was:\n{0}'.format(_exception_traceback())) return {'error': True, 'message': message} elif key == 'inventor' or key == 'applicant': if not has_booleans(value) and should_be_quoted(value): value = u'"{0}"'.format(value) elif key == 'class': # v1: Naive implementation can only handle single values #value = ifi_convert_class(value) # v2: Advanced implementation can handle expressions on field "class" # Translate class expression from "H04L12/433 or H04L12/24" # to "(ic:H04L0012433 OR cpc:H04L0012433) OR (ic:H04L001224 OR cpc:H04L001224)" try: # Put value into parenthesis, to properly capture expressions if value: value = u'({value})'.format(value=value) # Parse value as simple query expression query_object = CQL(cql=value) # Rewrite all patent classifications in query expression ast from OPS format to IFI format rewrite_classes_ifi(query_object, format, fieldname) # Serialize into appropriate upstream datasource query expression syntax expression = query_object.dumps() except pyparsing.ParseException as ex: return {'error': True, 'message': '<pre>' + str(ex.explanation) + '</pre>'} # ------------------------------------------ # surround with parentheses # ------------------------------------------ if key in ['fulltext', 'inventor', 'applicant', 'country', 'citation']: if has_booleans(value) and not should_be_quoted(value) and not '{!complexphrase' in value: value = u'({0})'.format(value) # ------------------------------------------ # expression formatter # ------------------------------------------ # Serialize into appropriate upstream datasource query expression syntax if not expression: if key == 'fulltext' and '{!complexphrase' in value: expression = value else: expression = format_expression(format, fieldname, value) #print 'expression:', expression # ------------------------------------------ # final polishing # ------------------------------------------ # Solr(?) syntax: boolean operators must be uppercase if has_booleans(expression): boolis = [' or ', ' and ', ' not '] for booli in boolis: expression = expression.replace(booli, booli.upper()) return {'query': expression}
def pair_to_sip_xml(cls, key, value, modifiers): # reformat modifiers to lower booleans # {u'fulltext': {u'claim': True, u'abstract': True, u'description': True, u'title': True} # -> # {u'fulltext': {u'claim': 'true', u'abstract': 'true', u'description': 'true', u'title': 'true'} for modifier_field, modifier_values in modifiers.iteritems(): if type(modifiers[modifier_field]) is types.DictionaryType: for modifier_name, modifier_value in modifiers[ modifier_field].iteritems(): modifiers[modifier_field][modifier_name] = str( modifier_value).lower() elif type(modifiers[modifier_field]) is types.BooleanType: modifiers[modifier_field] = str( modifiers[modifier_field]).lower() xml_part = None keywords = None if key == 'pubdate': try: if len(value) == 4 and value.isdigit(): # e.g. 1978 value = u'within {year}-01-01,{year}-12-31'.format( year=value) # e.g. 1990-2014, 1990 - 2014 value = year_range_to_within(value) if 'within' in value: try: within_dates = parse_date_within(value) except: raise ValueError('Could not parse "within" expression') if len(within_dates['startdate'] ) == 4 and within_dates['startdate'].isdigit(): within_dates[ 'startdate'] = within_dates['startdate'] + '-01-01' if len(within_dates['enddate'] ) == 4 and within_dates['enddate'].isdigit(): within_dates[ 'enddate'] = within_dates['enddate'] + '-12-31' if all(within_dates.values()): template = cls.sip_xml_expression_templates[key][ 'both'] elif within_dates['startdate']: template = cls.sip_xml_expression_templates[key][ 'startdate'] # API not capable of handling "enddate"-only attribute #elif within_dates['enddate']: # template = cls.sip_xml_expression_templates[key]['enddate'] else: raise ValueError( 'SIP cannot handle date ranges with end date only') xml_part = template.format( startdate=iso_to_german(within_dates['startdate']), enddate=iso_to_german(within_dates['enddate'])) else: template = cls.sip_xml_expression_templates[key]['both'] xml_part = template.format(startdate=iso_to_german(value), enddate=iso_to_german(value)) except Exception as ex: message = 'SIP query: Invalid date or range expression "{0}". Reason: {1}'.format( value, ex) logger.warn( message + ' Exception was: {0}'.format(_exception_traceback())) return {'error': True, 'message': message} elif key == 'country': if ' and ' in value.lower(): message = 'SIP query: Concatenating offices with "AND" would yield zero results' logger.warn(message) return {'error': True, 'message': message} entries = re.split(' or ', value, flags=re.IGNORECASE) entries = [entry.strip() for entry in entries] ccids = [] for country in entries: country = country.upper() sip_country = SipCountry.objects(cc=country).first() if sip_country: sip_ccid = sip_country.ccid ccids.append(sip_ccid) else: message = 'SIP query: Country "{0}" could not be resolved'.format( country) logger.warn(message) return {'error': True, 'message': message} if ccids: xml_part = '<country>\n' + '\n'.join([ '<ccid>{ccid}</ccid>'.format(ccid=ccid) for ccid in ccids ]) + '\n</country>' elif key == 'class': try: expression = SipCqlClass(value) xml_part = expression.dumpxml() # debugging #print '-' * 42 #print pretty_print(xml_part) except ClassDecodingError as ex: return {'error': True, 'message': str(ex)} except pyparsing.ParseException as ex: return { 'error': True, 'message': '<pre>' + str(ex.explanation) + '</pre>' } elif key == 'fulltext': """ parse cql subexpression (possible fields are ti, ab, de, cl, bi) and map to SIP syntax """ try: expression = SipCqlFulltext(value, modifiers=modifiers.get(key, {})) xml_part = expression.dumpxml() keywords = expression.keywords() # debugging #print '-' * 42 #print pretty_print(xml_part) except FulltextDecodingError as ex: return {'error': True, 'message': unicode(ex)} except pyparsing.ParseException as ex: return { 'error': True, 'message': u'<pre>' + ex.explanation + '</pre>' } except SyntaxError as ex: return { 'error': True, 'message': u'<pre>' + unicode(ex) + '</pre>' } elif key in cls.sip_xml_expression_templates: template = cls.sip_xml_expression_templates[key] if key == 'patentnumber': value = value.upper() xml_part = template.format(key=key, value=value.strip(), **modifiers.get(key, {})) else: logger.warn('SIP query: Could not handle pair {0}={1}'.format( key, value)) response = {} if xml_part: response = {'query': xml_part} if keywords: response.update({'keywords': keywords}) return response
def pair_to_cql(datasource, key, value): try: fieldname = datasource_indexnames[key][datasource] except KeyError: return # Sanity checks if fieldname is None: return cql_part = None format = u'{0}=({1})' # Special processing rules for depatisnet if datasource == 'depatisnet': if key in ['pubdate', 'appdate', 'priodate']: # Date fields for DEPATISnet yield a dictionary here fieldinfo = fieldname # Assume parsing a regular date fieldname = fieldinfo['date'] # Check if value is a year (4 digits) if len(value) == 4 and value.isdigit(): fieldname = fieldinfo['year'] # e.g. 1990-2014, 1990 - 2014 value = year_range_to_within(value) if 'within' in value: within_dates = parse_date_within(value) cql_parts = [] if within_dates['startdate']: startdate = within_dates['startdate'] # Check if value is a year (4 digits) if len(startdate) == 4 and startdate.isdigit(): fieldname = fieldinfo['year'] part = '{fieldname} >= {startdate}'.format( fieldname=fieldname, startdate=iso_to_german(startdate)) cql_parts.append(part) if within_dates['enddate']: enddate = within_dates['enddate'] # Check if value is a year (4 digits) if len(enddate) == 4 and enddate.isdigit(): fieldname = fieldinfo['year'] part = '{fieldname} <= {enddate}'.format( fieldname=fieldname, enddate=iso_to_german(enddate)) cql_parts.append(part) cql_part = ' and '.join(cql_parts) else: try: value = iso_to_german(value) except ValueError as ex: return {'error': True, 'message': ex.message} elif key == 'patentnumber' and 1 <= len(value) <= 2: fieldname = 'pcod' elif key == 'inventor' or key == 'applicant': value = value.strip(' "') if not has_booleans(value) and should_be_quoted(value): value = value.replace(' ', '(L)') # 2016-04-19: Improve DEPATISnet convenience by adapting wildcard semantics to world standards if '*' in value or '?' in value: """ TRUNCATION/ WILDCARDS ? no characters to any number of characters ! precisely one character # zero or one character See also: https://depatisnet.dpma.de/prod/en/hilfe/recherchemodi/experten-recherche/index.html So, the translation table would be: * -> ? ? -> ! """ value = value.replace('?', '!') value = value.replace('*', '?') elif datasource == 'ops': if key == 'inventor' or key == 'applicant': if not has_booleans(value) and should_be_quoted(value): value = u'"{0}"'.format(value) if key == 'pubdate': # e.g. 1990-2014, 1990 - 2014 value = year_range_to_within(value) if 'within' in value: within_dates = parse_date_within(value) if not within_dates['startdate'] or not within_dates['enddate']: return { 'error': True, 'message': 'OPS only accepts full date ranges in "within" expressions' } value = 'within "{startdate},{enddate}"'.format( startdate=within_dates['startdate'], enddate=within_dates['enddate']) format = '{0} {1}' if not cql_part: cql_part = format.format(fieldname, value) return {'query': cql_part}
def pair_to_elasticsearch(cls, key, value, modifiers=None): try: fieldname = cls.datasource_indexnames[key] except KeyError: return expression = None format = u'{0}:{1}' # ------------------------------------------ # value mogrifiers # ------------------------------------------ if key == 'patentnumber': # Transform into distinct fields PC, DE, KI #if has_booleans(value): # value = '({})'.format(value) expression_parts = [] # Publication number patent = split_patent_number(value) patent_normalized = normalize_patent(patent, for_ops=False) if patent_normalized: patent = patent_normalized if patent: subexpression = u'PC:{country} AND DE:{number}'.format( **patent) if patent['kind']: subexpression += u' AND KI:{kind}'.format(**patent) expression_parts.append(u'({})'.format(subexpression)) # Application number subexpression = u'AN:{}'.format(value) expression_parts.append(subexpression) expression = u' OR '.join(expression_parts) # Priority number subexpression = u'NP:{}'.format(value) expression_parts.append(subexpression) expression = u' OR '.join(expression_parts) elif key == 'pubdate': """ - DP:[19800101 TO 19851231] - DP:[* TO 19601231] """ try: parsed = False # e.g. 1991 if len(value) == 4 and value.isdigit(): value = u'within {}0101,{}1231'.format(value, value) # e.g. 1990-2014, 1990 - 2014 value = year_range_to_within(value) # e.g. # within 1978,1986 # within 1900,2009-08-20 # within 2009-08-20,2011-03-03 if 'within' in value: within_dates = parse_date_within(value) if within_dates['startdate']: if len(within_dates['startdate']) == 4: within_dates['startdate'] += '0101' within_dates['startdate'] = parse_date_universal( within_dates['startdate']).format('YYYYMMDD') else: within_dates['startdate'] = '*' if within_dates['enddate']: if len(within_dates['enddate']) == 4: within_dates['enddate'] += '1231' within_dates['enddate'] = parse_date_universal( within_dates['enddate']).format('YYYYMMDD') else: within_dates['enddate'] = '*' expression = '{fieldname}:[{startdate} TO {enddate}]'.format( fieldname=fieldname, **within_dates) elif not parsed: value_date = parse_date_universal(value) if value_date: value = value_date.format('YYYYMMDD') else: raise ValueError(value) except Exception as ex: message = 'depatech query: Invalid date or range expression "{0}". Reason: {1}.'.format( value, ex) logger.warn( message + ' Exception was: {0}'.format(_exception_traceback())) return {'error': True, 'message': message} elif key == 'inventor' or key == 'applicant': if not has_booleans(value) and should_be_quoted(value): value = u'"{0}"'.format(value) elif key == 'class': # v1: Naive implementation can only handle single values #value = lucene_convert_class(value) # v2: Advanced implementation can handle expressions on field "class" # Translate class expression from "H04L12/433 or H04L12/24" # to "(ic:H04L0012433 OR cpc:H04L0012433) OR (ic:H04L001224 OR cpc:H04L001224)" try: # Put value into parenthesis, to properly capture expressions if value: value = u'({value})'.format(value=value) # Parse value as simple query expression query_object = CQL(cql=value) # Rewrite all patent classifications in query expression ast from OPS format to Lucene format rewrite_classes_lucene(query_object, format, fieldname) # Serialize into appropriate upstream datasource query expression syntax expression = query_object.dumps() except pyparsing.ParseException as ex: return { 'error': True, 'message': '<pre>' + str(ex.explanation) + '</pre>' } elif key == 'country': value = value.upper() # ------------------------------------------ # surround with parentheses # ------------------------------------------ if key in ['fulltext', 'inventor', 'applicant', 'country', 'citation']: if has_booleans(value) and not should_be_quoted(value): value = u'({0})'.format(value) # ------------------------------------------ # expression formatter # ------------------------------------------ # Serialize into appropriate upstream datasource query expression syntax if not expression: expression = format_expression(format, fieldname, value) #print 'expression:', expression # ------------------------------------------ # final polishing # ------------------------------------------ # Solr(?) syntax: boolean operators must be uppercase if has_booleans(expression): boolis = [' or ', ' and ', ' not '] for booli in boolis: expression = expression.replace(booli, booli.upper()) return {'query': expression}