def cql_prepare_query(query, grammar=None, keyword_fields=None): log.info(u'Parsing search expression "{query}" with grammar "{grammar}"'.format( query=query, grammar=grammar and grammar.__name__ or u'default')) keyword_fields = keyword_fields or ops_keyword_fields + DpmaDepatisnetAccess.keyword_fields # fixup query: wrap into quotes if cql string is a) unspecific, b) contains spaces and c) is still unquoted if should_be_quoted(query) and u'within' not in query: query = u'"%s"' % query # Parse and recompile CQL query string to apply number normalization query_object = None try: # v1: Cheshire3 CQL parser #query_object = cql_parse(query) #query = query_object.toCQL().strip() # v2 pyparsing CQL parser query_object = CQL(query, grammar=grammar, keyword_fields=keyword_fields).polish() query_recompiled = query_object.dumps() if query_recompiled: query = query_recompiled except Exception as ex: # TODO: can we get more details from diagnostic information to just stop here w/o propagating obviously wrong query to OPS? log.warn(u'CQL parse error: query="{0}", reason={1}, Exception was:\n{2}'.format(query, ex, _exception_traceback())) return query_object, query
def parse_expression_cql(self, expression): # Fixup query: Wrap into quotes if CQL expression is a) unspecific, b) contains spaces and c) is still unquoted if should_be_quoted(expression) and u'within' not in expression: expression = u'"%s"' % expression # Parse and recompile CQL query string to apply number normalization query_object = None try: # v1: Cheshire3 CQL parser #query_object = cql_parse(query) #query = query_object.toCQL().strip() # v2 pyparsing CQL parser query_object = CQL(expression, grammar=self.grammar, keyword_fields=self.keyword_fields).polish() query_recompiled = query_object.dumps() if query_recompiled: expression = query_recompiled if query_recompiled != expression: logger.info( u'Recompiled search expression to "{query}"'.format( query=expression)) except Exception as ex: # TODO: Can we get more details from diagnostic information to just stop here w/o propagating obviously wrong query to OPS? logger.warn( u'CQL parse error: query="{0}", reason={1}, Exception was:\n{2}' .format(expression, ex, _exception_traceback())) self.cql_parser = query_object self.expression = expression if query_object: keywords = [] try: keywords = query_object.keywords() self.keywords_origin = 'grammar' except AttributeError: keywords = compute_keywords(query_object) self.keywords_origin = 'compute' # List of keywords should contain only unique items self.keywords = unique_sequence(keywords)
def pair_to_solr(cls, key, value, modifiers=None): try: fieldname = cls.datasource_indexnames[key] except KeyError: return expression = None format = u'{0}:{1}' # ------------------------------------------ # value mogrifiers # ------------------------------------------ if key == 'patentnumber': # TODO: parse more sophisticated to make things like "EP666666 or EP666667" or "?query=pn%3AEP666666&datasource=ifi" possible # TODO: use different normalization flavor for IFI, e.g. JP01153210A will not work as JPH01153210A, which is required by OPS value = normalize_patent(value, for_ops=False) elif key == 'pubdate': """ - pd:[19800101 TO 19851231] - pd:[* TO 19601231] - pdyear:[1980 TO 1985] - pdyear:[* TO 1960] """ try: parsed = False # e.g. 1991 if len(value) == 4 and value.isdigit(): fieldname = 'pdyear' parsed = True # e.g. 1990-2014, 1990 - 2014 value = year_range_to_within(value) # e.g. # within 1978,1986 # within 1900,2009-08-20 # within 2009-08-20,2011-03-03 if 'within' in value: within_dates = parse_date_within(value) elements_are_years = all([len(value) == 4 and value.isdigit() for value in within_dates.values()]) if elements_are_years: fieldname = 'pdyear' else: if within_dates['startdate']: within_dates['startdate'] = parse_date_universal(within_dates['startdate']).format('YYYYMMDD') if within_dates['enddate']: within_dates['enddate'] = parse_date_universal(within_dates['enddate']).format('YYYYMMDD') if not within_dates['startdate']: within_dates['startdate'] = '*' if not within_dates['enddate']: within_dates['enddate'] = '*' expression = '{fieldname}:[{startdate} TO {enddate}]'.format(fieldname=fieldname, **within_dates) elif not parsed: value_date = parse_date_universal(value) if value_date: value = value_date.format('YYYYMMDD') else: raise ValueError(value) except Exception as ex: message = 'IFI CLAIMS query: Invalid date or range expression "{0}". Reason: {1}.'.format(value, ex) logger.warn(message + '\nException was:\n{0}'.format(_exception_traceback())) return {'error': True, 'message': message} elif key == 'inventor' or key == 'applicant': if not has_booleans(value) and should_be_quoted(value): value = u'"{0}"'.format(value) elif key == 'class': # v1: Naive implementation can only handle single values #value = ifi_convert_class(value) # v2: Advanced implementation can handle expressions on field "class" # Translate class expression from "H04L12/433 or H04L12/24" # to "(ic:H04L0012433 OR cpc:H04L0012433) OR (ic:H04L001224 OR cpc:H04L001224)" try: # Put value into parenthesis, to properly capture expressions if value: value = u'({value})'.format(value=value) # Parse value as simple query expression query_object = CQL(cql=value) # Rewrite all patent classifications in query expression ast from OPS format to IFI format rewrite_classes_ifi(query_object, format, fieldname) # Serialize into appropriate upstream datasource query expression syntax expression = query_object.dumps() except pyparsing.ParseException as ex: return {'error': True, 'message': '<pre>' + str(ex.explanation) + '</pre>'} # ------------------------------------------ # surround with parentheses # ------------------------------------------ if key in ['fulltext', 'inventor', 'applicant', 'country', 'citation']: if has_booleans(value) and not should_be_quoted(value) and not '{!complexphrase' in value: value = u'({0})'.format(value) # ------------------------------------------ # expression formatter # ------------------------------------------ # Serialize into appropriate upstream datasource query expression syntax if not expression: if key == 'fulltext' and '{!complexphrase' in value: expression = value else: expression = format_expression(format, fieldname, value) #print 'expression:', expression # ------------------------------------------ # final polishing # ------------------------------------------ # Solr(?) syntax: boolean operators must be uppercase if has_booleans(expression): boolis = [' or ', ' and ', ' not '] for booli in boolis: expression = expression.replace(booli, booli.upper()) return {'query': expression}
def pair_to_elasticsearch(cls, key, value, modifiers=None): try: fieldname = cls.datasource_indexnames[key] except KeyError: return expression = None format = u'{0}:{1}' # ------------------------------------------ # value mogrifiers # ------------------------------------------ if key == 'patentnumber': # Transform into distinct fields PC, DE, KI #if has_booleans(value): # value = '({})'.format(value) expression_parts = [] # Publication number patent = split_patent_number(value) patent_normalized = normalize_patent(patent, for_ops=False) if patent_normalized: patent = patent_normalized if patent: subexpression = u'PC:{country} AND DE:{number}'.format( **patent) if patent['kind']: subexpression += u' AND KI:{kind}'.format(**patent) expression_parts.append(u'({})'.format(subexpression)) # Application number subexpression = u'AN:{}'.format(value) expression_parts.append(subexpression) expression = u' OR '.join(expression_parts) # Priority number subexpression = u'NP:{}'.format(value) expression_parts.append(subexpression) expression = u' OR '.join(expression_parts) elif key == 'pubdate': """ - DP:[19800101 TO 19851231] - DP:[* TO 19601231] """ try: parsed = False # e.g. 1991 if len(value) == 4 and value.isdigit(): value = u'within {}0101,{}1231'.format(value, value) # e.g. 1990-2014, 1990 - 2014 value = year_range_to_within(value) # e.g. # within 1978,1986 # within 1900,2009-08-20 # within 2009-08-20,2011-03-03 if 'within' in value: within_dates = parse_date_within(value) if within_dates['startdate']: if len(within_dates['startdate']) == 4: within_dates['startdate'] += '0101' within_dates['startdate'] = parse_date_universal( within_dates['startdate']).format('YYYYMMDD') else: within_dates['startdate'] = '*' if within_dates['enddate']: if len(within_dates['enddate']) == 4: within_dates['enddate'] += '1231' within_dates['enddate'] = parse_date_universal( within_dates['enddate']).format('YYYYMMDD') else: within_dates['enddate'] = '*' expression = '{fieldname}:[{startdate} TO {enddate}]'.format( fieldname=fieldname, **within_dates) elif not parsed: value_date = parse_date_universal(value) if value_date: value = value_date.format('YYYYMMDD') else: raise ValueError(value) except Exception as ex: message = 'depatech query: Invalid date or range expression "{0}". Reason: {1}.'.format( value, ex) logger.warn( message + ' Exception was: {0}'.format(_exception_traceback())) return {'error': True, 'message': message} elif key == 'inventor' or key == 'applicant': if not has_booleans(value) and should_be_quoted(value): value = u'"{0}"'.format(value) elif key == 'class': # v1: Naive implementation can only handle single values #value = lucene_convert_class(value) # v2: Advanced implementation can handle expressions on field "class" # Translate class expression from "H04L12/433 or H04L12/24" # to "(ic:H04L0012433 OR cpc:H04L0012433) OR (ic:H04L001224 OR cpc:H04L001224)" try: # Put value into parenthesis, to properly capture expressions if value: value = u'({value})'.format(value=value) # Parse value as simple query expression query_object = CQL(cql=value) # Rewrite all patent classifications in query expression ast from OPS format to Lucene format rewrite_classes_lucene(query_object, format, fieldname) # Serialize into appropriate upstream datasource query expression syntax expression = query_object.dumps() except pyparsing.ParseException as ex: return { 'error': True, 'message': '<pre>' + str(ex.explanation) + '</pre>' } elif key == 'country': value = value.upper() # ------------------------------------------ # surround with parentheses # ------------------------------------------ if key in ['fulltext', 'inventor', 'applicant', 'country', 'citation']: if has_booleans(value) and not should_be_quoted(value): value = u'({0})'.format(value) # ------------------------------------------ # expression formatter # ------------------------------------------ # Serialize into appropriate upstream datasource query expression syntax if not expression: expression = format_expression(format, fieldname, value) #print 'expression:', expression # ------------------------------------------ # final polishing # ------------------------------------------ # Solr(?) syntax: boolean operators must be uppercase if has_booleans(expression): boolis = [' or ', ' and ', ' not '] for booli in boolis: expression = expression.replace(booli, booli.upper()) return {'query': expression}