Beispiel #1
0
 def etree_to_xml(self, tree, pretty=False):
     xml = etree.tostring(tree)
     xml = xml.strip()
     if pretty:
         xml = pretty_print(xml)
     else:
         xml = compact_print(xml)
     return xml
Beispiel #2
0
    def transform(self):

        if not self.response.payload:
            return self

        if self.options.get('pretty', False):
            if self.format == 'xml':
                self.response.payload = pretty_print(self.response.payload)
            elif self.format == 'json':
                self.response.payload = json.dumps(json.loads(
                    self.response.payload),
                                                   indent=4)

        return self
Beispiel #3
0
    def parse(self):

        # apply asciifolding
        self.expression = strip_accents(self.expression)

        # parse search expression
        logger.debug('self.expression: %s', self.expression)
        root = self.to_etree(self.expression)
        logger.debug('term:\n%s', pretty_print(etree.tostring(root)))

        # HACK against corner case ab=(42)
        """
        <parenthesis>
            <index>ab</index>
            <binop>=</binop>
            <value>42</value>
        </parenthesis>
        """

        # rewrite element from "parenthesis" to "term" if structure looks like it
        def eexists(element, name):
            return element.find(name) is not None
        child_constraints =\
            all(map(lambda x: eexists(root, x), ['index', 'binop'])) and \
            any(map(lambda x: eexists(root, x), ['value', 'quotes']))
        if root.tag == 'parenthesis' and child_constraints:
            root.tag = 'term'

        # also rewrite all other parenthesis looking like terms
        for parens in root.iter('parenthesis'):
            child_constraints =\
                all(map(lambda x: eexists(parens, x), ['index', 'binop'])) and\
                any(map(lambda x: eexists(parens, x), ['value', 'quotes', 'or', 'and', 'not']))
            if child_constraints:
                parens.tag = 'term'

        logger.debug('before term:\n%s', pretty_print(etree.tostring(root)))

        # decapsulate and translate "term" nodes
        for term in list(root.iter('term')):

            # 1. decode and convert term structure
            index = term.find('index').text
            binop = term.find('binop').text

            # 1.a default value decoding
            #value = term.find('value').text
            # 1.b makes things like ab="42" possible
            # FIXME: catch IndexError, throw FulltextDecodingError
            value_quotes = term.xpath('value|quotes')
            boolean_content = term.xpath('and|or|not')

            if value_quotes:
                value = self.decode_quoted_value(value_quotes[0])
                self.keyword_add(value)

            elif boolean_content:
                value = self.convert_boolean_nodes(term)
                value = value.replace(u'and not', u'not')

            # 2. expand triple
            triple = index, binop, value
            expanded_xml = self.expand_fulltext(triple)

            # 3. replace term by computed representation
            xml_node = etree.XML(expanded_xml)

            # replace root node
            if term == root:
                root = xml_node

            # replace nested term
            else:
                parent_node = term.getparent()
                parent_node.replace(term, xml_node)

        logger.debug('after term:\n' + pretty_print(etree.tostring(root)))

        # decapsulate and translate "parenthesis" nodes
        for container in list(root.iter('parenthesis')):
            root = self.convert_elements(root, container, ['and', 'or', 'not'])
            root = self.convert_elements(root, container, ['near', 'span'])

        logger.debug('after parenthesis:\n' +
                     pretty_print(etree.tostring(root)))

        # unqualified terms, i.e. when there's no qualifying fieldname (bi=, ti=, ab=, ...),

        # extrapolate field "bi=" (search in all fulltext fields)
        root = self.convert_elements(root, root, ['value', 'quotes'])
        root = self.convert_elements(root, root, ['near', 'span'])

        # decapsulate "parenthesis" nodes (strip "parenthesis" tags)
        root = self.strip_parenthesis(root)

        #print "current:\n", pretty_print(etree.tostring(root))

        # HACK to make unqualified expressions with boolean operators possible
        # apply only if tree does not contain already expanded <text ...> elements, otherwise things go haywire
        if root.tag in ['and', 'or', 'not'
                        ] and root.xpath('value') and not root.xpath('//text'):
            root = self.convert_elements(root, root, ['and', 'or', 'not'])

        # HACK to expand leftover <value></value> elements
        for value_element in list(root.iter('value')):
            index, binop = self._get_index_binop(value_element)
            triple = index, binop, value_element.text
            root = self.fulltext_to_xml_element(root, value_element, triple)

        return root
Beispiel #4
0
def make_expression_filter(data):

    request = get_current_request()

    datasource = data['datasource']
    criteria = data['criteria']
    modifiers = data.get('modifiers', {})
    query = data.get('query')

    # TODO: Refactor to "patzilla.access.{sip,ificlaims,depatech,google}" namespaces
    if datasource == 'ificlaims':
        from patzilla.access.ificlaims.expression import IFIClaimsExpression
    elif datasource == 'depatech':
        from patzilla.access.depatech.expression import DepaTechExpression
    elif datasource == 'sip':
        from patzilla.access.sip.expression import SipExpression
    elif datasource == 'google':
        from patzilla.access.google.search import GooglePatentsExpression

    if datasource == 'sip':
        modifiers = SipExpression.compute_modifiers(modifiers)

    expression = ''
    expression_parts = []
    filter_parts = []
    keywords = []

    #if data['format'] == 'comfort':
    if True:

        # TODO: Refactor to "patzilla.access.google" namespace
        if datasource == 'google':
            gpe = GooglePatentsExpression(criteria, query)
            expression = gpe.serialize()
            keywords = gpe.get_keywords()

        else:

            # Bring criteria in order: Process "fulltext" first
            keys = list(criteria.keys())
            if 'fulltext' in keys:
                keys.remove('fulltext')
                keys.insert(0, 'fulltext')

            for key in keys:

                # Acquire humanized expression
                value = criteria.get(key)

                # Sanitize value
                value = value.strip()

                if not value:
                    continue

                # Allow notations like "DE or EP or US" and "DE,EP"
                if key == 'country':
                    entries = re.split('(?: or |,)', value, flags=re.IGNORECASE)
                    entries = [entry.strip() for entry in entries]
                    value = ' or '.join(entries)

                expression_part = None
                filter_part = None

                if datasource in ['ops', 'depatisnet']:
                    expression_part = pair_to_cql(datasource, key, value)

                # TODO: Refactor to "patzilla.access.sip" namespace
                elif datasource == 'sip':
                    expression_part = SipExpression.pair_to_sip_xml(key, value, modifiers)
                    if expression_part:
                        if 'keywords' in expression_part:
                            keywords += expression_part['keywords']
                        else:
                            keywords += keywords_from_boolean_expression(key, value)

                # TODO: Refactor to "patzilla.access.ificlaims" namespace
                elif datasource == 'ificlaims':

                    if key == 'pubdate':
                        expression_part = {'empty': True}
                        filter_part = IFIClaimsExpression.pair_to_solr(key, value, modifiers)

                    else:
                        expression_part = IFIClaimsExpression.pair_to_solr(key, value, modifiers)
                        if expression_part:
                            if 'keywords' in expression_part:
                                keywords += expression_part['keywords']
                            else:
                                keywords += keywords_from_boolean_expression(key, value)

                # TODO: Refactor to "patzilla.access.depatech" namespace
                elif datasource == 'depatech':

                    expression_part = DepaTechExpression.pair_to_elasticsearch(key, value, modifiers)
                    if expression_part:
                        if 'keywords' in expression_part:
                            keywords += expression_part['keywords']
                        else:
                            keywords += keywords_from_boolean_expression(key, value)

                # Accumulate expression part
                error_tpl = 'Criteria "{0}: {1}" has invalid format, datasource={2}.'
                if not expression_part:
                    message = error_tpl.format(key, value, datasource)
                    log.warn(message)
                    request.errors.add('query-expression-utility-service', 'comfort-form', message)

                elif 'error' in expression_part:
                    message = error_tpl.format(key, value, datasource)
                    message += '<br/>' + expression_part['message']
                    log.warn(message)
                    request.errors.add('query-expression-utility-service', 'comfort-form', message)

                else:
                    query = expression_part.get('query')
                    if query:
                        expression_parts.append(query)

                # Accumulate filter part
                error_tpl = 'Filter "{0}: {1}" has invalid format, datasource={2}.'
                if filter_part:

                    if 'error' in filter_part:
                        message = error_tpl.format(key, value, datasource)
                        message += '<br/>' + filter_part['message']
                        log.warn(message)
                        request.errors.add('query-expression-utility-service', 'comfort-form', message)

                    else:
                        filter_part.get('query') and filter_parts.append(filter_part.get('query'))


    log.info("Propagating keywords from comfort form: {keywords}".format(keywords=keywords))
    request.response.headers['X-PatZilla-Query-Keywords'] = json.dumps(keywords)

    # assemble complete expression from parts, connect them with AND operators
    if datasource in ['ops', 'depatisnet']:
        expression = ' and '.join(expression_parts)

    elif datasource in ['ificlaims', 'depatech']:
        expression = ' AND '.join(expression_parts)

    elif datasource == 'sip':
        if expression_parts:
            if len(expression_parts) == 1:
                expression = expression_parts[0]
            else:
                expression = '\n'.join(expression_parts)
                expression = '<and>\n' + expression + '\n</and>'

            # apply full family mode to whole xml search expression
            if asbool(modifiers.get('family-full')):
                expression = SipExpression.enrich_fullfamily(expression)

            expression = pretty_print(expression, xml_declaration=False)

    payload = {
        'expression': expression,
        'filter': ' AND '.join(filter_parts),
    }

    return payload