Esempio n. 1
0
def get_images_view_url(document):

    document = normalize_patent(document, for_ops=False)

    reference_type = None
    if len(document.number) <= 9:
        reference_type = 'publication'
    elif len(document.number) >= 10:
        reference_type = 'application'

    url_tpl = None
    if reference_type == 'application':
        # AppFT image server
        # http://pdfaiw.uspto.gov/.aiw?docid=20160105912
        url_tpl = 'http://pdfaiw.uspto.gov/.aiw?docid={docid}'

    elif reference_type == 'publication':
        # PatFT image server
        # http://pdfpiw.uspto.gov/.piw?docid=9317610
        url_tpl = 'http://pdfpiw.uspto.gov/.piw?docid={docid}'

    if url_tpl:
        url = url_tpl.format(docid=document.number)

        # Pre-flight check upstream url for existence of document
        try:
            response = requests.get(url)
            if 'is not a valid ID' not in response.content:
                return {'location': url, 'origin': 'USPTO'}
        except:
            pass
Esempio n. 2
0
    def pair_to_term(cls, key, value):

        try:
            fieldname = cls.fieldmap[key]['name']
            parameter = cls.fieldmap[key]['parameter']
        except KeyError:
            return

        if fieldname:
            if key == 'country':
                value = value.lower()
            elif key == 'patentnumber':
                value_normalized = normalize_patent(value)
                if value_normalized:
                    value = value_normalized
            term = '{0}:{1}'.format(fieldname, value)
        else:
            term = value

        term_data = {
            'parameter': parameter,
            'term': term,
        }

        return term_data
Esempio n. 3
0
 def action(token, index, binop, term):
     term = term.replace('"', '')
     # apply document number normalization to values of certain indexes only
     if index.lower() in indexes_publication_number:
         term = normalize_patent(term, fix_kindcode=True)
         if term:
             token[2] = term
Esempio n. 4
0
 def examples_ok(self):
     for number, number_normalized_expect in test_numbers_normalized_ok.items(
     ):
         number_normalized_computed = normalize_patent(number,
                                                       fix_kindcode=True,
                                                       for_ops=True)
         yield number, number_normalized_expect, number_normalized_computed
Esempio n. 5
0
 def document_to_number(self, document):
     ucid = document[u'ucid']
     cc, docno, kindcode = ucid.split('-')
     number = cc + docno + kindcode
     number_normalized = normalize_patent(number)
     if number_normalized:
         number = number_normalized
     return number
Esempio n. 6
0
def normalize_numbers(entries):
    entries = map(lambda s: s.replace(u' ', u''), entries)
    response = {'valid': [], 'invalid': [], 'all': []}
    for entry in entries:
        entry_normalized = normalize_patent(entry, fix_kindcode=True)
        if entry_normalized:
            response['valid'].append(entry_normalized)
            response['all'].append(entry_normalized)
        else:
            response['invalid'].append(entry)
            response['all'].append(entry)
    return response
Esempio n. 7
0
def espacenet_fetch(document_number, section, element_id):

    patent = normalize_patent(document_number,
                              as_dict=True,
                              provider='espacenet')

    # Blueprint: https://worldwide.espacenet.com/publicationDetails/biblio?CC=EP&NR=0666666&KC=A3
    url_tpl = 'https://worldwide.espacenet.com/data/publicationDetails/{section}?CC={country}&NR={number}'
    if 'kind' in patent and patent['kind']:
        url_tpl += '&KC={kind}'

    url = url_tpl.format(section=section, **patent)

    logger.info('Accessing Espacenet: {}'.format(url))
    response = requests.get(url, headers={'User-Agent': regular_user_agent})

    # Debugging
    #print 'response.content:\n', response.content

    message_404 = 'No section "{section}" at Espacenet for "{document_number}"'.format(
        **locals())
    message_fail = 'Fetching section "{section}" from Espacenet for "{document_number}" failed'.format(
        **locals())

    if response.status_code == 200:
        # TODO: when no result, "Claims not available" appears in response body
        soup = BeautifulSoup(response.content)
        element = soup.find('div', {'id': element_id})
        if element:
            element = element.find('p')
            lang = element['lang']
            del element['class']
            content = element.prettify()
        else:
            raise KeyError(message_404)

        data = {
            'xml': content,
            'lang': lang,
            'source': 'espacenet',
        }

        return data

    elif response.status_code == 404:
        raise KeyError(message_404)

    else:

        if 'Entity not found' in response.content:
            raise KeyError(message_404)
        else:
            raise ValueError(message_fail)
Esempio n. 8
0
def pdf_universal(patent):

    pdf = None
    datasource = None
    meta = {}

    document = decode_patent_number(patent)
    number_normalized = normalize_patent(patent)

    # first, try archive
    try:
        # Skip requests for documents w/o kindcode
        if not document.kind:
            raise ValueError(u'No kindcode for patent: {}'.format(patent))

        pdf = archive_fetch_pdf(number_normalized)
        datasource = 'archive'

    except Exception as ex:

        if not isinstance(ex, HTTPNotFound):
            log.error(exception_traceback())
        """
        # second, try archive again after running acquisition
        try:

            # Skip requests for documents w/o kindcode
            if not document.kind: raise ValueError(u'No kindcode')

            run_acquisition(number_normalized, 'pdf')
            pdf = archive_fetch_pdf(number_normalized, 2)
            datasource = 'archive'

        except Exception as ex:
        """

        if True:

            if not isinstance(ex, HTTPNotFound):
                log.error(exception_traceback())

            if document:

                pdf = pdf_from_ops(patent, document, meta)
                datasource = 'ops'

            else:
                log.error('Locating a document at the domestic office requires ' \
                          'a decoded document number for "{}"'.format(patent))

    return {'pdf': pdf, 'datasource': datasource, 'meta': meta}
Esempio n. 9
0
def fetch_pdf(document_number):
    """
    Retrieve PDF document from the European publication server.
    https://data.epo.org/publication-server/

    Blueprint address:
    https://data.epo.org/publication-server/pdf-document?cc=EP&pn=nnnnnn&ki=nn
    """

    logger.info('PDF {}: European publication server attempt'.format(document_number))

    patent = normalize_patent(document_number, as_dict=True, provider='espacenet')

    url_tpl = 'https://data.epo.org/publication-server/pdf-document?cc=EP&pn={number}&ki={kind}'

    url = url_tpl.format(**patent)

    logger.info('Accessing EPO publication server: {}'.format(url))
    response = requests.get(url, headers={'User-Agent': regular_user_agent})

    # Debugging
    #print 'response.content:\n', response.content

    if response.status_code == 200:

        if response.headers['Content-Type'] == 'application/pdf':
            payload = response.content
            return payload

        # Sometimes, an appropriate HTML document is returned,
        # pointing to the corresponding WIPO document.
        #
        # Example: EP2706864A2
        # https://data.epo.org/publication-server/pdf-document?cc=EP&pn=2706864&ki=A2
        # http://www.wipo.int/patentscope/search/en/WO2012153305
        #
        # TODO: Unlock this again by leveraging the WIPO URL.
        else:
            msg = 'No PDF document returned from European ' \
                  'publication server for "{document_number}".'.format(**locals())
            logger.warn(msg)
            raise HTTPNotFound(msg)

    else:
        msg = 'No document found at European publication ' \
              'server for "{document_number}"'.format(**locals())
        logger.warn(msg)
        raise HTTPNotFound(msg)
Esempio n. 10
0
def get_xml(number):
    """
    Fetch XML from EPD archive service
    """
    number_normalized = normalize_patent(number)

    # 2015-01-13: apply patentnumber fixes for getting more out of DEPATISconnect
    numbers = depatisconnect_alternatives(number_normalized)

    for number_real in numbers:
        try:
            return get_xml_real(number_real)
        except KeyError:
            continue

    raise KeyError('No XML document for "{0}" at DPMA'.format(number))
Esempio n. 11
0
def pdf_url(document_number):
    """
    # Application
    >>> pdf_url('US2016101909A1')
    'http://pdfaiw.uspto.gov/fdd/09/2016/19/010/0.pdf'

    # Grant I
    >>> pdf_url('US10194689B2')
    'http://pdfpiw.uspto.gov/fdd/89/946/101/0.pdf'

    # Grant II
    >>> pdf_url('US2548918')
    'http://pdfpiw.uspto.gov/fdd/18/489/025/0.pdf'
    """

    document = normalize_patent(document_number,
                                for_ops=False,
                                as_dict=True,
                                provider='uspto')
    if not document:
        return

    # Application
    if len(document.number) == 11:
        n = document.number
        # US20160101909A1
        # http://pdfaiw.uspto.gov/fdd/09/2016/19/010/0.pdf
        url = 'http://pdfaiw.uspto.gov/fdd/{}/{}/{}/{}/0.pdf'.format(
            n[9:11], n[0:4], n[7:9], n[4:7])

    # Grant
    elif len(document.number) == 8:
        n = document.number
        # US10194689B2
        # http://pdfpiw.uspto.gov/fdd/89/946/101/0.pdf
        url = 'http://pdfpiw.uspto.gov/fdd/{}/{}/{}/0.pdf'.format(
            n[6:8], n[3:6], n[0:3])

    else:
        raise ValueError(
            'US document number "{}" has unexpected length'.format(
                document_number))

    return url
Esempio n. 12
0
def get_drawing_png(document, page, kind):

    # 2. Try to fetch drawing from OPS, fall back to other patent offices
    try:
        payload = get_ops_image(document, page, kind, 'tiff')

    except HTTPNotFound:

        # fallback to USPTO (U.S.)
        if document.upper().startswith('US'):
            document_id = normalize_patent(split_patent_number(document),
                                           for_ops=False)
            try:
                payload = get_uspto_image_cached(document_id)
            except PayloadEmpty as ex:
                raise HTTPNotFound(
                    'No drawing for "{0}" at OPS or USPTO'.format(document))

        # fallback to CIPO (Canada)
        elif document.upper().startswith('CA'):
            document_id = split_patent_number(document)
            try:
                payload = get_cipo_image_cached(document_id)
            except PayloadEmpty as ex:
                raise HTTPNotFound(
                    'No drawing for "{0}" at OPS or CIPO'.format(document))

        # otherwise, pass through exception
        else:
            raise

    # 3. Croak if no image available
    if not payload:
        msg = 'No image available for document={document}, kind={kind}, page={page}'.format(
            **locals())
        log.warn(msg)
        raise HTTPNotFound(msg)

    # 4. Convert image from TIFF to PNG format
    payload = to_png(BytesIO(payload))

    return payload
Esempio n. 13
0
    def read_documents(self):
        for document in self.documents:
            try:
                number = self.document_to_number(document)
            except (KeyError, TypeError):
                number = None

            # Whether kindcodes should be fixed on number normalization
            normalize_fix_kindcode = 'normalize_fix_kindcode' in self.options and self.options.normalize_fix_kindcode

            # Apply number normalization
            # TODO: Check how we can decouple from "for_ops=True" here
            number_normalized = normalize_patent(
                number, fix_kindcode=normalize_fix_kindcode, for_ops=True)

            # Be graceful if this didn't work
            if number_normalized:
                number = number_normalized

            document['publication_number'] = number
            document['upstream_provider'] = self.meta.upstream.name
Esempio n. 14
0
    def toCQL(self):

        text = []
        for p in list(self.prefixes.keys()):
            if (p != ''):
                text.append('>%s="%s"' % (p, self.prefixes[p]))
            else:
                text.append('>"%s"' % (self.prefixes[p]))

        # add some smartness:

        # 1. for certain attributes, apply document number normalization to value
        term_vanilla = term = self.term.toCQL()
        if str(self.index).lower() in ['pn', 'num']:
            term = normalize_patent(str(term))

        # 2. fallback to original value, if number normalization couldn't handle this value
        if not term:
            term = term_vanilla

        # 3. exclude some values from being quoted (Error code: 1107 - Quote marks not applicable for this index)
        if str(self.index).lower() in [
                'pa', 'in', 'pc', 'ac', 'prc', 'py', 'ay', 'pry', 'pub', 'ad',
                'prd'
        ]:
            pass
        else:
            term = '"%s"' % term

        text.append('%s %s %s' % (self.index, self.relation.toCQL(), term))
        # Add sortKeys
        if self.sortKeys:
            text.append("sortBy")
            for sk in self.sortKeys:
                text.append(sk.toCQL())
        return ' '.join(text)

        return SearchClause.toCQL(self)
Esempio n. 15
0
def pdf_universal_real(patent, response):

    document = decode_patent_number(patent)
    number_normalized = normalize_patent(patent)

    # Sanity checks.
    if document is None:
        log.error('Locating a document at the domestic office requires ' \
                  'a decoded document number for "{}"'.format(patent))
        raise ValueError('Unable to decode document number {}'.format(patent))

    # 1. If it's an EP document, try European publication server first.
    if response.pdf is None and document.country == 'EP':

        try:
            response.pdf = publicationserver_fetch_pdf(patent)
            response.datasource = 'epo-publication-server'

        except Exception as ex:
            log.warning('PDF {}: Not available from EPO. {}'.format(
                patent, ex))
            if not isinstance(ex, HTTPError):
                log.error(exception_traceback())

    # 2. Next, try USPTO servers if it's an US document.
    if response.pdf is None and document.country == 'US':

        try:
            response.pdf = uspto_fetch_pdf(patent)
            response.datasource = 'uspto'

        except Exception as ex:
            log.warning('PDF {}: Not available from USPTO. {}'.format(
                patent, ex))
            if not isinstance(ex, HTTPError):
                log.error(exception_traceback())

    # 3. Next, try DPMA servers.
    if response.pdf is None:
        try:
            # Skip requests for documents w/o kindcode
            if not document.kind:
                raise ValueError('No kindcode for patent: {}'.format(patent))

            response.pdf = depatisconnect_fetch_pdf(number_normalized)
            response.datasource = 'dpma'

        except Exception as ex:
            log.warning('PDF {}: Not available from DPMA. {}'.format(
                patent, ex))

            # Evaluate exception.
            if isinstance(ex, NotConfiguredError):
                log.warning(ex)

            elif not isinstance(ex, HTTPNotFound):
                log.error(exception_traceback())

    # 4. Next, try EPO OPS service.
    # Note this will assemble PDF out of single pages requested
    # from EPO OPS, which is a rather expensive operation.
    if response.pdf is None:

        # 2016-04-21: Amend document number for CA documents, e.g. CA2702893C -> CA2702893A1
        # TODO: Reenable feature, but only when prefixing document with a custom page
        #       informing the user about recent changes not yet arrived at EPO.
        # if document.country == 'CA':
        #    patent = document.country + document.number

        try:
            response.pdf = ops_build_pdf(patent)
            response.datasource = 'epo-ops'

        except Exception as ex:
            log.warning('PDF {}: Not available from OPS. {}'.format(
                patent, ex))
            if not isinstance(ex, HTTPError):
                log.error(exception_traceback())

    # 5. Last but not least, try to redirect to USPTO server.
    # TODO: Move elsewhere as deactivated on 2019-02-19.
    if False and response.pdf is None and document.country == 'US':

        log.info('PDF {}: USPTO attempt'.format(patent))
        uspto_found = False
        reason = None
        try:
            images_location = uspto_pdfview_url(document)
            if images_location:
                response.meta.update(images_location)
                response.datasource = 'uspto'
                uspto_found = True

        except Exception as ex:
            reason = ex
            if not isinstance(ex, HTTPError):
                log.error(exception_traceback())

        if not uspto_found:
            log.warning('PDF {}: Not available on USPTO. {}'.format(
                patent, reason))

    return True
Esempio n. 16
0
def jump_office(request):
    office          = request.matchdict.get('office')
    service         = request.matchdict.get('service')
    document_type   = request.matchdict.get('document_type')
    document_number = request.matchdict.get('document_number')
    redirect        = request.params.get('redirect')

    if document_number:

        url = None
        if office == 'dpma' and service == 'register':
            dra = DpmaRegisterAccess()
            try:
                url = dra.get_document_url(document_number)
            except:
                return HTTPNotFound('Document number {} not found.'.format(document_number))

            # TODO: application number vs. file number, e.g.
            # - EP666666   vs. E95480005.8
            # - DE19630877 vs. 196308771

        elif office == 'uspto' and service == 'biblio':

            if document_type == 'publication':
                # http://patft.uspto.gov/netacgi/nph-Parser?Sect1=PTO1&Sect2=HITOFF&d=PALL&p=1&u=%2Fnetahtml%2FPTO%2Fsrchnum.htm&r=1&f=G&l=50&s1=9317610
                document = normalize_patent(document_number, as_dict=True, for_ops=False)
                url = 'http://patft.uspto.gov/netacgi/nph-Parser'\
                      '?Sect1=PTO1&Sect2=HITOFF&d=PALL&p=1&u=%2Fnetahtml%2FPTO%2Fsrchnum.htm&r=1&f=G&l=50&s1={number}.PN.'.format(**document)

            elif document_type == 'application':
                # http://appft.uspto.gov/netacgi/nph-Parser?Sect1=PTO1&Sect2=HITOFF&d=PG01&p=1&u=%2Fnetahtml%2FPTO%2Fsrchnum.html&r=1&f=G&l=50&s1=20160105912
                document = normalize_patent(document_number, as_dict=True, for_ops=False)
                url = 'http://appft.uspto.gov/netacgi/nph-Parser'\
                      '?Sect1=PTO1&Sect2=HITOFF&d=PG01&p=1&u=%2Fnetahtml%2FPTO%2Fsrchnum.html&r=1&f=G&l=50&s1={number}'.format(**document)

        elif office == 'uspto' and service == 'images':

            if document_type == 'publication':
                # http://pdfpiw.uspto.gov/.piw?docid=9317610
                document = normalize_patent(document_number, as_dict=True, for_ops=False)
                url = 'http://pdfpiw.uspto.gov/.piw?docid={number}'.format(**document)

            elif document_type == 'application':
                # http://pdfaiw.uspto.gov/.aiw?docid=20160105912
                document = normalize_patent(document_number, as_dict=True, for_ops=False)
                url = 'http://pdfaiw.uspto.gov/.aiw?docid={number}'.format(**document)

        elif office == 'uspto' and service == 'global-dossier':
            # https://globaldossier.uspto.gov/#/result/publication/DE/112015004959/1
            normalized = normalize_patent(document_number, as_dict=True, for_ops=False)
            url = 'https://globaldossier.uspto.gov/#/result/{document_type}/{country}/{number}/1'.format(
                document_type=document_type, **normalized)

        elif office == 'google' and service == 'patents':
            # https://www.google.com/patents/EP0666666B1
            # https://patents.google.com/patent/EP0666666B1
            normalized = normalize_patent(document_number, for_ops=False)
            url = 'https://patents.google.com/patent/{}'.format(normalized)

        # Add Google Prior Art search again. See "priorArtKeywords" and "priorArtDate" in HTML response.

        if url:
            if redirect:
                return HTTPFound(location=url)
            else:
                return url

    return HTTPNotFound(u'Could not locate document "{document_number}" at {office}/{service}.'.format(
        document_number=document_number, office=office, service=service))
Esempio n. 17
0
 def document_to_number(self, document):
     _id = document[u'_id']
     cc, docno, kindcode = _id.split('.')
     publication_number = cc + docno + kindcode
     number = normalize_patent(publication_number)
     return number
Esempio n. 18
0
    def pair_to_solr(cls, key, value, modifiers=None):

        try:
            fieldname = cls.datasource_indexnames[key]
        except KeyError:
            return

        expression = None
        format = u'{0}:{1}'


        # ------------------------------------------
        #   value mogrifiers
        # ------------------------------------------
        if key == 'patentnumber':
            # TODO: parse more sophisticated to make things like "EP666666 or EP666667" or "?query=pn%3AEP666666&datasource=ifi" possible
            # TODO: use different normalization flavor for IFI, e.g. JP01153210A will not work as JPH01153210A, which is required by OPS
            value = normalize_patent(value, for_ops=False)

        elif key == 'pubdate':

            """
            - pd:[19800101 TO 19851231]
            - pd:[* TO 19601231]
            - pdyear:[1980 TO 1985]
            - pdyear:[* TO 1960]
            """

            try:

                parsed = False

                # e.g. 1991
                if len(value) == 4 and value.isdigit():
                    fieldname = 'pdyear'
                    parsed = True

                # e.g. 1990-2014, 1990 - 2014
                value = year_range_to_within(value)

                # e.g.
                # within 1978,1986
                # within 1900,2009-08-20
                # within 2009-08-20,2011-03-03
                if 'within' in value:
                    within_dates = parse_date_within(value)
                    elements_are_years = all([len(value) == 4 and value.isdigit() for value in within_dates.values()])
                    if elements_are_years:
                        fieldname = 'pdyear'

                    else:
                        if within_dates['startdate']:
                            within_dates['startdate'] = parse_date_universal(within_dates['startdate']).format('YYYYMMDD')

                        if within_dates['enddate']:
                            within_dates['enddate'] = parse_date_universal(within_dates['enddate']).format('YYYYMMDD')

                    if not within_dates['startdate']:
                        within_dates['startdate'] = '*'

                    if not within_dates['enddate']:
                        within_dates['enddate'] = '*'

                    expression = '{fieldname}:[{startdate} TO {enddate}]'.format(fieldname=fieldname, **within_dates)

                elif not parsed:
                    value_date = parse_date_universal(value)
                    if value_date:
                        value = value_date.format('YYYYMMDD')
                    else:
                        raise ValueError(value)

            except Exception as ex:
                message = 'IFI CLAIMS query: Invalid date or range expression "{0}". Reason: {1}.'.format(value, ex)
                logger.warn(message + '\nException was:\n{0}'.format(_exception_traceback()))
                return {'error': True, 'message': message}

        elif key == 'inventor' or key == 'applicant':
            if not has_booleans(value) and should_be_quoted(value):
                value = u'"{0}"'.format(value)

        elif key == 'class':

            # v1: Naive implementation can only handle single values
            #value = ifi_convert_class(value)

            # v2: Advanced implementation can handle expressions on field "class"
            # Translate class expression from "H04L12/433 or H04L12/24"
            # to "(ic:H04L0012433 OR cpc:H04L0012433) OR (ic:H04L001224 OR cpc:H04L001224)"
            try:

                # Put value into parenthesis, to properly capture expressions
                if value:
                    value = u'({value})'.format(value=value)

                # Parse value as simple query expression
                query_object = CQL(cql=value)

                # Rewrite all patent classifications in query expression ast from OPS format to IFI format
                rewrite_classes_ifi(query_object, format, fieldname)

                # Serialize into appropriate upstream datasource query expression syntax
                expression = query_object.dumps()

            except pyparsing.ParseException as ex:
                return {'error': True, 'message': '<pre>' + str(ex.explanation) + '</pre>'}


        # ------------------------------------------
        #   surround with parentheses
        # ------------------------------------------
        if key in ['fulltext', 'inventor', 'applicant', 'country', 'citation']:
            if has_booleans(value) and not should_be_quoted(value) and not '{!complexphrase' in value:
                value = u'({0})'.format(value)

        # ------------------------------------------
        #   expression formatter
        # ------------------------------------------
        # Serialize into appropriate upstream datasource query expression syntax
        if not expression:
            if key == 'fulltext' and '{!complexphrase' in value:
                expression = value
            else:
                expression = format_expression(format, fieldname, value)
            #print 'expression:', expression

        # ------------------------------------------
        #   final polishing
        # ------------------------------------------
        # Solr(?) syntax: boolean operators must be uppercase
        if has_booleans(expression):
            boolis = [' or ', ' and ', ' not ']
            for booli in boolis:
                expression = expression.replace(booli, booli.upper())

        return {'query': expression}
Esempio n. 19
0
    def pair_to_elasticsearch(cls, key, value, modifiers=None):

        try:
            fieldname = cls.datasource_indexnames[key]
        except KeyError:
            return

        expression = None
        format = u'{0}:{1}'

        # ------------------------------------------
        #   value mogrifiers
        # ------------------------------------------
        if key == 'patentnumber':

            # Transform into distinct fields PC, DE, KI

            #if has_booleans(value):
            #    value = '({})'.format(value)

            expression_parts = []

            # Publication number
            patent = split_patent_number(value)

            patent_normalized = normalize_patent(patent, for_ops=False)
            if patent_normalized:
                patent = patent_normalized

            if patent:
                subexpression = u'PC:{country} AND DE:{number}'.format(
                    **patent)
                if patent['kind']:
                    subexpression += u' AND KI:{kind}'.format(**patent)
                expression_parts.append(u'({})'.format(subexpression))

            # Application number
            subexpression = u'AN:{}'.format(value)
            expression_parts.append(subexpression)
            expression = u' OR '.join(expression_parts)

            # Priority number
            subexpression = u'NP:{}'.format(value)
            expression_parts.append(subexpression)
            expression = u' OR '.join(expression_parts)

        elif key == 'pubdate':
            """
            - DP:[19800101 TO 19851231]
            - DP:[* TO 19601231]
            """

            try:

                parsed = False

                # e.g. 1991
                if len(value) == 4 and value.isdigit():
                    value = u'within {}0101,{}1231'.format(value, value)

                # e.g. 1990-2014, 1990 - 2014
                value = year_range_to_within(value)

                # e.g.
                # within 1978,1986
                # within 1900,2009-08-20
                # within 2009-08-20,2011-03-03
                if 'within' in value:
                    within_dates = parse_date_within(value)

                    if within_dates['startdate']:
                        if len(within_dates['startdate']) == 4:
                            within_dates['startdate'] += '0101'
                        within_dates['startdate'] = parse_date_universal(
                            within_dates['startdate']).format('YYYYMMDD')
                    else:
                        within_dates['startdate'] = '*'

                    if within_dates['enddate']:
                        if len(within_dates['enddate']) == 4:
                            within_dates['enddate'] += '1231'
                        within_dates['enddate'] = parse_date_universal(
                            within_dates['enddate']).format('YYYYMMDD')
                    else:
                        within_dates['enddate'] = '*'

                    expression = '{fieldname}:[{startdate} TO {enddate}]'.format(
                        fieldname=fieldname, **within_dates)

                elif not parsed:
                    value_date = parse_date_universal(value)
                    if value_date:
                        value = value_date.format('YYYYMMDD')
                    else:
                        raise ValueError(value)

            except Exception as ex:
                message = 'depatech query: Invalid date or range expression "{0}". Reason: {1}.'.format(
                    value, ex)
                logger.warn(
                    message +
                    ' Exception was: {0}'.format(_exception_traceback()))
                return {'error': True, 'message': message}

        elif key == 'inventor' or key == 'applicant':
            if not has_booleans(value) and should_be_quoted(value):
                value = u'"{0}"'.format(value)

        elif key == 'class':

            # v1: Naive implementation can only handle single values
            #value = lucene_convert_class(value)

            # v2: Advanced implementation can handle expressions on field "class"
            # Translate class expression from "H04L12/433 or H04L12/24"
            # to "(ic:H04L0012433 OR cpc:H04L0012433) OR (ic:H04L001224 OR cpc:H04L001224)"
            try:

                # Put value into parenthesis, to properly capture expressions
                if value:
                    value = u'({value})'.format(value=value)

                # Parse value as simple query expression
                query_object = CQL(cql=value)

                # Rewrite all patent classifications in query expression ast from OPS format to Lucene format
                rewrite_classes_lucene(query_object, format, fieldname)

                # Serialize into appropriate upstream datasource query expression syntax
                expression = query_object.dumps()

            except pyparsing.ParseException as ex:
                return {
                    'error': True,
                    'message': '<pre>' + str(ex.explanation) + '</pre>'
                }

        elif key == 'country':
            value = value.upper()

        # ------------------------------------------
        #   surround with parentheses
        # ------------------------------------------
        if key in ['fulltext', 'inventor', 'applicant', 'country', 'citation']:
            if has_booleans(value) and not should_be_quoted(value):
                value = u'({0})'.format(value)

        # ------------------------------------------
        #   expression formatter
        # ------------------------------------------
        # Serialize into appropriate upstream datasource query expression syntax
        if not expression:
            expression = format_expression(format, fieldname, value)
            #print 'expression:', expression

        # ------------------------------------------
        #   final polishing
        # ------------------------------------------
        # Solr(?) syntax: boolean operators must be uppercase
        if has_booleans(expression):
            boolis = [' or ', ' and ', ' not ']
            for booli in boolis:
                expression = expression.replace(booli, booli.upper())

        return {'query': expression}
Esempio n. 20
0
def invalidate_xml(number):
    number_normalized = normalize_patent(number)
    region_invalidate(get_xml, None, 'get_xml', number_normalized)