Ejemplo n.º 1
0
 def match_filter(item, filter):
     if callable(filter):
         patent = split_patent_number(item)
         outcome = filter(patent)
     else:
         outcome = item.startswith(filter)
     return outcome
Ejemplo n.º 2
0
def depatisconnect_alternatives(number):
    """reverse "fix_patent" for DE documents"""

    # always add original number first
    numbers = [number]

    patent = split_patent_number(number)
    if patent['country'] == 'DE':
        if not patent['number'].isdigit():
            return [join_patent(patent)]

        patent_number = int(patent['number'])
        # e.g. DE000000121107A, DE000000801283B
        if patent_number < 1000000:
            if patent['kind'] == 'C':
                patent['kind'] = 'B'
                numbers.append(join_patent(patent))
                patent['kind'] = 'A'
                numbers.append(join_patent(patent))

        # e.g. DE000001020931A
        elif 1000000 <= patent_number < 1400000:
            #numbers.append(join_patent(patent))
            pass

        # e.g. DE000002363448A
        elif 1400000 <= patent_number:
            if patent['kind'] == 'A1':
                patent['kind'] = 'A'
                numbers.append(join_patent(patent))

    return numbers
Ejemplo n.º 3
0
def normalize_patent(number, as_dict=False, as_string=False, fix_kindcode=False, for_ops=True, provider=None):

    if provider is None and for_ops is True:
        provider = 'ops'

    # 1. handle patent dicts or convert (split) from string
    if isinstance(number, types.DictionaryType):
        patent = number
    else:
        patent = split_patent_number(number)

    # 2.a. normalize patent dict
    patent_normalized = patch_patent(patent, provider=provider)

    # 2.b. apply fixes
    if fix_kindcode:
        fix_patent_kindcode_ops(patent_normalized)

    # 3. result handling

    # 3.a) default mechanism: return what we've got
    if isinstance(number, types.DictionaryType):
        result = patent_normalized
    else:
        result = join_patent(patent_normalized)

    # 3.b) extended mechanism: return what we are requested for
    if as_dict:
        result = patent_normalized
    elif as_string:
        result = join_patent(patent_normalized)

    return result
Ejemplo n.º 4
0
def get_drawing_png(document, page, kind):

    # 2. Try to fetch drawing from OPS, fall back to other patent offices
    try:
        payload = get_ops_image(document, page, kind, 'tiff')

    except HTTPNotFound:

        # fallback to USPTO (U.S.)
        if document.upper().startswith('US'):
            document_id = normalize_patent(split_patent_number(document),
                                           for_ops=False)
            try:
                payload = get_uspto_image_cached(document_id)
            except PayloadEmpty as ex:
                raise HTTPNotFound(
                    'No drawing for "{0}" at OPS or USPTO'.format(document))

        # fallback to CIPO (Canada)
        elif document.upper().startswith('CA'):
            document_id = split_patent_number(document)
            try:
                payload = get_cipo_image_cached(document_id)
            except PayloadEmpty as ex:
                raise HTTPNotFound(
                    'No drawing for "{0}" at OPS or CIPO'.format(document))

        # otherwise, pass through exception
        else:
            raise

    # 3. Croak if no image available
    if not payload:
        msg = 'No image available for document={document}, kind={kind}, page={page}'.format(
            **locals())
        log.warn(msg)
        raise HTTPNotFound(msg)

    # 4. Convert image from TIFF to PNG format
    payload = to_png(BytesIO(payload))

    return payload
Ejemplo n.º 5
0
def test_denormalization():

    payload = """
WO2002051230
WO2002051231
WO2006113621A3
WO1998016331A3
WO2000001014A1
WO2001002000A3
WO1999012345
WO1999123456
WO2001012345
WO2001098623A1
WO2001098623A1
WO2001098623A1
WO2001098623A1
WO2003107732
WO2003107732
WO2004000001
WO1999013800
WO1999023997
WO1990004917
WO2000027301
WO2000000748
WO2003043359
WO2003107520
WO2007054055
---
WO1990004917
"""

    print "-" * 30
    print "original\tdenormalized"
    print "-" * 30
    for number in payload.split("\n"):
        if not number or number == "\n": continue
        if number.startswith('---'):
            print number
            continue
        number_denormalized = join_patent(
            denormalize_patent(split_patent_number(number)))
        print "%s\t%s" % (number, number_denormalized)
Ejemplo n.º 6
0
def ops_register(reference_type, document_number, constituents=None, xml=False):
    """
    Request register information from OPS in JSON or XML format.

    reference_type = publication|application|priority

    Examples:
    - http://ops.epo.org/3.1/rest-services/register/publication/epodoc/EP2485810/biblio
    - http://ops.epo.org/3.1/rest-services/register/publication/epodoc/EP2485810/biblio,legal.json
    """

    if constituents is None:
        constituents = 'biblio,legal'

    # Compute document identifier.
    document_id = split_patent_number(document_number)
    #ops_id = epo_ops.models.Docdb(document_id.number, document_id.country, document_id.kind)
    ops_id = epo_ops.models.Epodoc(document_id.country + document_id.number, document_id.kind)

    # Acquire register information from OPS.
    with ops_client(xml=xml) as ops:
        response = ops.register(reference_type, ops_id, constituents=to_list(constituents))
        return handle_response(response, 'ops-register')
Ejemplo n.º 7
0
def ops_family_inpadoc(reference_type, document_number, constituents, xml=False):
    """
    Request family information from OPS in JSON format.

    reference_type = publication|application|priority
    constituents   = biblio|legal

    Examples:
    - http://ops.epo.org/3.1/rest-services/family/publication/docdb/EP.1491501.A1/biblio,legal
    - http://ops.epo.org/3.1/rest-services/family/publication/docdb/EP0666666/biblio
    - http://ops.epo.org/3.1/rest-services/family/publication/docdb/EP0666666.A2/biblio
    - http://ops.epo.org/3.1/rest-services/family/publication/docdb/EP0666666.B1/biblio

    """

    # Compute document identifier.
    document_id = split_patent_number(document_number)
    ops_id = epo_ops.models.Epodoc(document_id.country + document_id.number, document_id.kind)

    # Acquire family information from OPS.
    with ops_client(xml=xml) as ops:
        response = ops.family(reference_type, ops_id, constituents=to_list(constituents))
        return handle_response(response, 'ops-family')
Ejemplo n.º 8
0
def ops_family_publication_docdb_xml(reference_type, document_number, constituents):
    """
    Request family information from OPS in XML format.

    reference_type = publication|application|priority
    constituents   = biblio|legal

    Examples:
    - http://ops.epo.org/3.1/rest-services/family/publication/docdb/EP.1491501.A1/biblio,legal
    """

    # Compute document identifier.
    document_id = split_patent_number(document_number)
    ops_id = epo_ops.models.Docdb(document_id.number, document_id.country, document_id.kind)

    # Acquire family information from OPS.
    ops = get_ops_client()

    # FIXME: Better use "accept_type" on a per-request basis supported by ``python-epo-ops-client``.
    ops.accept_type = 'application/xml'
    response = ops.family(reference_type, ops_id, constituents=to_list(constituents))
    ops.accept_type = 'application/json'

    return handle_response(response, 'ops-family')
Ejemplo n.º 9
0
def normalize_patent_wo_pct(patent):
    """
    Normalizes to "WIPO Application Number" format, e.g. PCT/US2005/009417
    Takes inputs like WOPCT/US02/03226, PCT/US1999/9417 or WOEP/2004/008531

    see "International Application No.":
    http://www.wipo.int/pctdb/en/wo.jsp?IA=PCT/US2005/009417
    http://www.wipo.int/pctdb/en/wo.jsp?IA=US2005009417

    see also:
    http://www.wipo.int/edocs/pctdocs/en/2005/pct_2005_42-section3.pdf
    """

    assert patent['country'] == 'WO'

    patched = copy(patent)
    #print patched

    r = re.compile('[\/|-]')
    parts = r.split(patched['number'])

    # handle special formatting like "WOPCT/WO9831467": convert to WO publication number
    if len(parts) == 2:
        pct = parts[0]
        patent_number = parts[1]
        if patent_number.startswith('WO'):
            wo_patent = split_patent_number(patent_number)
            return normalize_patent_wo(wo_patent)

    # only allow numbers containing three segments
    if not len(parts) == 3:
        return

    # assign segment names
    pct = parts[0]
    country_year = parts[1]
    seqnumber = parts[2]

    # handle special formatting like "WOPCT-WO97/29690": convert to WO publication number
    if country_year.startswith('WO'):
        wo_patent = split_patent_number(country_year + seqnumber)
        return normalize_patent_wo(wo_patent)

    # handle special formatting like "WOEP/2004/008531"
    if pct.startswith('WO') and len(pct) == 4:
        country_year = pct[2:4] + country_year

    # assume s.th. like "EP02": expand year to full year
    if len(country_year) == 4:
        # assume for century: 78-99 => 19, otherwise => 20
        # build fullyear from (2-digit) year
        fullyear = fullyear_from_year(country_year[2:])
        country_year = country_year[0:2] + fullyear

    # pad sequential number to six digits with leading zeros
    seqnumber = pad_left(seqnumber, '0', 6)

    # delete country,
    patched['country'] = ''
    patched['number'] = ('%s/%s/%s' % (pct, country_year, seqnumber))

    return patched
Ejemplo n.º 10
0
    images_index_url = None
    for anchor in anchors:
        if "Drawings" in str(anchor):
            images_index_url = cipo_baseurl + anchor['href']
            break

    if not images_index_url:
        return


    # 2. fetch and parse images index page
    images_index_html = fetch_images_index(images_index_url)
    soup = BeautifulSoup(images_index_html)
    # <img src="/opic-cipo/cpd/page/141597_20130713_drawings_page1_scale25_rotate0.gif?page=3&amp;section=drawings&amp;scale=25&amp;rotation=0&amp;type=" alt="Canadian Patent Document 141597. Drawings page. Image 1 of 3" />
    first_drawing_url = cipo_baseurl + soup.find('img', src=re.compile(ur'/opic-cipo/cpd/page'))['src']

    return first_drawing_url

if __name__ == '__main__':

    numbers = [
        'CA141597A'
    ]
    for number in numbers:
        payload = fetch_first_drawing(split_patent_number(number))
        if payload:
            #print "payload length:", len(payload)
            print payload
        else:
            print "not found"
Ejemplo n.º 11
0
 def generate(self, data):
     for number, number_normalized_expect in data.iteritems():
         number_normalized_computed = split_patent_number(number)
         yield number, number_normalized_expect, number_normalized_computed
Ejemplo n.º 12
0
 def compute(self):
     self.document = split_patent_number(self.original)
Ejemplo n.º 13
0
    def pair_to_elasticsearch(cls, key, value, modifiers=None):

        try:
            fieldname = cls.datasource_indexnames[key]
        except KeyError:
            return

        expression = None
        format = u'{0}:{1}'

        # ------------------------------------------
        #   value mogrifiers
        # ------------------------------------------
        if key == 'patentnumber':

            # Transform into distinct fields PC, DE, KI

            #if has_booleans(value):
            #    value = '({})'.format(value)

            expression_parts = []

            # Publication number
            patent = split_patent_number(value)

            patent_normalized = normalize_patent(patent, for_ops=False)
            if patent_normalized:
                patent = patent_normalized

            if patent:
                subexpression = u'PC:{country} AND DE:{number}'.format(
                    **patent)
                if patent['kind']:
                    subexpression += u' AND KI:{kind}'.format(**patent)
                expression_parts.append(u'({})'.format(subexpression))

            # Application number
            subexpression = u'AN:{}'.format(value)
            expression_parts.append(subexpression)
            expression = u' OR '.join(expression_parts)

            # Priority number
            subexpression = u'NP:{}'.format(value)
            expression_parts.append(subexpression)
            expression = u' OR '.join(expression_parts)

        elif key == 'pubdate':
            """
            - DP:[19800101 TO 19851231]
            - DP:[* TO 19601231]
            """

            try:

                parsed = False

                # e.g. 1991
                if len(value) == 4 and value.isdigit():
                    value = u'within {}0101,{}1231'.format(value, value)

                # e.g. 1990-2014, 1990 - 2014
                value = year_range_to_within(value)

                # e.g.
                # within 1978,1986
                # within 1900,2009-08-20
                # within 2009-08-20,2011-03-03
                if 'within' in value:
                    within_dates = parse_date_within(value)

                    if within_dates['startdate']:
                        if len(within_dates['startdate']) == 4:
                            within_dates['startdate'] += '0101'
                        within_dates['startdate'] = parse_date_universal(
                            within_dates['startdate']).format('YYYYMMDD')
                    else:
                        within_dates['startdate'] = '*'

                    if within_dates['enddate']:
                        if len(within_dates['enddate']) == 4:
                            within_dates['enddate'] += '1231'
                        within_dates['enddate'] = parse_date_universal(
                            within_dates['enddate']).format('YYYYMMDD')
                    else:
                        within_dates['enddate'] = '*'

                    expression = '{fieldname}:[{startdate} TO {enddate}]'.format(
                        fieldname=fieldname, **within_dates)

                elif not parsed:
                    value_date = parse_date_universal(value)
                    if value_date:
                        value = value_date.format('YYYYMMDD')
                    else:
                        raise ValueError(value)

            except Exception as ex:
                message = 'depatech query: Invalid date or range expression "{0}". Reason: {1}.'.format(
                    value, ex)
                logger.warn(
                    message +
                    ' Exception was: {0}'.format(_exception_traceback()))
                return {'error': True, 'message': message}

        elif key == 'inventor' or key == 'applicant':
            if not has_booleans(value) and should_be_quoted(value):
                value = u'"{0}"'.format(value)

        elif key == 'class':

            # v1: Naive implementation can only handle single values
            #value = lucene_convert_class(value)

            # v2: Advanced implementation can handle expressions on field "class"
            # Translate class expression from "H04L12/433 or H04L12/24"
            # to "(ic:H04L0012433 OR cpc:H04L0012433) OR (ic:H04L001224 OR cpc:H04L001224)"
            try:

                # Put value into parenthesis, to properly capture expressions
                if value:
                    value = u'({value})'.format(value=value)

                # Parse value as simple query expression
                query_object = CQL(cql=value)

                # Rewrite all patent classifications in query expression ast from OPS format to Lucene format
                rewrite_classes_lucene(query_object, format, fieldname)

                # Serialize into appropriate upstream datasource query expression syntax
                expression = query_object.dumps()

            except pyparsing.ParseException as ex:
                return {
                    'error': True,
                    'message': '<pre>' + str(ex.explanation) + '</pre>'
                }

        elif key == 'country':
            value = value.upper()

        # ------------------------------------------
        #   surround with parentheses
        # ------------------------------------------
        if key in ['fulltext', 'inventor', 'applicant', 'country', 'citation']:
            if has_booleans(value) and not should_be_quoted(value):
                value = u'({0})'.format(value)

        # ------------------------------------------
        #   expression formatter
        # ------------------------------------------
        # Serialize into appropriate upstream datasource query expression syntax
        if not expression:
            expression = format_expression(format, fieldname, value)
            #print 'expression:', expression

        # ------------------------------------------
        #   final polishing
        # ------------------------------------------
        # Solr(?) syntax: boolean operators must be uppercase
        if has_booleans(expression):
            boolis = [' or ', ' and ', ' not ']
            for booli in boolis:
                expression = expression.replace(booli, booli.upper())

        return {'query': expression}