Exemple #1
0
def denormalize_patent_wo(patent):
    assert patent['country'] == 'WO'

    patched = patent.copy()

    length = len(patent['number'])

    # convert from 4+6 to 2+5 ...
    if length == 10:

        fullyear = patched['number'][0:4]
        century = fullyear[0:2]
        seqnumber = patched['number'][4:]

        # ... for fullyear == 19*: convert to 2+5
        if century == '19':
            seqnumber = str(int(seqnumber))
            patched['number'] = fullyear[2:4] + pad_left(seqnumber, '0', 5)

        # ... for fullyear == 20*
        if century == '20':
            patched['number'] = denormalize_patent_wo_algo(int(fullyear), int(seqnumber))


    # convert from 2+6 to 2+5 ...
    elif length == 8:

        year = patched['number'][0:2]
        seqnumber = patched['number'][2:]

        fullyear = fullyear_from_year(year)
        #print fullyear
        #print patched['number']
        patched['number'] = denormalize_patent_wo_algo(int(fullyear), int(seqnumber))
        #print patched['number']


    # wrong format: assume 4+5, convert to 2+5 ...
    elif length == 9:
        fullyear = patched['number'][0:4]
        seqnumber = patched['number'][4:]
        patched['number'] = denormalize_patent_wo_algo(int(fullyear), int(seqnumber))


    return patched
def normalize_patent_wo_pct(patent):
    """
    Normalizes to "WIPO Application Number" format, e.g. PCT/US2005/009417
    Takes inputs like WOPCT/US02/03226, PCT/US1999/9417 or WOEP/2004/008531

    see "International Application No.":
    http://www.wipo.int/pctdb/en/wo.jsp?IA=PCT/US2005/009417
    http://www.wipo.int/pctdb/en/wo.jsp?IA=US2005009417

    see also:
    http://www.wipo.int/edocs/pctdocs/en/2005/pct_2005_42-section3.pdf
    """

    assert patent['country'] == 'WO'

    patched = copy(patent)
    #print patched

    r = re.compile('[\/|-]')
    parts = r.split(patched['number'])

    # handle special formatting like "WOPCT/WO9831467": convert to WO publication number
    if len(parts) == 2:
        pct = parts[0]
        patent_number = parts[1]
        if patent_number.startswith('WO'):
            wo_patent = split_patent_number(patent_number)
            return normalize_patent_wo(wo_patent)

    # only allow numbers containing three segments
    if not len(parts) == 3:
        return

    # assign segment names
    pct = parts[0]
    country_year = parts[1]
    seqnumber = parts[2]

    # handle special formatting like "WOPCT-WO97/29690": convert to WO publication number
    if country_year.startswith('WO'):
        wo_patent = split_patent_number(country_year + seqnumber)
        return normalize_patent_wo(wo_patent)

    # handle special formatting like "WOEP/2004/008531"
    if pct.startswith('WO') and len(pct) == 4:
        country_year = pct[2:4] + country_year

    # assume s.th. like "EP02": expand year to full year
    if len(country_year) == 4:
        # assume for century: 78-99 => 19, otherwise => 20
        # build fullyear from (2-digit) year
        fullyear = fullyear_from_year(country_year[2:])
        country_year = country_year[0:2] + fullyear

    # pad sequential number to six digits with leading zeros
    seqnumber = pad_left(seqnumber, '0', 6)

    # delete country,
    patched['country'] = ''
    patched['number'] = ('%s/%s/%s' % (pct, country_year, seqnumber))

    return patched
def normalize_patent_wo(patent):
    """
    Normalizes to "WIPO Publication Number" format, e.g. WO2005092324

    see "Pub. No.":
    http://www.wipo.int/pctdb/en/wo.jsp?IA=WO/2005/092324
    http://www.wipo.int/pctdb/en/wo.jsp?IA=WO0067640
    """

    assert patent['country'] == 'WO'

    patched = copy(patent)

    # filter: leave special documents untouched (with alphanumeric prefix)
    pattern = '^\D+'
    r = re.compile(pattern)
    if r.match(patched['number']):
        return patched

    length = len(patent['number'])

    # convert from 2+5 or 2+6 to 4+6
    if length == 7 or length == 8:

        year = patched['number'][0:2]
        seqnumber = patched['number'][2:]

        # assume for century: 78-99 => 19, otherwise => 20
        # build fullyear from (2-digit) year
        fullyear = fullyear_from_year(year)
        """
        # try different decoding: 1 zero + 2 year + 5 seqnumber
        # (wrong format due to "pad everything to 8 characters" logic of Bestellsystem)
        # so strip off first leading zero before decoding again
        # TODO: what about WO09802618A2, WO00202618A2, WO00402618A2, WO09201000A1, WO09901000A3, WO00101000A1?
        if length == 8:

          # 1. numbers like WO00101000A1 are ambiguous, could be WO2000101000A1 or WO2001001000A1
          ambiguous_2000_2003 = ( 2000 <= int(fullyear) and int(fullyear) <= 2003 and patched['number'].startswith('00') )

          # 2. same with 8 digit numbers >= 2004, starting with "WO004..."
          #    hint: WO00402618A2 can not be WO2000402618A2 (due to format 2+6 and release date), so must be WO2004002618A2
          ambiguous_2004_bis  = ( int(fullyear) >= 2004 )

          if ambiguous_2000_2003:  # or ambiguous_2004_bis:
            patched['number'] = patched['number'][1:]
            year = patched['number'][0:2]
            seqnumber = patched['number'][2:]
            fullyear = fullyear_from_year(year)
        """

        #if length == 8 and patched['number'].startswith('0') and int(fullyear) < 2003:
        #    return

        # pad sequential number to 6 characters
        patched['number'] = fullyear + pad_left(seqnumber, '0', 6)

    # convert from 4+5 to 4+6 (wrong format)
    elif length == 9:
        fullyear = patched['number'][0:4]
        seqnumber = patched['number'][4:]

        # pad sequential number to 6 characters
        patched['number'] = fullyear + pad_left(seqnumber, '0', 6)

    patched['number'] = trim_leading_zeros(patched['number'])
    return patched