def normalize_patent_au(patent): """ Normalizes "Australian" format, e.g. AU2003212220A1, AU200042655B2, AU00784257B2 Patent Application Number: old: 4+5 digits (Patadmin, before 5 July 2002) new: 4+6 digits (PAMS, after 5 July 2002) http://apa.hpa.com.au:8080/ipapa/intro http://pericles.ipaustralia.gov.au/aub/aub_pages_1.intro Patent Number: 6 digits http://pericles.ipaustralia.gov.au/aub/aub_pages_1.intro """ assert patent['country'] == 'AU' patched = copy(patent) length = len(patent['number']) # convert from 4+5 to 4+6 (old to new format) if length == 9: fullyear = patched['number'][0:4] seqnumber = patched['number'][4:] # pad sequential number to 6 characters patched['number'] = fullyear + pad_left(seqnumber, '0', 6) else: patched['number'] = trim_leading_zeros(patched['number']) if len(patched['number']) < 6: patched['number'] = pad_left(patched['number'], '0', 6) return patched
def normalize_patent_wo(patent): """ Normalizes to "WIPO Publication Number" format, e.g. WO2005092324 see "Pub. No.": http://www.wipo.int/pctdb/en/wo.jsp?IA=WO/2005/092324 http://www.wipo.int/pctdb/en/wo.jsp?IA=WO0067640 """ assert patent['country'] == 'WO' patched = copy(patent) # filter: leave special documents untouched (with alphanumeric prefix) pattern = '^\D+' r = re.compile(pattern) if r.match(patched['number']): return patched length = len(patent['number']) # convert from 2+5 or 2+6 to 4+6 if length == 7 or length == 8: year = patched['number'][0:2] seqnumber = patched['number'][2:] # assume for century: 78-99 => 19, otherwise => 20 # build fullyear from (2-digit) year fullyear = fullyear_from_year(year) """ # try different decoding: 1 zero + 2 year + 5 seqnumber # (wrong format due to "pad everything to 8 characters" logic of Bestellsystem) # so strip off first leading zero before decoding again # TODO: what about WO09802618A2, WO00202618A2, WO00402618A2, WO09201000A1, WO09901000A3, WO00101000A1? if length == 8: # 1. numbers like WO00101000A1 are ambiguous, could be WO2000101000A1 or WO2001001000A1 ambiguous_2000_2003 = ( 2000 <= int(fullyear) and int(fullyear) <= 2003 and patched['number'].startswith('00') ) # 2. same with 8 digit numbers >= 2004, starting with "WO004..." # hint: WO00402618A2 can not be WO2000402618A2 (due to format 2+6 and release date), so must be WO2004002618A2 ambiguous_2004_bis = ( int(fullyear) >= 2004 ) if ambiguous_2000_2003: # or ambiguous_2004_bis: patched['number'] = patched['number'][1:] year = patched['number'][0:2] seqnumber = patched['number'][2:] fullyear = fullyear_from_year(year) """ #if length == 8 and patched['number'].startswith('0') and int(fullyear) < 2003: # return # pad sequential number to 6 characters patched['number'] = fullyear + pad_left(seqnumber, '0', 6) # convert from 4+5 to 4+6 (wrong format) elif length == 9: fullyear = patched['number'][0:4] seqnumber = patched['number'][4:] # pad sequential number to 6 characters patched['number'] = fullyear + pad_left(seqnumber, '0', 6) patched['number'] = trim_leading_zeros(patched['number']) return patched
def normalize_patent_us(patent, provider=None): # USPTO number formats # PATFT - Issued Patents: # http://patft.uspto.gov/netahtml/PTO/srchnum.htm # # Utility -- 5,146,634 6923014 0000001 # Design -- D339,456 D321987 D000152 # Plant -- PP08,901 PP07514 PP00003 # Reissue -- RE35,312 RE12345 RE00007 # Defensive Publication -- T109,201 T855019 T100001 # Statutory Invention Registration -- H001,523 H001234 H000001 # Re-examination -- RX12 # Additional Improvement -- AI00,002 AI000318 AI00007 subtype_prefixes = ['D', 'PP', 'RD', 'RE', 'T', 'H', 'AI'] # AppFT - Patent Applications # http://appft.uspto.gov/netahtml/PTO/srchnum.html # # Utility: 20010000044 assert patent['country'] == 'US' patched = copy(patent) length = len(patched['number']) if provider == 'ops' or provider == 'espacenet': # OPS accepts US patent application publication numbers in 4+6=10 format # Examples: US2015322651A1, US2017250417A1, US2017285092A1 # 2017-10-25 # DEPATISnet started delivering application publication numbers in 5+7 format # with a leading zero after the country, e.g. US000006166174A, US020170285092A1 # around October 2017. Account for that. if length == 12: patched['number'] = patched['number'].lstrip('0') length = len(patched['number']) # US application publication numbers: Convert from 4+5=9 to 4+6=10 if length == 9: padding = '0' * (10 - length) patched['number'] = patched['number'][0:4] + padding + patched[ 'number'][4:] # US application publication numbers: Convert from 4+7=11 to 4+6=10 # 2015-12-20: Normalize responses from SIP like "US20150322651A1" to "US2015322651A1" elif length == 11: if patched['number'][4] == '0': patched[ 'number'] = patched['number'][0:4] + patched['number'][5:] # US patents: Handle document numbers with character prefixes # Trim leading zeros for OPS elif 'number-type' in patched and 'number-real' in patched: subtype = patched['number-type'] seqnumber = patched['number-real'] if subtype in subtype_prefixes: patched['number'] = subtype + trim_leading_zeros(seqnumber) # US patents: Strip leading zeros else: patched['number'] = patched['number'].lstrip('0') else: # US patents: Handle document numbers with character prefixes # Pad patent number with zeros to get total length of 7 characters if 'number-type' in patched and 'number-real' in patched: subtype = patched['number-type'] seqnumber = patched['number-real'] if subtype in subtype_prefixes: patched['number'] = subtype + seqnumber.zfill(7) # Convert from 4+5=9 or 4+6=10 to 4+7=11 # US20170000054A1 elif length == 9 or length == 10: padding = '0' * (11 - length) patched['number'] = patched['number'][0:4] + padding + patched[ 'number'][4:] # 2018-04-23: Espacenet changed behavior, handle edge case for # USD813591S to yield https://worldwide.espacenet.com/publicationDetails/claims?CC=US&NR=D813591S&KC=S if provider == 'espacenet': if 'number-type' in patched: if patched['number-type'] == 'D' and patched['kind'] == 'S': patched['number'] += patched['kind'] return patched
def patch_patent(patent, provider=None): if not patent: return number_length = len(patent['number']) patched = copy(patent) #print 'patched:', patched # strip leading zeros of *publication* to 6 digits, if seqnumber is longer than 6 digits # examples: publication: AT401234; application: AT 967/1994 => AT96794 if patched['country'] == 'AT': """ if len(patched['number']) > 6 and not '/' in patched['number']: patched['number'] = trim_leading_zeros(patched['number']) patched['number'] = pad_left(patched['number'], '0', 6) """ patched['number'] = trim_leading_zeros(patched['number']) # pad to 6 characters with leading zeros elif patched['country'] == 'AR': patched['number'] = patched['number'].lstrip('0').rjust(6, '0') elif patched['country'] == 'AU': patched = normalize_patent_au(patched) elif patched['country'] == 'BR': patched['number'] = patched['number'].lstrip('0') # strip leading zeros with exception of kindcode == T1, then pad to 7 digits like EP # "Veröffentlichung der europäischen Patentanmeldung" elif patched['country'] == 'DE': patched['number'] = trim_leading_zeros(patched['number']) #if patched.get('kind') == 'T1': # patched['number'] = pad_left(patched['number'], '0', 7) # The Eurasian Patent Organization (EAPO) # Pad to 6 characters with leading zeros elif patched['country'] == 'EA' and number_length < 9: patched['number'] = trim_leading_zeros(patched['number']) patched['number'] = pad_left(patched['number'], '0', 6) # pad to 7 characters with leading zeros elif patched['country'] == 'EP': patched['number'] = trim_leading_zeros(patched['number']) patched['number'] = pad_left(patched['number'], '0', 7) elif patched['country'] == 'GE': patched['number'] = patched['number'].lstrip('0') # e.g. # GE00U200501210Y = GEU20051210Y # GE00P200503700B = GEP20053700B #print '77777777777:', patched['number'][5] if patched['number'][5] == '0': patched['number'] = patched['number'][:5] + patched['number'][6:] elif patched['country'] == 'IT': patched['number'] = patched['number'].lstrip('0') patched = normalize_patent_it(patched) # 2017-09-06: KR numbers # e.g. KR1020150124192A => KR20150124192A elif patched['country'] == 'KR': patched['number'] = trim_leading_zeros(patched['number']) if len(patched['number']) > 11 and patched['number'][:2] == '10': patched['number'] = patched['number'][2:] # 2009-11-09: JP numbers elif patched['country'] == 'JP': patched = normalize_patent_jp(patched) # 2015-09-01: SE numbers elif patched['country'] == 'SE': patched = normalize_patent_se(patched) patched['number'] = trim_leading_zeros(patched['number']) # 2007-07-26: US applications are 4+7 elif patched['country'] == 'US': patched = normalize_patent_us(patched, provider=provider) # normalize wo numbers to 4+6 format elif patched['country'] == 'WO': # WOPCT/US86/01765 or WOEP/2004/008531 if patched['number'].startswith('PCT'): patched = normalize_patent_wo_pct(patched) else: patched = normalize_patent_wo(patched) #patched = denormalize_patent_wo(patched) # strip leading zeros else: patched['number'] = trim_leading_zeros(patched['number']) #print "patched (regular):", patent, patched return patched