def GetFamilly(client, brev, rep): from OPS2NetUtils2 import ExtractClassificationSimple2, SeparateCountryField, ExtractAbstract, UniClean import epo_ops import datetime ResultContents = rep lstres = [] comptExcept = 0 # try: # url ='http://ops.epo.org/3.1/rest-services/family/publication/docdb/' +brev['label'] +'/biblio' # # data = requests.get(url, headers = headers) dico = None try: data = client.family('publication', epo_ops.models.Epodoc(brev['label']), 'biblio') data = data.json() dico = data[u'ops:world-patent-data'][u'ops:patent-family'][u'ops:family-member'] #PatentDataFam[brev['label']] = dict() if type(dico) == type(dict()): dico=[dico] cpt = 1 except: try: data = client.family('publication', epo_ops.models.Docdb(brev['label'][2:], brev['label'][0:2],brev['portee'])) data = data.json() dico = data[u'ops:world-patent-data'][u'ops:patent-family'][u'ops:family-member'] #PatentDataFam[brev['label']] = dict() if type(dico) == type(dict()): dico=[dico] cpt = 1 except: print "nothing found for ", brev print "ignoring" return None if dico is not None: for donnee in dico: Go =True Brevet=dict(dict(dict(dict()))) Brevet[u'ops:world-patent-data'] =dict() Brevet[u'ops:world-patent-data']['ops:biblio-search'] =dict() Brevet[u'ops:world-patent-data']['ops:biblio-search']['ops:search-result'] =dict() Brevet[u'ops:world-patent-data']['ops:biblio-search']['ops:search-result'][u'exchange-documents'] = donnee #hum no sure that it is a good way PatentData = dict() Req = Brevet try: PatentData[u'label'] = donnee[u'exchange-document'][u'bibliographic-data'][u'publication-reference'][u'document-id'][1][u'doc-number'][u'$'] except: try: PatentData[u'label'] = donnee[u'publication-reference'][u'document-id'][1][u'doc-number']['$'] except: print "no label ?" Go = False # print pprint.pprint(donnee) if Go: #PatentDataFam[PatentData['label']] = dict() PatentData[u'titre'] = UniClean(ExtraitTitleEn(Req)) # print "Patent title(s)", PatentData['titre'] PatentData[u'inventeur'] = UniClean(ExtraitParties(Req, 'inventor', 'epodoc')) # print "Inventors : ", PatentData['inventeur'] PatentData[u'applicant'] = UniClean(ExtraitParties(Req, 'applicant','epodoc')) # print "Applicants : ", PatentData['applicant'] PatentData[u'pays'] = ExtraitCountry(Req) PatentData[u'portee'] = ExtraitKind(Req) try: PatentData[u'classification'] = ExtraitIPCR2(Req) except: PatentData[u'classification'] = '' if isinstance(PatentData[u'classification'], list): for classif in PatentData[u'classification']: PatentData2 = ExtractClassificationSimple2(classif) for cle in PatentData2.keys(): if cle in PatentData.keys() and PatentData2[cle] not in PatentData[cle]: if PatentData[cle] == '': PatentData[cle] = [] if isinstance(PatentData2[cle], list): for cont in PatentData2[cle]: if cont not in PatentData[cle]: PatentData[cle].append(cont) else: if PatentData2[cle] not in PatentData[cle]: PatentData[cle].append(PatentData2[cle]) else: PatentData[cle] = [] if isinstance(PatentData2[cle], list): for cont in PatentData2[cle]: if cont not in PatentData[cle]: PatentData[cle].append(cont) else: PatentData[cle].append(PatentData2[cle]) elif PatentData[u'classification'] != '': PatentData2 = ExtractClassificationSimple2(PatentData[u'classification']) for cle in PatentData2.keys(): if cle in PatentData.keys() and PatentData2[cle] not in PatentData[cle]: if PatentData[cle] == '': PatentData[cle] = [] if isinstance(PatentData2[cle], list): for cont in PatentData2[cle]: if cont not in PatentData[cle]: PatentData[cle].append(cont) else: PatentData[cle] = [] PatentData[cle].append(PatentData2[cle]) elif cle not in PatentData.keys(): PatentData[cle] = [] if isinstance(PatentData2[cle], list): for cont in PatentData2[cle]: if cont not in PatentData[cle]: PatentData[cle].append(cont) else: if PatentData2[cle] not in PatentData[cle]: PatentData[cle].append(PatentData2[cle]) # print classif del(PatentData[u'classification']) #PatentData[u'applicant'] = Formate(PatentData['applicant'], PatentData['pays']) # remember inventor original writing form to reuse in the url property of the node #PatentData[u'inventeur'] = Formate(PatentData['inventeur'], PatentData['pays']) PatentData = SeparateCountryField(PatentData) # #print "Classification Reduced: ", PatentData['ClassifReduite'] date = ExtractionDate(Req) #priority claim first date time if date is not None and date != "": PatentData[u'date'] = date[0:4] +'-'+ date[4:6] +'-'+ date[6:] PatentData[u'dateDate'] = datetime.date(int(date[0:4]), int(date[4:6]), int(date[6:])) # print "patent date", PatentData['date'] else: PatentData[u'dateDate'] = datetime.date.today() PatentData[u'date'] = str(datetime.date.today().year) +'-' + str(datetime.date.today().month) + '-' + str(datetime.date.today().day) #try: #hum straight forward may be not the good choice try: if u'references-cited' in donnee[u'exchange-document'][u'bibliographic-data'].keys(): if "citation" in donnee[u'exchange-document'][u'bibliographic-data'][u'references-cited'].keys(): PatentData[u'citations'] = len(donnee[u'exchange-document'][u'bibliographic-data'][u'references-cited'][u'citation']) else: PatentData[u'citations'] = 0 except: PatentData[u'citations'] = 0 #it is may be an Application patent. Hence, no CIB, no citation... so I should avoid it # print " ********************************* " #if cpt == 1:#not the first one !!!! try: if donnee[u'priority-claim'][u'priority-active-indicator']['$'] == u'YES': PatentData['priority-active-indicator'] = 1 except: PatentData['priority-active-indicator'] = 0 ## should check what is "active indicator" for patent try: if donnee[u'application-reference'][u'@is-representative'] == u'YES': PatentData['representative'] = 1 # PatentData['representative'] = True except: PatentData[u'representative'] = 0 # should check what is reprensentativeness for patent PatentData[u'family lenght'] = len(dico) for cle in PatentData.keys(): if isinstance(PatentData[cle], list): if len(PatentData[cle]) == 1: PatentData[cle] == PatentData[cle][0] #UnNesting if None not in PatentData.values(): IRAM = '**** *Label_' + PatentData[u'label'] +' *Country_'+PatentData[u'pays']+ ' *CIB3_'+'-'.join(PatentData[u'IPCR3']) + ' *CIB1_'+'-'.join(PatentData[u'IPCR1']) + ' *CIB4_'+'-'.join(PatentData[u'IPCR4']) + ' *Date_' + str(PatentData[u'dateDate'].year) + ' *Applicant_'+'-'.join(coupeEnMots(str(PatentData[u'applicant']))) TXT=dict() if isinstance(donnee[u'exchange-document'], list): for tempo in donnee[u'exchange-document']: if tempo.has_key('abstract'): txtTemp = ExtractAbstract(tempo['abstract']) for cleLang in txtTemp: if TXT.has_key(cleLang): TXT[cleLang] += txtTemp[cleLang] else: TXT[cleLang] = txtTemp[cleLang] else: if donnee[u'exchange-document'].has_key('abstract'): TXT = ExtractAbstract(donnee[u'exchange-document'][u'abstract']) for lang in TXT.keys(): EcritContenu(IRAM + ' *Contenu_Abstract \n' + TXT[lang], ResultContents+'//FamiliesAbstracts//'+lang+'-'+PatentData['label']+'.txt') lstres.append(PatentData) cpt += 1 else: # print "hum... missing values... avoiding this patent" #print "Cleaning data" for key in PatentData.keys(): if isinstance(PatentData[key], list): if len(PatentData[key])==1: PatentData[key] = PatentData[key][0] elif isinstance(PatentData[key], unicode): pass elif isinstance(PatentData[key], unicode): PatentData[key] = unicode(PatentData[key]) else: PatentData[key] = u'' datemin = datetime.date(3000, 1, 1) for brevet in lstres: if brevet.has_key('representative'): if brevet['dateDate'] < datemin: datemin = brevet['dateDate'] prior = brevet['label'] if 'prior' not in locals(): prior = brev['label'] for brevet in lstres: brevet['prior'] = prior # print "exceptions ", comptExcept # print len(lstres), ' patents added' return lstres
TXT=dict() if isinstance(patentCont[u'ops:world-patent-data'][u'exchange-documents'][u'exchange-document'], list): for tempo in patentCont[u'ops:world-patent-data'][u'exchange-documents'][u'exchange-document']: if tempo.has_key('abstract'): txtTemp = ExtractAbstract(tempo['abstract']) for cleLang in txtTemp: if TXT.has_key(cleLang): TXT[cleLang] += txtTemp[cleLang] else: TXT[cleLang] = txtTemp[cleLang] else: if patentCont[u'ops:world-patent-data'][u'exchange-documents'][u'exchange-document'].has_key('abstract'): TXT = ExtractAbstract(patentCont[u'ops:world-patent-data'][u'exchange-documents'][u'exchange-document'][u'abstract']) for lang in TXT.keys(): EcritContenu(IRAM + ' *Contenu_Abstract \n' + TXT[lang], RepDir+ '//'+ content+'s//'+lang+'-'+ndb+'.txt') Langues.add(lang) abstract +=1 # except: # print "pas glop" # print patentCont else: print "no gather parameter set. Finishing." lstfic = os.listdir(ResultPathContent+'//abstracts/') lang = [fics[0:2] for fics in lstfic] Langues = set(lang) print 'Over the ', len(lstBrevet), ' patents... ' print abstract, " not so empty abstract gathered. See ", ndf.replace('.dump', '')+'/Abstracts/ directory for files'
def ExtractPatent(pat, ResultContents, BiblioPatents): DejaLa = [bre['label'] for bre in BiblioPatents] for cle in ['inventeur', 'applicant', 'date', 'dateDate', 'titre']: if cle != 'date' and cle !='dateDate': if pat[cle] == None: pat[cle] = 'empty' else: if cle == 'date' and pat[cle] == None: import datetime pat[cle] = str(datetime.date.today().year) + '-' + str(datetime.date.today().month) + '-' + str(datetime.date.today().day) elif cle == 'dateDate' and pat[cle] == None: import datetime pat[cle] = datetime.date.today().year cles = [key for key in pat.keys() if pat[key]==None] for cle in cles: if cle=='date': pat[cle] = unicode(datetime.date.today().year) elif cle=="dateDate": pat[cle] = datetime.date.today() else: bre[cle] = u'empty' if None not in pat.values(): #if Brev['label'] == Brev["prior"]: # just using primary patents not all the family if isinstance(pat['classification'], list): for classif in pat['classification']: tempo2 = ExtractClassificationSimple2(classif) for cle in tempo2.keys(): if cle in pat.keys() and tempo2[cle] not in pat[cle]: if pat[cle] == '': pat[cle] = [] if isinstance(tempo2[cle], list): pat[cle].extend(tempo2[cle]) else: pat[cle].append(tempo2[cle]) else: pat[cle] = [] if isinstance(tempo2[cle], list): pat[cle].extend(tempo2[cle]) else: pat[cle].append(tempo2[cle]) if pat[cle].count(',')>0: print pat[cle] #hum, strage state else: tempo2 = ExtractClassificationSimple2(pat['classification']) for cle in tempo2.keys(): if cle in pat.keys() and tempo2[cle] not in pat[cle]: if pat[cle] == '': pat[cle] = [] if isinstance(tempo2[cle], list): pat[cle].extend(tempo2[cle]) else: pat[cle].append(tempo2[cle]) else: pat[cle] = [] if isinstance(tempo2[cle], list): pat[cle].extend(tempo2[cle]) else: pat[cle].append(tempo2[cle]) if pat[cle].count(',')>0: print pat[cle] #hum, strage state # print classif pat = SeparateCountryField(pat) for clekey in pat.keys(): if isinstance(pat[clekey], list): pat[clekey] = UnNest(pat[clekey]) if isinstance(pat['IPCR1'], list): CIB1 = '-'.join(dat for dat in pat['IPCR1']) else: CIB1 = pat['IPCR1'] if isinstance(pat['IPCR3'], list): CIB3 = '-'.join(dat for dat in pat['IPCR3']) else: CIB3 = pat['IPCR3'] if isinstance(pat['IPCR4'], list): CIB4 = '-'.join(dat for dat in pat['IPCR4']) else: CIB4 = pat['IPCR4'] IRAM = '**** *Label_' + ndb +' *Country_'+pat['pays']+ ' *CIB3_'+CIB3 + ' *CIB1_'+CIB1 + ' *CIB4_'+CIB4 + ' *Date_' + str(pat['dateDate'].year) + ' *Applicant_'+UniClean('-'.join(coupeEnMots(pat['applicant'])))[0:12] IRAM = IRAM.replace('_ ', '_empty', IRAM.count('_ ')) +'\n' TXT=dict() if isinstance(patentBib[u'ops:world-patent-data'][u'exchange-documents'][u'exchange-document'], list): for tempo in patentBib[u'ops:world-patent-data'][u'exchange-documents'][u'exchange-document']: if tempo.has_key('abstract'): txtTemp = ExtractAbstract(tempo['abstract']) for cleLang in txtTemp: if TXT.has_key(cleLang): TXT[cleLang] += txtTemp[cleLang] else: TXT[cleLang] = txtTemp[cleLang] else: if patentBib[u'ops:world-patent-data'][u'exchange-documents'][u'exchange-document'].has_key('abstract'): TXT = ExtractAbstract(patentBib[u'ops:world-patent-data'][u'exchange-documents'][u'exchange-document'][u'abstract']) for lang in TXT.keys(): EcritContenu(IRAM + TXT[lang], ResultAbstractPath+'//'+lang+'-'+ndb+'.txt') if pat['label'] in DejaLa: #checking multiples status tempor = [patent for patent in BiblioPatents if patent['label'] == pat["label"]][0] #should be unique BiblioPatents.remove(tempor) tempor = Update(tempor, pat) for key in tempor.keys(): if isinstance(tempor[key], list): tempor[key] = UnNest(tempor[key]) tempor = CleanPatent(tempor) BiblioPatents.append(CleanPatent(tempor)) else: for key in pat.keys(): if isinstance(pat[key], list): pat[key] = UnNest(pat[key]) pat = CleanPatent(pat) BiblioPatents.append(CleanPatent(pat)) DejaLa.append(pat['label']) return pat, YetGathered, BiblioPatents else:#None values avoiding this patent if pat.has_key('label'): DejaLa.append(pat['label']) return None, DejaLa, BiblioPatents
def GetFamilly(client, brev, rep): from OPS2NetUtils2 import ExtractClassificationSimple2, SeparateCountryField, ExtractAbstract, UniClean import epo_ops import datetime ResultContents = rep lstres = [] comptExcept = 0 # try: # url ='http://ops.epo.org/3.1/rest-services/family/publication/docdb/' +brev['label'] +'/biblio' # # data = requests.get(url, headers = headers) dico = None try: data = client.family('publication', epo_ops.models.Epodoc(brev['label']), 'biblio') data = data.json() dico = data[u'ops:world-patent-data'][u'ops:patent-family'][ u'ops:family-member'] #PatentDataFam[brev['label']] = dict() if type(dico) == type(dict()): dico = [dico] cpt = 1 except: try: data = client.family( 'publication', epo_ops.models.Docdb(brev['label'][2:], brev['label'][0:2], brev['portee'])) data = data.json() dico = data[u'ops:world-patent-data'][u'ops:patent-family'][ u'ops:family-member'] #PatentDataFam[brev['label']] = dict() if type(dico) == type(dict()): dico = [dico] cpt = 1 except: print "nothing found for ", brev print "ignoring" return None if dico is not None: for donnee in dico: Go = True Brevet = dict(dict(dict(dict()))) Brevet[u'ops:world-patent-data'] = dict() Brevet[u'ops:world-patent-data']['ops:biblio-search'] = dict() Brevet[u'ops:world-patent-data']['ops:biblio-search'][ 'ops:search-result'] = dict() Brevet[u'ops:world-patent-data']['ops:biblio-search'][ 'ops:search-result'][ u'exchange-documents'] = donnee #hum no sure that it is a good way PatentData = dict() Req = Brevet try: PatentData[u'label'] = donnee[u'exchange-document'][ u'bibliographic-data'][u'publication-reference'][ u'document-id'][1][u'doc-number'][u'$'] except: try: PatentData[u'label'] = donnee[u'publication-reference'][ u'document-id'][1][u'doc-number']['$'] except: print "no label ?" Go = False # print pprint.pprint(donnee) if Go: #PatentDataFam[PatentData['label']] = dict() PatentData[u'titre'] = UniClean(ExtraitTitleEn(Req)) # print "Patent title(s)", PatentData['titre'] PatentData[u'inventeur'] = UniClean( ExtraitParties(Req, 'inventor', 'epodoc')) # print "Inventors : ", PatentData['inventeur'] PatentData[u'applicant'] = UniClean( ExtraitParties(Req, 'applicant', 'epodoc')) # print "Applicants : ", PatentData['applicant'] PatentData[u'pays'] = ExtraitCountry(Req) PatentData[u'portee'] = ExtraitKind(Req) try: PatentData[u'classification'] = ExtraitIPCR2(Req) except: PatentData[u'classification'] = '' if isinstance(PatentData[u'classification'], list): for classif in PatentData[u'classification']: PatentData2 = ExtractClassificationSimple2(classif) for cle in PatentData2.keys(): if cle in PatentData.keys( ) and PatentData2[cle] not in PatentData[cle]: if PatentData[cle] == '': PatentData[cle] = [] if isinstance(PatentData2[cle], list): for cont in PatentData2[cle]: if cont not in PatentData[cle]: PatentData[cle].append(cont) else: if PatentData2[cle] not in PatentData[cle]: PatentData[cle].append( PatentData2[cle]) else: PatentData[cle] = [] if isinstance(PatentData2[cle], list): for cont in PatentData2[cle]: if cont not in PatentData[cle]: PatentData[cle].append(cont) else: PatentData[cle].append(PatentData2[cle]) elif PatentData[u'classification'] != '': PatentData2 = ExtractClassificationSimple2( PatentData[u'classification']) for cle in PatentData2.keys(): if cle in PatentData.keys( ) and PatentData2[cle] not in PatentData[cle]: if PatentData[cle] == '': PatentData[cle] = [] if isinstance(PatentData2[cle], list): for cont in PatentData2[cle]: if cont not in PatentData[cle]: PatentData[cle].append(cont) else: PatentData[cle] = [] PatentData[cle].append(PatentData2[cle]) elif cle not in PatentData.keys(): PatentData[cle] = [] if isinstance(PatentData2[cle], list): for cont in PatentData2[cle]: if cont not in PatentData[cle]: PatentData[cle].append(cont) else: if PatentData2[cle] not in PatentData[cle]: PatentData[cle].append(PatentData2[cle]) # print classif del (PatentData[u'classification']) #PatentData[u'applicant'] = Formate(PatentData['applicant'], PatentData['pays']) # remember inventor original writing form to reuse in the url property of the node #PatentData[u'inventeur'] = Formate(PatentData['inventeur'], PatentData['pays']) PatentData = SeparateCountryField(PatentData) # #print "Classification Reduced: ", PatentData['ClassifReduite'] date = ExtractionDate(Req) #priority claim first date time if date is not None and date != "": PatentData[ u'date'] = date[0:4] + '-' + date[4:6] + '-' + date[6:] PatentData[u'dateDate'] = datetime.date( int(date[0:4]), int(date[4:6]), int(date[6:])) # print "patent date", PatentData['date'] else: PatentData[u'dateDate'] = datetime.date.today() PatentData[u'date'] = str( datetime.date.today().year) + '-' + str( datetime.date.today().month) + '-' + str( datetime.date.today().day) #try: #hum straight forward may be not the good choice try: if u'references-cited' in donnee[u'exchange-document'][ u'bibliographic-data'].keys(): if "citation" in donnee[u'exchange-document'][ u'bibliographic-data'][ u'references-cited'].keys(): PatentData[u'citations'] = len( donnee[u'exchange-document'] [u'bibliographic-data'][u'references-cited'] [u'citation']) else: PatentData[u'citations'] = 0 except: PatentData[u'citations'] = 0 #it is may be an Application patent. Hence, no CIB, no citation... so I should avoid it # print " ********************************* " #if cpt == 1:#not the first one !!!! try: if donnee[u'priority-claim'][u'priority-active-indicator'][ '$'] == u'YES': PatentData['priority-active-indicator'] = 1 except: PatentData['priority-active-indicator'] = 0 ## should check what is "active indicator" for patent try: if donnee[u'application-reference'][ u'@is-representative'] == u'YES': PatentData['representative'] = 1 # PatentData['representative'] = True except: PatentData[u'representative'] = 0 # should check what is reprensentativeness for patent PatentData[u'family lenght'] = len(dico) for cle in PatentData.keys(): if isinstance(PatentData[cle], list): if len(PatentData[cle]) == 1: PatentData[cle] == PatentData[cle][0] #UnNesting if None not in PatentData.values(): IRAM = '**** *Label_' + PatentData[ u'label'] + ' *Country_' + PatentData[ u'pays'] + ' *CIB3_' + '-'.join( PatentData[u'IPCR3']) + ' *CIB1_' + '-'.join( PatentData[u'IPCR1'] ) + ' *CIB4_' + '-'.join( PatentData[u'IPCR4']) + ' *Date_' + str( PatentData[u'dateDate'].year ) + ' *Applicant_' + '-'.join( coupeEnMots( str(PatentData[u'applicant']))) TXT = dict() if isinstance(donnee[u'exchange-document'], list): for tempo in donnee[u'exchange-document']: if tempo.has_key('abstract'): txtTemp = ExtractAbstract(tempo['abstract']) for cleLang in txtTemp: if TXT.has_key(cleLang): TXT[cleLang] += txtTemp[cleLang] else: TXT[cleLang] = txtTemp[cleLang] else: if donnee[u'exchange-document'].has_key('abstract'): TXT = ExtractAbstract( donnee[u'exchange-document'][u'abstract']) for lang in TXT.keys(): EcritContenu( IRAM + ' *Contenu_Abstract \n' + TXT[lang], ResultContents + '//FamiliesAbstracts//' + lang + '-' + PatentData['label'] + '.txt') lstres.append(PatentData) cpt += 1 else: # print "hum... missing values... avoiding this patent" #print "Cleaning data" for key in PatentData.keys(): if isinstance(PatentData[key], list): if len(PatentData[key]) == 1: PatentData[key] = PatentData[key][0] elif isinstance(PatentData[key], unicode): pass elif isinstance(PatentData[key], unicode): PatentData[key] = unicode(PatentData[key]) else: PatentData[key] = u'' datemin = datetime.date(3000, 1, 1) for brevet in lstres: if brevet.has_key('representative'): if brevet['dateDate'] < datemin: datemin = brevet['dateDate'] prior = brevet['label'] if 'prior' not in locals(): prior = brev['label'] for brevet in lstres: brevet['prior'] = prior # print "exceptions ", comptExcept # print len(lstres), ' patents added' return lstres