def test_docdb_required(): with raises(MissingRequiredValue): Docdb('123', None, None) with raises(MissingRequiredValue): Docdb('123', '345', '') with raises(MissingRequiredValue): Docdb('', None, None)
def test_docdb_required(): with raises(MissingRequiredValue): Docdb("123", None, None) with raises(MissingRequiredValue): Docdb("123", "345", "") with raises(MissingRequiredValue): Docdb("", None, None)
os.chdir(ndf.replace('.dump', '')) desc, clm, ft = 0, 0, 0 for brevet in lstBrevets: #tempo =('publication', Docdb(,, )) ndb = brevet[u'document-id'][u'country']['$'] + brevet[u'document-id'][ u'doc-number']['$'] for content in [u'claims', u'description', u'fulltext']: if content not in filter(os.path.isdir, os.listdir(os.getcwd())): os.makedirs(content) # optional, list of constituents try: data = registered_client.published_data( reference_type='publication', input=Docdb(brevet[u'document-id'][u'doc-number']['$'], brevet[u'document-id'][u'country']['$'], brevet[u'document-id'][u'kind']['$']), endpoint=content, constituents=[]) if data.status_code == 403: #making necessary redirections print data if data.ok: patentCont = data.json() #withch language ? #the following could be factorized !!!!!!!! if content == 'description': description = [] description = patentCont[u'ops:world-patent-data'][ u'ftxt:fulltext-documents'][u'ftxt:fulltext-document'][
if ndb+'.txt' not in fichier: #hack here as chinese patents seems not be in claims or description endpoint #, u'fulltext' temp =('publication', Epodoc(pays+ndb[2:])) #, brevet[u'document-id'][u'kind']['$'])) try: data = registered_client.published_data(*temp, endpoint = endP) #registered_client.published_data() if data.ok and content.replace(typeSrc, "").lower() in str(data.json()): CheckDocDB = False else: CheckDocDB = True except: CheckDocDB = True if CheckDocDB: if isinstance(brevet[u'kind'], list): tempoData = [] for cc in brevet[u'kind']: temp =('publication', Docdb(ndb[2:],pays, cc)) # hope all comes from same country try: tempoData.append(registered_client.published_data(*temp, endpoint = endP)) except: data = None pass for dat in tempoData: if dat is not None and dat.ok: #doing the same for all content. This may result in redundancy contenu = content.replace(typeSrc, "").lower() patentCont = dat.json() Langs = MakeIram2(brevet, ndb +'.txt', patentCont, RepDir+ '//'+ typeSrc + contenu+'//', contenu) if endP == 'biblio': for contenu in ['claims', 'description']: Langs = MakeIram2(brevet, ndb +'.txt', patentCont, RepDir+ '//'+ typeSrc + contenu+'//', contenu) else:
def issue_request(client): return client.published_data("publication", Docdb("Quota", "Forbidden", "exceeded"))
desc, clm, ft = 0,0,0 if GatherContent: for brevet in lstBrevet: #tempo =('publication', Docdb(,, )) #if brevet['label'] == 'FR2997041': ndb =brevet[u'label']#[u'document-id'][u'country']['$']+brevet[u'document-id'][u'doc-number']['$']brevet['publication-ref'][u'document-id'][0][u'kind']['$']) #check for already gathered patents lstfic =[] #alreadycollected for content in [u'claims', u'description']: lstfic += os.listdir(ResultPathContent+'//'+content+'//') fichier = [fics[3:] for fics in lstfic] if fichier.count(ndb+'.txt') < 2: #one or both files claim or desc are missing tmp = Epodoc(ndb) tempo2 = ('publication', tmp) tmp = Docdb(ndb[2:], ndb[0:2],brevet['status']) tempo = ('publication', tmp) ndb =brevet[u'label']#[u'document-id'][u'country']['$']+brevet[u'document-id'][u'doc-number']['$']brevet['publication-ref'][u'document-id'][0][u'kind']['$']) if True: #avoid check of chinese patents since they aren't descibed in english for content in [u'claims', u'description']: #, u'fulltext' if content not in os.listdir(RepDir): os.makedirs(RepDir +'//' +content) # optional, list of constituents try : data = registered_client.published_data(*tempo, endpoint = content) #registered_client.published_data() if data.ok: patentCont=data.json()
def test_docdb_as_api_input(): params = ["US08/921,321", "CC", "B2", "20140122"] expected = "(CC).(US08/921%2C321).(B2).(20140122)" assert Docdb(*params).as_api_input() == expected
*temp, endpoint=endP) #ops_client.published_data() if data.ok and content.replace( typeSrc, "").lower() in str(data.json()): CheckDocDB = False else: CheckDocDB = True except Exception as err: CheckDocDB = True if CheckDocDB: if isinstance(brevet[u'kind'], list): tempoData = [] for cc in brevet[u'kind']: temp = ( 'publication', Docdb(ndb[2:], pays, cc) ) # hope all comes from same country try: tempoData.append( ops_client.published_data( *temp, endpoint=endP)) except: data = None pass for dat in tempoData: if dat is not None and dat.ok: #doing the same for all content. This may result in redundancy contenu = content.replace(typeSrc, "") patentCont = dat.json() Langs = MakeIram2( brevet, ndb + '.txt', patentCont,
def OPSChercheAbstractBrevet(pat, DirStockage): import epo_ops from epo_ops.models import Docdb from epo_ops.models import Epodoc fic = open('../cles-epo.txt', 'r') key, secret = fic.read().split(',') key, secret = key.strip(), secret.strip() fic.close() ops_client = epo_ops.Client(key, secret) ops_client.accept_type = 'application/json' ndb = pat[ 'label'] #[u'document-id'][u'country']['$']+brevet[u'document-id'][u'doc-number']['$']brevet['publication-ref'][u'document-id'][0][u'kind']['$']) Abstracts = dict() if isinstance(ndb, list): ndb = ndb[0] #print("Retrieving ", ndb) pays = pat['country'] for key in ['label', 'country', 'kind']: if isinstance(pat[key], list): pat[key] = list( set(pat[key]) ) # hum some problem (again) in cleaning data within the family gatherer... 22/12/15 if isinstance(pays, list): pays = pays[0] content = 'Abstract' endP = 'biblio' # temp = ('publication', Epodoc(pays + ndb[2:]) ) #, brevet[u'document-id'][u'kind']['$'])) try: data = ops_client.published_data( *temp, endpoint=endP) #ops_client.published_data() if data.ok and 'abstract' in str(data.json()): CheckDocDB = False else: CheckDocDB = True except Exception as err: CheckDocDB = True if CheckDocDB: if isinstance(pat['kind'], list): tempoData = [] for cc in pat['kind']: temp = ('publication', Docdb(ndb[2:], pays, cc) ) # hope all comes from same country try: tempoData.append( ops_client.published_data(*temp, endpoint=endP)) except: data = None pass for dat in tempoData: if dat is not None and dat.ok: contenu = content patentCont = dat.json() Abstracts = MakeIram4(pat, patentCont, contenu) # Make2Iram2 devrait formater le brevet dans un fichier txt au format Iramuteq dans le bon repertoire # Lang est un truc :-) (je crois que cela renvoit la langue de l'abstract récupéré)) else: temp = ('publication', Docdb(pat['label'][2:], pat['country'], pat['kind'])) if data is not None and data.ok: contenu = content patentCont = data.json() Abstracts = MakeIram4(pat, patentCont, contenu) # if ops:world-patent-data exchange-documents exchange-documents abstract return Abstracts
import re import requests from epo_ops.models import Docdb from epo_ops.models import Epodoc from epo_ops.models import Original data = ('publication', Docdb('1000000', 'EP', 'A1')) rdata = ('publication', Epodoc('EP1000000')) def find_range(document, pattern): return re.search("range.*{0}".format(pattern), document) def assert_request_success(response): assert response.status_code == requests.codes.ok assert response.headers['X-API'] == 'ops-v3.1' def assert_family_success(client): response = client.family(*data) assert_request_success(response) assert 'patent-family' in response.text return response def issue_published_data_request(client): return client.published_data(*data)
return None, DejaLa, BiblioPatents if GatherBibli and GatherBiblio: registered_client = epo_ops.RegisteredClient(key, secret) # data = registered_client.family('publication', , 'biblio') registered_client.accept_type = 'application/json' for brevet in lstBrevets: YetGathered = [u['label'] for u in BiblioPatents] # may be current patent has already be gathered in a previous attempt # should add a condition here to check in os.listdir() tempo =('publication', Docdb(brevet[u'document-id'][u'doc-number']['$'],brevet[u'document-id'][u'country']['$'], brevet[u'document-id'][u'kind']['$'])) tempo2 =('publication', Epodoc(brevet[u'document-id'][u'country']['$']+brevet[u'document-id'][u'doc-number']['$']))#, brevet[u'document-id'][u'kind']['$'])) ndb =brevet[u'document-id'][u'country']['$']+brevet[u'document-id'][u'doc-number']['$'] #nameOfPatent if ndb not in YetGathered: try: #trying Epodoc first, unused due to response format (multi document instead of one only) data = registered_client.published_data(*tempo2, endpoint = 'biblio') patentBib = data.json() data2 = registered_client.published_data(*tempo, endpoint = 'biblio') if data.ok and data2.ok: patentBibtemp = data.json() patentBibtemp2= data2.json() if len(str(patentBibtemp)) > len(str(patentBibtemp2)): patentBib = patentBibtemp else: patentBib = patentBibtemp2
def issue_request(client): return client.published_data( 'publication', Docdb('Quota', 'Forbidden', 'exceeded') )
def test_docdb_as_api_input(): params = ['US08/921,321', 'CC', 'B2', '20140122'] expected = '(CC).(US08/921%2C321).(B2).(20140122)' assert Docdb(*params).as_api_input() == expected
brevet = CleanPatent(brevet) brevet = CleanPatent(brevet) ndb =brevet[u'label']#[u'document-id'][u'country']['$']+brevet[u'document-id'][u'doc-number']['$']brevet['publication-ref'][u'document-id'][0][u'kind']['$']) #check for already gathered patents lstfic = os.listdir(ResultPathContent+'//Abstracts/') fichier = [fics[3:] for fics in lstfic] if ndb+'.txt' not in fichier: for content in [u'abstract']:#claims', u'description']: #, u'fulltext' try : tmp = Epodoc(ndb) tempo = ('publication', tmp) data = registered_client.published_data(*tempo, endpoint = content) #registered_client.published_data() if 'abstract' not in str(data.json()): tmp = Docdb(ndb[2:], ndb[0:2],brevet['status']) tempo = ('publication', tmp) data = registered_client.published_data(*tempo, endpoint = content) #brevet['publication-ref'][u'document-id'][0][u'doc-number']['$'],#brevet[u'document-id'][u'doc-number']['$'], #brevet['publication-ref'][u'document-id'][0][u'country']['$'],#brevet[u'document-id'][u'country']['$'], #brevet['publication-ref'][u'document-id'][0][u'kind']['$']), endpoint = content, constituents = []) except: try: tmp = Docdb(ndb[2:], ndb[0:2],brevet['status']) tempo = ('publication', tmp) data = registered_client.published_data(*tempo, endpoint = content) #registered_client.published_data() except:#from there totally fun... may be we do not get there... try: # print 'yes we get' tmp = Epodoc(brevet['publication-ref'][u'document-id'][1][u'doc-number']['$'])
def assert_bulk_service_retrival_success(client): input_list = [Docdb("1000000", "EP", "A1"), Epodoc("US2018265402")] response = client.published_data("publication", input=input_list) assert response.status_code == requests.codes.ok
import re import requests from epo_ops.models import Docdb, Epodoc, Original data = ("publication", Docdb("1000000", "EP", "A1")) rdata = ("publication", Epodoc("EP1000000")) idata = ("published-data/images/EP/1000000/A1/fullimage", 1) # idata path is the result @path from images published-data json request def find_range(document, pattern): return re.search("range.*{0}".format(pattern), document) def assert_request_success(response): assert response.status_code == requests.codes.ok assert response.headers["X-API"] == "ops-v3.2" def assert_family_success(client): response = client.family(*data) assert_request_success(response) assert "patent-family" in response.text return response def assert_family_biblio_success(client): response = client.family(*data, constituents=["biblio"]) assert_request_success(response)