ResultPathImages = configFile.ResultPathImages P2NFamilly = configFile.GatherFamilly if IsEnableScript: ops_client = epo_ops.Client(key, secret) ops_client.accept_type = 'application/json' prefixes = [""] if P2NFamilly: prefixes.append("Families") for prefix in prefixes: ndf = prefix + configFile.ndf try: biblio_file = LoadBiblioFile(ResultBiblioPath, ndf) except IOError as ex: print('WARNING: Could not load information for "{}". Not found / error: {}'.format(ndf, ex)) patents = biblio_file['brevets'] metadata = {} Num = len(patents) cpt = 0 for patent in patents: cpt += 1 AnnonceProgres (Appli='p2n_image', valMax=100, valActu=cpt*90/Num) # 10% are expected in fusion image patent_label = get_patent_label(patent) pathes = [] path_json = '{}//{}.json'.format(ResultPathImages, patent_label) path_image = '{}//{}-{}.tiff'.format(ResultPathImages, patent_label, '{}') print("Processing patent {}".format(patent_label))
Inventeurs = set() Applicants = set() AnnonceLog(Appli='p2n_network', texte='Net processing is starting ') if configFile.GatherFamilly: PU = [ndf, 'Families' + ndf] else: PU = [ndf] for fic in PU: print("\n> Hi! This is Net processor used on:", fic) if 'Description' + fic in os.listdir(ResultBiblioPath): with open(ResultBiblioPath + '//' + fic, 'r') as data: dico = LoadBiblioFile(ResultBiblioPath, fic) else: # Retrocompatibility print("please use Comptatibilizer") sys.exit() LstBrevet = dico['brevets'] for bre in LstBrevet: if isinstance(bre['label'], list): # if len(bre['label']) >1: if len(bre['label']) != len(set(bre['label'])): AnnonceLog( Appli='p2n_network', texte= 'Good, two labels for same patent fixing to first one ' + str(bre["label"])) #print ("two labels for same patent fixing to first one" , bre ["label"] )
return dico else: return dico if GatherFamilly: print("\n> Hi! This is the family gatherer. Processing ", ndf) try: fic = open(ResultPath + '//' + ndf, 'rb') print("loading data file ", ndf + ' from ', ResultPath, " directory.") if 'Description' + ndf in os.listdir( ResultPath ): # NEW 12/12/15 new gatherer append data to pickle file in order to consume less memory data = LoadBiblioFile(ResultPath, ndf) else: #Retrocompatibility :-) print("gather your data again") sys.exit() if isinstance(data, collections.Mapping): ListeBrevet = data['brevets'] print("Found ", len(ListeBrevet), " patents gathered.") else: print( 'data corrupted. Do something (destroying data directory is a nice idea)' ) sys.exit() print(len(ListeBrevet), " patents loaded from file.") print("Augmenting list with families.") ficOk = True
ResultBiblioPath = configFile.ResultBiblioPath temporPath = configFile.temporPath ResultGephiPath = configFile.ResultGephiPath ResultPathContent = configFile.ResultContentsPath ResultAbstractPath = configFile.ResultAbstractPath Auteur = configFile.ResultPath + '//AcadCorpora' RepDir = configFile.ResultPath + '//AcadCorpora' project = RepDir if 'Description' + ndf in os.listdir( BiblioPath ): # NEW 12/12/15 new gatherer append data to pickle file in order to consume less memory print("loading patent biblio data with ", " and ".join(NeededInfo), " fields.") DataBrevet = LoadBiblioFile(BiblioPath, ndf) print("Hi this is AcadStats processor. Bibliographic data of ", ndf, " patent universe found.") else: print("relancez P2n pour collecter les données brevet") sys.exit() print("Nice, ", len(DataBrevet["brevets"]), " patents found. On calcule les auteurs identifiés...") # def Nettoie(Liste): # indesirables = ['', u'', None, False, [], ' ', "?", "Empty", "empty"] # Liste = [' '.join([truc.lower().title() for truc in nom.split(' ')]) for nom in Liste ] # return list(filter(lambda x: x not in indesirables, Liste)) # Analyse stat des résultats
requete = configFile.requete ndf = configFile.ndf #should set a working dir one upon a time... done it is temporPath ResultPath = configFile.ResultBiblioPath temporPath = configFile.temporPath ResultContentsPath = configFile.ResultContentsPath ResultBiblioPath = configFile.ResultBiblioPath ResultPathContent = '..//DATA//'+ndf+'//PatentContents' #Setting wether or not we use only primary classification Primar = True #Setting cache for performance purposes CIB = dict() if 'Description'+ndf or 'Description'+ndf.lower() in os.listdir(ResultBiblioPath): # NEW 12/12/15 new gatherer append data to pickle file in order to consume less memory ficBrevet = LoadBiblioFile(ResultBiblioPath, ndf) else: #Retrocompatibility print('gather your data again. sorry') sys.exit() if 'brevets' in ficBrevet: lstBrevet = ficBrevet['brevets'] # if data.has_key('requete'): # DataBrevet['requete'] = data["requete"] print("Found datafile with ", len(lstBrevet), " patents!") else: print('gather your data again') sys.exit() cles = ['IPCR11', 'CitO', 'dateDate', 'inventor-nice', 'equivalents', 'CitedBy', 'representative', 'Inventor-Country', 'date', 'inventor', 'kind', 'priority-active-indicator', 'applicant-nice', 'IPCR1', 'country', 'IPCR3', 'applicant', 'IPCR4', 'IPCR7', 'title', 'application-ref']
BiblioRes["brevets"] = [] BiblioRes["number"] = 0 BiblioRes["requete"] = '' try: os.makedirs(ResultFolder + '//PatentBiblios') except: if res.title() in lstReq: lstReq[0].remove(res.title()) pass #biblioFiles for ndf in lstReq: lstBrevets2, nbTrouves = [], 0 if ndf in os.listdir('..//DATA//') and ndf in os.listdir( '..//DATA//' + ndf + '//PatentBiblios//'): Brevet1 = LoadBiblioFile('..//DATA//' + ndf + '//PatentBiblios//', ndf) print("Doing ", ndf, "Found ", len(Brevet1["brevets"]), "patents in list") BiblioRes["brevets"] = BrevetFusion(Brevet1["brevets"], BiblioRes["brevets"]) BiblioRes["number"] = len(BiblioRes["brevets"]) if len(BiblioRes["requete"]) > 0: BiblioRes[ "requete"] = Brevet1["requete"] + " UNION " + BiblioRes["requete"] else: BiblioRes["requete"] = Brevet1["requete"] for brevet in BiblioRes["brevets"]: with open(ResultFolder + '//PatentBiblios//' + res, 'ab') as ficRes: pickle.dump(brevet, ficRes)
Rep = configFile.ResultContentsPath Bib = configFile.ResultBiblioPath try: if os.getenv('DEBUG'): es = Elasticsearch(hosts=[{'host': "127.0.0.1", 'port': 9200}]) # this works only in debug mode # elastic is reach by docker inter dns name in the image as below else: es = Elasticsearch(hosts=[{'host': "elasticsearch", 'port': 9200}]) except: es = Elasticsearch(hosts=[{'host': "elasticsearch", 'port': 9200}]) if 'Description' + ndf in os.listdir( Bib): # NEW 12/12/15 new gatherer append data to pickle file in order to consume less memory DataBrevet = LoadBiblioFile(Bib, ndf) LstBrevet = DataBrevet['brevets'] elif 'Description' + ndf.title() in os.listdir( Bib): # NEW 12/12/15 new gatherer append da # ta to pickle file in order to consume less memory DataBrevet = LoadBiblioFile(Bib, ndf.title()) LstBrevet = DataBrevet['brevets'] else: # Retrocompatibility print("please use Comptatibilizer") def GenereListeFichiers(rep): """ prend un dossier en paramètre (chemin absolu) et génère la liste complète des fichiers TXT de l'arborescence""" listeFicFR = [] listeFicEN = []
#u'CitedBy', # the list of docs (patents) cititng this patent #'CitP', # the patents cited by this patent #'CitO' # the other docs cited by this patent ] #"citations" #filterFile = [fi for fi in os.listdir(ListBiblioPath) if fi.count('Expanded')] srcFile = [ fi for fi in os.listdir(ListBiblioPath) if '.pkl' not in fi and 'tempoInconnus' not in fi and "Description" not in fi ] for ndf in srcFile: if 'Description' + ndf in os.listdir( ListBiblioPath ): # NEW 12/12/15 new gatherer append data to pickle file in order to consume less memory DataBrevet = LoadBiblioFile(ListBiblioPath, ndf) print("\n> Hi! This is FormateExportPivotTable") else: #Retrocompatibility... prévious test is ugly: there is an issue with filename in lowercase (sometimes) print("please use Comptatibilizer") DataBrevet = LoadBiblioFile(ListBiblioPath, ndf) #so I try to laod it.... if isinstance(DataBrevet, collections.Mapping): #data = DataBrevet LstBrevet = DataBrevet['brevets'] if 'number' in DataBrevet: print("Found ", DataBrevet["number"], " patents! Formating into HMTL Pivot tables") else: print("Found ", len(DataBrevet["brevets"]), " patents! Trying to format into HMTL Pivot tables")
def read(path, slot): filename = 'Description' + slot if filename in os.listdir(path): data = LoadBiblioFile(path, slot) return data
else: pass AnnonceLog(Appli='p2n_gather_biblio', texte="Found almost" + str(len(lstBrevets)) + " patents. Saving list") AnnonceLog(Appli='p2n_gather_biblio', texte="Within " + str(len(set(listeLabel))) + " unique patents") print("Found almost", len(lstBrevets), " patents. Saving list") print("Within ", len(set(listeLabel)), " unique patents") BibliDataBrevets = dict() BibliDataBrevets['brevets'] = [] # loading already gathered bibliographic daata if ndf in os.listdir(ResultBiblioPath): BibliDataBrevets = LoadBiblioFile(ResultBiblioPath, ndf) # with codecs.open(ResultBiblioPath + '//' + ndf, 'rb', "utf-8") as fic: # while 1: # try: # DataBrevets['brevets'].append(byteify(pickle.load(fic))) # except EOFError: # break if len(BibliDataBrevets['brevets']) == len(listeLabel): print(len(BibliDataBrevets['brevets']), " bibliographic patent data gathered yet? ") GatherBibli = False AnnonceProgres(Appli='p2n_gather_biblio', valMax=100, valActu=100) sys.exit('Nothing else to do :-). Good bye') else:
def run(): # Bootstrap logging boot_logging() # Load configuration config = LoadConfig() # Run this only if enabled if not config.GatherImages: return # Get some information from configuration expression = config.requete storage_basedir = config.ResultBiblioPath storage_dirname = config.ndf output_path = config.ResultPathImages # Compute prefixes prefixes = [""] if config.GatherFamilly: prefixes.append("Families") # Build maps for all prefixes for prefix in prefixes: # Status message label = label_from_prefix(prefix) logger.info("Generating gallery of drawings for {}. ".format(label)) # Compute storage slot using prefix and DataDirectory # e.g. "Lentille" vs. "FamiliesLentille" storage_name = prefix + storage_dirname # Load bibliographic data biblio_file = LoadBiblioFile(storage_basedir, storage_name) # Generate thumbnails gallery = [] patents = biblio_file['brevets'] cpt = 0 for patent in patents: cpt + 1 AnnonceProgres(Appli='p2n_image', valMax=100, valActu=90 + cpt * 10 / len(patents)) patent_label = get_patent_label(patent) i = 1 logger.info('Processing patent {}'.format(patent_label)) path_img_base = '{}//{}-{}.tiff'.format(output_path, patent_label, '{}') path = path_img_base.format(i) while os.path.exists(path): thumb, orig, tiff = generate_thumbnails(path) gallery.append({ "_id": '{}-{}'.format(patent_label, i), 'thumb': thumb, 'orig': orig, 'label': patent['title'], 'ipcr7': patent['IPCR7'], 'code': patent_label, 'tiff': tiff, }) i += 1 path = path_img_base.format(i) # Render gallery AnnonceProgres(Appli='p2n_image', valMax=100, valActu=100) RenderTemplate( 'ModeleImages.html', output_path + '/index' + prefix + '.html', request=expression.replace('"', ''), gallery=gallery, json=json.dumps(gallery), )
if IsEnableScript: Rep = configFile.ResultContentsPath Bib = configFile.ResultBiblioPath prefixes = [""] if GatherFamilly: prefixes.append("Families") for prefix in prefixes: ndf = prefix + configFile.ndf if 'Description' + ndf in os.listdir( Bib ): # NEW 12/12/15 new gatherer append data to pickle file in order to consume less memory DataBrevet = LoadBiblioFile(Bib, ndf) LstBrevet = DataBrevet['brevets'] else: #Retrocompatibility print("please use Comptatibilizer") try: os.makedirs(Rep + "//Carrot2") except: #directory exists pass temporar = GenereListeFichiers(Rep) cpt = 0 for det in ['Abstract', 'Claims', 'Description']: ind = 0 cpt += 1 for lang in ['FR', 'EN', 'UNK']:
'kind', 'applicant', 'country', 'inventor', 'representative', 'IPCR4', 'IPCR7', "Inventor-Country", "Applicant-Country", "equivalents", "CPC", 'references', 'Citations', 'CitedBy' ] prefixes = [""] if GatherFamilly: prefixes.append("Families") for prefix in prefixes: ndf = prefix + configFile.ndf if 'Description' + ndf in os.listdir( ListBiblioPath ): # NEW 12/12/15 new gatherer append data to pickle file in order to consume less memory LstBrevet = LoadBiblioFile(ListBiblioPath, ndf) with open(ListBiblioPath + '//Description' + ndf, 'rb') as ficRes: DataBrevet = pickle.load(ficRes) else: #Retrocompatibility with open(ListBiblioPath + '//' + ndf, 'rb') as data: LstBrevet = pickle.load(data) ##next may need clarifying update data = LstBrevet LstBrevet = data['brevets'] if 'requete' in data: requete = data["requete"] if 'number' in data: print("Found ", data["number"], " patents! Formating to HMTL tables")
NeededInfo.extend(mixNet) # list of needed field for building the net # may be should use from # from collections import OrderedDict # class OrderedNodeGraph(nx.Graph): # node_dict_factory=OrderedDict # G = OrderedNodeGraph() G1 = nx.MultiDiGraph() # Multi edges directed network for Gephi attr_dict = dict() # attributes for the net # flat net for gexf.js may be it is possible to use previous instead of this one... if 'Description' + ndf in os.listdir( BiblioPath ): # NEW 12/12/15 new gatherer append data to pickle file in order to consume less memory print(network, ": loading data with ", " and ".join(mixNet), " fields.") DataBrevet = LoadBiblioFile(BiblioPath, ndf) print("Hi this is Pre-Network processor. Bibliographic data of ", ndf, " patent universe found.") else: #Retrocompatibility print("please use Comptatibilizer") print("Nice, ", len(DataBrevet["brevets"]), " patents found. Pre-formating ", sys.argv[1], " net.") for brev in DataBrevet["brevets"]: #tempo = pickle.load(fic) # we only memorize needed nfo pat = OrderedDict() if "date" not in list(brev.keys()): brev['date'] = '1-1-1' if isinstance(brev['label'], list): brev['label'] = brev['label'][0] for key in NeededInfo:
#listeLabel = [] # Entering PatentBiblio feeding print("Checking and/or gathering bibliographic data") if GatherBibli and GatherBiblio: for brevet in lstBrevets: # nameOfPatent for file system save (abstract, claims...) ndb = brevet['document-id']['country']['$'] + brevet['document-id'][ 'doc-number']['$'] listeLabel.append(ndb) print("Found almost", len(lstBrevets), " patents. Saving list") print("Within ", len(set(listeLabel)), " unique patents") DataBrevets = dict() DataBrevets['brevets'] = [] if ndf in os.listdir(ResultBiblioPath): DataBrevets = LoadBiblioFile(ResultBiblioPath, ndf) # with codecs.open(ResultBiblioPath + '//' + ndf, 'rb', "utf-8") as fic: # while 1: # try: # DataBrevets['brevets'].append(byteify(pickle.load(fic))) # except EOFError: # break if len(DataBrevets['brevets']) == len(listeLabel): print(len(DataBrevets['brevets']), " bibliographic patent data gathered yet? ") GatherBibli = False sys.exit('Nothing else to do :-). Good bye') else: print( len(listeLabel) - len(DataBrevets['brevets']),
nbAppliAvant = dict() nbInvAvant = dict() # traitement des fichiers + familles if GatherFamilly: PU = [ndf, 'Families' + ndf] else: PU = [ndf] for fic in PU: cptInv, cptAppl = 0, 0 print( "\n> Hi! This is Pre Process for normalizing applicant names: used on:", fic) if 'Description' + fic in os.listdir(ListBiblioPath): with open(ListBiblioPath + '//' + fic, 'r', encoding="utf8") as data: dico = LoadBiblioFile(ListBiblioPath, fic) else: # Retrocompatibility print("please use Comptatibilizer") sys.exit() LstBrevet = dico['brevets'] print("Good, ", len(LstBrevet), " patents found filterd from equivalent unicity") # patent filtering process Filtres = [] dejaVus = [] for bre in LstBrevet: if bre['label'] not in dejaVus: dejaVus.append(bre['label']) Filtres.append(bre) if isinstance(bre['equivalents'], list): for eq in bre['equivalents']:
temporPath = configFile.temporPath ResultPathContent = configFile.ResultContentsPath ResultAbstractPath = configFile.ResultAbstractPath ListBiblioPath = configFile.ResultBiblioPath # special path used with AcadPubMed.py Auteur = configFile.ResultPath + '//AcadCorpora' RepDir = configFile.ResultPath + '//AcadCorpora' project = RepDir if 'AcadCorpora' not in os.listdir(configFile.ResultPath): print("relancez le script de collecte (AcadPubMed.py 29/06/2019)") sys.exit() if 'Description' + ndf in os.listdir( BiblioPath ): # NEW 12/12/15 new gatherer append data to pickle file in order to consume less memory print("loading patent biblio data with all fields.") DataBrevet = LoadBiblioFile(BiblioPath, ndf) print( "Hi this is AcadStatsAcad Corpora splitter processor. Bibliographic data of ", ndf, " patent universe found.") else: print("relancez P2n pour collecter les données brevet") sys.exit() print( "Nice, ", len(DataBrevet["brevets"]), " patents found. Découpage selon le données du tablea EntitésPubliquesNorm.xlsx" ) # test de consistance # with open(Auteur+'//DejaTraites.csv', 'r',) as fic: # DejaVus = fic.readlines()
"Inventor-Country", "Applicant-Country", "equivalents", "CPC", 'prior-Date', #'prior-dateDate', # dates of priority claims 'references', # the number of refences into the document len(CitP) + len(CitO) 'Citations', # the number of citations granted by the document 'CitedBy', # the list of docs (patents) cititng this patent 'CitP', # the patents cited by this patent 'CitO' # the other docs cited by this patent ] #"citations" if 'Description' + ndf in os.listdir( ListBiblioPath ): # NEW 12/12/15 new gatherer append data to pickle file in order to consume less memory dico = LoadBiblioFile(ListBiblioPath, ndf) else: #Retrocompatibility with open(ListBiblioPath + '//' + ndf, 'r') as data: dico = pickle.load(data) LstBrevet = dico['brevets'] if 'requete' in dico: requete = dico["requete"] if 'number' in dico: print("Found ", dico["number"], " patents! Formating to HMTL tables") else: print("Found ", len(LstBrevet), " patents! Formating to HMTL tables") LstExp = [] LstExp2 = [] #just for testing las fnction in gathered should deseapear soon
ResultListPath = configFile.ResultListPath ResultBiblioPath = configFile.ResultBiblioPath if IsEnableScript: LoadDescs() prefixes = [""] if P2NFamilly: prefixes.append("Families") for prefix in prefixes: ndf = prefix + configFile.ndf try: with open(ResultBiblioPath + '//' + ndf, 'r') as fic: DataBrevets1 = LoadBiblioFile(ResultBiblioPath, ndf) BrevetsTotal = str(len(DataBrevets1['brevets'])) except: print("Error: there are no data to generate de FreePlane file") # End of Load patent file # ### ugly code to patch classification extraction inconsistency for bre in DataBrevets1['brevets']: if isinstance(bre['classification'], list): if '' in bre['classification']: bre['classification'].remove('') bre['IPCR11'] = bre['classification'] lstIPC = [ipc[0] for ipc in bre['classification']] for ipc in lstIPC:
def run(): aujourd = datetime.date.today() configFile = LoadConfig() requete = configFile.requete ndf = configFile.ndf Gather = configFile.GatherContent GatherBiblio = configFile.GatherBiblio GatherPatent = configFile.GatherPatent GatherFamilly = configFile.GatherFamilly IsEnableScript = configFile.FormateExportDataTable #should set a working dir one upon a time... done it is temporPath ListBiblioPath = configFile.ResultBiblioPath temporPath = configFile.temporPath ResultPathContent = configFile.ResultPath ResultListPath = configFile.ResultListPath ResultBiblioPath = configFile.ResultBiblioPath # Lecture du fichier de référence lstApplic = [] Inventeurs = [] Applicants = [] nbAppliAvant = dict() nbInvAvant = dict() # traitement des fichiers + familles if GatherFamilly: PU = [ndf, 'Families' + ndf] else: PU = [ndf] for fic in PU: if 'Old' + fic not in os.listdir(ResultBiblioPath): cptInv, cptAppl = 0, 0 print( "\n> Hi! This is Pre Process for filtering equivalents patents from dataset gathered by P2N-OPSGather: used on:", fic) if 'Description' + fic in os.listdir(ListBiblioPath): with open(ListBiblioPath + '//' + fic, 'r', encoding="utf8") as data: dico = LoadBiblioFile(ListBiblioPath, fic) else: # Retrocompatibility print("please use Comptatibilizer") sys.exit() LstBrevet = dico['brevets'] # patent filtering process Filtres = [] dejaVus = [] LabBrevets = [brev['label'] for brev in LstBrevet] for bre in LstBrevet: # parcours de la listee des brevets if bre['label'] not in dejaVus: # si pas vu dejaVus.append(bre['label']) #rajout aux vus # parcours des equivalents. Deux cas : une liste ou un chaine non vide if isinstance(bre['equivalents'], list) and len(bre['equivalents']) > 0: # récupération de la liste des dates de chaque équvalents dates = [] for brev in bre['equivalents']: if brev in LabBrevets: # si celui-ci fait partie des brevets de départ (sinon ignoré) for brevet in LstBrevet: # on va le chercher if brevet[ 'label'] == brev: # yes c'est lui !!! if isinstance( brevet["date"], list ): # les dates sont quelquefois en liste OU en chaine :-() date = min(brevet["date"]) else: date = brevet["date"] # on rajoute à la structure adhoc : date, brevet, taille (nb de caractères) if len(date) < 4: print("Aille") dates.append((date, bre, len(str(bre.values())))) #dates.extend((brevet["date"][0], brevet, len(str(brevet.values()))) for brevet in LstBrevet if brevet['label'] == brev ) dejaVus.append(brev) if len(dates) == 1: # pas d'ambiguité Filtres.append(dates[0][1]) elif len(dates) > 1: #récupération du plus vieux MiniDate = min([dat for dat, brev, val in dates]) MaxVal = max(val for dat, brev, val in dates) if len(MiniDate) < 5: print(bre['prior-Date'], ' -- > ', MiniDate) # giving priority to the first in date apparition maximizing lenght (su^^posed to be fields with information) candidat = [ bre for dat, bre, val in dates if dat == MiniDate and val == MaxVal ] if len(candidat ) == 0: # ou au max d'apport informationnel # if it doens't work giving priority to max information content candidat = [ brev for dat, brev, val in dates if val == MaxVal ] if len(candidat) > 1: priorDateMin = min([ min(brevet["prior-Date"]) for brevet in candidat ]) NewCandidat = [ brev for brev in candidat if priorDateMin in brev["prior-Date"] ] if len(NewCandidat) > 1: NewCandidat = NewCandidat[0] Filtres.append(NewCandidat) else: print("pffff") else: #aucun des équivalents dans la liste Filtres.append(candidat[0]) else: Filtres.append(bre) elif isinstance(bre['equivalents'], str) and len(bre['equivalents']) > 0: #len(bre ['equivalents'])>0 and bre ['equivalents'] in LabBrevets: if bre['equivalents'] in LabBrevets: brevet = [ brev for brev in LstBrevet if brev['label'] == bre['equivalents'] ][0] if isinstance( brevet["date"], list ): # les dates sont quelquefois en liste OU en chaine :-() date = min(brevet["date"]) else: date = brevet["date"] if len(date) < 4: print("Aille") dates = [(date, brevet, len(brevet.values()))] if isinstance( bre["date"], list ): # les dates sont quelquefois en liste OU en chaine :-() date = min(bre["date"]) else: date = bre["date"] # joining currend patent dates.append((date, bre, len(bre.values()))) MiniDate = min([dat for dat, bre, val in dates]) MaxVal = max([val for dat, bre, val in dates]) candidat = [ bre for dat, bre, val in dates if dat == MiniDate and val == MaxVal ] if len(candidat) > 1: pass elif len(candidat) == 0: # if it doens't work giving priority to max information content candidat = [ brev for dat, brev, val in dates if val == MaxVal ] if len(candidat) > 1: priorDateMin = min([ min(brevet["prior-Date"]) for brevet in candidat ]) NewCandidat = [ brev for brev in candidat if priorDateMin in brev["prior-Date"] ] if len(NewCandidat) > 1: NewCandidat = NewCandidat[0] Filtres.append(NewCandidat) else: print("pffff") else: Filtres.append(candidat[0]) else: #equivalent pas dans le corpus Filtres.append(bre) else: Filtres.append(bre) for dat, brevet, val in dates: dejaVus.append(brevet['label']) # joining lost patents LabFiltered = [] for bre in Filtres: if isinstance(bre["label"], str): LabFiltered.append(bre['label']) else: LabFiltered.append(bre['label'][0]) EquivFiltered = [] for bre in Filtres: for pat in bre['equivalents']: EquivFiltered.append(pat) complement = [bre for bre in LstBrevet \ if bre ['label'] not in LabFiltered \ and sum([eq in EquivFiltered for eq in bre["equivalents"]]) ==0] NewFilt = [] DejaVus = [] for bre in Filtres: if isinstance(bre['label'], list): bre['label'] = bre['label'][0] if bre['label'] not in DejaVus: equi = [] cpFilt = copy.copy(Filtres) cpFilt.remove(bre) for bre1 in cpFilt: if isinstance(bre1['equivalents'], list): for eq in bre1['equivalents']: if len(eq) > 0 and eq != 'empty': equi.append(eq) elif len(bre1['equivalents'] ) > 1 and bre1['equivalents'] != 'empty': equi.append(bre1['equivalents']) else: pass if len(bre['equivalents'] ) > 0 and bre['equivalents'] != 'empty': res = sum([pat in equi for pat in bre['equivalents']] + [bre['label'] in equi]) if res > 0: tempo = [ (bre2['date'], bre2, len(bre2.values())) for bre2 in cpFilt if bre['equivalents'] ] # on pourrait direct aller là et tester sur la taille de tempo :-/ tempo2 = [] for dat, brevet, val in tempo: if isinstance(dat, str): tempo2.append((dat, brevet, val)) elif isinstance(dat, list): for truc in dat: if len(truc) > 0: tempo2.append((truc, brevet, val)) else: pass tempo = tempo2 dates = [] valeurs = [] for dat, brevet, val in tempo: dates.append(dat) valeurs.append(val) miniDate = min(dates) maxVal = max(valeurs) tempo2 = [ bre for dat, brevet, val in tempo if dat == miniDate and val == maxVal ] if len(tempo2) > 0: NewFilt.append(tempo2[0]) if isinstance(tempo2[0]['equivalents'], list): for eq in tempo2[0]['equivalents']: DejaVus.append(eq) elif tempo2[0]['equivalents'] != 'empty': DejaVus.append(tempo2[0]['equivalents']) else: tempo2 = [ bre for dat, brevet, val in tempo if val == maxVal ] if len(tempo2) > 0: NewFilt.append(tempo2[0]) if isinstance(tempo2[0]['equivalents'], list): for eq in tempo2[0]['equivalents']: DejaVus.append(eq) elif tempo2[0]['equivalents'] != 'empty': DejaVus.append(tempo2[0]['equivalents']) else: pass else: NewFilt.append(bre) else: NewFilt.append(bre) if isinstance(bre['label'], str): DejaVus.append(bre['label']) else: for lab in bre['label']: DejaVus.append(lab) if isinstance(bre['equivalents'], list): for eq in bre['equivalents']: DejaVus.append(eq) elif bre['equivalents'] != 'empty': DejaVus.append(bre['equivalents']) else: pass EquivFiltered = [] cpFilt = copy.copy(Filtres) for bre in Filtres: if not isinstance(bre['label'], str): if len(bre['label']) > 0: bre['label'] = bre['label'][0] else: print("no label !!!!") else: pass toRemove = [] for bre in Filtres: if not isinstance(bre['equivalents'], list): if len(bre['equivalents'] ) and bre['equivalents'] != 'empty': bre['equivalents'] = [bre['equivalents']] else: bre['equivalents'] = [] for pat in bre['equivalents']: if pat != bre['label']: if pat not in EquivFiltered: EquivFiltered.append(pat) else: cpFilt = [ brev for brev in cpFilt if brev['label'] != bre['label'] ] toRemove.append((bre['label'], bre)) exclude = [truc for truc, muche in toRemove] Resultat = [] for bre in Filtres: if bre['label'] not in exclude: Resultat.append(bre) EquivFiltered2 = [] for bre in Resultat: for pat in bre['equivalents']: EquivFiltered2.append(pat) print("net set of equivalent covered: ", len(EquivFiltered2)) print(len(LstBrevet), ' --> ', len(Filtres), ' --> ', len(Resultat)) print("Good, ", len(Resultat + complement), " patents filterd from equivalent unicity exrtracted from ", fic) #Saving file for brev in Resultat: with open(ResultBiblioPath + '//tempo' + fic, 'ab') as ficRes: pickle.dump(brev, ficRes) os.rename(ResultBiblioPath + '//' + fic, ResultBiblioPath + '//Old' + fic) os.rename(ResultBiblioPath + '//tempo' + fic, ResultBiblioPath + '//' + fic)
GatherBiblio = configFile.GatherBiblio GatherPatent = configFile.GatherPatent GatherFamilly = configFile.GatherFamilly # should set a working dir one upon a time... done it is temporPath ResultBiblioPath = configFile.ResultBiblioPath ResultPatentPath = configFile.ResultListPath ResultContentsPath = configFile.ResultContentsPath GlobalPath = configFile.GlobalPath # take request from BiblioPatent file # NEW 12/12/15 new gatherer append data to pickle file in order to consume less memory if 'Description' + ndf in os.listdir(ResultBiblioPath): data = LoadBiblioFile(ResultBiblioPath, ndf) requete = data['requete'] else: # Retrocompatibility print("please use Comptatibilizer") # if 'Fusion' in data.keys() data = dict() if GatherFamilly: # pdate needed for families # NEW 12/12/15 new gatherer append data to pickle file in order to consume less memory if 'DescriptionFamilies' + ndf in os.listdir(ResultBiblioPath): data2 = LoadBiblioFile(ResultBiblioPath, 'Families' + ndf) nbFam = len(data2['brevets']) else: # Retrocompatibility print("please use Comptatibilizer") # if 'Fusion' in data.keys()with open( ResultBiblioPath+'//Families'+ndf, 'r') as ficBib: # data2 = cPickle.load(ficBib)