def process(self): try: self.source_logger_summary.info( '\n\n\n\nStart filepush_nebis {SOURCE} {STARTTIME}'.format( SOURCE=self._shortcut_source_name, STARTTIME=current_timestamp())) self.pre_process() nebis_incoming_files = list_files_absolute_sorted( self.configuration.working_dir, ".*\.gz") cleanUpNebis = CleanupNebis(self.configuration) number_messages = 0 for incoming_file in nebis_incoming_files: tar = tarfile.open(incoming_file, 'r:gz') for single_file in tar.getmembers(): buffered_reader = tar.extractfile(single_file) #todo # buffered_reader provides bytes and not string # do we have to transform it to bytes (as it is done with most of the other sources # or is the current implementation sufficient?? # content = buffered_reader.read().decode('utf-8') content = buffered_reader.read().decode('utf-8') cleanContent = cleanUpNebis.cleanup(content) if len(cleanContent) > 0: self.send( key=cleanContent['key'].encode('utf8'), message=cleanContent['cleanDoc'].encode('utf8')) number_messages += 1 if number_messages % 1000 == 0: self.flush() self.flush() self.post_process() self.source_logger_summary.info( 'Anzahl der nach Kafka gesendeten messages: {ANZAHL}'.format( ANZAHL=number_messages)) self.source_logger_summary.info( 'STOP filepush_nebis_kafka datasource {SOURCE} {STOPTIME}'. format(SOURCE=self._shortcut_source_name, STOPTIME=current_timestamp())) except Exception as baseException: self.source_logger.error( 'Exception während filepush_nebis_kafka: {MESSAGE}'.format( MESSAGE=str(baseException))) else: self.source_logger_summary.info( 'Keine Exception im Basisworkflow filepush_nebis_kafka der source {SOURCE}' .format(SOURCE=self._shortcut_source_name)) self.update_configuration()
def process(self): if self.contentprovider.is_content_available(): try: self.source_logger_summary.info('\n\n\n\nStart webdav_kafka {SOURCE} {STARTTIME}'.format( SOURCE=self._shortcut_source_name, STARTTIME=current_timestamp() )) number_messages = 0 for delete in self.contentprovider.provide_deletes(): identifier_key = self.p_identifier_key.search(delete).group(1) self.send(key=identifier_key.encode('utf8'), message=delete.encode('utf8')) number_messages +=1 if number_messages % 1000 == 0: self.flush() number_messages = 0 for update in self.contentprovider.provide_updates(): identifier_key = self.p_identifier_key.search(update).group(1) self.send(key=identifier_key.encode('utf8'), message=update.encode('utf8')) number_messages +=1 if number_messages % 1000 == 0: self.flush() self.flush() self.source_logger_summary.info('Anzahl der nach Kafka gesendeten messages: {ANZAHL}'.format( ANZAHL=number_messages )) self.source_logger_summary.info('number of deleted records: {deleted} / number of updated records {updated}'.format( deleted=self.contentprovider._number_delete_records, updated=self.contentprovider._number_update_records )) self.source_logger_summary.info('STOP Harvesting datasource {SOURCE} {STOPTIME}'.format( SOURCE=self._shortcut_source_name, STOPTIME=current_timestamp() )) except Exception as baseException: self.source_logger.error('Exception während des Rzerorozesses: {MESSAGE}'.format( MESSAGE=str(baseException))) else: self.source_logger_summary.info('Keine Exception im webdav-process source {SOURCE}'.format( SOURCE=self._shortcut_source_name)) move_files(self.configuration.rero_working_dir,self.configuration.rero_src_dir) self.update_configuration()
def process(self): self.source_logger_summary.info( '\n\n\n\nStart Edu zem {SOURCE} {STARTTIME}'.format( SOURCE=self._shortcut_source_name, STARTTIME=current_timestamp())) #self.refresh_access_token() #self.check_valid_access_token() all_kurse = self.make_repository_request(self.url_all_kurse) all_informationsveranstaltungen = self.make_repository_request( self.url_all_informationsveranstaltungen) all_intensivweiterbildungen = self.make_repository_request( self.url_all_intensivweiterbildungen) all_lehrgaenge = self.make_repository_request(self.url_all_lehrgaenge) all_referate = self.make_repository_request(self.url_all_referate) all_tagungen = self.make_repository_request(self.url_all_tagungen) file = open("data/phbern/kurse.json", "w") self.writeHeader(file) json.dump(all_kurse, file, indent=20) self.writeFooterAndClose(file) file = open("data/phbern/informationsveranstaltung.json", "w") self.writeHeader(file) json.dump(all_informationsveranstaltungen, file, indent=20) self.writeFooterAndClose(file) file = open("data/phbern/intensivweiterbildung.json", "w") self.writeHeader(file) json.dump(all_intensivweiterbildungen, file, indent=20) self.writeFooterAndClose(file) file = open("data/phbern/lehrgaenge.json", "w") self.writeHeader(file) json.dump(all_lehrgaenge, file, indent=20) self.writeFooterAndClose(file) file = open("data/phbern/referate.json", "w") self.writeHeader(file) json.dump(all_referate, file, indent=20) self.writeFooterAndClose(file) file = open("data/phbern/tagungen.json", "w") self.writeHeader(file) json.dump(all_tagungen, file, indent=20) self.writeFooterAndClose(file) self.source_logger_summary.info( '\n\n\n\nFinished Edu zem {SOURCE} {STARTTIME}'.format( SOURCE=self._shortcut_source_name, STARTTIME=current_timestamp()))
def getProjectId(self, url: str): # example for the URL: '/v1/projects/700037' and we are looking for the project number projectid = url[url.rfind("/") + 1:] self.source_logger_summary.info( '\nFetching project id:{PROJECTID} {CURRENTTIME}'.format( PROJECTID=projectid, CURRENTTIME=current_timestamp())) return projectid
def process(self): try: self.source_logger_summary.info( '\n\n\n\nStart Harvesting datasource {SOURCE} {STARTTIME}'. format(SOURCE=self._shortcut_source_name, STARTTIME=current_timestamp())) oai_sickle = OaiSickleWrapper(self.configuration, self.source_logger_summary, self.source_logger) messages = 0 for record in oai_sickle.fetch_iter(): messages += 1 self.send(key=record.header.identifier.encode('utf8'), message=record.raw.encode('utf8')) self.flush() self.source_logger_summary.info( 'Anzahl der nach Kafka gesendeten messages: {ANZAHL}'.format( ANZAHL=messages)) self.source_logger_summary.info( 'STOP Harvesting datasource {SOURCE} {STOPTIME}'.format( SOURCE=self._shortcut_source_name, STOPTIME=current_timestamp())) except Exception as baseException: self.source_logger.error( 'Exception während des Harvestingprozesses: {MESSAGE}'.format( MESSAGE=str(baseException))) else: self.source_logger_summary.info( 'Keine Exception im Basisworkflow Harvesting der source {SOURCE}' .format(SOURCE=self._shortcut_source_name)) self.update_configuration()
def update_stop_time(self): self.specializedConfiguration['Processing']["Default"][ 'stoppageTime'] = current_timestamp()
def process(self): self.source_logger_summary.info( '\n\n\n\nStart Edu zem {SOURCE} {STARTTIME}'.format( SOURCE=self._shortcut_source_name, STARTTIME=current_timestamp())) response = requests.get(self.base_url + "/v1/projects", headers=self.headers) #todo post is actually not possible # change the type of registered application so post and put methods are allowed #status = {"state": {"equal": "open"}} #response = requests.post(self.base_url + "/v1/projects",data=status,headers=self.headers) if response.ok: text = response.text projects = json.loads(text) for project in projects: projectId = self.getProjectId(project["self"]) if not self.active: if projectId == str(self.last_project_id): #next id should be used self.active = True continue else: self.source_logger_summary.info( '\nFetched projectid {ID} passed'.format( ID=projectId, STARTTIME=current_timestamp())) continue self.check_valid_access_token() #fullproject = requests.get(self.base_url + project["self"],headers=self.headers) #Beispielprojekt mit pricenote #fullproject = requests.get(self.base_url + "/v1/projects/997065",headers=self.headers) fp = self.make_repository_request(project["self"]) #Silvia # Ich würde nur die Kurse mit Status in_progress nehmen. # Im Moment ist noch deferred und abandoned vorhanden, was ja sicher keine # aktuellen Kurse sind. # Dann gibt es noch Status new, der aber auch wenig Sinn macht da noch keine # vernünftige Kursbeschreibung vorhanden ist. Also besser weg damit. #if not "status" in fp or fp["status"] is None or fp["status"] == "done" or fp["status"] == "cancelled": #if not "status" in fp or fp["status"] is None or fp["status"] != 'in_progress': # continue if not self.is_course_relevant_for_metisgym(fp): continue #t = open("onlytest.json","w") #json.dump(fp,t,indent=20) #t.flush() #t.close() if "forms" in fp: methods = [] price_note = None for form in fp["forms"]: method_pricenote_tuple = self.process_methods_pricenote_in_form( form) methods.append(method_pricenote_tuple[0]) price_note = method_pricenote_tuple[1] fp["course_methods"] = list(itertools.chain(*methods)) if not price_note is None: fp["price_note"] = price_note if "companies" in fp: for company in fp["companies"]: company["details"] = self.processCompany(company) if "contacts" in fp: for contact in fp["contacts"]: jsonContactDetails = self.make_repository_request( contact["contact"]) #jsonContactDetails = json.loads(requests.get(self.base_url + contact["contact"], headers=self.headers).text) privacyContact = {} privacyContact["prefix"] = jsonContactDetails[ "prefix"] if "prefix" in jsonContactDetails else "na" privacyContact["first_name"] = jsonContactDetails[ "first_name"] if "first_name" in jsonContactDetails else "na" privacyContact["last_name"] = jsonContactDetails[ "last_name"] if "last_name" in jsonContactDetails else "na" privacyContact["birthday"] = jsonContactDetails[ "birthday"] if "birthday" in jsonContactDetails else "na" privacyContact["keywords"] = jsonContactDetails["keywords"] if "keywords" in jsonContactDetails else [] privacyContact["emails"] = jsonContactDetails["emails"] if "emails" in jsonContactDetails else [] privacyContact["phone_numbers"] = jsonContactDetails["phone_numbers"] if "phone_numbers" in jsonContactDetails else [] if "companies" in jsonContactDetails: privacyContact["companies"] = jsonContactDetails[ "companies"] for company in privacyContact["companies"]: company["details"] = self.processCompany( company) contact["details"] = privacyContact if "tasks" in fp: del fp["tasks"] if "notes" in fp: del fp["notes"] if "forms" in fp: del fp["forms"] if "appointments" in fp: del fp["appointments"] self.send(key=projectId.encode('utf8'), message=json.dumps(fp).encode('utf8')) self.source_logger_summary.info( '\n\n\n\nFinished Edu zem {SOURCE} {STARTTIME}'.format( SOURCE=self._shortcut_source_name, STARTTIME=current_timestamp()))
def process(self): self.source_logger_summary.info( '\n\n\n\nStart Edu zem {SOURCE} {STARTTIME}'.format( SOURCE=self._shortcut_source_name, STARTTIME=current_timestamp())) #read data from file if needed (testdata was deleted) #fp = open("data/evento-bern/evento_content_eventobern_all.json","r") #all_events_dict = json.load(fp) #for key, evento_course in all_events_dict.items(): # self.send(key=key.encode('utf8'), # message=json.dumps(evento_course).encode('utf8')) # Beschreibung Vorgehen pro Evento Schule: # Public Token lösen # alle Events abholen # alle Zusatz Texte zu Events erfassen (individuell pro Schule). Lable -> Memo # alle EventLocations abholen # Filtern nach EventLevelId (1005 -> LuL Forbildung, Id wird sich für die Produktion noch ändern) # Pro gefilterter Event noch die Lektionen abholen (Perfomance) # Frage: # Wollen sie auch gleich Anmeldungen vornehmen? Wenn ja, müssten ich Ihnen noch 2 Request und ein wenig Businesslogik Kenntnis mehr abgeben. # Nein! #self.refresh_access_token() #self.check_valid_access_token() all_events = self.make_repository_request(self.url_all_events) all_events_texts = self.make_repository_request( self.url_all_event_texts) all_events_locations = self.make_repository_request( self.url_all_event_locations) #all_lessons_of_events = self.make_repository_request(self.lessons_of_event) # all_events_serialized = open("all_events_serialized_eventobern.json", "w") # all_events_texts_serialized = open("all_events_texts_serialized_eventobern.json", "w") # all_events_locations_serialized = open("all_events_locations_serialized_eventobern.json", "w") # #all_lessons_of_events_serialized = open("all_lessons_of_events_serialized_eventobern.json", "w") # # json.dump(all_events, all_events_serialized, indent=20) # all_events_serialized.flush() # all_events_serialized.close() # # json.dump(all_events_texts, all_events_texts_serialized, indent=20) # all_events_texts_serialized.flush() # all_events_texts_serialized.close() # # json.dump(all_events_locations, all_events_locations_serialized, indent=20) # all_events_locations_serialized.flush() # all_events_locations_serialized.close() all = {} all["all_events"] = all_events all["all_events_texts"] = all_events_texts all["all_events_locations"] = all_events_locations #all["all_lessons_of_events"] = all_lessons_of_events all["all_lessons_of_events"] = [] #todo: bis jetzt kein check, ob die referenzen in den abhängigen Objekten überhaupt vorhanden sind!! filtered_dict_events_as_list = list( map(lambda fe: {str(fe['Id']): fe}, filter(lambda e: e['Id'] != 2, all_events))) all_events_dict = {} for single_dict in filtered_dict_events_as_list: key = str(list(single_dict.keys())[0]) all_events_dict[key] = single_dict[key] all_events_dict[key]['event_texts'] = [] all_events_dict[key]['event_locations'] = [] all_events_dict[key]['lessons_of_event'] = [] for single_event_text in all_events_texts: all_events_dict[str( single_event_text['EventId'])]['event_texts'].append( single_event_text) for single_event_location in all_events_locations: all_events_dict[str( single_event_location['EventId'])]['event_locations'].append( single_event_location) #search for all lessons for single_event in filtered_dict_events_as_list: eventKey = list(single_event.keys())[0] lessons_url = self.lessons_of_event.replace("{AnlassId}", eventKey) lessons_of_single_event = self.make_repository_request(lessons_url) for single_lesson in lessons_of_single_event: all_events_dict[str(eventKey)]['lessons_of_event'].append( single_lesson) # evento_out_serialized = open("evento_content_eventobern_all.json","w") # json.dump(all_events_dict, evento_out_serialized, indent=20) # evento_out_serialized.flush() # evento_out_serialized.close() for key, evento_course in all_events_dict.items(): self.send(key=key.encode('utf8'), message=json.dumps(evento_course).encode('utf8')) self.source_logger_summary.info( '\n\n\n\nFinished Edu zem {SOURCE} {STARTTIME}'.format( SOURCE=self._shortcut_source_name, STARTTIME=current_timestamp()))