def process(self):

        try:
            self.source_logger_summary.info(
                '\n\n\n\nStart filepush_nebis {SOURCE} {STARTTIME}'.format(
                    SOURCE=self._shortcut_source_name,
                    STARTTIME=current_timestamp()))

            self.pre_process()
            nebis_incoming_files = list_files_absolute_sorted(
                self.configuration.working_dir, ".*\.gz")

            cleanUpNebis = CleanupNebis(self.configuration)
            number_messages = 0
            for incoming_file in nebis_incoming_files:
                tar = tarfile.open(incoming_file, 'r:gz')
                for single_file in tar.getmembers():
                    buffered_reader = tar.extractfile(single_file)
                    #todo
                    # buffered_reader provides bytes and not string
                    # do we have to transform it to bytes (as it is done with most of the other sources
                    # or is the current implementation sufficient??
                    # content = buffered_reader.read().decode('utf-8')

                    content = buffered_reader.read().decode('utf-8')

                    cleanContent = cleanUpNebis.cleanup(content)
                    if len(cleanContent) > 0:
                        self.send(
                            key=cleanContent['key'].encode('utf8'),
                            message=cleanContent['cleanDoc'].encode('utf8'))
                        number_messages += 1
                        if number_messages % 1000 == 0:
                            self.flush()

            self.flush()
            self.post_process()

            self.source_logger_summary.info(
                'Anzahl der nach Kafka gesendeten messages: {ANZAHL}'.format(
                    ANZAHL=number_messages))

            self.source_logger_summary.info(
                'STOP filepush_nebis_kafka datasource {SOURCE} {STOPTIME}'.
                format(SOURCE=self._shortcut_source_name,
                       STOPTIME=current_timestamp()))

        except Exception as baseException:
            self.source_logger.error(
                'Exception während filepush_nebis_kafka:  {MESSAGE}'.format(
                    MESSAGE=str(baseException)))
        else:
            self.source_logger_summary.info(
                'Keine Exception im Basisworkflow filepush_nebis_kafka der source {SOURCE}'
                .format(SOURCE=self._shortcut_source_name))
            self.update_configuration()
    def process(self):
        if self.contentprovider.is_content_available():
            try:

                self.source_logger_summary.info('\n\n\n\nStart webdav_kafka {SOURCE} {STARTTIME}'.format(
                    SOURCE=self._shortcut_source_name,
                    STARTTIME=current_timestamp()
                ))

                number_messages = 0
                for delete in self.contentprovider.provide_deletes():
                    identifier_key = self.p_identifier_key.search(delete).group(1)
                    self.send(key=identifier_key.encode('utf8'),
                              message=delete.encode('utf8'))
                    number_messages +=1
                    if number_messages % 1000 == 0:
                        self.flush()

                number_messages = 0
                for update in self.contentprovider.provide_updates():
                    identifier_key = self.p_identifier_key.search(update).group(1)
                    self.send(key=identifier_key.encode('utf8'),
                              message=update.encode('utf8'))
                    number_messages +=1
                    if number_messages % 1000 == 0:
                        self.flush()

                self.flush()

                self.source_logger_summary.info('Anzahl der nach Kafka gesendeten messages: {ANZAHL}'.format(
                    ANZAHL=number_messages
                ))

                self.source_logger_summary.info('number of deleted records: {deleted} / number of updated records {updated}'.format(
                    deleted=self.contentprovider._number_delete_records,
                    updated=self.contentprovider._number_update_records
                ))


                self.source_logger_summary.info('STOP Harvesting datasource {SOURCE} {STOPTIME}'.format(
                    SOURCE=self._shortcut_source_name,
                    STOPTIME=current_timestamp()
                ))

            except Exception as baseException:
                self.source_logger.error('Exception während des Rzerorozesses:  {MESSAGE}'.format(
                    MESSAGE=str(baseException)))
            else:
                self.source_logger_summary.info('Keine Exception im webdav-process source {SOURCE}'.format(
                    SOURCE=self._shortcut_source_name))


            move_files(self.configuration.rero_working_dir,self.configuration.rero_src_dir)

        self.update_configuration()
    def process(self):

        self.source_logger_summary.info(
            '\n\n\n\nStart Edu zem {SOURCE} {STARTTIME}'.format(
                SOURCE=self._shortcut_source_name,
                STARTTIME=current_timestamp()))

        #self.refresh_access_token()
        #self.check_valid_access_token()

        all_kurse = self.make_repository_request(self.url_all_kurse)
        all_informationsveranstaltungen = self.make_repository_request(
            self.url_all_informationsveranstaltungen)
        all_intensivweiterbildungen = self.make_repository_request(
            self.url_all_intensivweiterbildungen)
        all_lehrgaenge = self.make_repository_request(self.url_all_lehrgaenge)
        all_referate = self.make_repository_request(self.url_all_referate)
        all_tagungen = self.make_repository_request(self.url_all_tagungen)

        file = open("data/phbern/kurse.json", "w")
        self.writeHeader(file)
        json.dump(all_kurse, file, indent=20)
        self.writeFooterAndClose(file)

        file = open("data/phbern/informationsveranstaltung.json", "w")
        self.writeHeader(file)
        json.dump(all_informationsveranstaltungen, file, indent=20)
        self.writeFooterAndClose(file)

        file = open("data/phbern/intensivweiterbildung.json", "w")
        self.writeHeader(file)
        json.dump(all_intensivweiterbildungen, file, indent=20)
        self.writeFooterAndClose(file)

        file = open("data/phbern/lehrgaenge.json", "w")
        self.writeHeader(file)
        json.dump(all_lehrgaenge, file, indent=20)
        self.writeFooterAndClose(file)

        file = open("data/phbern/referate.json", "w")
        self.writeHeader(file)
        json.dump(all_referate, file, indent=20)
        self.writeFooterAndClose(file)

        file = open("data/phbern/tagungen.json", "w")
        self.writeHeader(file)
        json.dump(all_tagungen, file, indent=20)
        self.writeFooterAndClose(file)

        self.source_logger_summary.info(
            '\n\n\n\nFinished Edu zem {SOURCE} {STARTTIME}'.format(
                SOURCE=self._shortcut_source_name,
                STARTTIME=current_timestamp()))
    def getProjectId(self, url: str):

        # example for the URL: '/v1/projects/700037' and we are looking for the project number
        projectid = url[url.rfind("/") + 1:]
        self.source_logger_summary.info(
            '\nFetching project id:{PROJECTID} {CURRENTTIME}'.format(
                PROJECTID=projectid, CURRENTTIME=current_timestamp()))

        return projectid
Beispiel #5
0
    def process(self):

        try:

            self.source_logger_summary.info(
                '\n\n\n\nStart Harvesting datasource {SOURCE} {STARTTIME}'.
                format(SOURCE=self._shortcut_source_name,
                       STARTTIME=current_timestamp()))

            oai_sickle = OaiSickleWrapper(self.configuration,
                                          self.source_logger_summary,
                                          self.source_logger)
            messages = 0
            for record in oai_sickle.fetch_iter():
                messages += 1
                self.send(key=record.header.identifier.encode('utf8'),
                          message=record.raw.encode('utf8'))
            self.flush()

            self.source_logger_summary.info(
                'Anzahl der nach Kafka gesendeten messages: {ANZAHL}'.format(
                    ANZAHL=messages))

            self.source_logger_summary.info(
                'STOP Harvesting datasource {SOURCE} {STOPTIME}'.format(
                    SOURCE=self._shortcut_source_name,
                    STOPTIME=current_timestamp()))

        except Exception as baseException:
            self.source_logger.error(
                'Exception während des Harvestingprozesses:  {MESSAGE}'.format(
                    MESSAGE=str(baseException)))
        else:
            self.source_logger_summary.info(
                'Keine Exception im Basisworkflow Harvesting der source {SOURCE}'
                .format(SOURCE=self._shortcut_source_name))

        self.update_configuration()
 def update_stop_time(self):
     self.specializedConfiguration['Processing']["Default"][
         'stoppageTime'] = current_timestamp()
    def process(self):

        self.source_logger_summary.info(
            '\n\n\n\nStart Edu zem {SOURCE} {STARTTIME}'.format(
                SOURCE=self._shortcut_source_name,
                STARTTIME=current_timestamp()))

        response = requests.get(self.base_url + "/v1/projects",
                                headers=self.headers)
        #todo post is actually not possible
        # change the type of registered application so post and put methods are allowed
        #status = {"state": {"equal": "open"}}
        #response = requests.post(self.base_url + "/v1/projects",data=status,headers=self.headers)

        if response.ok:
            text = response.text
            projects = json.loads(text)

            for project in projects:

                projectId = self.getProjectId(project["self"])

                if not self.active:
                    if projectId == str(self.last_project_id):
                        #next id should be used
                        self.active = True
                        continue
                    else:
                        self.source_logger_summary.info(
                            '\nFetched projectid  {ID} passed'.format(
                                ID=projectId, STARTTIME=current_timestamp()))
                        continue

                self.check_valid_access_token()

                #fullproject = requests.get(self.base_url + project["self"],headers=self.headers)
                #Beispielprojekt mit pricenote
                #fullproject = requests.get(self.base_url + "/v1/projects/997065",headers=self.headers)

                fp = self.make_repository_request(project["self"])

                #Silvia
                # Ich würde nur die Kurse mit Status in_progress nehmen.
                # Im Moment ist noch deferred und abandoned vorhanden, was ja sicher keine
                # aktuellen Kurse sind.
                # Dann gibt es noch Status new, der aber auch wenig Sinn macht da noch keine
                # vernünftige Kursbeschreibung vorhanden ist. Also besser weg damit.

                #if not "status" in fp or fp["status"] is None or fp["status"] == "done" or fp["status"] == "cancelled":
                #if not "status" in fp or fp["status"] is None or fp["status"] != 'in_progress':
                #  continue
                if not self.is_course_relevant_for_metisgym(fp):
                    continue

                #t = open("onlytest.json","w")
                #json.dump(fp,t,indent=20)
                #t.flush()
                #t.close()

                if "forms" in fp:
                    methods = []
                    price_note = None
                    for form in fp["forms"]:
                        method_pricenote_tuple = self.process_methods_pricenote_in_form(
                            form)
                        methods.append(method_pricenote_tuple[0])
                        price_note = method_pricenote_tuple[1]
                    fp["course_methods"] = list(itertools.chain(*methods))
                    if not price_note is None:
                        fp["price_note"] = price_note

                if "companies" in fp:
                    for company in fp["companies"]:
                        company["details"] = self.processCompany(company)

                if "contacts" in fp:
                    for contact in fp["contacts"]:
                        jsonContactDetails = self.make_repository_request(
                            contact["contact"])
                        #jsonContactDetails = json.loads(requests.get(self.base_url + contact["contact"], headers=self.headers).text)
                        privacyContact = {}
                        privacyContact["prefix"] = jsonContactDetails[
                            "prefix"] if "prefix" in jsonContactDetails else "na"
                        privacyContact["first_name"] = jsonContactDetails[
                            "first_name"] if "first_name" in jsonContactDetails else "na"
                        privacyContact["last_name"] = jsonContactDetails[
                            "last_name"] if "last_name" in jsonContactDetails else "na"
                        privacyContact["birthday"] = jsonContactDetails[
                            "birthday"] if "birthday" in jsonContactDetails else "na"
                        privacyContact["keywords"] = jsonContactDetails["keywords"] if "keywords" in jsonContactDetails else []

                        privacyContact["emails"] = jsonContactDetails["emails"] if "emails" in jsonContactDetails else []

                        privacyContact["phone_numbers"] = jsonContactDetails["phone_numbers"] if "phone_numbers" in jsonContactDetails else []

                        if "companies" in jsonContactDetails:
                            privacyContact["companies"] = jsonContactDetails[
                                "companies"]
                            for company in privacyContact["companies"]:
                                company["details"] = self.processCompany(
                                    company)
                        contact["details"] = privacyContact

                if "tasks" in fp:
                    del fp["tasks"]
                if "notes" in fp:
                    del fp["notes"]
                if "forms" in fp:
                    del fp["forms"]
                if "appointments" in fp:
                    del fp["appointments"]

                self.send(key=projectId.encode('utf8'),
                          message=json.dumps(fp).encode('utf8'))

        self.source_logger_summary.info(
            '\n\n\n\nFinished Edu zem {SOURCE} {STARTTIME}'.format(
                SOURCE=self._shortcut_source_name,
                STARTTIME=current_timestamp()))
Beispiel #8
0
    def process(self):

        self.source_logger_summary.info(
            '\n\n\n\nStart Edu zem {SOURCE} {STARTTIME}'.format(
                SOURCE=self._shortcut_source_name,
                STARTTIME=current_timestamp()))

        #read data from file if needed (testdata was deleted)
        #fp = open("data/evento-bern/evento_content_eventobern_all.json","r")
        #all_events_dict = json.load(fp)
        #for key, evento_course in all_events_dict.items():
        #    self.send(key=key.encode('utf8'),
        #              message=json.dumps(evento_course).encode('utf8'))

        # Beschreibung Vorgehen pro Evento Schule:

        # Public Token lösen
        # alle Events abholen
        # alle Zusatz Texte zu Events erfassen (individuell pro Schule). Lable -> Memo
        # alle EventLocations abholen
        # Filtern nach EventLevelId (1005 -> LuL Forbildung, Id wird sich für die Produktion noch ändern)
        # Pro gefilterter Event noch die Lektionen abholen (Perfomance)
        # Frage:
        # Wollen sie auch gleich Anmeldungen vornehmen? Wenn ja, müssten ich Ihnen noch 2 Request und ein wenig Businesslogik Kenntnis mehr abgeben.
        # Nein!

        #self.refresh_access_token()
        #self.check_valid_access_token()

        all_events = self.make_repository_request(self.url_all_events)
        all_events_texts = self.make_repository_request(
            self.url_all_event_texts)
        all_events_locations = self.make_repository_request(
            self.url_all_event_locations)
        #all_lessons_of_events = self.make_repository_request(self.lessons_of_event)

        # all_events_serialized = open("all_events_serialized_eventobern.json", "w")
        # all_events_texts_serialized = open("all_events_texts_serialized_eventobern.json", "w")
        # all_events_locations_serialized = open("all_events_locations_serialized_eventobern.json", "w")
        # #all_lessons_of_events_serialized = open("all_lessons_of_events_serialized_eventobern.json", "w")
        #
        # json.dump(all_events, all_events_serialized, indent=20)
        # all_events_serialized.flush()
        # all_events_serialized.close()
        #
        # json.dump(all_events_texts, all_events_texts_serialized, indent=20)
        # all_events_texts_serialized.flush()
        # all_events_texts_serialized.close()
        #
        # json.dump(all_events_locations, all_events_locations_serialized, indent=20)
        # all_events_locations_serialized.flush()
        # all_events_locations_serialized.close()

        all = {}
        all["all_events"] = all_events
        all["all_events_texts"] = all_events_texts
        all["all_events_locations"] = all_events_locations
        #all["all_lessons_of_events"] = all_lessons_of_events
        all["all_lessons_of_events"] = []

        #todo: bis jetzt kein check, ob die referenzen in den abhängigen Objekten überhaupt vorhanden sind!!
        filtered_dict_events_as_list = list(
            map(lambda fe: {str(fe['Id']): fe},
                filter(lambda e: e['Id'] != 2, all_events)))
        all_events_dict = {}
        for single_dict in filtered_dict_events_as_list:
            key = str(list(single_dict.keys())[0])
            all_events_dict[key] = single_dict[key]
            all_events_dict[key]['event_texts'] = []
            all_events_dict[key]['event_locations'] = []
            all_events_dict[key]['lessons_of_event'] = []

        for single_event_text in all_events_texts:
            all_events_dict[str(
                single_event_text['EventId'])]['event_texts'].append(
                    single_event_text)

        for single_event_location in all_events_locations:
            all_events_dict[str(
                single_event_location['EventId'])]['event_locations'].append(
                    single_event_location)

        #search for all lessons
        for single_event in filtered_dict_events_as_list:
            eventKey = list(single_event.keys())[0]
            lessons_url = self.lessons_of_event.replace("{AnlassId}", eventKey)
            lessons_of_single_event = self.make_repository_request(lessons_url)
            for single_lesson in lessons_of_single_event:
                all_events_dict[str(eventKey)]['lessons_of_event'].append(
                    single_lesson)

        # evento_out_serialized = open("evento_content_eventobern_all.json","w")
        # json.dump(all_events_dict, evento_out_serialized, indent=20)
        # evento_out_serialized.flush()
        # evento_out_serialized.close()

        for key, evento_course in all_events_dict.items():
            self.send(key=key.encode('utf8'),
                      message=json.dumps(evento_course).encode('utf8'))

        self.source_logger_summary.info(
            '\n\n\n\nFinished Edu zem {SOURCE} {STARTTIME}'.format(
                SOURCE=self._shortcut_source_name,
                STARTTIME=current_timestamp()))