Exemple #1
0
    def downloadWebpage(self):
        print 'Trying to get website information...please wait...'
        cache = Fernsehserien_de_Scraper.CACHE_FOLDER + '/' + self.name + '_' + 'eplist.dat'
        if os.path.isfile(cache) and (time.time() -
                                      os.path.getmtime(cache)) < 43200:
            print "Use local file..."
            webpage = urlopen(cache)
        else:
            if serieslinks.has_key(self.name):
                title = serieslinks[self.name]
            else:
                title = self.name.replace(' ', '-')

            webpage = urlopen('http://www.fernsehserien.de/' + title +
                              '/episodenguide').read()

            if not (os.path.isdir(self.name.replace('-', ' '))):
                os.mkdir(self.name.replace('-', ' '))

            if not (os.path.isdir(Fernsehserien_de_Scraper.CACHE_FOLDER)):
                os.mkdir(Fernsehserien_de_Scraper.CACHE_FOLDER)

            f = open(cache, 'w')
            f.write(webpage)
            f.close()

        print 'Website successfully scraped'
        self.soupobj = BeautifulSoup(webpage, "html.parser")
    def downloadWebpage(self):
        print 'Trying to get website information...please wait...'
        cache = Fernsehserien_de_Scraper.CACHE_FOLDER + '/' + self.name + '_' + 'eplist.dat'
        if os.path.isfile(cache) and (time.time() - os.path.getmtime(cache)) < 43200:
            print "Use local file..."        
            webpage = urlopen(cache)
        else:
            if serieslinks.has_key(self.name):
                title = serieslinks[self.name]
            else:
                title = self.name.replace(' ','-')

            webpage = urlopen('http://www.fernsehserien.de/'+title+'/episodenguide').read()
            
            if not(os.path.isdir(self.name.replace('-',' '))):
                os.mkdir(self.name.replace('-',' '))

            if not(os.path.isdir(Fernsehserien_de_Scraper.CACHE_FOLDER)):
                os.mkdir(Fernsehserien_de_Scraper.CACHE_FOLDER)
                
            f = open(cache,'w')
            f.write(webpage)
            f.close()
            
        print 'Website successfully scraped'
        self.soupobj = BeautifulSoup(webpage, "html.parser")
Exemple #3
0
    def getTimeTable(self, sender):
        print 'Trying to get timetable information...please wait...'

        if senderlinks.has_key(sender):
            senderlink = senderlinks[sender]
        else:
            print 'Link zu Sender ' + sender + ' nicht gefunden'
            return 0

        cache = Fernsehserien_de_Scraper.CACHE_FOLDER + '/' + self.name + '_ttlist.dat'
        if os.path.isfile(cache) and (time.time() -
                                      os.path.getmtime(cache)) < 43200:
            print "Use local file..."
            webpage = urlopen(cache)
        else:
            if serieslinks.has_key(self.name.replace(' ', '-')):
                title = serieslinks[self.name.replace(' ', '-')]
            else:
                title = self.name.replace(' ', '-')

            webpage = urlopen('http://www.fernsehserien.de/' + title +
                              '/sendetermine/' + senderlink + '/-1').read()

            if not (os.path.isdir(self.name)):
                os.mkdir(self.name)

            if not (os.path.isdir(Fernsehserien_de_Scraper.CACHE_FOLDER)):
                os.mkdir(Fernsehserien_de_Scraper.CACHE_FOLDER)

            f = open(cache, 'w')
            f.write(webpage)
            f.close()

        print 'Website successfully scraped'
        #soup = BeautifulSoup(fernsehserien_testdata.gethtmlo(), "html.parser")
        soup = BeautifulSoup(webpage, "html.parser")
        tddata = soup.select("tr")

        epdate, eptime, season, episode, title = [], [], [], [], []

        for index, item in enumerate(tddata):
            if fmod(index, 2) != 0 and index > 0:
                #print item.text
                m = re.search(
                    "(\d{2}\.\d{2}\.\d{4}).*?(\d{2}:\d{2}).*?>(\d{1,3})<.*?>(\d{1,2}).*?>(\d{1,2}).*?>([^<]+)",
                    str(item))
                if type(m) is not NoneType:
                    epdate.append(m.group(1))
                    eptime.append(m.group(2))
                    season.append(m.group(4))
                    episode.append(m.group(5))
                    title.append(m.group(6))

        return (epdate, season, episode, title, eptime)
    def getTimeTable(self, sender):
        print 'Trying to get timetable information...please wait...'
                      
        if senderlinks.has_key(sender):
            senderlink = senderlinks[sender]
        else:
            print 'Link zu Sender ' + sender +' nicht gefunden'
            return 0
        
        cache = Fernsehserien_de_Scraper.CACHE_FOLDER + '/' + self.name + '_ttlist.dat'
        if os.path.isfile(cache) and (time.time() - os.path.getmtime(cache)) < 43200:
            print "Use local file..."        
            webpage = urlopen(cache)
        else:
            if serieslinks.has_key(self.name.replace(' ','-')):
                title = serieslinks[self.name.replace(' ','-')]
            else:
                title = self.name.replace(' ','-')
                
            webpage = urlopen('http://www.fernsehserien.de/'+title+'/sendetermine/'+senderlink+'/-1').read()
            
            if not(os.path.isdir(self.name)):
                os.mkdir(self.name)

            if not(os.path.isdir(Fernsehserien_de_Scraper.CACHE_FOLDER)):
                os.mkdir(Fernsehserien_de_Scraper.CACHE_FOLDER)
                
            f = open(cache,'w')
            f.write(webpage)
            f.close()   
            
        
        print 'Website successfully scraped'
        #soup = BeautifulSoup(fernsehserien_testdata.gethtmlo(), "html.parser")
        soup = BeautifulSoup(webpage, "html.parser")
        tddata = soup.select("tr")

        epdate, eptime, season, episode, title = [],[],[],[],[]
        
        for index, item in enumerate(tddata):
            if fmod(index,2) != 0 and index>0:
                #print item.text
                m = re.search("(\d{2}\.\d{2}\.\d{4}).*?(\d{2}:\d{2}).*?>(\d{1,3})<.*?>(\d{1,2}).*?>(\d{1,2}).*?>([^<]+)", str(item))
                if type(m) is not NoneType:            
                    epdate.append(m.group(1))
                    eptime.append(m.group(2))
                    season.append(m.group(4))
                    episode.append(m.group(5))
                    title.append(m.group(6))
                    
        return (epdate, season, episode, title, eptime)    
    def downloadWebpage(self):
        logging.info('Trying to get website information...please wait...')
        self.index0 = self.name.find(
            ' USA 20')  #rb 2018-07-10  spart Einträge in tv_shows_db
        if self.index0 > -1:
            logging.info('USA 20 gefunden...please wait...')
            self.name = self.name[0:self.index0] + self.name[self.index0 + 9:]
            logging.info(self.name)
        else:
            logging.info(self.name)
            self.index0 = self.name.find(
                ' F 20')  #rb 2022-01-14  spart Einträge in tv_shows_db
            if self.index0 > -1:
                logging.info('F 20 gefunden...please wait...')
                self.name = self.name[0:self.index0] + self.name[self.index0 +
                                                                 7:]
                logging.info(self.name)
        cache = Fernsehserien_de_Scraper.CACHE_FOLDER + '/' + self.name + '_' + 'eplist.html'
        if os.path.isfile(cache) and (time.time() -
                                      os.path.getmtime(cache)) < 43200:
            logging.info('Use local file...')
            webpage = urlopen(cache)
        else:
            logging.info('self.name:' + self.name)

            if serieslinks.has_key(self.name):
                title = serieslinks[self.name]
            else:
                title = self.name.replace(' ', '-')
                title = title.replace('---', '-')  # nur 1 -
            webpage = urlopen('https://www.fernsehserien.de/' + title +
                              '/episodenguide').read()
            if not (os.path.isdir(Fernsehserien_de_Scraper.CACHE_FOLDER)):
                os.mkdir(Fernsehserien_de_Scraper.CACHE_FOLDER)

            logging.info('Website scraping => done')

            f = open(cache, 'w')
            f.write(webpage)
            f.close()

        self.soupobj = BeautifulSoup(webpage, "html.parser")
    def getTimeTable(self, sender):
        logging.info('Trying to get timetable information...please wait...')

        if senderlinks.has_key(sender):
            senderlink = senderlinks[sender]
        else:
            logging.warning('Link zu Sender ' + sender + ' nicht gefunden')
            return 0

        m = re.search("(.*)( \w{1,3} 20)(.*)", self.name)
        if type(m) is not NoneType:  #rb 2022-01-14
            #logging.info('Group(1): ' + m.group(1))
            #logging.info('Group(2): ' + m.group(2))
            #logging.info('Group(3): ' + m.group(3))
            self.name = m.group(1)
#		self.index0 = self.name.find (' USA 20') #rb 2018-07-10s  spart Einträge in tv_shows_db
#		if self.index0 > -1:
#			self.name = self.name[0: self.index0] + self.name[self.index0+9:]
#		else:
#			logging.info(self.name)
#			self.index0 = self.name.find (' F 20') #rb 2022-01-14  spart Einträge in tv_shows_db
#			if self.index0 > -1:
#				logging.info('F 20 gefunden...please wait...')
#				self.name = self.name[0: self.index0] + self.name[self.index0+7:]
#				logging.info(self.name)
        if serieslinks.has_key(self.name):
            title = serieslinks[self.name]
        else:
            title = self.name.replace(' ', '-')
            title = title.replace('---', '-')  # nur 1 -
        #cache = Fernsehserien_de_Scraper.CACHE_FOLDER + '/' + self.name + '_ttlist.html' #rb auskommentiert
        #cache = Fernsehserien_de_Scraper.CACHE_FOLDER + '/' + self.name + str(conf.SZaehler) + '_' + senderlink +'_ttlist.html' #rb
        cache = Fernsehserien_de_Scraper.CACHE_FOLDER + '/' + title + str(
            conf.SZaehler) + '_' + senderlink + '_ttlist.html'  #rb
        #		if os.path.isfile(cache) and (time.time() - os.path.getmtime(cache)) < 43200:    #12h
        #			logging.info("Using recent cache file...")
        #			webpage = urlopen(cache)
        if os.path.isfile(cache):
            self._test = datetime.datetime.fromtimestamp(
                os.path.getmtime(cache)).strftime("%Y.%m.%d%H-%M")[2:15]  #rb
        if os.path.isfile(cache) and (self._test > self.SZeit):
            # bessere Bedingung: Datum der Cachedatei ist neuer als die Sendezeit des Films im Dateinamen
            logging.info("Using recent cache file..." + str(conf.SZaehler))
            #webpage = urlopen(cache)
            webpage = urlopen(cache).read()  #rb
            #			conf.LetzteSeite = ("erfasst" in webpage) #rb
            conf.LetzteSeite = ("erfasst" in webpage) or (
                'title="früher"' not in webpage)  #rb ersetzt
        else:
            #			if serieslinks.has_key(self.name):   #rb title vorverschoben
            #				title = serieslinks[self.name]
            #				#if serieslinks.has_key(self.name.replace(' ','-')):
            #				#	title = serieslinks[self.name.replace(' ','-')]
            #			else:
            #				title = self.name.replace(' ','-')
            #logging.info('Loading: https://www.fernsehserien.de/'+title+'/sendetermine/'+senderlink+'/-1') #rb auskommentiert
            logging.info('Loading: https://www.fernsehserien.de/' + title +
                         '/sendetermine/' + senderlink + '/-' +
                         str(conf.SZaehler))
            #webpage = urlopen('https://www.fernsehserien.de/'+title+'/sendetermine/'+senderlink+'/-1').read()
            webpage = urlopen('https://www.fernsehserien.de/' + title +
                              '/sendetermine/' + senderlink + '/-' +
                              str(conf.SZaehler)).read()  #rb
            if not (os.path.isdir(Fernsehserien_de_Scraper.CACHE_FOLDER)):
                os.mkdir(Fernsehserien_de_Scraper.CACHE_FOLDER)
            f = open(cache, 'w')
            f.write(webpage)
            f.close()
            logging.info('Website scraping => done')
            #			conf.LetzteSeite = ("erfasst" in webpage) #rb
            conf.LetzteSeite = ("erfasst" in webpage) or (
                'title="früher"' not in webpage)  #rb ersetzt
#		conf.LetzteSeite = ("erfasst" in webpage)  #rb verschoben
        soup = BeautifulSoup(webpage, "html.parser")
        #		tddata = soup.select("tr")

        epdate, eptime, season, episode, title = [], [], [], [], []
        if ('<tr' in webpage):
            rows = soup.findAll('tr')

            logging.info('mit StaffelNr')
            for row in rows:
                #                                   Datum               Start                 Staffel    Episode                                      Titel
                #				m = re.search("(\d{2}\.\d{2}\.\d{4}).*?(\d{2}:\d{2}).*?folgen\/.*(\d{1,2})x(\d{1,3}).*\-episodentitel.*\-sendetermine\"\>([\wäßüöÄÜÖ \-\:]*)", str(row))
                m = re.search(
                    "(\d{2}\.\d{2}\.\d{4}).*?(\d{2}:\d{2}).*?folgen\/.*?(\d{1,2})x(\d{1,3})(.*)",
                    str(row))  # ? ist entscheidend!!!
                if type(m) is not NoneType:
                    epdate.append(m.group(1))
                    eptime.append(m.group(2))
                    season.append(m.group(3))
                    episode.append(m.group(4))
                    #					m1 = re.search("\-episodentitel.*title=\"zur Episode\".([\wäßüöÄÜÖ \-\:]*)", m.group(5))
                    #					m1 = re.search(".*title=\"zur Episode\".([\wäßüöÄÜÖ \-\:]*)", m.group(5))
                    #					m1 = re.search(".*title=\"zur Episode\".([\wäßüöÄÜÖ() \-\:\.]*)", m.group(5)) #rb 07.01.2022
                    m1 = re.search(
                        ".*title=\"zur Episode\".([\wäßüöÄÜÖ() \-\:\.\,]*)",
                        m.group(5))  #rb 21.02.2022 Komma hinzugefügt
                    if type(m1) is not NoneType:
                        title.append(m1.group(1))
                    else:
                        #						m1 = re.search('-episodentitel.*-sendetermine.\>([\wäßüöÄÜÖ \-\:]*)', s1)
                        #						m1 = re.search('*-sendetermine.\>([\wäßüöÄÜÖ \-\:]*)', s1)
                        #						m1 = re.search('*-sendetermine.\>([\wäßüöÄÜÖ() \-\:\.]*)', s1) #rb 07.01.2022
                        m1 = re.search(
                            '*-sendetermine.\>([\wäßüöÄÜÖ() \-\:\.\,]*)',
                            s1)  #rb 21.02.2022 Komma hinzugefügt
                        if type(m1) is not NoneType:
                            title.append(m1.group(1))
            if len(
                    title
            ) == 0:  # notwendig, wenn keine StaffelNr eingegeben ist, z.B.:  Wunderschoen, Wilsberg rb 03.11.2020
                logging.info('keine StaffelNr')
                for row in rows:
                    #					m = re.search("(\d{2}\.\d{2}\.\d{4}).*?(\d{2}:\d{2}).*?(\d{2}:\d{2}).*?folgen/(\d{1,3}).*?episodentitel.*?zur Episode\"\>([\w \-\:]*)",str(item))  # Episode kann auch 3-stellig sein
                    #					m = re.search("(\d{2}\.\d{2}\.\d{4}).*?(\d{2}:\d{2}).*?(\d{2}:\d{2}).*?folgen/(\d{1,3}).*?episodentitel.*?zur Episode\"\>([\wäßüöÄÜÖ \-\:]*)",str(row))  # Episode kann auch 3-stellig sein
                    #					m = re.search("(\d{2}\.\d{2}\.\d{4}).*?(\d{2}:\d{2}).*?(\d{2}:\d{2}).*?folgen/(\d{1,3}).*?episodentitel.*?zur Episode\"\>([\wäßüöÄÜÖ() \-\:\.]*)",str(row)) #rb 07.01.2022 # Episode kann auch 3-stellig sein
                    m = re.search(
                        "(\d{2}\.\d{2}\.\d{4}).*?(\d{2}:\d{2}).*?(\d{2}:\d{2}).*?folgen/(\d{1,3}).*?episodentitel.*?zur Episode\"\>([\wäßüöÄÜÖ() \-\:\.\,]*)",
                        str(row))  #rb 21.02.22 # Komma hinzugefügt
                    if type(m) is not NoneType:
                        epdate.append(m.group(1))
                        eptime.append(m.group(2))
                        season.append('1')
                        episode.append(m.group(4))
                        title.append(m.group(5))


###			return (epdate, season, episode, title, eptime)
        else:
            rows = soup.findAll('a', href=True)
            for row in rows:
                #				m = re.search("(\d{2}\.\d{2}\.\d{4}).*?(\d{2}:\d{2}).*?folgen\/.*(\d{1,2})x(\d{1,3}).*\-episodentitel.*\-sendetermine\"\>([\wäßüöÄÜÖ \-\:]*)", str(row))
                #				m = re.search("folgen\/(\d{1,2})x(\d{1,3}).*(\d{2}\.\d{2}\.\d{4}).*?(\d{2}:\d{2}).*?\-episodentitel.*?\"\>([\wäßüöÄÜÖ \-\:]*)", str(row))
                #				m = re.search("folgen\/(\d{1,2})x(\d{1,3}).*(\d{2}\.\d{2}\.\d{4}).*?(\d{2}:\d{2}).*?\-episodentitel.*?\"\>([\wäßüöÄÜÖ() \-\:\.]*)", str(row))
                m = re.search(
                    "folgen\/(\d{1,2})x(\d{1,3}).*(\d{2}\.\d{2}\.\d{4}).*?(\d{2}:\d{2}).*?\-episodentitel.*?\"\>([\wäßüöÄÜÖ() \-\:\.\,]*)",
                    str(row))  #rb 21.02.2022 Komma hinzugefügt
                if type(m) is not NoneType:
                    epdate.append(m.group(3))
                    eptime.append(m.group(4))
                    season.append(m.group(1))
                    episode.append(m.group(2))
                    title.append(m.group(5))
            if len(
                    title
            ) == 0:  # notwendig, wenn keine StaffelNr eingegeben ist, z.B.:  Wunderschoen, Wilsberg rb 03.11.2020
                logging.info('keine StaffelNr')
                for row in rows:
                    #					m = re.search("folgen\/(\d{1,3}).*?(\d{2}\.\d{2}\.\d{4}).*?(\d{2}:\d{2}).*?\-episodentitel.*?\"\>([\wäßüöÄÜÖ \-\:]*)", str(row))
                    #	m = re.search("folgen\/(\d{1,3}).*?(\d{2}\.\d{2}\.\d{4}).*?(\d{2}:\d{2}).*?\-episodentitel.*?\"\>([\wäßüöÄÜÖ() \-\:\.]*)", str(row)) #rb 07.01.2022
                    m = re.search(
                        "folgen\/(\d{1,3}).*?(\d{2}\.\d{2}\.\d{4}).*?(\d{2}:\d{2}).*?\-episodentitel.*?\"\>([\wäßüöÄÜÖ() \-\:\.\,]*)",
                        str(row))  #rb 21.02.2022 Komma hinzugefügt
                    if type(m) is not NoneType:
                        epdate.append(m.group(2))
                        eptime.append(m.group(3))
                        season.append('1')
                        episode.append(m.group(1))
                        title.append(m.group(4))
        return (epdate, season, episode, title, eptime)