コード例 #1
0
 def scrapxml(self):
     logger = com_logger.Logger('RTL')
 
     table = []
     for conf in self.config['URLRTL']:
         url = self.config['URLRTL'][str(conf)]
         logger.info('Check URL:' + url)
         if requests.get(url).status_code == 200:
             url = urllib.request.urlopen(url).read()
             soup = BeautifulSoup(url, "lxml")
             soup.prettify()
         
             for item_list in soup.find_all("item"):
                 a_name = self.fileutils.replace(item_list.find("title").text)
                 a_link = item_list.find("guid").text
             
                 logger.info('Find: ' + a_name)
                 if com_sqlite.select(a_name) != a_name:
                     base_dir = os.path.dirname(os.path.abspath(__file__))
                     db_path = os.path.join(base_dir, self.config['DIRDOWNLOAD']['DIR'])
                     urllib.request.urlretrieve(a_link, db_path + "/" + a_name + ".mp3")
                     logger.info('Downloaded: ' + a_name)
                     com_sqlite.insert(a_name)
                     table.append(a_name)
                     mail = com_email.Mail()
                     mail.send_mail_gmail("RTL: " + a_name, table)
                 break
コード例 #2
0
    def scrap(self):
        logger = com_logger.Logger('BBC')

        table = []
        try:
            for conf in self.config['URLBBC']:
                url = self.config['URLBBC'][str(conf)]
                logger.info('Check URL: ' + url)
                if requests.get(url).status_code == 200:
                    url = urllib.request.urlopen(url).read()
                    soup = BeautifulSoup(url, "html.parser")
                    soup.prettify()

                    for ul_list in soup.find_all("ul", class_="list-unstyled"):
                        div_test = ul_list.find_all("div",
                                                    class_="programme__body")
                        if len(div_test) > 0:
                            for li_list in ul_list.find_all("li", class_=""):
                                for div_list in li_list.find_all(
                                        "div", class_="programme__body"):
                                    span_name = div_list.find(
                                        "span", class_="programme__title ")
                                    a_name = self.fileutils.replace(
                                        span_name.find("span").text)

                                    div_link = div_list.find(
                                        "div",
                                        class_=
                                        "popup__content popup__content--download br-box-subtle br-subtle-link-onbg br-subtle-link-onborder"
                                    )
                                    if div_link is not None:
                                        a_link = div_link.find(
                                            "a",
                                            class_=
                                            "link-complex popup__list__item island--squashed br-subtle-bg-ontext br-subtle-bg-onbg--hover br-subtle-link-ontext--hover"
                                        )["href"]

                                        logger.info('Find: ' + a_name)
                                        if com_sqlite.select(a_name) != a_name:
                                            logger.info('Download: ' + a_name)
                                            base_dir = os.path.dirname(
                                                os.path.abspath(__file__))
                                            db_path = os.path.join(
                                                base_dir,
                                                self.config['DIRDOWNLOAD']
                                                ['DIR'])
                                            urllib.request.urlretrieve(
                                                a_link, db_path + "/" +
                                                a_name + ".mp3")
                                            logger.info('Downloaded !')
                                            com_sqlite.insert(a_name)
                                            table.append(a_name)
                                            mail = com_email.Mail()
                                            mail.send_mail_gmail(
                                                "BBC: " + a_name, table)
                                break
        except Exception as exp:
            logger.error(repr(exp))
コード例 #3
0
 def scrap(self):
     logger = com_logger.Logger('RTL')
     
     for conf in self.config['URLRTL']:
         url = self.config['URLRTL'][str(conf)]
         logger.info('Check URL: ' + url)
         if requests.get(url).status_code == 200:
             url = urllib.request.urlopen(url).read()
             soup = BeautifulSoup(url, "html.parser")
             soup.prettify()
             
             table = []
             for div_list in soup.find_all("div", class_ = "timeline-post shift"):
                 try:
                     div_item = div_list.find("div", class_ = "post-fig brd brd-grey")
                     a_link = div_item.find("a", class_ = "post-link")["href"]
                     
                     if requests.get(a_link).status_code == 200:
                         a_link = urllib.request.urlopen(a_link).read()
                         soupnext = BeautifulSoup(a_link, "html.parser")
                         soupnext.prettify()
                         div_mp3 = soupnext.find("figcaption", class_ = "figcaption article-mdl cf")
                         a_name = self.fileutils.replace(div_mp3.find("span", class_ = "legend").text)
                         a_link = div_mp3.find("a", class_ = "dl icon icon-download")["href"]
                         
                         logger.info('Find: ' + a_name)
                         if com_sqlite.select(a_name) != a_name:
                             logger.info('Download: ' + a_name)
                             base_name = os.path.dirname(os.path.abspath(__file__))
                             db_path = os.path.join(base_name, self.config['DIRDOWNLOAD']['DIR'])
                             urllib.request.urlretrieve(a_link, db_path + "/" + a_name + ".mp3")
                             logger.info('Downloaded !')
                             com_sqlite.insert(a_name)
                             table.append(a_name)
                             mail = com_email.Mail()
                             mail.send_mail_gmail("RTL: " + a_name, table)
                         break
                 except Exception as exp:
                     logger.error(repr(exp))
コード例 #4
0
    def scrap():
        logger = com_logger.Logger('Scraper')
        url_start = "https://www.leboncoin.fr/annonces/offres/haute_normandie/?o="
        # https://www.leboncoin.fr/annonces/offres/?th=1
        # https: // www.leboncoin.fr/annonces/offres/haute_normandie/?th = 1 & q = vtt
        url_search = "&q="
        url_end = "&it=1"  # vide sinon recherche uniquement dans le titre "&it=1"
        search_list = []

        # Get Things to search from config file
        conf = com_config.Config()
        config = conf.getconfig()
        for i in range(1, 50):
            strthings = 'things' + str(i)
            try:
                if len(config['SEARCH'][strthings]) != 0:
                    search_list.append(config['SEARCH'][strthings])
            except Exception as exp:
                logger.info('Things stop at:' + str(exp))
                break

        max_browse = int(config['SEARCH']['max_browse'])
        logger.info('Start extraction')

        for search_item in search_list:
            tab = search_item.split(",")
            prix_min = int(tab[1])
            prix_max = int(tab[2])
            index = 1
            contenuhtml = []

            logger.info('Search: ' + urllib.parse.unquote(tab[0]))

            urlok = 0
            try:
                urlok = requests.get(
                    url_start + str(index) + url_search + urllib.parse.quote(tab[0]) + url_end).status_code
            except Exception as exp:
                logger.error('URL: ' + url_start + str(index) + url_search + urllib.parse.quote(tab[0]) + url_end)
                logger.error(str(exp))
            while urlok == 200 and index <= max_browse:
                try:
                    url = urllib.request.urlopen(
                        url_start + str(index) + url_search + urllib.parse.quote(tab[0]) + url_end).read()
                except Exception as exp:
                    logger.error('URL: ' + url_start + str(index) + url_search + urllib.parse.quote(tab[0]) + url_end)
                    logger.error(str(exp))
                    break

                soup = BeautifulSoup(url, "html.parser")
                soup.prettify()

                if len(soup.find_all("section", class_="tabsContent block-white dontSwitch")) == 0:
                    break

                for section in soup.find_all("section", class_="tabsContent block-white dontSwitch"):
                    for li in section.find_all("li"):
                        link = li.find("a", class_="list_item clearfix trackable")["href"]
                        idx = \
                            li.find("a", class_="list_item clearfix trackable")["data-info"].strip().split(",", 7)[2].split(
                                ":", 2)[1].replace('"', "").strip()
                        # container = source.find('div', attrs={'id':'dlbox'})
                        imglink = ''
                        try:
                            imglink = li.find("span", class_="lazyload")["data-imgsrc"]
                        except Exception as exp:
                            logger.debug('No image: ' + str(exp))
                            pass

                        for item in li.find_all("section", class_="item_infos"):
                            titre = urllib.parse.unquote(
                                item.find("h2", class_="item_title").text.replace("\n", "").replace("\t", ""))
                            category = urllib.parse.unquote(
                                item.find("p", itemprop="category").text.replace("\n", "").replace("\t", ""))
                            localisation = urllib.parse.unquote(
                                item.find("p", itemprop="availableAtOrFrom").text.replace("\n", "").replace("\t", ""))

                            try:
                                prix = int(item.find("h3", class_="item_price").text.encode("ASCII", 'ignore').strip())
                            except AttributeError:
                                prix = 0
                            except ValueError:
                                prix = 0

                            if (prix >= prix_min) and (prix <= prix_max):
                                if com_sqlite.select(idx) == 0:  # Item not yet present in database
                                    logger.info('Find: ' + titre.strip())
                                    com_sqlite.insert(idx)
                                    contenuhtml = mailcontent(contenuhtml, imglink, link, prix, titre, category,
                                                              localisation)
                index += 1
                logger.debug('Page : ' + str(index))

            if len(contenuhtml) > 0:
                contenu = Scraper.mailfull(urllib.parse.unquote(tab[0] + " Prix: " + str(prix_min) + "-" + str(prix_max) + ' €'), contenuhtml)
                com_email.send_mail_gmail(urllib.parse.unquote(tab[0]), contenu)
                logger.info('Mail sent')
        logger.info('End extraction')