def scrapxml(self): logger = com_logger.Logger('RTL') table = [] for conf in self.config['URLRTL']: url = self.config['URLRTL'][str(conf)] logger.info('Check URL:' + url) if requests.get(url).status_code == 200: url = urllib.request.urlopen(url).read() soup = BeautifulSoup(url, "lxml") soup.prettify() for item_list in soup.find_all("item"): a_name = self.fileutils.replace(item_list.find("title").text) a_link = item_list.find("guid").text logger.info('Find: ' + a_name) if com_sqlite.select(a_name) != a_name: base_dir = os.path.dirname(os.path.abspath(__file__)) db_path = os.path.join(base_dir, self.config['DIRDOWNLOAD']['DIR']) urllib.request.urlretrieve(a_link, db_path + "/" + a_name + ".mp3") logger.info('Downloaded: ' + a_name) com_sqlite.insert(a_name) table.append(a_name) mail = com_email.Mail() mail.send_mail_gmail("RTL: " + a_name, table) break
def scrap(self): logger = com_logger.Logger('BBC') table = [] try: for conf in self.config['URLBBC']: url = self.config['URLBBC'][str(conf)] logger.info('Check URL: ' + url) if requests.get(url).status_code == 200: url = urllib.request.urlopen(url).read() soup = BeautifulSoup(url, "html.parser") soup.prettify() for ul_list in soup.find_all("ul", class_="list-unstyled"): div_test = ul_list.find_all("div", class_="programme__body") if len(div_test) > 0: for li_list in ul_list.find_all("li", class_=""): for div_list in li_list.find_all( "div", class_="programme__body"): span_name = div_list.find( "span", class_="programme__title ") a_name = self.fileutils.replace( span_name.find("span").text) div_link = div_list.find( "div", class_= "popup__content popup__content--download br-box-subtle br-subtle-link-onbg br-subtle-link-onborder" ) if div_link is not None: a_link = div_link.find( "a", class_= "link-complex popup__list__item island--squashed br-subtle-bg-ontext br-subtle-bg-onbg--hover br-subtle-link-ontext--hover" )["href"] logger.info('Find: ' + a_name) if com_sqlite.select(a_name) != a_name: logger.info('Download: ' + a_name) base_dir = os.path.dirname( os.path.abspath(__file__)) db_path = os.path.join( base_dir, self.config['DIRDOWNLOAD'] ['DIR']) urllib.request.urlretrieve( a_link, db_path + "/" + a_name + ".mp3") logger.info('Downloaded !') com_sqlite.insert(a_name) table.append(a_name) mail = com_email.Mail() mail.send_mail_gmail( "BBC: " + a_name, table) break except Exception as exp: logger.error(repr(exp))
def scrap(self): logger = com_logger.Logger('RTL') for conf in self.config['URLRTL']: url = self.config['URLRTL'][str(conf)] logger.info('Check URL: ' + url) if requests.get(url).status_code == 200: url = urllib.request.urlopen(url).read() soup = BeautifulSoup(url, "html.parser") soup.prettify() table = [] for div_list in soup.find_all("div", class_ = "timeline-post shift"): try: div_item = div_list.find("div", class_ = "post-fig brd brd-grey") a_link = div_item.find("a", class_ = "post-link")["href"] if requests.get(a_link).status_code == 200: a_link = urllib.request.urlopen(a_link).read() soupnext = BeautifulSoup(a_link, "html.parser") soupnext.prettify() div_mp3 = soupnext.find("figcaption", class_ = "figcaption article-mdl cf") a_name = self.fileutils.replace(div_mp3.find("span", class_ = "legend").text) a_link = div_mp3.find("a", class_ = "dl icon icon-download")["href"] logger.info('Find: ' + a_name) if com_sqlite.select(a_name) != a_name: logger.info('Download: ' + a_name) base_name = os.path.dirname(os.path.abspath(__file__)) db_path = os.path.join(base_name, self.config['DIRDOWNLOAD']['DIR']) urllib.request.urlretrieve(a_link, db_path + "/" + a_name + ".mp3") logger.info('Downloaded !') com_sqlite.insert(a_name) table.append(a_name) mail = com_email.Mail() mail.send_mail_gmail("RTL: " + a_name, table) break except Exception as exp: logger.error(repr(exp))
def scrap(): logger = com_logger.Logger('Scraper') url_start = "https://www.leboncoin.fr/annonces/offres/haute_normandie/?o=" # https://www.leboncoin.fr/annonces/offres/?th=1 # https: // www.leboncoin.fr/annonces/offres/haute_normandie/?th = 1 & q = vtt url_search = "&q=" url_end = "&it=1" # vide sinon recherche uniquement dans le titre "&it=1" search_list = [] # Get Things to search from config file conf = com_config.Config() config = conf.getconfig() for i in range(1, 50): strthings = 'things' + str(i) try: if len(config['SEARCH'][strthings]) != 0: search_list.append(config['SEARCH'][strthings]) except Exception as exp: logger.info('Things stop at:' + str(exp)) break max_browse = int(config['SEARCH']['max_browse']) logger.info('Start extraction') for search_item in search_list: tab = search_item.split(",") prix_min = int(tab[1]) prix_max = int(tab[2]) index = 1 contenuhtml = [] logger.info('Search: ' + urllib.parse.unquote(tab[0])) urlok = 0 try: urlok = requests.get( url_start + str(index) + url_search + urllib.parse.quote(tab[0]) + url_end).status_code except Exception as exp: logger.error('URL: ' + url_start + str(index) + url_search + urllib.parse.quote(tab[0]) + url_end) logger.error(str(exp)) while urlok == 200 and index <= max_browse: try: url = urllib.request.urlopen( url_start + str(index) + url_search + urllib.parse.quote(tab[0]) + url_end).read() except Exception as exp: logger.error('URL: ' + url_start + str(index) + url_search + urllib.parse.quote(tab[0]) + url_end) logger.error(str(exp)) break soup = BeautifulSoup(url, "html.parser") soup.prettify() if len(soup.find_all("section", class_="tabsContent block-white dontSwitch")) == 0: break for section in soup.find_all("section", class_="tabsContent block-white dontSwitch"): for li in section.find_all("li"): link = li.find("a", class_="list_item clearfix trackable")["href"] idx = \ li.find("a", class_="list_item clearfix trackable")["data-info"].strip().split(",", 7)[2].split( ":", 2)[1].replace('"', "").strip() # container = source.find('div', attrs={'id':'dlbox'}) imglink = '' try: imglink = li.find("span", class_="lazyload")["data-imgsrc"] except Exception as exp: logger.debug('No image: ' + str(exp)) pass for item in li.find_all("section", class_="item_infos"): titre = urllib.parse.unquote( item.find("h2", class_="item_title").text.replace("\n", "").replace("\t", "")) category = urllib.parse.unquote( item.find("p", itemprop="category").text.replace("\n", "").replace("\t", "")) localisation = urllib.parse.unquote( item.find("p", itemprop="availableAtOrFrom").text.replace("\n", "").replace("\t", "")) try: prix = int(item.find("h3", class_="item_price").text.encode("ASCII", 'ignore').strip()) except AttributeError: prix = 0 except ValueError: prix = 0 if (prix >= prix_min) and (prix <= prix_max): if com_sqlite.select(idx) == 0: # Item not yet present in database logger.info('Find: ' + titre.strip()) com_sqlite.insert(idx) contenuhtml = mailcontent(contenuhtml, imglink, link, prix, titre, category, localisation) index += 1 logger.debug('Page : ' + str(index)) if len(contenuhtml) > 0: contenu = Scraper.mailfull(urllib.parse.unquote(tab[0] + " Prix: " + str(prix_min) + "-" + str(prix_max) + ' €'), contenuhtml) com_email.send_mail_gmail(urllib.parse.unquote(tab[0]), contenu) logger.info('Mail sent') logger.info('End extraction')