def _news_scrapper(): for n in config()['news_sites']: news_site = config()['news_sites'][n] host = news_site['url'] homepage = news.HomePage(news_site, host) total = len(homepage.article_links) index = 1 for link in homepage.article_links: article = _fetch_article(news_site, host, link) if article and article.title is not None: if (db.news.find_one({"title": article.title}) is None): db.news.insert_one({ "title": article.title, "content": article.body, "category": article.category, "image": build_link(host, article.image), "date": datetime.datetime.utcnow() }) progress( index, total, 'Num of articles: {}'.format( db.news.count_documents({}))) else: progress(index, total, 'Article already exists!') index += 1 client.close()
def _news_scraper(news_site_uid): host = config()['news_sites'][news_site_uid]['url'] logging.info('Beginning scraper for {}'.format(host)) homepage = news.HomePage(news_site_uid, host) for link in homepage.article_links: print(link)
def main(news_site_id): url = config()['news_sites'][news_site_id]['url'] logging.info(f'Starting to scrap {url}') homepage = news.HomePage(news_site_id, url) logging.info(f'Getting header links') header_links = [_build_link(url, link) for link in homepage.header_links] logging.info(f'Getting news pages links') sections = [ news.SectionPage(news_site_id, section) for section in header_links ] links = [section.article_links for section in sections] flat_links = [ _build_link(url, item) for sublist in links for item in sublist ] logging.info('Fetching Articles') #articles = [news.ArticlePage(news_site_id,article) for article in flat_links] # _save_articles(news_site_id, flat_links) articles = [ _fetch_article(news_site_id, url, article) for article in flat_links ] articles = [article for article in articles if article] logging.info(f'Saving articles') _save_articles(news_site_id, articles) logging.info(f'Articles saved')
def _news_scraper(news_site_uid): host = config()['news_sites'][news_site_uid]['url'] logging.info('Beginning scraper for {}'.format(host)) homepage = news.HomePage(news_site_uid, host) articles = [] ud_programs = [] if news_site_uid == 'udistrital': for info in homepage.udistrital_info: ud_info_programs = news.UdProgramsPage(news_site_uid, info) if ud_info_programs: logger.info('Info fetched!') ud_programs.append(ud_info_programs) # break _save_ud_programs(news_site_uid, ud_programs) else: for link in homepage.article_links: article = _fetch_article(news_site_uid, host, link) if article: logger.info('Article fetched!') articles.append(article) break _save_articles(news_site_uid, articles)
def _news_scraper(news_site_uid): host = config()['news_sites'][news_site_uid]['url'] logging.info('Beginning scraper for {}'.format(host)) homepage = news.HomePage(news_site_uid, host) #categorias #articulos articles = [] for link in homepage.article_links: article = _fetch_article(news_site_uid, host, link) if article: logger.info('Estoy vivo!!!') articles.append(article) #print(article.title) #quitamos print para correr el codigo #break #print(len(article))#quitamos print para correr el codigo #codigo para crear el archivo _save_articles(news_site_uid, articles)
def _fetch_links(news_site_uid, host): logger.info('Start fetching links at {}'.format(news_site_uid)) links = [] try: homepage = news.HomePage(news_site_uid, host) links = homepage.article_links except Exception as e: logger.error('ERROR fetching links: {}'.format(e), exc_info=False) return links
def _new_scraper (new_site_uid): host = config()['news_sites'][new_site_uid]['url'] logging.info('esta escaneando {}'.format(host)) homepage = news.HomePage(new_site_uid,host) articles = [] for link in homepage.article_links: article = _fetch_article(new_site_uid,host,link) if article: logger.info('encontro el ariculo') articles.append(article) _save_articles(new_site_uid,articles)
async def _fetch_links(news_site_uid, session): error = 0 links = [] try: homepage = news.HomePage(news_site_uid) await homepage.visit(session) links = homepage.article_links except Exception as e: logger.error('ERROR fetching links: {}'.format(e), exc_info=False) error = 'ERROR fetching links: {}'.format(e) return (error, links, news_site_uid)
def _news_scraper(news_site_uid): host = config()['news_sites'][news_site_uid]['url'] logger.info('Beginning scrapper for {}'.format(host)) homepage = news.HomePage(news_site_uid, host) articles = [] for link in homepage.article_links: article = _fetch_article(news_site_uid, host, link) if article: logger.info('Article fetched!!') articles.append(article) _save_articles(news_site_uid, articles)
def _news_scraper(news_site_id): host = config()["news_sites"][news_site_id]["url"] logger.info(f"Beginning scraper for {host=}") homepage = news.HomePage(news_site_id, host) articles = [] for link in homepage.article_links: article = _fetch_article(news_site_id, host, link) if article: logger.info("Article fetched!!") articles.append(article) _save_articles(news_site_id, articles)
def _news_scrapper(news_site_uid): host = config()['news_sites'][news_site_uid]['url'] logging.info('Starting scrapper for {}'.format(host)) homepage = news.HomePage(news_site_uid, host) articles = [] for link in homepage.article_links: article = _fetch_article(news_site_uid, host, link) if article: logger.info('Article fetched correctly.') articles.append(article) # print(article.title) #print(f'We got {len(articles)} articles') _save_articles(news_site_uid, articles)
def _news_scraper(news_site_uid): host = config()['news_sites'][news_site_uid]['url'] logging.info(f'Beginning scrape for {host}') homepage = news.HomePage(news_site_uid, host) articles = [] for link in homepage.article_links: #print(link) article = _fetch_article(news_site_uid, host, link) if article: logger.info('Article fetched!!') articles.append(article)
def _news_scraper(news_site_uid): host = config()['news_sites'][news_site_uid]['url'] logging.info(f'Starting Scrapper for {host}...') articles = [] homepage = news.HomePage(news_site_uid, host) for link in homepage.article_links: fetch_link = build_link(host, link) article = _fetch_article(news_site_uid, host, fetch_link) if article: articles.append(article) logger.info(f'Getting {fetch_link} [{article.title}]') logger.info(f'{len(articles)} articles were fetched!...') logger.info(f'Writing data...') _save_articles(news_site_uid, articles)
def _news_scraper(news_site_uid): # news_site_uid porque estamos utilizando las llaves como IDs {xataka, enter.co} host = config()['news_sites'][news_site_uid]['url'] # {https://www.xataka.com/, https://www.enter.co/} logging.info('Beginning scraper for {}'.format(host)) homepage = news.HomePage(news_site_uid, host) # contiene un set con los links del homepage articles = [] for link in homepage.article_links: article = _fetch_article(news_site_uid, host, link) if article: logger.info('Article fetched!!') articles.append(article) #break # hace que se rompa el ciclo solamente cuando tengamos el primer articulo _save_articles(news_site_uid, articles)
def _news_scraper(news_site_uid): host = config()['news_sites'][news_site_uid]['url'] logging.info(f'Beginning scraper for {host}') homepage = news.HomePage(news_site_uid, host) articles = [] for link in homepage.article_links: article = _fetch_article(news_site_uid, host, link) if article: logger.info('Article fetched!!') articles.append(article) #Va a guardar objetos de la subclase # print(articles) _save_articles(news_site_uid, articles)
def _news_scraper(news_site_uid): host = config()['news_sites'][news_site_uid]['url'] logging.info(f'Iniciando scrape para {host}') homepage = news.HomePage(news_site_uid, host) articles = [] for link in homepage.article_links: article = _fetch_article(news_site_uid, host, link) if article: logger.info('Articulo Extraido!!') articles.append(article) #print(len(articles)) _save_articles(news_site_uid, articles)
def _financial_scrapper(financial_site_uid): host = config()['financial_sites'][financial_site_uid]['url'] logging.info(f'Beginning scraper for {host}') logging.info('Finding links in homepage...') homepage = news.HomePage(financial_site_uid, host) articles = [] for link in homepage.article_links: article = _fetch_article(financial_site_uid, host, link) if article: logger.info('Article fetched!!!') articles.append(article) _save_articles(financial_site_uid, articles)
def _news_scraper(news_site): ''' Función para hacer un scraper a una pagina web sobre noticias. @param news_site: Sitio que queremos buscar según los que tengamos en config.yaml''' host = common.config()['news_sites'][news_site]['url'] logging.info('Beginning scraper for {}'.format(host)) homepage = news.HomePage(news_site, host) articles = [] for link in homepage.article_links: article = _fetch_article(news_site, host, link) if article: logger.info('Article ferched!!') articles.append(article) _save_article(news_site, articles)
def _news_scrapper( news_site_uid ): # config carga la estructura del yaml y aqui accedemos a url de la news site host = config()['news_sites'][news_site_uid]['url'] logging.info('Beginning scrapper for {}'.format(host)) #'f'{host}' homepage = news.HomePage(news_site_uid, host) articles = [] for link in homepage.article_links: article = _fetch_article(news_site_uid, host, link) if article: logger.info('Article fetched!') articles.append(article) _save_articles(news_site_uid, articles)
def _news_scraper(news_site_uid): #take the url from the config.yaml host = config()['news_sites'][news_site_uid]['url'] logging.info('Beginning scraper for {}'.format(host)) #create a news homepage object homepage = news.HomePage(news_site_uid, host) articles = [] for link in homepage.article_links: #call the function that fetches the article article = _fetch_article(news_site_uid, host, link) if article: logger.info('Article fetched!') articles.append(article) #save info in a csv file _save_articles(news_site_uid, articles)
def _news_scraper(news_site_uid): host = config()["news_sites"][news_site_uid]["url"] logging.info("Beginning scraper for {}".format(host)) home_page = news.HomePage(news_site_uid, host) articles = [] for link in home_page.article_links: article = _fetch_article(news_site_uid, link) if article: logger.info("Article fetched!!") articles.append(article) print(article.title) print(len(articles)) _save_articles(news_site_uid, articles)
def _news_scraper(news_site_uid): """"Modulo que genera el scrapper de la pagina seleccionada""" host = config()['news_sites'][news_site_uid]['url'] logging.info('Beggining scrapper for {}'.format(host)) homepage = news.HomePage(news_site_uid, host) articles = [] for link in homepage.articule_links: #print(link) article = _fetch_article(news_site_uid, host, link) if article: logger.info('Article fetched!') articles.append(article) #break #print(article.title) #print(len(articles)) _save_articles(news_site_uid, articles)
def _news_scrapper(news_sites_uid): host = config()['new_site'][news_sites_uid]['url'] logging.info('begging scraper for {} '.format(host)) homepage = news.HomePage(news_sites_uid, host) for link in homepage.article_links: article = _fetch_article(news_sites_uid, host, link) if article: logger.info('Article fetched!!') articles.append(article) print(article.title) print(len(article)) _save_articles(news_sites_uid, articles)
def _news_scraper(news_site_uid): host = config()['news_sites'][news_site_uid]['url'] logger.info(f"Beginning scraper for {host}") homepage = news.HomePage(news_site_uid, host) articles = [] for link in homepage.article_links: article = _fetch_article(news_site_uid, host, link) if article: logger.info(Fore.GREEN + '📥📥📥 Article Fetched! 📥📥📥') #Colorama print(Style.RESET_ALL) articles.append(article) _save_articles(news_site_uid, articles)
def _news_scraper(news_site_uid): host = config()['news_sites'][news_site_uid]['url'] logging.info(f'Comenzando el scraper de: {host}') logging.info('Encontrando links en la página de inicio...') homepage = news.HomePage(news_site_uid, host) articles = [] for link in homepage.article_links: article = _fetch_article(news_site_uid, host, link) if article: logger.info('Articulo obtenido de forma exitosa!!') articles.append(article) _save_articles(news_site_uid, articles)
def _news_scraper(news_site_uid): host = config()['news_sites'][news_site_uid]['url'] logging.info('Beginning scraper for {}'.format(host)) logging.info('Finding links in homepage...') homepage = news.HomePage(news_site_uid, host) articles = [] for link in homepage.article_links: article = _fetch_article(news_site_uid, host, link) if article: logger.info('Article fetched!') articles.append(article) print(article.title) print(len(articles))
def _news_scraper(news_site_uid): print(news_site_uid) host_url = config()['news_sites'][news_site_uid]['url'] host_url2 = config()['news_sites'][news_site_uid]['url2'] logger.info(f'\tBegginig scraper for {host_url}') # 1. Go to main page and get all the tech link articles homepage = news.HomePage(news_site_uid, host_url) articles = [] # list for save tech articles for link in homepage.article_links: #print(link) article = _fetch_article(news_site_uid, host_url2, link) if article: logger.info('Article fetched!') articles.append(article) _save_article(news_site_uid, articles)
def _news_scrapper(news_site_uid): #url host = config()['news_sites'][news_site_uid]['url'] logging.info('Iniciando scrapper para {}'.format(host)) homepage = news.HomePage(news_site_uid, host) articles = [] print("homepage {}".format(homepage.article_links)) for link in homepage.article_links: article = _fetch_article(news_site_uid, host, link) if article: logger.info('Article fetched') articles.append(article) print(article.title) print(len(articles)) _save_articles(news_site_uid, articles)
def _news_scraper(news_site_uid): #obtenemos la direccion url host = config('config.yaml')['news_sites'][news_site_uid]['url'] logging.info('Beginning scraper for {}'.format(host)) #Envia el nombre del site y la url a la clase HomePage homepage = news.HomePage(news_site_uid, host) articles = [] for link in homepage.article_links: article = _fetch_article(news_site_uid, host, link) #Rectifica el link de cada articulo if article: logger.info('Article fetched!!') #Enlista los articulos encontrados articles.append(article) #print(article.title) #if len(articles)==3: # break #Guarda el articulo _save_articles(news_site_uid, articles)
def _news_scraper(news_site_uid): host = config()['news_sites'][news_site_uid]['url'] logging.info(f'Iniciando scrape para {host}') homepage = news.HomePage(news_site_uid, host) for link in homepage.article_links: print(link) articles = [] for link in homepage.article_links: article = _fetch_article(news_site_uid, host, link) if article: logger.info('Article fetched') articles.append(article) print(article.title) print(len(articles))