def run(cron_time, cron_unit, notify=True, force_tasks=False, force_agents=False, recent_ads=3): tasks = State.get_tasks() log.add_handler("CRON_HANDLER") log.info_print(f"Running cronjob for schedule: {cron_time} {cron_unit}") # Scrape each url given in tasks file for id in tasks: t = tasks[id] freq = t.frequency freq_unit = t.frequency_unit # skip tasks that dont correspond with the cron schedule if int(freq) != int(cron_time) or freq_unit[:1] != cron_unit[:1]: continue task.run(t, notify=notify, force_tasks=force_tasks, force_agents=force_agents, recent_ads=recent_ads)
def create_source(): State.load() form = SourceForm() if form.validate_on_submit(): if form.test.data: web_url=form.website.data Dict = {1: 'kijiji', 2: 'zillow'} prime_source = prime.Source(module=Dict.get(form.module.data), module_properties={'url':web_url,'botname':"prime"}) try: total_ads = prime.test_webui_source(prime_source).total_new_ads log.info_print(f"total_ads: {total_ads}") except: message = "Not a valid source" else: message = f"Found {total_ads} new ads" \ if total_ads != 1 else "Found 1 new ad" finally: if web_url == "": message = "Not a valid source" flash(message, "notification") else: source = Source(module=form.module.data, name=form.name.data, website=form.website.data, location=form.location.data, range=form.range.data, # subreddit=form.subreddit.data ) db.session.add(source) db.session.commit() State.refresh_sources() flash('Your source has been saved!', 'top_flash_success') return redirect(url_for('main.sources')) return render_template('create-source.html', title='Create Source', form=form, legend='Create Source')
def run(task, sources=None, notif_agents=None, notify=True, force_tasks=False, force_agents=False, recent_ads=0, save_ads=True, ignore_old_ads=False): from lib.core.state import State if sources is None: sources = State.get_sources() if notif_agents is None: notif_agents = State.get_notif_agents() exclude_words = task.exclude log.info_print(f"Task: {task.name}") if task.enabled == False: if force_tasks == False: log.info_print("Task disabled. Skipping...") print() return else: log.info_print("Task disabled but forcing task to run...") task_notif_agents = notif_agent.get_notif_agents_by_ids( task.notif_agent_ids) if notify == True and force_agents == False: notif_agent.notif_agents_enabled_check(task_notif_agents) source_results = {} for source_id in task.source_ids: source_results[source_id] = source.scrape( sources[source_id], task_notif_agents, include=task.include, exclude=task.exclude, colour_flag=task.colour_flag, notify=notify, force_tasks=force_tasks, force_agents=force_agents, recent_ads=recent_ads, save_ads=save_ads, ignore_old_ads=ignore_old_ads) if save_ads: ad.save() result = RunResult(source_results=source_results) return result
def scrape_for_ads(self, old_ad_ids, exclude=[], **kwargs): self.new_ads = {} self.old_ad_ids = old_ad_ids self.exclude = [] url = kwargs["url"] title = None log.info_print(f"url 1: {url}") while url: # Get the html data from the URL req_headers = { 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'en-US,en;q=0.8', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' } page = requests.get(url, headers=req_headers) soup = BeautifulSoup(page.content, "html.parser") # If the title doesnt exist pull it from the html data if title is None: title = self.get_title(soup) # Find ads on the page self.find_ads(soup) # Set url for next page of ads url = soup.find('a', {'title': 'Next page'}) disabled = True try: disabled_state = url['disabled'] except: disabled = False if url: if disabled: log.info_print(f"if disabled: {disabled}") break else: url = 'https://www.zillow.com' + url['href'] log.info_print(f"new url: {url}") return self.new_ads, title
def scrape(source, notif_agents_list, include=[], exclude=[], colour_flag="", notify=True, force_tasks=False, force_agents=False, recent_ads=0, save_ads=True, ignore_old_ads=False): from lib.core.state import State import lib.core.notif_agent as notif_agent ads = State.get_ads() source_modules = State.get_source_modules() notif_agent_modules = State.get_notif_agent_modules() log.info_print(f"Source: {source.name}") log.info_print(f"Module: {source.module}") log.info_print(f"Module Properties: {source.module_properties}") if len(include): print(f"Including: {include}") if len(exclude): print(f"Excluding: {exclude}") module = source_modules[source.module] old_ads = [] if ignore_old_ads == False: if source.module in ads: old_ads = ads[source.module] log.debug(f"Total old ads: {len(old_ads)}") else: log.debug(f"No old ads found for module: {source.module}") else: log.info_print("Ignoring old ads...") new_ads, ad_title = module.scrape_for_ads(old_ads, exclude=exclude, **source.module_properties) info_string = f"Found {len(new_ads)} new ads" \ if len(new_ads) != 1 else "Found 1 new ad" log.info_print(info_string) num_ads = len(new_ads) if notify and num_ads: ads_to_send = new_ads if recent_ads > 0: # only notify the most recent notify_recent new_ads ads_to_send = ct.get_most_recent_items(recent_ads, new_ads) log.debug( f"Recent ads set to: {recent_ads} got: {len(ads_to_send)}") log.info_print(f"Total ads to notify about: {len(ads_to_send)}") if len(notif_agents_list) == 0: log.warning_print( "No notification agents set... nothing to notify") else: if len(notif_agents_list) > 1: log.info_print( f"Notifying agents: {notif_agent.get_names(notif_agents_list)}" ) for agent in notif_agents_list: if agent.enabled or force_agents == True: if agent.enabled == False and force_agents == True: log.info_print( "Notification agent was disabled but forcing...") notif_agent_modules[agent.module].send_ads( ads_to_send, ad_title, colour_flag, **agent.module_properties) else: log.info_print( f"Skipping... Notification agent disabled: {agent.name}" ) elif not notify and num_ads: log.info_print("Skipping notification") if save_ads: ads[source.module] = module.old_ad_ids log.debug(f"Total all-time processed ads: {len(module.old_ad_ids)}") else: log.info_print(f"Saving ads disabled. Skipping...") print() return ScrapeSummary(new_ads=new_ads, latest_ads=list(new_ads)[-3:], total_new_ads=len(new_ads))