Beispiel #1
0
def run(cron_time,
        cron_unit,
        notify=True,
        force_tasks=False,
        force_agents=False,
        recent_ads=3):

    tasks = State.get_tasks()

    log.add_handler("CRON_HANDLER")

    log.info_print(f"Running cronjob for schedule: {cron_time} {cron_unit}")

    # Scrape each url given in tasks file
    for id in tasks:
        t = tasks[id]
        freq = t.frequency
        freq_unit = t.frequency_unit

        # skip tasks that dont correspond with the cron schedule
        if int(freq) != int(cron_time) or freq_unit[:1] != cron_unit[:1]:
            continue

        task.run(t,
                 notify=notify,
                 force_tasks=force_tasks,
                 force_agents=force_agents,
                 recent_ads=recent_ads)
def create_source():
    State.load()
    form = SourceForm()
    if form.validate_on_submit():
        if form.test.data:
            web_url=form.website.data
            Dict = {1: 'kijiji', 2: 'zillow'}

            prime_source = prime.Source(module=Dict.get(form.module.data), module_properties={'url':web_url,'botname':"prime"})

            try:
                total_ads = prime.test_webui_source(prime_source).total_new_ads
                log.info_print(f"total_ads: {total_ads}")
            except:
                message = "Not a valid source"
            else:
                message = f"Found {total_ads} new ads" \
                    if total_ads != 1 else "Found 1 new ad"
            finally:
                if web_url == "":
                    message = "Not a valid source"
                flash(message, "notification")

        else:
            source = Source(module=form.module.data,
                            name=form.name.data,
                            website=form.website.data,
                            location=form.location.data,
                            range=form.range.data,
                            # subreddit=form.subreddit.data
                            )
            db.session.add(source)
            db.session.commit()

            State.refresh_sources()

            flash('Your source has been saved!', 'top_flash_success')
            return redirect(url_for('main.sources'))
    return render_template('create-source.html', title='Create Source', 
                            form=form, legend='Create Source')
Beispiel #3
0
def run(task,
        sources=None,
        notif_agents=None,
        notify=True,
        force_tasks=False,
        force_agents=False,
        recent_ads=0,
        save_ads=True,
        ignore_old_ads=False):

    from lib.core.state import State

    if sources is None:
        sources = State.get_sources()

    if notif_agents is None:
        notif_agents = State.get_notif_agents()

    exclude_words = task.exclude

    log.info_print(f"Task: {task.name}")

    if task.enabled == False:
        if force_tasks == False:
            log.info_print("Task disabled. Skipping...")
            print()
            return
        else:
            log.info_print("Task disabled but forcing task to run...")

    task_notif_agents = notif_agent.get_notif_agents_by_ids(
        task.notif_agent_ids)

    if notify == True and force_agents == False:
        notif_agent.notif_agents_enabled_check(task_notif_agents)

    source_results = {}

    for source_id in task.source_ids:
        source_results[source_id] = source.scrape(
            sources[source_id],
            task_notif_agents,
            include=task.include,
            exclude=task.exclude,
            colour_flag=task.colour_flag,
            notify=notify,
            force_tasks=force_tasks,
            force_agents=force_agents,
            recent_ads=recent_ads,
            save_ads=save_ads,
            ignore_old_ads=ignore_old_ads)

    if save_ads:
        ad.save()

    result = RunResult(source_results=source_results)
    return result
Beispiel #4
0
    def scrape_for_ads(self, old_ad_ids, exclude=[], **kwargs):
        self.new_ads = {}
        self.old_ad_ids = old_ad_ids
        self.exclude = []

        url = kwargs["url"]
        title = None
        log.info_print(f"url 1: {url}")
        while url:
            # Get the html data from the URL
            req_headers = {
                'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
                'accept-encoding': 'gzip, deflate, br',
                'accept-language': 'en-US,en;q=0.8',
                'upgrade-insecure-requests': '1',
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
            }

            page = requests.get(url, headers=req_headers)
            soup = BeautifulSoup(page.content, "html.parser")

            # If the title doesnt exist pull it from the html data
            if title is None:
                title = self.get_title(soup)

            # Find ads on the page
            self.find_ads(soup)

            # Set url for next page of ads
            url = soup.find('a', {'title': 'Next page'})
            disabled = True
            try:
                disabled_state = url['disabled']
            except:
                disabled = False

            if url:
                if disabled:
                    log.info_print(f"if disabled: {disabled}")
                    break
                else:
                    url = 'https://www.zillow.com' + url['href']
                    log.info_print(f"new url: {url}")
        return self.new_ads, title
def scrape(source,
           notif_agents_list,
           include=[],
           exclude=[],
           colour_flag="",
           notify=True,
           force_tasks=False,
           force_agents=False,
           recent_ads=0,
           save_ads=True,
           ignore_old_ads=False):
    from lib.core.state import State
    import lib.core.notif_agent as notif_agent

    ads = State.get_ads()
    source_modules = State.get_source_modules()
    notif_agent_modules = State.get_notif_agent_modules()
    log.info_print(f"Source: {source.name}")
    log.info_print(f"Module: {source.module}")
    log.info_print(f"Module Properties: {source.module_properties}")

    if len(include):
        print(f"Including: {include}")

    if len(exclude):
        print(f"Excluding: {exclude}")

    module = source_modules[source.module]

    old_ads = []
    if ignore_old_ads == False:
        if source.module in ads:
            old_ads = ads[source.module]
            log.debug(f"Total old ads: {len(old_ads)}")

        else:
            log.debug(f"No old ads found for module: {source.module}")

    else:
        log.info_print("Ignoring old ads...")
    new_ads, ad_title = module.scrape_for_ads(old_ads,
                                              exclude=exclude,
                                              **source.module_properties)

    info_string = f"Found {len(new_ads)} new ads" \
        if len(new_ads) != 1 else "Found 1 new ad"

    log.info_print(info_string)

    num_ads = len(new_ads)

    if notify and num_ads:
        ads_to_send = new_ads

        if recent_ads > 0:
            # only notify the most recent notify_recent new_ads
            ads_to_send = ct.get_most_recent_items(recent_ads, new_ads)
            log.debug(
                f"Recent ads set to: {recent_ads} got: {len(ads_to_send)}")
            log.info_print(f"Total ads to notify about: {len(ads_to_send)}")

        if len(notif_agents_list) == 0:
            log.warning_print(
                "No notification agents set... nothing to notify")

        else:
            if len(notif_agents_list) > 1:
                log.info_print(
                    f"Notifying agents: {notif_agent.get_names(notif_agents_list)}"
                )

            for agent in notif_agents_list:
                if agent.enabled or force_agents == True:
                    if agent.enabled == False and force_agents == True:
                        log.info_print(
                            "Notification agent was disabled but forcing...")

                    notif_agent_modules[agent.module].send_ads(
                        ads_to_send, ad_title, colour_flag,
                        **agent.module_properties)

                else:
                    log.info_print(
                        f"Skipping... Notification agent disabled: {agent.name}"
                    )

    elif not notify and num_ads:
        log.info_print("Skipping notification")

    if save_ads:
        ads[source.module] = module.old_ad_ids
        log.debug(f"Total all-time processed ads: {len(module.old_ad_ids)}")
    else:
        log.info_print(f"Saving ads disabled. Skipping...")

    print()

    return ScrapeSummary(new_ads=new_ads,
                         latest_ads=list(new_ads)[-3:],
                         total_new_ads=len(new_ads))