Esempio n. 1
0
def run():

    date = local_yesterday()
    # For local testing in different time zone:
    # date = local_yesterday() - dt.timedelta(days=1)

    top_articles = _get_top_articles(5, date)

    if not top_articles:
        logger.info("No articles found, sending no message to Teams")
        return

    logger.debug("Found top articles")
    articles_above_threshold = _get_articles_above_threshold(
        ARTICLE_THRESHOLD,
        date,
    )
    logger.debug(
        "Found {} articles above threshold".format(articles_above_threshold))

    adaptive_card = _generate_adaptive_card(top_articles,
                                            articles_above_threshold)
    logger.debug(adaptive_card.to_json())
    payload = generate_teams_payload(adaptive_card)

    # Send payload to MS Teams
    result = send_to_teams(payload, WEBHOOK_URL)
    logger.debug(result)

    if WEBHOOK_URL_SECONDARY:
        result = send_to_teams(payload, WEBHOOK_URL_SECONDARY)
        logger.debug(result)
Esempio n. 2
0
def scrape_full_gsc(property: Property):
    """Run full scrape of property from GSC API (most recent 30 days).

    Args:
        property (Property): Property to scrape data for.
    """
    logger.info("Running full scrape of property {}", property)

    property_filter = Q(id=property.id)

    start_date = local_yesterday() - dt.timedelta(days=30)

    sleep(1)
    scrape_gsc(start_date=start_date, property_filter=property_filter)

    logger.success("Finished full scrape of property {}", property)
Esempio n. 3
0
def run(*, last_update_gsc: str = None):
    # Generate list of Page objects that are potential to-do items
    articles_to_do = _get_seo_articles_to_update(10000, local_yesterday())
    # For testing, in case not enough articles have been scraped:
    # articles_to_do = _get_seo_articles_to_update(
    #     10000, local_yesterday() - dt.timedelta(days=1)
    # )

    logger.debug("Found {} articles to send to MS Teams", len(articles_to_do))

    adaptive_card = _generate_adaptive_card(
        articles_to_do,
        last_update_gsc=last_update_gsc,
    )
    payload = generate_teams_payload(adaptive_card)

    # Send payload to MS Teams
    result = send_to_teams(payload, WEBHOOK_URL)
    logger.debug(result)

    if WEBHOOK_URL_SECONDARY:
        result = send_to_teams(payload, WEBHOOK_URL_SECONDARY)
        logger.debug(result)
Esempio n. 4
0
def _generate_adaptive_card(pages: Page,
                            last_update_gsc: str = None) -> AdaptiveCard:

    # Convert PDT timezone to Berlin time, because GSC-times are all PDT-based
    PDT = dt.timezone(-dt.timedelta(hours=7))
    yesterday = local_yesterday()
    start_time = dt.datetime(yesterday.year,
                             yesterday.month,
                             yesterday.day,
                             tzinfo=PDT)
    start_time_local = start_time.astimezone(BERLIN)

    # Generate intro
    greeting = random.choice(GREETINGS)
    intro = TextBlock(
        (f"{greeting} Diese Beiträge von uns sind seit gestern, {start_time_local.hour}:00 Uhr, "
         "mit Google gut gefunden worden und haben heute noch kein Update bekommen. "
         "**Lohnt sich eine Aktualisierung oder ein Weiterdreh?**"),
        wrap=True,
    )

    # Add note about GSC data
    note_gsc = Container(
        [
            TextBlock(
                text=(
                    f"Letzter Datenabgleich mit der GSC: {last_update_gsc} Uhr"
                ),
                spacing="None",
                wrap=True,
            ),
            TextBlock(
                text=
                ("Es dauert bis zu 48 Stunden, bis die Daten in der GSC final sind!"
                 ),
                spacing="Small",
                wrap=True,
            ),
        ],
        spacing="extralarge",
    )

    # Generate outro
    outro = TextBlock(
        text=f"[Infos zur Datenerhebung]({MORE_URL})",
        horizontalAlignment="right",
        spacing="large",
    )

    # Generate sections for each page
    stories = []

    for i, page in enumerate(pages):
        story = _generate_story(page)

        # Add separators between stories
        if i > 0:
            story.separator = True

        stories.append(story)

    # Put everything together
    adaptive_card_body = [intro, *stories, note_gsc, outro]
    card = AdaptiveCard(body=adaptive_card_body)

    return card
Esempio n. 5
0
def scrape_webtrekk(
    *,
    start_date: Optional[dt.date] = None,
    end_date: Optional[dt.date] = None,
):
    """Load webtrekk report and transfer it into database
    (:class:`~okr.models.pages.PageWebtrekkMeta` and
    :class:`~okr.models.pages.PageDataWebtrekk`).
    """
    today = local_today()
    yesterday = local_yesterday()

    start_date = date_param(
        start_date,
        default=yesterday - dt.timedelta(days=1),
        earliest=today - dt.timedelta(days=30),
        latest=yesterday,
    )

    end_date = date_param(
        end_date,
        default=yesterday,
        earliest=start_date,
        latest=today,
    )

    page_cache = {}

    try:
        property = Property.objects.get(url="https://www1.wdr.de/nachrichten/")
    except Property.DoesNotExist:
        property = None

    for date in reversed(date_range(start_date, end_date)):
        logger.info("Start Webtrekk SEO scrape for {}.", date)

        try:
            data = webtrekk.cleaned_webtrekk_page_data(date)
        except Exception as e:
            capture_exception(e)
            continue

        for key, item in data.items():
            url, headline, query = key

            page = _page_from_url(url, page_cache, property=property)

            if page is None:
                continue

            webtrekk_meta, created = PageWebtrekkMeta.objects.get_or_create(
                page=page,
                headline=headline,
                query=query or "",
            )

            PageDataWebtrekk.objects.update_or_create(
                date=date,
                webtrekk_meta=webtrekk_meta,
                defaults=dict(
                    visits=item.get("visits", 0),
                    entries=item.get("entries", 0),
                    visits_campaign=item.get("visits_campaign", 0),
                    bounces=item.get("bounces", 0),
                    length_of_stay=dt.timedelta(
                        seconds=item.get("length_of_stay", 0)),
                    impressions=item.get("impressions", 0),
                    exits=item.get("exits", 0),
                    visits_search=item.get("visits_search", 0),
                    entries_search=item.get("entries_search", 0),
                    visits_campaign_search=item.get("visits_campaign_search",
                                                    0),
                    bounces_search=item.get("bounces_search", 0),
                    length_of_stay_search=dt.timedelta(
                        seconds=item.get("length_of_stay_search", 0)),
                    impressions_search=item.get("impressions_search", 0),
                    exits_search=item.get("exits_search", 0),
                ),
            )

    logger.success("Finished Webtrekk SEO scrape")
Esempio n. 6
0
def scrape_gsc(
    *,
    start_date: Optional[dt.date] = None,
    property_filter: Optional[Q] = None,
):
    """Scrape from Google Search Console API.

    Args:
        start_date (Optional[dt.date], optional): Earliest date to request data for.
          Defaults to None. Will be set to two days before yesterday if None.
        property_filter (Optional[Q], optional): Filter to select a subset of
          properties. Defaults to None.
    """
    today = local_today()
    yesterday = local_yesterday()

    start_date = date_param(
        start_date,
        default=yesterday - dt.timedelta(days=2),
        earliest=today - dt.timedelta(days=30),
        latest=yesterday,
    )

    properties = Property.objects.all()

    if property_filter:
        properties = properties.filter(property_filter)

    for property in properties:
        logger.info(
            "Start scrape Google Search Console data for property {}.",
            property,
        )

        page_cache = {}

        try:
            _property_data_gsc(property, start_date, yesterday)
        except Exception as e:
            capture_exception(e)

        for date in reversed(date_range(start_date, yesterday)):
            logger.info("Scraping data for {}.", date)

            # Get page data first to ensure scrape is done before SEO bot runs
            try:
                _page_data_gsc(property, date, page_cache)
            except Exception as e:
                capture_exception(e)

            try:
                _page_data_query_gsc(property, date, page_cache)
            except Exception as e:
                capture_exception(e)

            try:
                _property_data_query_gsc(property, date)
            except Exception as e:
                capture_exception(e)

        logger.success(
            "Finished Google Search Console scrape for property {}.",
            property,
        )