コード例 #1
0
ファイル: parser.py プロジェクト: antibantique/naive_feedya
 def __init__(self, entry: FeedParserDict):
     self.entry = entry
     self.title = entry.title
     self.url = entry.link
     self.summary = entry.get('summary', '')
     self.published_parsed = entry.get('published_parsed')
     self.update_parsed = entry.get('update_parsed')
     self.published_date = self._define_published_date(
         self.published_parsed, self.update_parsed)
コード例 #2
0
def check_parsed(parsed: feedparser.FeedParserDict, req_keys: list) -> bool:
    """
    Checks either the feed or entry of a parsed RSS feed

    :param parsed: valid, utf-8 feedparsed RSS
    :req_keys: keys that the `parsed` must contain
    :return: whether the parsed has the needed elements
    """
    return all([parsed.get(x) is not None for x in req_keys])
コード例 #3
0
ファイル: models.py プロジェクト: Aidence/crispy-succotash
 def _update_feed_data(self,
                       feed_data_obj: feedparser.FeedParserDict) -> None:
     """
     Updates feed data given a "feedparser.FeedParserDict.feed" object
     :param feed_data_obj:
     :return:
     """
     self.title = feed_data_obj['title']
     site_url = feed_data_obj.get('link', None)
     if site_url:
         self.site_url = site_url
コード例 #4
0
    def load_rss_info(self, parsed: feedparser.FeedParserDict) -> None:
        """
        Load some RSS subscription elements into this feed state."""
        self.entries = []
        for entry in parsed.get("entries"):
            new_entry = {}
            new_entry["title"] = entry["title"]

            new_entry["urls"] = []
            new_entry["metadata"] = {}
            for enclosure in entry["enclosures"]:
                new_entry["urls"].append(enclosure["href"])

            self.entries.append(new_entry)
コード例 #5
0
def parse_content(item: FeedDict) -> str:
    content = get_text_from_html(item.get('summary') or '')
    content = textwrap.shorten(content, width=300, placeholder="...")
    content = (
        content
        .replace('_', '')
        .replace('*', '')
        .replace('`', '')
        .replace('[', '')
        .replace(']', '')
        .replace('(', '')
        .replace(')', '')
    )
    return content
コード例 #6
0
def check_source(parsed: feedparser.FeedParserDict) -> bool:
    """
    Checks the parsed feed for encoding et bozo

    :param parsed: potentially invalid RSS feed
    :return: whether the RSS feed is actually valid or not
    """
    if parsed.bozo == 1:
        return False
    if not parsed.get('encoding'):
        return False
    if not parsed.encoding.upper() == 'utf-8'.upper():
        return False

    return True
コード例 #7
0
ファイル: import_jobs.py プロジェクト: MrLokans/pythondigest
def make_validate_dict(item: feedparser.FeedParserDict) -> dict:
    """
    Создает из RSS элемента словарь для сохранения.
    Метод пытается достать максимум информации из элемента
    :param item:
    :return:
    """
    _ = item.get('published_parsed', None)
    if _:
        published_at = datetime.fromtimestamp(mktime(_))
    else:
        published_at = datetime.now()

    try:
        result = {
            'title': item.title,
            'description': item.summary,
            'link': item.link,
            'published_at': published_at,
        }
    except Exception:
        result = {}
    return result
コード例 #8
0
def parse_vacancies(data: feedparser.FeedParserDict) -> Iterator[Vacancy]:
    for entry in data.get('entries', []):
        try:
            year, month, day, hour, minutes, seconds, *_ = entry.published_parsed
            date = datetime(year, month, day, hour, minutes, seconds)
            text = prepare_text(entry.description)
            url = entry.link
        except Exception as exception:
            app.logger.exception(
                msg='Exception during parsing job post',
                exc_info=exception,
            )
            continue
        title = remove_markdown_symbols(entry.title)
        text = f'*{title}*\n\n' + text
        link = f'*Посилання*\n[{title}]({entry.link})'

        result = text + link
        if len(result) > MESSAGE_LIMIT:
            strip_to = MESSAGE_LIMIT - len(link) - 10
            result = text[:strip_to] + '...\n\n' + link

        yield Vacancy(url=url, title=entry.title, text=result, date=date)
コード例 #9
0
ファイル: arxiv.py プロジェクト: MohamedAliRashad/arxiv.py
 def __init__(self, feed_link: feedparser.FeedParserDict):
     self.href = feed_link.href
     self.title = feed_link.get('title')
     self.rel = feed_link.get('rel')
     self.content_type = feed_link.get('content_type')
コード例 #10
0
    def _handle_http_codes(
            self, parsed: feedparser.FeedParserDict) -> "UpdateResult":
        """
        Given feedparser parse result, determine if parse succeeded, and what to do about that.
        """
        # feedparser gives no status if you feedparse a local file.
        if "status" not in parsed:
            LOG.debug("Saw status 200 - OK, all is well.")
            return UpdateResult.SUCCESS

        status = parsed.get("status", 200)
        result = UpdateResult.SUCCESS
        if status == requests.codes["NOT_FOUND"]:
            LOG.error(
                f"Saw status {status}, unable to retrieve feed text for "
                f"{self.metadata['name']}."
                f"\nStored URL {self.url} for {self.metadata['name']} will be preserved"
                f"and checked again on next attempt.")

            result = UpdateResult.FAILURE

        elif status in [
                requests.codes["UNAUTHORIZED"], requests.codes["GONE"]
        ]:
            LOG.error(
                f"Saw status {status}, unable to retrieve feed text for "
                f"{self.metadata['name']}."
                f"\nClearing stored URL {self.url} for {self.metadata['name']}."
                f"\nPlease provide new URL and authorization for subscription "
                f"{self.metadata['name']}.")

            self.url = ""
            result = UpdateResult.FAILURE

        # handle redirecting errors
        elif status in [
                requests.codes["MOVED_PERMANENTLY"],
                requests.codes["PERMANENT_REDIRECT"]
        ]:
            LOG.warning(
                f"Saw status {status} indicating permanent URL change."
                f"\nChanging stored URL {self.url} for {self.metadata['name']} to "
                f"{parsed.get('href')} and attempting get with new URL.")

            self.url = parsed.get("href")
            result = UpdateResult.ATTEMPT_AGAIN

        elif status in [
                requests.codes["FOUND"], requests.codes["SEE_OTHER"],
                requests.codes["TEMPORARY_REDIRECT"]
        ]:
            LOG.warning(
                f"Saw status {status} indicating temporary URL change."
                f"\nAttempting with new URL {parsed.get('href')}."
                f"\nStored URL {self.url} for {self.metadata['name']} will be unchanged."
            )

            self.temp_url = self.url
            self.url = parsed.get("href")
            result = UpdateResult.ATTEMPT_AGAIN

        elif status != 200:
            LOG.warning(
                f"Saw '{status}'. Retrying retrieve for {self.metadata['name']} "
                f"at {self.url}.")
            result = UpdateResult.ATTEMPT_AGAIN

        else:
            LOG.debug("Saw status 200. Success!")

        return result
コード例 #11
0
def parse_date(item: FeedDict) -> datetime:
    date = item.get('published_parsed') or item.get('updated_parsed')
    return time_struct_to_datetime(date)