コード例 #1
0
def get_data_for_rss_item(rss_item: Dict) -> Dict:
    http_code, content, raw_content = _get_http_data_of_url(rss_item['link'])
    rss_item.update({
        'raw_content': raw_content,
        'http_code': http_code,
        'content': content,
    })
    return rss_item
コード例 #2
0
ファイル: import_news.py プロジェクト: shamigor/pythondigest
def get_data_for_rss_item(rss_item: Dict) -> Dict:
    http_code, content, raw_content = _get_http_data_of_url(rss_item['link'])
    rss_item.update(
        {
            'raw_content': raw_content,
            'http_code': http_code,
            'content': content,
        }
    )
    return rss_item
コード例 #3
0
def update_news():
    items_on_once = 10
    filepath = './pk_list.pickle'
    # если какая-то новость косячная, то на ней обработка не замнется
    pk_list = load_pickle_file(filepath)
    shuffle(pk_list)
    if pk_list is None:
        return

    list_tags = list(Tag.objects.values_list('name', flat=True))

    while pk_list:
        print('Parse: (left - %s)' % len(pk_list))
        success_pks = []
        for item in Item.objects.filter(pk__in=pk_list[:items_on_once]):
            try:
                http_code, content, _ = _get_http_data_of_url(item.link)
                assert http_code == '404', 'Not found page'
                item_data = {
                    'title': item.title,
                    'content': content,
                    'description': item.description,
                }
                tags_for_item = _get_tags_for_item(item_data, list_tags)

                if tags_for_item:
                    # todo
                    # надо ли определяет каких тегов нет еще и добавлять только их
                    # или писать все, а БД сама разберется?
                    # разница - в количестве запросов
                    tags_for_insert = diff(tags_for_item,
                                           item.tags.values_list('name',
                                                                 flat=True))
                    tags_objects = Tag.objects.filter(name__in=tags_for_insert)
                    item.tags.add(*tags_objects)
                    item.save()

            except Exception:
                pass
            # print(item)
            success_pks.append(item.pk)

        Item.objects.filter(pk__in=success_pks).update(to_update=False)
        pk_list = diff(pk_list, success_pks)
        save_pickle_file(filepath, pk_list)
コード例 #4
0
def update_news():
    items_on_once = 10
    filepath = './pk_list.pickle'
    # если какая-то новость косячная, то на ней обработка не замнется
    pk_list = load_pickle_file(filepath)
    shuffle(pk_list)
    if pk_list is None:
        return

    list_tags = list(Tag.objects.values_list('name', flat=True))

    while pk_list:
        print("Parse: (left - %s)" % len(pk_list))
        success_pks = []
        for item in Item.objects.filter(pk__in=pk_list[:items_on_once]):
            try:
                http_code, content = _get_http_data_of_url(item.link)
                assert http_code == '404', "Not found page"
                item_data = {
                    'title': item.title,
                    'content': content,
                    'description': item.description,
                }
                tags_for_item = _get_tags_for_item(item_data, list_tags)

                if tags_for_item:
                    # todo
                    # надо ли определяет каких тегов нет еще и добавлять только их
                    # или писать все, а БД сама разберется?
                    # разница - в количестве запросов
                    tags_for_insert = diff(tags_for_item,
                                           item.tags.values_list('name',
                                                                 flat=True))
                    tags_objects = Tag.objects.filter(name__in=tags_for_insert)
                    item.tags.add(*tags_objects)
                    item.save()

            except Exception:
                pass
            print(item)
            success_pks.append(item.pk)

        Item.objects.filter(pk__in=success_pks).update(to_update=False)
        pk_list = diff(pk_list, success_pks)
        save_pickle_file(filepath, pk_list)
コード例 #5
0
def import_rss(**kwargs):
    for src in AutoImportResource.objects.filter(type_res='rss',
                                                 in_edit=False):

        rssnews = feedparser.parse(src.link)
        today = datetime.date.today()
        week_before = today - datetime.timedelta(weeks=1)
        for n in rssnews.entries:
            ct = len(Item.objects.filter(link=n.link)[0:1])
            if ct:
                continue

            time_struct = getattr(n, 'published_parsed', None)
            if time_struct:
                _timestamp = mktime(time_struct)
                dt = datetime.datetime.fromtimestamp(_timestamp)
                if dt.date() < week_before:
                    continue

            title = n.title
            # title = u'[!] %s' % n.title if fresh_google_check(
            #    n.title) else n.title

            http_code, content, raw_content = _get_http_data_of_url(n.link)

            item_data = {
                'title': title,
                'link': n.link,
                'raw_content': raw_content,
                'http_code': http_code,
                'content': content,
                'description': re.sub('<.*?>', '', n.summary),
                'resource': src.resource,
                'language': src.language,
            }
            item_data.update(
                apply_parsing_rules(item_data, **kwargs)
                if kwargs.get('query_rules') else {})
            item_data = apply_video_rules(item_data.copy())
            save_item(item_data)
コード例 #6
0
ファイル: import_news.py プロジェクト: dantyan/pythondigest
def import_rss(**kwargs):
    for src in AutoImportResource.objects.filter(type_res='rss',
                                                 in_edit=False):

        rssnews = feedparser.parse(src.link)
        today = datetime.date.today()
        week_before = today - datetime.timedelta(weeks=1)
        for n in rssnews.entries:
            ct = len(Item.objects.filter(link=n.link)[0:1])
            if ct:
                continue

            time_struct = getattr(n, 'published_parsed', None)
            if time_struct:
                _timestamp = mktime(time_struct)
                dt = datetime.datetime.fromtimestamp(_timestamp)
                if dt.date() < week_before:
                    continue

            title = n.title
            # title = u'[!] %s' % n.title if fresh_google_check(
            #    n.title) else n.title

            http_code, content, raw_content = _get_http_data_of_url(n.link)

            item_data = {
                'title': title,
                'link': n.link,
                'raw_content': raw_content,
                'http_code': http_code,
                'content': content,
                'description': re.sub('<.*?>', '', n.summary),
                'resource': src.resource,
                'language': src.language,
            }
            item_data.update(
                apply_parsing_rules(item_data, **kwargs) if kwargs.
                get('query_rules') else {})
            item_data = apply_video_rules(item_data.copy())
            save_item(item_data)
コード例 #7
0
def get_data_for_rss_item(rss_item: Dict) -> Dict:
    http_code, content, raw_content = _get_http_data_of_url(rss_item["link"])
    rss_item.update({"raw_content": raw_content, "http_code": http_code, "content": content})
    return rss_item