コード例 #1
0
def import_tweets(**kwargs):
    for i in get_tweets():
        # это помогает не парсить лишний раз ссылку, которая есть
        ct = len(Item.objects.filter(link=i[1])[0:1])
        if ct:
            continue

        title = u'[!] %s' % i[0] if fresh_google_check(i[1]) else i[0]
        item_data = {
            'title': title,
            'link': i[1],
            'http_code': i[3],
            'resource': i[2]
        }
        data = apply_parsing_rules(item_data, **kwargs) if kwargs.get(
            'query_rules') else {}
        item_data.update(data)
        save_item(item_data)
コード例 #2
0
def get_rss(**kwargs):
    for src in AutoImportResource.objects.filter(type_res='rss',
                                                 in_edit=False):
        print('\n\n' + '=' * 25)
        print('  ' + src.name)
        print('=' * 25 + '\n')

        num = 0
        rssnews = feedparser.parse(src.link)
        for n in rssnews.entries:

            title = u'[!] %s' % n.title if fresh_google_check(
                n.title,
                debug=True) else n.title

            http_code, content, _ = _get_http_data_of_url(n.link)

            item_data = {
                'title': title,
                'link': n.link,
                'http_code': http_code,
                'content': content,
                'description': n.summary,
                'resource': src.resource,
            }
            data = apply_parsing_rules(item_data, **kwargs) if kwargs.get(
                'query_rules') else {}
            item_data.update(data)

            print_str = ''
            print_str += 'status: %s' % item_data['status'] if (
                'status' in item_data) else ''
            print_str += 'tags: %s' % item_data['tags'] if ('tags' in
                                                            item_data) else ''
            print_str += 'section: %s' % item_data['section'] if (
                'section' in item_data) else ''
            print(print_str)
            try:
                lastnews = Item.objects.get(link=item_data.get('link'))
            except Item.DoesNotExist:
                num += 1
                print('%d: Title: %s (%s)' %
                      (num, item_data.get('title'), item_data.get('link')))
コード例 #3
0
def get_rss(**kwargs):
    for src in AutoImportResource.objects.filter(type_res='rss',
                                                 in_edit=False):
        print('\n\n' + '=' * 25)
        print('  ' + src.name)
        print('=' * 25 + '\n')

        num = 0
        rssnews = feedparser.parse(src.link)
        for n in rssnews.entries:

            title = u'[!] %s' % n.title if fresh_google_check(
                n.title, debug=True) else n.title

            http_code, content, _ = _get_http_data_of_url(n.link)

            item_data = {
                'title': title,
                'link': n.link,
                'http_code': http_code,
                'content': content,
                'description': n.summary,
                'resource': src.resource,
            }
            data = apply_parsing_rules(
                item_data, **kwargs) if kwargs.get('query_rules') else {}
            item_data.update(data)

            print_str = ''
            print_str += 'status: %s' % item_data['status'] if (
                'status' in item_data) else ''
            print_str += 'tags: %s' % item_data['tags'] if (
                'tags' in item_data) else ''
            print_str += 'section: %s' % item_data['section'] if (
                'section' in item_data) else ''
            print(print_str)
            try:
                lastnews = Item.objects.get(link=item_data.get('link'))
            except Item.DoesNotExist:
                num += 1
                print('%d: Title: %s (%s)' %
                      (num, item_data.get('title'), item_data.get('link')))
コード例 #4
0
def import_rss(**kwargs):
    for src in AutoImportResource.objects.filter(type_res='rss', in_edit=False):

        rssnews = feedparser.parse(src.link)
        today = datetime.date.today()
        week_before = today - datetime.timedelta(weeks=1)
        for n in rssnews.entries:
            ct = len(Item.objects.filter(link=n.link)[0:1])
            if ct:
                continue

            time_struct = getattr(n, 'published_parsed', None)
            if time_struct:
                _timestamp = mktime(time_struct)
                dt = datetime.datetime.fromtimestamp(_timestamp)
                if dt.date() < week_before:
                    continue

            title = u'[!] %s' % n.title if fresh_google_check(
                n.title) else n.title

            http_code, content = _get_http_data_of_url(n.link)

            item_data = {
                'title': title,
                'link': n.link,
                'http_code': http_code,
                'content': content,
                'description': re.sub("<.*?>", "", n.summary),
                'resource': src.resource,
                'language': src.language,
            }
            data = apply_parsing_rules(item_data, **kwargs) if kwargs.get(
                'query_rules') else {}
            item_data.update(data)
            save_item(item_data)