Ejemplo n.º 1
0
    def handle(self, *args, **options):
        # FIXME: why use options.get again and again?
        twitter_user = None
        user_set = None
        start_dt = None
        end_dt = None
        if options.get('twitter_user', False):
            try:
                twitter_user = TwitterUser.objects.get(
                    name=options.get('twitter_user'))
            except TwitterUser.DoesNotExist:
                raise CommandError('TwitterUser %s does not exist' %
                                   options.get('twitter_user'))
        elif options.get('set_name', False):
            user_set = None
            try:
                user_set = TwitterUserSet.objects.get(
                    name=options.get('set_name'))
            except TwitterUserSet.DoesNotExist:
                raise CommandError('TwitterUserSet %s does not exist' %
                                   options.get('set_name'))
        else:
            raise CommandError('please specify a twitter user or set name')

        if options.get('start_date', False):
            start_dt = make_date_aware(options.get('start_date'))
            if not start_dt:
                raise CommandError('dates must be in the format YYYY-MM-DD')
        else:
            start_dt = None
        if options.get('end_date', False):
            end_dt = make_date_aware(options.get('end_date'))
            if not end_dt:
                raise CommandError('dates must be in the format YYYY-MM-DD')
        else:
            end_dt = None
        if start_dt and end_dt:
            if end_dt < start_dt:
                raise CommandError('start date must be earlier than end date')

        if twitter_user:
            qs = twitter_user.items.all()
        elif user_set:
            qs = TwitterUserItem.objects.filter(
                twitter_user__sets__in=[user_set])

        if start_dt:
            qs = qs.filter(date_published__gte=start_dt)
        if end_dt:
            qs = qs.filter(date_published__lte=end_dt)

        # tweak for python 2.7 to avoid having to set PYTHONIOENCODING=utf8
        # in environment, see Graham Fawcett's comment/suggestion at:
        #   nedbatchelder.com/blog/200401/printing_unicode_from_python.html
        writer_class = codecs.getwriter('utf-8')
        sys.stdout = writer_class(sys.stdout, 'replace')
        for tui in qs:
            print '\t'.join(tui.csv)
Ejemplo n.º 2
0
    def handle(self, *args, **options):
        twitter_user = user_set = start_dt = end_dt = xls = filename = None
        if options['filename']:
            filename = options.get('filename')
        xls = options['xls']
        if xls and filename is None:
            raise CommandError("When --xls is specified, \
--filename=FILENAME is required")
        if not xls and filename is not None:
            raise CommandError("Writing CSV files currently not yet \
supported; recommend piping output to a file")
        if options['twitter_user']:
            try:
                twitter_user = TwitterUser.objects.get(
                    name__iexact=options.get('twitter_user'))
                qs = twitter_user.items.all()
            except TwitterUser.DoesNotExist:
                raise CommandError('TwitterUser %s does not exist' %
                                   options.get('twitter_user'))
        elif options['set_name']:
            try:
                user_set = TwitterUserSet.objects.get(
                    name=options.get('set_name'))
                qs = TwitterUserItem.objects.filter(
                    twitter_user__sets__in=[user_set])
            except TwitterUserSet.DoesNotExist:
                raise CommandError('TwitterUserSet %s does not exist' %
                                   options['set_name'])
        else:
            raise CommandError('please provide either twitteruser or setname')
        if options['start_date']:
            start_dt = make_date_aware(options.get('start_date'))
            if not start_dt:
                raise CommandError('dates must be in the format YYYY-MM-DD')
            qs = qs.filter(date_published__gte=start_dt)
        if options['end_date']:
            end_dt = make_date_aware(options.get('end_date'))
            if not end_dt:
                raise CommandError('dates must be in the format YYYY-MM-DD')
            qs = qs.filter(date_published__lte=end_dt)
        if start_dt and end_dt:
            if end_dt < start_dt:
                raise CommandError('start date must be earlier than end date')
        # tweak for python 2.7 to avoid having to set PYTHONIOENCODING=utf8
        # in environment, see Graham Fawcett's comment/suggestion at:
        #   nedbatchelder.com/blog/200401/printing_unicode_from_python.html
        writer_class = codecs.getwriter('utf-8')
        sys.stdout = writer_class(sys.stdout, 'replace')
        if xls:
            tworkbook = xls_tweets_workbook(qs, TwitterUserItem.csv_headers)
            tworkbook.save(filename)
        else:
            for tui in qs:
                print '\t'.join(tui.csv)
Ejemplo n.º 3
0
    def handle(self, *args, **options):
        # FIXME: why use options.get again and again?
        twitter_user = None
        user_set = None
        start_dt = None
        end_dt = None
        if options.get('twitter_user', False):
            try:
                twitter_user = TwitterUser.objects.get(
                    name=options.get('twitter_user'))
            except TwitterUser.DoesNotExist:
                raise CommandError('TwitterUser %s does not exist' %
                                   options.get('twitter_user'))
        elif options.get('set_name', False):
            user_set = None
            try:
                user_set = TwitterUserSet.objects.get(
                    name=options.get('set_name'))
            except TwitterUserSet.DoesNotExist:
                raise CommandError('TwitterUserSet %s does not exist' %
                                   options.get('set_name'))
        else:
            raise CommandError('please specify a twitter user or set name')

        if options.get('start_date', False):
            start_dt = make_date_aware(options.get('start_date'))
            if not start_dt:
                raise CommandError('dates must be in the format YYYY-MM-DD')
        else:
            start_dt = None
        if options.get('end_date', False):
            end_dt = make_date_aware(options.get('end_date'))
            if not end_dt:
                raise CommandError('dates must be in the format YYYY-MM-DD')
        else:
            end_dt = None
        if start_dt and end_dt:
            if end_dt < start_dt:
                raise CommandError('start date must be earlier than end date')

        if twitter_user:
            qs = twitter_user.items.all()
        elif user_set:
            qs = TwitterUserItem.objects.filter(
                twitter_user__sets__in=[user_set])

        if start_dt:
            qs = qs.filter(date_published__gte=start_dt)
        if end_dt:
            qs = qs.filter(date_published__lte=end_dt)

        for tui in qs:
            print '\t'.join(tui.csv)
Ejemplo n.º 4
0
    def handle(self, *args, **options):
        # FIXME: why use options.get again and again?
        twitter_user = None
        user_set = None
        start_dt = None
        end_dt = None
        if options.get('twitter_user', False):
            try:
                twitter_user = TwitterUser.objects.get(
                    name=options.get('twitter_user'))
            except TwitterUser.DoesNotExist:
                raise CommandError('TwitterUser %s does not exist' %
                                   options.get('twitter_user'))
        elif options.get('set_name', False):
            user_set = None
            try:
                user_set = TwitterUserSet.objects.get(
                    name=options.get('set_name'))
            except TwitterUserSet.DoesNotExist:
                raise CommandError('TwitterUserSet %s does not exist' %
                                   options.get('set_name'))
        else:
            raise CommandError('please specify a twitter user or set name')

        if options.get('start_date', False):
            start_dt = make_date_aware(options.get('start_date'))
            if not start_dt:
                raise CommandError('dates must be in the format YYYY-MM-DD')
        else:
            start_dt = None
        if options.get('end_date', False):
            end_dt = make_date_aware(options.get('end_date'))
            if not end_dt:
                raise CommandError('dates must be in the format YYYY-MM-DD')
        else:
            end_dt = None
        if start_dt and end_dt:
            if end_dt < start_dt:
                raise CommandError('start date must be earlier than end date')

        if twitter_user:
            qs = twitter_user.items.all()
        elif user_set:
            qs = TwitterUserItem.objects.filter(
                twitter_user__sets__in=[user_set])

        if start_dt:
            qs = qs.filter(date_published__gte=start_dt)
        if end_dt:
            qs = qs.filter(date_published__lte=end_dt)

        for tui in qs:
            print '\t'.join(tui.csv)
Ejemplo n.º 5
0
    def handle(self, *args, **options):
        twitter_user = user_set = start_dt = end_dt = fmt = filename = None
        fmt = options['format'].lower()
        if fmt not in ['csv', 'json', 'xls']:
            raise CommandError("format must be either csv, json or xls")
        if options['filename']:
            filename = options.get('filename')
        if fmt == 'xls' and filename is None:
            raise CommandError("When --format is xls, \
--filename=FILENAME is required")
        if options['twitter_user']:
            try:
                twitter_user = TwitterUser.objects.get(
                    name__iexact=options.get('twitter_user'))
                qs = twitter_user.items.all()
            except TwitterUser.DoesNotExist:
                raise CommandError('TwitterUser %s does not exist' %
                                   options.get('twitter_user'))
        elif options['set_name']:
            try:
                user_set = TwitterUserSet.objects.get(
                    name=options.get('set_name'))
                qs = TwitterUserItem.objects.filter(
                    twitter_user__sets__in=[user_set])
            except TwitterUserSet.DoesNotExist:
                raise CommandError('TwitterUserSet %s does not exist' %
                                   options['set_name'])
        else:
            raise CommandError('please provide either twitteruser or setname')
        if options['start_date']:
            start_dt = make_date_aware(options.get('start_date'))
            if not start_dt:
                raise CommandError('dates must be in the format YYYY-MM-DD')
            qs = qs.filter(date_published__gte=start_dt)
        if options['end_date']:
            end_dt = make_date_aware(options.get('end_date'))
            if not end_dt:
                raise CommandError('dates must be in the format YYYY-MM-DD')
            qs = qs.filter(date_published__lte=end_dt)
        if start_dt and end_dt:
            if end_dt < start_dt:
                raise CommandError('start date must be earlier than end date')

        # tweak for python 2.7 to avoid having to set PYTHONIOENCODING=utf8
        # in environment, see Graham Fawcett's comment/suggestion at:
        #   nedbatchelder.com/blog/200401/printing_unicode_from_python.html
        if filename:
            sys.stdout = codecs.open(filename, 'w', 'utf-8')
        else:
            writer_class = codecs.getwriter('utf-8')
            sys.stdout = writer_class(sys.stdout, 'replace')

        if fmt == 'xls':
            tworkbook = xls_tweets_workbook(qs, TwitterUserItem.csv_headers)
            tworkbook.save(filename)
        elif fmt == 'json':
            for tui in qs:
                print tui.item_json
        else:
            print '\t'.join(TwitterUserItem.csv_headers)
            for tui in qs:
                print '\t'.join(tui.csv)
Ejemplo n.º 6
0
    def handle(self, *args, **options):
        twitter_user = None
        start_dt = None
        end_dt = None
        if options['twitter_user']:
            try:
                twitter_user = TwitterUser.objects.get(
                    name=options['twitter_user'])
            except TwitterUser.DoesNotExist:
                raise CommandError('TwitterUser %s does not exist' %
                                   options['twitter_user'])

        if options['start_date']:
            start_dt = make_date_aware(options['start_date'])
            if not start_dt:
                raise CommandError('dates must be in the format YYYY-MM-DD')
        else:
            start_dt = None
        if options['end_date']:
            end_dt = make_date_aware(options['end_date'])
            if not end_dt:
                raise CommandError('dates must be in the format YYYY-MM-DD')
        else:
            end_dt = None
        if start_dt and end_dt:
            if end_dt < start_dt:
                raise CommandError('start date must be earlier than end date')

        if twitter_user:
            qs = twitter_user.items.all()
        else:
            qs = TwitterUserItem.objects.all()

        if not options['refetch']:
            qs = qs.filter(urls__isnull=True)

        if start_dt:
            qs = qs.filter(date_published__gte=start_dt)
        if end_dt:
            qs = qs.filter(date_published__lte=end_dt)

        qs = queryset_iterator(qs)

        count = 0
        for tui in qs:
            urls = []
            urls.extend(tui.tweet['entities']['urls'])
            if 'media' in tui.tweet['entities'].keys():
                urls.extend(tui.tweet['entities']['media'])
            if not urls:
                # use of entities.urls was spotty at first
                for u in tui.links:
                    if ('...' in unicodedata.normalize('NFKD', u).encode('ascii','ignore')
                        and tui.tweet['retweet_count'] > 0) :
                        continue
                    urls.append({'url': u, 'expanded_url': u})
            for url in urls:
                try:
                    r = requests.head(url['expanded_url'],
                                              allow_redirects=True,
                                              timeout=10)
                    if r.status_code == 405:
                        r = requests.get(url['expanded_url'],
                                                  allow_redirects=True,
                                                  stream=True, timeout=10)
                        r.close()
                    req_history_headers = []
                    for req in r.history:
                        req_headers = self.decode_headers(req.headers, req.encoding)

                        req_history_headers.append((
                            req.status_code,
                            req.url,
                            req_headers))

                    final_req_headers = self.decode_headers(r.headers, r.encoding)

                    tuiu = TwitterUserItemUrl(
                        item=tui,
                        start_url=url['url'],
                        expanded_url=url['expanded_url'],
                        history=json.dumps(req_history_headers),
                        final_url=r.url,
                        final_status=r.status_code,
                        final_headers=json.dumps(final_req_headers),
                        duration_seconds=r.elapsed.total_seconds())
                    tuiu.save()
                except (requests.RequestException) as e:
                    # TODO: consider trapping/recording
                    # requests.exceptions.ConnectionError,
                    # requests.exceptions.TooManyRedirects etc.
                    # and flagging records as having errored out
                    print("Request Exceptions Error fetching %s: %s" % (url['expanded_url'].encode('utf-8'), e))
                except (requests.packages.urllib3.exceptions.HTTPError) as e:
                    print("HTTPError fetching %s: %s" % (url['expanded_url'].encode('utf-8'), e))
                except (socket_error) as e:
                    print("Socket error fetching %s: %s" % (url['expanded_url'].encode('utf-8'), e))

                    tuiu = TwitterUserItemUrl(
                        item=tui,
                        start_url=url['url'],
                        expanded_url=url['expanded_url'],
                        final_url=url['url'],
                        final_status=410)
                    tuiu.save()

            if urls:
                count += 1
            if options['limit']:
                if count >= options['limit']:
                    sys.exit()
Ejemplo n.º 7
0
    def handle(self, *args, **options):
        twitter_user = None
        start_dt = None
        end_dt = None
        if options['twitter_user']:
            try:
                twitter_user = TwitterUser.objects.get(
                    name=options['twitter_user'])
            except TwitterUser.DoesNotExist:
                raise CommandError('TwitterUser %s does not exist' %
                                   options['twitter_user'])

        if options['start_date']:
            start_dt = make_date_aware(options['start_date'])
            if not start_dt:
                raise CommandError('dates must be in the format YYYY-MM-DD')
        else:
            start_dt = None
        if options['end_date']:
            end_dt = make_date_aware(options['end_date'])
            if not end_dt:
                raise CommandError('dates must be in the format YYYY-MM-DD')
        else:
            end_dt = None
        if start_dt and end_dt:
            if end_dt < start_dt:
                raise CommandError('start date must be earlier than end date')

        if twitter_user:
            qs = twitter_user.items.all()
        else:
            qs = TwitterUserItem.objects.all()

        if not options['refetch']:
            qs = qs.filter(urls__isnull=True)

        if start_dt:
            qs = qs.filter(date_published__gte=start_dt)
        if end_dt:
            qs = qs.filter(date_published__lte=end_dt)

        qs = queryset_iterator(qs)

        count = 0
        for tui in qs:
            urls = []
            urls.extend(tui.tweet['entities']['urls'])
            if 'media' in tui.tweet['entities'].keys():
                urls.extend(tui.tweet['entities']['media'])
            if not urls:
                # use of entities.urls was spotty at first
                for u in tui.links:
                    if ('...' in unicodedata.normalize('NFKD', u).encode(
                            'ascii', 'ignore')
                            and tui.tweet['retweet_count'] > 0):
                        continue
                    urls.append({'url': u, 'expanded_url': u})
            for url in urls:
                try:
                    r = requests.head(url['expanded_url'],
                                      allow_redirects=True,
                                      timeout=10)
                    if r.status_code == 405:
                        r = requests.get(url['expanded_url'],
                                         allow_redirects=True,
                                         stream=True,
                                         timeout=10)
                        r.close()
                    req_history_headers = []
                    for req in r.history:
                        req_headers = self.decode_headers(
                            req.headers, req.encoding)

                        req_history_headers.append(
                            (req.status_code, req.url, req_headers))

                    final_req_headers = self.decode_headers(
                        r.headers, r.encoding)

                    tuiu = TwitterUserItemUrl(
                        item=tui,
                        start_url=url['url'],
                        expanded_url=url['expanded_url'],
                        history=json.dumps(req_history_headers),
                        final_url=r.url,
                        final_status=r.status_code,
                        final_headers=json.dumps(final_req_headers),
                        duration_seconds=r.elapsed.total_seconds())
                    tuiu.save()
                except (requests.RequestException) as e:
                    # TODO: consider trapping/recording
                    # requests.exceptions.ConnectionError,
                    # requests.exceptions.TooManyRedirects etc.
                    # and flagging records as having errored out
                    print("Request Exceptions Error fetching %s: %s" %
                          (url['expanded_url'].encode('utf-8'), e))
                except (requests.packages.urllib3.exceptions.HTTPError) as e:
                    print("HTTPError fetching %s: %s" %
                          (url['expanded_url'].encode('utf-8'), e))
                except (socket_error) as e:
                    print("Socket error fetching %s: %s" %
                          (url['expanded_url'].encode('utf-8'), e))

                    tuiu = TwitterUserItemUrl(item=tui,
                                              start_url=url['url'],
                                              expanded_url=url['expanded_url'],
                                              final_url=url['url'],
                                              final_status=410)
                    tuiu.save()

            if urls:
                count += 1
            if options['limit']:
                if count >= options['limit']:
                    sys.exit()
Ejemplo n.º 8
0
    def handle(self, *args, **options):
        twitter_user = None
        start_dt = None
        end_dt = None
        if options['twitter_user']:
            try:
                twitter_user = TwitterUser.objects.get(
                    name=options['twitter_user'])
            except TwitterUser.DoesNotExist:
                raise CommandError('TwitterUser %s does not exist' %
                                   options['twitter_user'])

        if options['start_date']:
            start_dt = make_date_aware(options['start_date'])
            if not start_dt:
                raise CommandError('dates must be in the format YYYY-MM-DD')
        else:
            start_dt = None
        if options['end_date']:
            end_dt = make_date_aware(options['end_date'])
            if not end_dt:
                raise CommandError('dates must be in the format YYYY-MM-DD')
        else:
            end_dt = None
        if start_dt and end_dt:
            if end_dt < start_dt:
                raise CommandError('start date must be earlier than end date')

        if twitter_user:
            qs = twitter_user.items.all()
        else:
            qs = TwitterUserItem.objects.all()

        if start_dt:
            qs = qs.filter(date_published__gte=start_dt)
        if end_dt:
            qs = qs.filter(date_published__lte=end_dt)

        # be sure we move through the list in a consistent order
        qs = qs.order_by('date_published')

        session = requests.Session()
        count = 0
        for tui in qs:
            urls = []
            urls.extend(tui.tweet['entities']['urls'])
            if not urls:
                # use of entities.urls was spotty at first
                for u in tui.links:
                    urls.append({'url': u, 'expanded_url': u})
            for url in urls:
                # use filter because 0-to-many might already exist
                qs_tuiu = TwitterUserItemUrl.objects.filter(
                    item=tui,
                    start_url=url['url'],
                    expanded_url=url['expanded_url'])
                # if any already exist, and we're not refetching, move on
                if qs_tuiu.count() > 0 and \
                        not options['refetch']:
                    continue
                # otherwise, create a new one from scratch
                try:
                    r = session.get(url['url'], allow_redirects=True,
                                    stream=False)
                    r.close()
                except:
                    # TODO: consider trapping/recording
                    # requests.exceptions.ConnectionError,
                    # requests.exceptions.TooManyRedirects etc.
                    # and flagging records as having errored out
                    tuiu = TwitterUserItemUrl(
                        item=tui,
                        start_url=url['url'],
                        expanded_url=url['url'],
                        final_url=url['url'],
                        final_status=410)
                    tuiu.save()
                    continue
                tuiu = TwitterUserItemUrl(
                    item=tui,
                    start_url=url['url'],
                    expanded_url=url['expanded_url'],
                    history=json.dumps([(
                        req.status_code, req.url, dict(req.headers))
                        for req in r.history]),
                    final_url=r.url,
                    final_status=r.status_code,
                    final_headers=json.dumps(dict(r.headers)),
                    duration_seconds=r.elapsed.total_seconds())
                tuiu.save()
            count += 1
            if options['limit']:
                if count >= options['limit']:
                    sys.exit()