Ejemplo n.º 1
0
    def _stream_tweets_by_user_id(self, id_, **kwargs):
        # TODO rework this to use min/max tweets instead of assuming < 200
        # means done
        kwargs = dict(
            user_id=id_,
            count=200
        )

        # TODO consider breaking up/refactoring
        while True:
            try:
                l.INFO("Fetching 200 tweets %s" % (kwargs))
                tweets = self.api.GetUserTimeline(**kwargs)

            except Exception as e:
                l.WARN("%s kwargs %s" % (e, kwargs))
                return None

            l.INFO("Streaming tweets")
            for tweet in tweets:
                self.on_tweet(tweet)

            if len(tweets) < 200:
                # TODO Fix - Using <200 as proxy for end of user timeline
                l.INFO("Stream ended < 200 tweets")
                break

            tweet_ids = [tweet.id for tweet in tweets]
            if len(tweet_ids) > 0:
                # Next request start at oldest tweet in current request
                l.INFO("Setting max ID: {}".format(min(tweet_ids)))
                kwargs['max_id'] = min(tweet_ids)
Ejemplo n.º 2
0
    def on_page_source(self):
        soup = BeautifulSoup(self.page_source)
        elements = soup.findAll("div", {"class": "rg_meta"})

        for element in elements:
            meta_data_str = element.text
            try:
                meta_data = json.loads(meta_data_str)
                self.on_entry(meta_data)
            except Exception as e:
                l.WARN(e)
Ejemplo n.º 3
0
def cli(ctx, config):
    csv_config = {}

    try:
        with open(config, 'r') as config_file:
            csv_config = json.load(config_file)
            l.INFO("Using custom CSV configuration: %s" % (csv_config))
    except TypeError:
        l.WARN("Using default CSV configuration: %s" % (CSV_DEFAULT_CONFIG))

    input_ = click.get_text_stream('stdin')
    convert(input_, configuration=csv_config)
Ejemplo n.º 4
0
def cli(ctx, config):
    s3_config = {}

    try:
        with open(config, 'r') as config_file:
            s3_config = json.load(config_file)
            l.INFO("Using custom CSV configuration: %s" % (s3_config))
    except TypeError:
        l.WARN("Unable to parse s3 config")

    input_ = click.get_text_stream('stdin')
    convert(input_, configuration=s3_config)
Ejemplo n.º 5
0
def convert(lines, configuration):
    access_key_id = str(configuration['aws_access_key_id'])
    secret_access_key = str(configuration['aws_secret_access_key'])
    bucket = str(configuration['bucket'])
    cfg_filename = str(configuration.get('output_file', ''))

    s3_client = S3Client(access_key_id, secret_access_key)

    targets = {}

    for line in lines:
        try:
            data = json.loads(line)
        except Exception as e:
            raise Exception(errors.PARSING_ERROR % (line, e))

        if 'type' not in data:
            raise Exception(errors.MISSING_KEY_ERROR % ('type', line))

        data_type = data['type']

        if data_type == 'RECORD':
            if 'stream' not in data:
                raise Exception(errors.MISSING_KEY_ERROR % ('stream', line))

            filename = cfg_filename
            if filename == "":
                filename = data['stream'] + '.json'

            target_path = ('s3://{bucket}/{filename}'.format(
                bucket=bucket, filename=filename))

            record = data['record']
            print(targets)

            target = None
            if not target_path in targets.keys():
                target = S3Target(target_path, client=s3_client)
                targets[target_path] = {
                    'target': target,
                    'file': target.open('w')
                }

            target = targets[target_path]['target']

            targets[target_path]['file'].write(json.dumps(record) + '\n')

        else:
            l.WARN(errors.UNEXPECTED_MESSAGE_TYPE % (data['type'], data))

    for target_path in targets:
        targets[target_path]['file'].close()
Ejemplo n.º 6
0
    def update_page_source(self):
        url = self.build_search_url()

        driver = Chrome()
        driver.get(url)

        num_scrolls = 0
        try:

            while num_scrolls < self.scroll_max:
                driver.execute_script(random_js_scroll())
                self.page_source = driver.page_source
                random_sleep()
                num_scrolls += 1

        except Exception as e:
            l.WARN(e)

        driver.close()
Ejemplo n.º 7
0
def convert(lines, configuration):
    cfg_filename = str(configuration.get('output_file', ""))
    delimiter = str(configuration.get('delimiter', ','))
    quotechar = str(configuration.get('quotechar', '"'))

    for line in lines:
        try:
            data = json.loads(line)
        except Exception as e:
            raise Exception(errors.PARSING_ERROR % (line, e))

        if 'type' not in data:
            raise Exception(errors.MISSING_KEY_ERROR % ('type', line))

        data_type = data['type']

        if data_type == 'RECORD':
            if 'stream' not in data:
                raise Exception(errors.MISSING_KEY_ERROR % ('stream', line))

            filename = cfg_filename
            if filename == "":
                filename = data['stream'] + '.csv'
            flattened_record = flatten(data['record'])
            header = flattened_record.keys()

            with open(filename, 'a') as output_file:
                writer = csv.DictWriter(output_file,
                                        header,
                                        extrasaction='ignore',
                                        delimiter=delimiter,
                                        quotechar=quotechar)

                if is_file_empty(filename):
                    writer.writeheader()
                writer.writerow(flattened_record)

        else:
            l.WARN(errors.UNEXPECTED_MESSAGE_TYPE % (data['type'], data))
Ejemplo n.º 8
0
    def _stream_tweets(self, user_id=None, screen_name=None, limit=3200):
        # TODO rework this to use min/max tweets instead of assuming < 200
        # means done
        kwargs = dict(
            count=200
        )
        tweets_gathered = 0

        while True:
            try:
                l.INFO("Fetching 200 tweets %s" % (kwargs))
                tweets = self.api.GetUserTimeline(**kwargs)
                tweets_gathered += len(tweets)

            except Exception as e:
                l.WARN("%s kwargs %s" % (e, kwargs))
                return None

            l.INFO("Streaming tweets")
            for tweet in tweets:
                self.on_tweet(tweet)

            if tweets_gathered >= limit:
                l.INFO("Per user limit hit {} tweets gathered".format(limit))
                break

            if len(tweets) < 200:
                # TODO Fix - Using <200 as proxy for end of user timeline
                l.INFO("Stream ended < 200 tweets")
                break

            tweet_ids = [tweet.id for tweet in tweets]
            if len(tweet_ids) > 0:
                # Next request start at oldest tweet in current request
                l.INFO("Setting max ID: {}".format(min(tweet_ids)))
                kwargs['max_id'] = min(tweet_ids)
Ejemplo n.º 9
0
def cli(ctx, scroll_max, from_file, from_pipe):
    keywords = []

    if not from_file is None:
        reader = csv.reader(from_file)
        for row in reader:
            keywords.append(row[0])

    if from_pipe:
        try:
            stdin_text = (
                click.get_text_stream('stdin').read().strip()).split('\n')
            for line in stdin_text:
                keywords.append(line)
        except Exception as e:
            raise RuntimeError("Error while reading pipe: %s" % (e))

    if len(keywords) == 0:
        l.WARN("Nothing to search, got: %s" % (keywords))

    crawler = GoogleImageCrawler(task_cls=GoogleImageMetaDataLogger,
                                 queue_data=keywords,
                                 scroll_max=scroll_max)
    crawler.start()
Ejemplo n.º 10
0
def insert_lines(collection, lines):

    for line in lines:
        try:
            data = json.loads(line)
        except Exception as e:
            raise Exception(PARSING_ERROR % (line, e))

        if 'type' not in data:
            raise Exception(MISSING_KEY_ERROR % ('type', line))

        data_type = data['type']

        if data_type == 'RECORD':
            if 'stream' not in data:
                raise Exception(MISSING_KEY_ERROR % ('stream', line))

            record = data['record']
            collection.insert_one(record)

        else:
            l.WARN("""
                   Unexpected message type %s in message %s
                   """ % (data['type'], data))