Ejemplo n.º 1
0
    def on_finished(self, data, args):
        Logger.d(TAG, 'on_finished')
        print(args)
        print(f'len(data) : {len(data)}')

        items = [format_data(item._json) for item in data]
        [item.update({'screen_name': args["screen_name"]}) for item in items]
        for item, d in zip(items, data):
            item['created_at'] = int(d.created_at.timestamp())
            item['id'] = d.id
            if 'media' in item['entities']:
                try:
                    s3_media_urls = []
                    medias = item['entities']['media']
                    for media in medias:
                        media_url = media['media_url']
                        filename = media_url.split("/")[-1]
                        local_tmp_filepath = f'/tmp/{filename}'
                        urllib.request.urlretrieve(media_url,
                                                   local_tmp_filepath)
                        s3_filepath = os.path.join(
                            DataLocationConfig.TWITTER_MEDIAFILE_DIR,
                            args["screen_name"], filename)
                        S3.save_file(local_tmp_filepath, s3_filepath, 'i-app')
                        os.remove(local_tmp_filepath)
                        Logger.d(TAG, f'Uploaded media file to {s3_filepath}')
                        s3_media_urls.append(s3_filepath)
                    item['s3_media_urls'] = s3_media_urls
                except Exception as e:
                    print(e)

        DynamoDB.put_items(
            AWSConfig.DYNAMODB_TWITTER_USER_TWEET_TABLE_NAME,
            items,
        )
    def on_finished(self, data, args):
        Logger.i('company_announcement_crawl : on_finished', f'{args["start_dt"]} : {args["end_dt"]}')
        Logger.i('company_announcement_crawl : on_finished', len(data))

        [d.update({'company_code': int(d['company_code'])}) for d in data]

        DynamoDB.put_items(
            AWSConfig.DYNAMODB_COMPANY_ANNOUNCEMENT_TABLE_NAME,
            data,
        )
        print('='*100)
Ejemplo n.º 3
0
    def on_finished(self, data, args):
        Logger.d(TAG, f'on_finished : {args["keyword"]} : {len(data)}')

        print('=' * 100)
        items = [format_data(item._json) for item in data]
        [item.update({'keyword': args["keyword"]}) for item in items]
        for item, d in zip(items, data):
            item['created_at'] = int(d.created_at.timestamp())
        DynamoDB.put_items(
            AWSConfig.DYNAMODB_TWITTER_TABLE_NAME,
            items,
        )
Ejemplo n.º 4
0
    def on_finished(self, data, args):
        Logger.d(TAG, 'on_finished')

        Logger.d(TAG, '=' * 100)
        for d in data:
            d['datetime_keyword'] = d['datetime'].strftime(
                '%Y%m%d_%H%M%S') + '_' + d['keyword']
            d['datetime'] = int(d['datetime'].timestamp())
            d['date'] = d['date'].strftime("%Y-%m-%d")

        DynamoDB.put_items(
            AWSConfig.DYNAMODB_TWITTER_TREND_TABLE_NAME,
            data,
        )
Ejemplo n.º 5
0
    def on_finished(self, data, args):
        Logger.i('company_announcement_crawl : on_finished', f'{args["start_dt"]} : {args["end_dt"]}')
        Logger.i('company_announcement_crawl : on_finished', len(data))

        [d.update({'company_code': int(d['company_code'])}) for d in data]

        DynamoDB.put_items(
            AWSConfig.DYNAMODB_COMPANY_ANNOUNCEMENT_TABLE_NAME,
            data,
        )
        [download_pdf_to_S3(d['document_url'], d['pubdate'].split(' ')[0].replace('-', '')) for d in data]
        global g_data
        g_data = data
        print('='*100)
Ejemplo n.º 6
0
    def on_finished(self, data, args):
        Logger.d(TAG, f'on_finished : {args["topic"]}')

        Logger.d(TAG, '=' * 100)
        for d in data:
            d['published_date'] = datetime(
                *d['published_parsed'][:6]).strftime("%Y-%m-%d")
            del d['published_parsed']
            d['topic'] = args['topic']

        print(f'len(data) : {len(data)}')
        DynamoDB.put_items(
            AWSConfig.DYNAMODB_GOOGLE_RSS_NEWS_TABLE_NAME,
            data,
        )
Ejemplo n.º 7
0
    def on_finished(self, data, args):

        json_data_list = [{
            'keyword':
            keyword,
            'date':
            args['datetime'].strftime('%Y-%m-%d'),
            'datetime':
            args['datetime'].strftime('%Y-%m-%d %H:%M:%S')
        } for keyword in data[0].tolist()]

        Logger.d(TAG, f'on_finished : len(data) : {len(json_data_list)}')
        DynamoDB.put_items(
            AWSConfig.DYNAMODB_GOOGLE_TREND_NAME,
            json_data_list,
        )
def main():
    res = DynamoDB.partitionkey_query(
        table_name='finapp_twitter_tweet',
        partition_key_name='keyword',
        partition_key='ロジザード',
    )
    print([dt.fromtimestamp(r['created_at']) for r in res])
Ejemplo n.º 9
0
    def on_finished(self, data, args):
        Logger.i('company_announcement_crawl : on_finished', f'{args["start_dt"]} : {args["end_dt"]}')
        Logger.i('company_announcement_crawl : on_finished', len(data))

        [d.update({'company_code': int(d['company_code'])}) for d in data]

        global g_data
        ids = [d['id'] for d in g_data]
        filtered_data = list(filter(lambda x: x['id'] not in ids, data))

        Logger.i('company_announcement_crawl : on_finished, filtered_data len => ', len(filtered_data))
        DynamoDB.put_items(
            AWSConfig.DYNAMODB_COMPANY_ANNOUNCEMENT_TABLE_NAME,
            filtered_data,
        )
        [download_pdf_to_S3(d['document_url'], d['pubdate'].split(' ')[0].replace('-', '')) for d in filtered_data]
        
        g_data = []
        print('='*100)