Python S3_scraper_index Exemples, shared.S3_scraper_index Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : app.py Projet : pbarber/ni-covid-tweets

def check_nisra(secret, s3, notweet):
    indexkey = secret['nisra-deaths-index']

    # Get the previous data file list from S3
    status = S3_scraper_index(s3, secret['bucketname'], indexkey)
    previous = status.get_dict()
    previous = sorted(previous, key=lambda k: k['filedate'], reverse=True)

    # Check the NISRA site for file changes
    current, changes = check_for_nisra_files(s3, secret['bucketname'],
                                             previous)

    # Write any changes back to S3
    if len(changes) > 0:
        status.put_dict(current)
        message = 'Wrote %d items to %s, of which %d were changes' % (
            len(current), indexkey, len(changes))

        # If the most recent file has changed then tweet
        totweet = [
            c['index'] for c in changes
            if (c['change'] == 'added') or (c['index'] == 0)
        ]
        if not notweet and (0 in totweet):
            print('Launching NISRA tweeter')
            launch_lambda_async(os.getenv('NISRA_TWEETER_LAMBDA'),
                                [current[a] for a in totweet])
            message += ', and launched NISRA tweet lambda'
    else:
        message = 'Did nothing'

    return message

Exemple #2

0

Afficher le fichier

Fichier : app.py Projet : pbarber/ni-covid-tweets

def lambda_handler(event, context):
    # Get the secret
    sm = boto3.client('secretsmanager')
    secretobj = sm.get_secret_value(SecretId='ni-covid-tweets')
    secret = json.loads(secretobj['SecretString'])

    messages = []
    # Download the most recently updated PDF file
    s3 = boto3.client('s3')
    for change in event:
        tmp = tempfile.NamedTemporaryFile(suffix='.pdf')
        with open(tmp.name, 'wb') as fp:
            s3.download_fileobj(secret['bucketname'], change['keyname'], fp)
        text = textract.process(tmp.name, method='pdfminer').decode('utf-8')
        first = True
        regex = re.compile(r'^Current estimate of Rt \((.*)\):\s+(.*)$')
        tweet = 'R estimates by Northern Ireland DoH on '
        for line in text.split('\n'):
            m = regex.match(line)
            if first is True:
                tweet += '%s\n\n' % line
            elif m:
                tweet += '\u2022 %s: %s\n' % (m.group(1), m.group(2))
            first = False
        tweet += '\n%s' % change['url']

        if change.get('notweet') is not True:
            api = TwitterAPI(secret['twitter_apikey'],
                             secret['twitter_apisecretkey'],
                             secret['twitter_accesstoken'],
                             secret['twitter_accesstokensecret'])
            resp = api.tweet(tweet)

            # Download and update the index
            status = S3_scraper_index(s3, secret['bucketname'],
                                      secret['doh-r-index'])
            index = status.get_dict()
            for i in range(len(index)):
                if index[i]['filedate'] == change['filedate']:
                    index[i]['tweet'] = resp.id
                    break
            status.put_dict(index)

            messages.append('Tweeted ID %s and updated %s' %
                            (resp.id, secret['doh-r-index']))
        else:
            print(tweet)
            messages.append('Did not tweet')

    return {
        "statusCode": 200,
        "body": json.dumps({
            "message": messages,
        }),
    }

Exemple #3

0

Afficher le fichier

Fichier : app.py Projet : pbarber/ni-covid-tweets

def check_doh(secret, s3, notweet, mode):
    if mode == 'dd':
        indexkey = secret['doh-dd-index']
        lambdaname = os.getenv('TWEETER_LAMBDA')
    else:
        indexkey = secret['doh-r-index']
        lambdaname = os.getenv('R_TWEETER_LAMBDA')

    # Get the previous data file list from S3
    status = S3_scraper_index(s3, secret['bucketname'], indexkey)
    previous = status.get_dict()
    previous = sorted(previous, key=lambda k: k['filedate'], reverse=True)

    # Check the DoH site for file changes
    if mode == 'dd':
        current, changes = check_for_dd_files(
            s3,
            secret['bucketname'],
            previous,
            int(secret['doh-dd-files-to-check']),
            store=(not notweet))
    else:
        current, changes = check_for_r_files(s3, secret['bucketname'],
                                             previous)

    # Write any changes back to S3
    if len(changes) > 0:
        status.put_dict(current)
        message = 'Wrote %d items to %s, of which %d were changes' % (
            len(current), indexkey, len(changes))

        # If the most recent file has changed then tweet
        totweet = [
            c['index'] for c in changes
            if (c['change'] == 'added') or (c['index'] == 0)
        ]
        if not notweet and (0 in totweet):
            print('Launching %s tweeter' % mode)
            launch_lambda_async(lambdaname, [current[a] for a in totweet])
            message += ', and launched %s tweet lambda' % mode
    else:
        message = 'Did nothing'

    return message

Exemple #4

0

Afficher le fichier

Fichier : app.py Projet : pbarber/ni-covid-tweets

def get_all_doh(secret, s3):
    indexkey = secret['doh-dd-index']

    # Get the previous data file list from S3
    status = S3_scraper_index(s3, secret['bucketname'], indexkey)
    previous = status.get_dict()
    previous = sorted(previous, key=lambda k: k['filedate'], reverse=True)

    # Check the DoH site for file changes
    current, changes = check_for_dd_files(s3, secret['bucketname'], previous,
                                          0)

    # Write any changes back to S3
    if len(changes) > 0:
        status.put_dict(current)
        message = 'Wrote %d items to %s, of which %d were changes' % (
            len(current), indexkey, len(changes))
    else:
        message = 'Did nothing'

    return message

Exemple #5

0

Afficher le fichier

Fichier : app.py Projet : pbarber/ni-covid-tweets

def lambda_handler(event, context):
    # Get the secret
    sm = boto3.client('secretsmanager')
    secretobj = sm.get_secret_value(SecretId='ni-covid-tweets')
    secret = json.loads(secretobj['SecretString'])

    tweets = []
    # Download the most recently updated Excel file
    s3 = boto3.client('s3')
    for change in event:
        obj = s3.get_object(Bucket=secret['bucketname'],Key=change['keyname'])['Body']
        stream = io.BytesIO(obj.read())

        # Load test data and add extra fields
        df = pandas.read_excel(stream,engine='openpyxl',sheet_name='Table 7', header=3)
        df.dropna('columns',how='all',inplace=True)
        df.rename(columns=colclean,inplace=True)
        df.dropna('rows',subset=['Total'],inplace=True)

        # Get the latest dates with values for tests and rolling
        df['date'] = pandas.to_datetime(df['Week Ending'], format='%d/%m/%Y')
        df.sort_values('date', inplace=True)
        latest = df.iloc[-1]

        # Check against previous day's reports
        status = S3_scraper_index(s3, secret['bucketname'], secret['nisra-deaths-index'])
        index = status.get_dict()
        plots = []
        if latest['Total'] == 0:
            tweet = '''No deaths registered in Northern Ireland, week ended {date}

'''.format(
                date=latest['date'].strftime('%A %-d %B %Y'),
            )
        else:
            if latest['Total'] == 1:
                tweet = '''One death registered in Northern Ireland, week ended {date}, in:
'''.format(
                    date=latest['date'].strftime('%A %-d %B %Y')
                )
            else:
                tweet = '''{deaths:,} deaths registered in Northern Ireland, week ended {date}, in:
'''.format(
                    date=latest['date'].strftime('%A %-d %B %Y'),
                    deaths=int(latest['Total'])
                )
            for name in ['Hospital', 'Care Home', 'Hospice', 'Home', 'Other']:
                if latest[name] > 0:
                    tweet += '\u2022 %s: %s\n' %(name, int(latest[name]))
            tweet += '\n'
        if len(df) > 1:
            prev = df.iloc[-2]
            diff = latest['Total'] - prev['Total']
            tweet += '''{symb} {diff} {comp} than previous week

'''.format(
                symb=good_symb if diff < 0 else bad_symb,
                diff=abs(int(diff)),
                comp='fewer' if diff < 0 else 'more'
            )
            try:
                driver = get_chrome_driver()
                plots = []
                if driver is None:
                    logging.error('Failed to start chrome')
                else:
                    toplot = df[(df['Week Ending'] > df['Week Ending'].max()-pandas.to_timedelta(84, unit='d'))]
                    toplot = toplot.drop(columns=['Week of Death','date','Total']).melt(id_vars='Week Ending', var_name='Location', value_name='Deaths')
                    print(toplot)
                    p = altair.vconcat(
                        altair.Chart(
                            toplot
                        ).mark_area().encode(
                            x = altair.X('Week Ending:T', axis=altair.Axis(title='Week of death')),
                            y = altair.Y('sum(Deaths):Q', axis=altair.Axis(title='Deaths', orient="right", tickMinStep=1)),
                            color=altair.Color('Location', sort=altair.SortField('order',order='descending')),
                        ).properties(
                            height=450,
                            width=800,
                            title='NI COVID-19 Deaths reported by NISRA from %s to %s' %(toplot['Week Ending'].min().strftime('%-d %B %Y'), toplot['Week Ending'].max().strftime('%-d %B %Y'))
                        ),
                    ).properties(
                        title=altair.TitleParams(
                            ['Data from NISRA',
                            'https://twitter.com/ni_covid19_data on %s'  %datetime.datetime.now().date().strftime('%A %-d %B %Y')],
                            baseline='bottom',
                            orient='bottom',
                            anchor='end',
                            fontWeight='normal',
                            fontSize=10,
                            dy=10
                        ),
                    )
                    plotname = 'nisra-deaths-time-%s.png'%datetime.datetime.now().date().strftime('%Y-%d-%m')
                    plotstore = io.BytesIO()
                    p.save(fp=plotstore, format='png', method='selenium', webdriver=driver)
                    plotstore.seek(0)
                    plots.append({'name': plotname, 'store': plotstore})
            except:
                logging.exception('Error creating plot')

        tweets.append({
            'text': tweet,
            'url': change['url'],
            'notweet': change.get('notweet'),
            'filedate': change['filedate'],
            'plots': plots
        })

    donottweet = []
    if len(tweets) > 1:
        for i in range(1,len(tweets)):
            for j in range(0, i):
                if (tweets[i]['text'] == tweets[j]['text']):
                    donottweet.append(i)

    messages = []
    for idx in range(len(tweets)):
        tweet = tweets[idx]['text'] + tweets[idx]['url']
        if (idx not in donottweet):
            if tweets[idx].get('notweet') is not True:
                api = TwitterAPI(secret['twitter_apikey'], secret['twitter_apisecretkey'], secret['twitter_accesstoken'], secret['twitter_accesstokensecret'])
                upload_ids = api.upload_multiple(tweets[idx]['plots'])
                if change.get('testtweet') is True:
                    if len(upload_ids) > 0:
                        resp = api.dm(secret['twitter_dmaccount'], tweet, upload_ids[0])
                    else:
                        resp = api.dm(secret['twitter_dmaccount'], tweet)
                    messages.append('Tweeted DM ID %s' %(resp.id))
                else:
                    if len(upload_ids) > 0:
                        resp = api.tweet(tweet, media_ids=upload_ids)
                    else:
                        resp = api.tweet(tweet)
                    messages.append('Tweeted ID %s, ' %resp.id)

                    # Update the file index
                    for i in range(len(index)):
                        if index[i]['filedate'] == tweets[idx]['filedate']:
                            index[i]['tweet'] = resp.id
                            break
                    status.put_dict(index)

                    messages[-1] += ('updated %s' %secret['nisra-deaths-index'])
            else:
                messages.append('Did not tweet')
                print(tweet)
        else:
            messages.append('Duplicate found %s, did not tweet, ' %tweets[idx]['filedate'])

    return {
        "statusCode": 200,
        "body": json.dumps({
            "message:": messages,
        }),
    }

Exemple #6

0

Afficher le fichier

def lambda_handler(event, context):
    messages = []

    try:
        # Get the secret
        sm = boto3.client('secretsmanager')
        secretobj = sm.get_secret_value(SecretId='ni-covid-tweets')
        secret = json.loads(secretobj['SecretString'])

        # Get the index
        s3 = boto3.client('s3')
        status = S3_scraper_index(s3, secret['bucketname'],
                                  secret['cog-variants-index'])
        index = status.get_dict()

        # Create a copy of the file in s3
        if 'keyname' not in event:
            keyname = "COG-variants/%s/%s-%s.csv" % (
                event['filedate'], event['modified'].replace(
                    ':', '_'), event['length'])
            print('getting URL')
            with requests.get(event['url'], stream=True) as stream:
                stream.raise_for_status()
                stream.raw.decode_content = True
                s3.upload_fileobj(
                    stream.raw,
                    secret['bucketname'],
                    keyname,
                    Config=boto3.s3.transfer.TransferConfig(use_threads=False))
            print('done')
        else:
            keyname = event['keyname']

        # Download the most recently updated CSV file
        obj = s3.get_object(Bucket=secret['bucketname'], Key=keyname)['Body']
        stream = io.BytesIO(obj.read())

        # Dataframe for converting between pango lineage and WHO labels
        # Get the mapping from the raw Github URL
        resp = requests.get(
            'https://github.com/pbarber/covid19-pango-lineage-to-who-label/raw/main/mapping.json'
        )
        # Make sure that the request was successful
        resp.raise_for_status()
        # Convert the request data to a Python dictionary
        mapping = resp.json()
        # Expand the Pango column
        mapping = pandas.DataFrame(mapping).explode(
            'Pango lineages').reset_index(drop=True)
        # Filter out old designations
        mapping_current = mapping[
            mapping['Designation'] != 'Former Variant of Interest']

        # Load variant data, aggregate and push back to S3
        df = pandas.read_csv(stream)
        df = df[df['adm1'] == 'UK-NIR']
        df['Sample Date'] = pandas.to_datetime(df['sample_date'])
        df['Week of sample'] = df['Sample Date'] - pandas.to_timedelta(
            df['Sample Date'].dt.dayofweek, unit='d')
        # Join the lineage data
        matches = mapping['Pango lineages'].apply(match, col=df['lineage'])
        match_idx = matches.idxmax()
        # Filter out indexes where there is no match
        match_idx[match_idx == matches.idxmin()] = pandas.NA
        df['idx'] = match_idx
        # Join to the mapping based on indexes
        df = df.merge(mapping, how='left', left_on='idx',
                      right_index=True).drop(columns=['idx', 'Pango lineages'])
        df['WHO label'] = df['WHO label'].fillna('Other')
        lin_by_week = df.groupby(['Week of sample',
                                  'WHO label']).size().rename('count')
        lin_pc_by_week = lin_by_week / lin_by_week.groupby(level=0).sum()
        lin_by_week = pandas.DataFrame(lin_by_week).reset_index()
        lin_pc_by_week = pandas.DataFrame(lin_pc_by_week).reset_index()
        stream = io.BytesIO()
        lin_by_week.to_csv(stream, index=False)
        stream.seek(0)
        lineage_key = '%s_lineage.csv' % keyname.rsplit('.', maxsplit=1)[0]
        s3.upload_fileobj(stream, secret['bucketname'], lineage_key)
        messages.append('Wrote lineage summary to s3')

        # Update the S3 index and find the previous date
        previous = '1970-01-01'
        prev_lineagekey = None
        thisindex = None
        for i in range(len(index)):
            if index[i]['modified'] == event['modified']:
                index[i]['lineage'] = lineage_key
                index[i]['keyname'] = keyname
                thisindex = i
            elif index[i]['filedate'] != event['filedate']:
                if (index[i]['filedate'] > previous) and (index[i]['filedate']
                                                          < event['filedate']):
                    previous = index[i]['filedate']
                    prev_lineagekey = index[i].get('lineage')
        status.put_dict(index)

        # If there is a previous file, then load it and work out the differences
        if prev_lineagekey is not None:
            obj = s3.get_object(Bucket=secret['bucketname'],
                                Key=prev_lineagekey)['Body']
            stream = io.BytesIO(obj.read())
            prev_lineage = pandas.read_csv(stream)
            if 'WHO label' not in prev_lineage.columns:
                prev_lineage['WHO label'] = 'Other'
            prev_lineage = prev_lineage.groupby('WHO label')['count'].sum()
            lineage = lin_by_week.groupby(
                'WHO label')['count'].sum().reset_index()
            lineage = lineage.merge(prev_lineage, how='left', on='WHO label')
            lineage = lineage.groupby('WHO label').sum()[[
                'count_x', 'count_y'
            ]]
            lineage['count_y'] = lineage['count_y'].fillna(0)
            lineage['diff'] = (lineage['count_x'] -
                               lineage['count_y']).fillna(0).astype(int)
            top5 = lineage.nlargest(5, 'diff')
            tweet = """{total:,d} new variant analyses reported for NI on {currdate} since {prevdate} ({altogether:,d} total):
""".format(total=lineage['diff'].sum(),
            prevdate=datetime.datetime.strptime(
               previous, '%Y-%m-%d').date().strftime('%A %-d %B %Y'),
            currdate=datetime.datetime.strptime(
               event['filedate'], '%Y-%m-%d').date().strftime('%A %-d %B %Y'),
            altogether=lineage['count_x'].sum())
            for variant, data in top5.to_dict('index').items():
                if data['diff'] > 0:
                    tweet += f"\u2022 {variant}: {data['diff']:,d} (of {data['count_x']:,d})\n"
            others = int(lineage['diff'].sum() - top5['diff'].sum())
            if others != 0:
                tweet += f"\u2022 Others: {others:,d}\n"
            tweet += '\nSource: https://beta.microreact.org/'

            driver = get_chrome_driver()
            if driver is None:
                raise Exception('Failed to start chrome')

            p = altair.vconcat(
                altair.Chart(lin_by_week[
                    lin_by_week['Week of sample'] >
                    lin_by_week['Week of sample'].max() -
                    pandas.to_timedelta(84, unit='d')]).mark_line().encode(
                        x=altair.X('Week of sample:T',
                                   axis=altair.Axis(title='',
                                                    labels=False,
                                                    ticks=False)),
                        y=altair.Y('count:Q',
                                   axis=altair.Axis(title='Samples')),
                        color='WHO label').
                properties(
                    height=225,
                    width=800,
                    title=
                    'NI COVID-19 variants identified by COG-UK over the most recent 12 weeks'
                ),
                altair.Chart(lin_pc_by_week[
                    lin_pc_by_week['Week of sample'] >
                    lin_pc_by_week['Week of sample'].max() -
                    pandas.to_timedelta(84, unit='d')]).mark_area().encode(
                        x='Week of sample:T',
                        y=altair.Y('sum(count):Q',
                                   axis=altair.Axis(format='%',
                                                    title='% of samples',
                                                    orient="right")),
                        color='WHO label').properties(
                            height=225,
                            width=800,
                        )
            ).properties(title=altair.TitleParams([
                'Variant identification can take up to 3 weeks, so recent totals are likely to be revised upwards',
                'https://twitter.com/ni_covid19_data on %s' %
                datetime.datetime.now().date().strftime('%A %-d %B %Y')
            ],
                                                  baseline='bottom',
                                                  orient='bottom',
                                                  anchor='end',
                                                  fontWeight='normal',
                                                  fontSize=10,
                                                  dy=10), )
            plotname = 'ni-variants-%s.png' % datetime.datetime.now().date(
            ).strftime('%Y-%d-%m')
            plotstore = io.BytesIO()
            p.save(fp=plotstore,
                   format='png',
                   method='selenium',
                   webdriver=driver)
            plotstore.seek(0)

            if event.get('notweet') is not True:
                api = TwitterAPI(secret['twitter_apikey'],
                                 secret['twitter_apisecretkey'],
                                 secret['twitter_accesstoken'],
                                 secret['twitter_accesstokensecret'])
                resp = api.upload(plotstore, plotname)
                if event.get('testtweet') is True:
                    resp = api.dm(secret['twitter_dmaccount'], tweet,
                                  resp.media_id)
                    messages.append('Tweeted DM ID %s, ' % resp.id)
                else:
                    resp = api.tweet(tweet, media_ids=[resp.media_id])
                    messages.append('Tweeted ID %s, ' % resp.id)
                    # Update the file index
                    index[thisindex]['tweet'] = resp.id
                    status.put_dict(index)
            else:
                messages.append('Did not tweet')
                print(tweet)
        else:
            messages.append('Did not find previous lineage data')
    except:
        logging.exception('Caught exception in COG variants tweeter')

    return {
        "statusCode": 200,
        "body": json.dumps({
            "message:": messages,
        }),
    }

Exemple #7

0

Afficher le fichier

def lambda_handler(event, context):
    # Get the secret
    sm = boto3.client('secretsmanager')
    secretobj = sm.get_secret_value(SecretId='ni-covid-tweets')
    secret = json.loads(secretobj['SecretString'])

    s3 = boto3.client('s3')

    messages = []
    # Download the most recently updated PDF file
    for change in event:
        tmp = tempfile.NamedTemporaryFile(suffix='.pdf')
        with open(tmp.name, 'wb') as fp:
            s3.download_fileobj(secret['bucketname'],change['keyname'],fp)
        # Get the date range covered by the report
        text = textract.process(tmp.name, method='pdfminer').decode('utf-8')
        regex = re.compile(r'(\d{1,2})(?:st|nd|rd|th)\s+([A-Z][a-z]+)\s+(\d{4})\s+\–+\s+(\d{1,2})(?:st|nd|rd|th)\s+([A-Z][a-z]+)\s+(\d{4})')
        start_date = None
        end_date = None
        for line in text.split('\n'):
            m = regex.search(line)
            if m:
                start_date = pandas.to_datetime('%s %s %s' %(m.group(1),m.group(2),m.group(3)), format='%d %B %Y').date()
                end_date = pandas.to_datetime('%s %s %s' %(m.group(4),m.group(5),m.group(6)), format='%d %B %Y').date()
                break
        if start_date is None:
            logging.error('Unable to find start date in report')
            return {
                "statusCode": 404,
                "body": 'Unable to find start date in report %s' %change['url'],
            }
        # Get the tables from the report - note that it was not possible to get data from 4th April or earlier due to
        # tables that will not parse properly in the PDF
        tables = tabula.read_pdf(tmp.name, pages = "all", multiple_tables = True)
        tablecount = 0
        dataset = pandas.DataFrame()
        for df in tables:
            if 'Total' not in df.columns:
                firstrow = df.iloc[0]
                newcols = []
                for i in range(len(firstrow)):
                    if isinstance(firstrow[i], float) and math.isnan(firstrow[i]):
                        newcols.append(df.columns[i])
                    else:
                        newcols.append(firstrow[i])
                df.columns = newcols
                df = df[1:]
            df['Setting'] = df['Setting'].str.strip()
            df.dropna(axis='index',subset=['Total','Open','Closed'],inplace=True)
            df['Total'] = df['Total'].astype(int)
            df['Open'] = df['Open'].astype(int)
            df['Closed'] = df['Closed'].astype(int)
            df = df[df['Setting']!='Total']
            if tablecount==0:
                df['Type'] = 'Probable Outbreak'
            elif tablecount==1:
                df['Type'] = 'Cluster'
            else:
                logging.warning('Unexpected table: %s' %df)
            tablecount += 1
            dataset = pandas.concat([dataset, df])
        dataset['Start Date'] = pandas.to_datetime(start_date)
        dataset['End Date'] = pandas.to_datetime(end_date)
        week = int((end_date - pandas.to_datetime('1 January 2020', format='%d %B %Y').date()).days / 7)
        dataset['Week'] = week
        # Create a simple summary and the tweet text
        summary = dataset.groupby('Type').sum()
        tweet = 'NI Contact Tracing reports from %s to %s:\n' %(start_date.strftime('%-d %B %Y'), end_date.strftime('%-d %B %Y'))
        for Type,data in summary.to_dict('index').items():
            tweet += '\u2022 %d %ss (%d open, %d closed)\n' %(data['Total'], Type.lower(), data['Open'], data['Closed'])
        tweet += '\n%s' %change['url']
        # Pull current data from s3
        try:
            obj = s3.get_object(Bucket=secret['bucketname'],Key=secret['pha-clusters-datastore'])['Body']
        except s3.exceptions.NoSuchKey:
            print("The object %s does not exist in bucket %s." %(secret['pha-clusters-datastore'], secret['bucketname']))
            datastore = pandas.DataFrame(columns=['Week'])
        else:
            stream = io.BytesIO(obj.read())
            datastore = pandas.read_csv(stream)
        # Clean out any data with matching dates
        datastore = datastore[datastore['Week'] != week]
        # Append the new data
        datastore = pandas.concat([datastore, dataset])
        datastore['Start Date'] = pandas.to_datetime(datastore['Start Date'])
        datastore['End Date'] = pandas.to_datetime(datastore['End Date'])
        # Replace any known duplicates
        datastore['Setting'] = datastore['Setting'].replace({
            'Cinema/ Theatre / Entertainment': 'Cinema / Theatre / Entertainment Venue',
            'Cinema/ Theatre / Entertainment Venue': 'Cinema / Theatre / Entertainment Venue',
            'Funeral / Wakes': 'Funeral / Wake',
            'Restaurant / Cafe': 'Restaurant / Café'
        })
        # Push the data to s3
        stream = io.BytesIO()
        datastore.to_csv(stream, index=False)
        stream.seek(0)
        s3.upload_fileobj(stream, secret['bucketname'], secret['pha-clusters-datastore'])
        # Set up chromedriver so we can save altair plots
        driver = get_chrome_driver()
        plots = []
        if driver is None:
            logging.error('Failed to start chrome')
        else:
            p = altair.vconcat(
                altair.Chart(
                    dataset
                ).mark_bar().encode(
                    x = altair.X('Total:Q', axis=altair.Axis(title='Total reported')),
                    y = altair.Y('Setting:O'),
                    color='Type',
                    order=altair.Order(
                        'Type',
                        sort='ascending'
                    ),
                ).properties(
                    height=450,
                    width=800,
                    title='NI COVID-19 Contact Tracing reports from %s to %s' %(start_date.strftime('%-d %B %Y'), end_date.strftime('%-d %B %Y'))
                ),
            ).properties(
                title=altair.TitleParams(
                    ['Data from Public Health Agency, does not include education or home settings',
                    'Covers the preceding four weeks',
                    'https://twitter.com/ni_covid19_data on %s'  %datetime.datetime.now().date().strftime('%A %-d %B %Y')],
                    baseline='bottom',
                    orient='bottom',
                    anchor='end',
                    fontWeight='normal',
                    fontSize=10,
                    dy=10
                ),
            )
            plotname = 'pha-outbreaks-week-%s.png'%datetime.datetime.now().date().strftime('%Y-%d-%m')
            plotstore = io.BytesIO()
            p.save(fp=plotstore, format='png', method='selenium', webdriver=driver)
            plotstore.seek(0)
            plots.append({'name': plotname, 'store': plotstore})
            p = altair.vconcat(
                altair.Chart(
                    datastore.groupby(['End Date','Type'])['Total'].sum().reset_index()
                ).mark_area().encode(
                    x = altair.X('End Date:T', axis=altair.Axis(title='Date reported (for preceding four weeks)')),
                    y = altair.Y('Total:Q', axis=altair.Axis(title='Total reported', orient="right")),
                    color='Type',
                    order=altair.Order(
                        'Type',
                        sort='ascending'
                    ),
                ).properties(
                    height=450,
                    width=800,
                    title='NI COVID-19 Contact Tracing reports from %s to %s' %(datastore['Start Date'].min().strftime('%-d %B %Y'), datastore['End Date'].max().strftime('%-d %B %Y'))
                ),
            ).properties(
                title=altair.TitleParams(
                    ['Data from Public Health Agency, does not include education or home settings',
                    'Reported weekly for the preceding four weeks',
                    'https://twitter.com/ni_covid19_data on %s'  %datetime.datetime.now().date().strftime('%A %-d %B %Y')],
                    baseline='bottom',
                    orient='bottom',
                    anchor='end',
                    fontWeight='normal',
                    fontSize=10,
                    dy=10
                ),
            )
            plotname = 'pha-outbreaks-time-%s.png'%datetime.datetime.now().date().strftime('%Y-%d-%m')
            plotstore = io.BytesIO()
            p.save(fp=plotstore, format='png', method='selenium', webdriver=driver)
            plotstore.seek(0)
            plots.append({'name': plotname, 'store': plotstore})
            p = altair.vconcat(
                altair.Chart(
                    datastore.groupby(['End Date','Setting','Type'])['Total'].sum().reset_index()
                ).mark_area().encode(
                    x = altair.X('End Date:T', axis=altair.Axis(title='')),
                    y = altair.Y('Total:Q', axis=altair.Axis(title='', orient="right")),
                    color='Type',
                    facet=altair.Facet('Setting:O', columns=5, title=None, spacing=0),
                    order=altair.Order(
                        'Type',
                        sort='ascending'
                    ),
                ).properties(
                    height=90,
                    width=160,
                    title=altair.TitleParams(
                        'NI COVID-19 Contact Tracing reports by setting from %s to %s' %(datastore['Start Date'].min().strftime('%-d %B %Y'), datastore['End Date'].max().strftime('%-d %B %Y')),
                        anchor='middle',
                    ),
                ),
            ).properties(
                title=altair.TitleParams(
                    ['Data from Public Health Agency, does not include education or home settings',
                    'Reported weekly for the preceding four weeks',
                    'https://twitter.com/ni_covid19_data on %s'  %datetime.datetime.now().date().strftime('%A %-d %B %Y')],
                    baseline='bottom',
                    orient='bottom',
                    anchor='end',
                    fontWeight='normal',
                    fontSize=10,
                    dy=10
                ),
            )
            plotname = 'pha-outbreaks-small-%s.png'%datetime.datetime.now().date().strftime('%Y-%d-%m')
            plotstore = io.BytesIO()
            p.save(fp=plotstore, format='png', method='selenium', webdriver=driver)
            plotstore.seek(0)
            plots.append({'name': plotname, 'store': plotstore})

        # Convert to dates to ensure correct output to CSV
        datastore['Start Date'] = datastore['Start Date'].dt.date
        datastore['End Date'] = datastore['End Date'].dt.date

        # Tweet out the text and images
        if change.get('notweet') is not True:
            api = TwitterAPI(secret['twitter_apikey'], secret['twitter_apisecretkey'], secret['twitter_accesstoken'], secret['twitter_accesstokensecret'])
            upload_ids = api.upload_multiple(plots)
            if change.get('testtweet') is True:
                if len(upload_ids) > 0:
                    resp = api.dm(secret['twitter_dmaccount'], tweet, upload_ids[0])
                    if len(upload_ids) > 1:
                        resp = api.dm(secret['twitter_dmaccount'], 'Test 1', upload_ids[1])
                        if len(upload_ids) > 2:
                            resp = api.dm(secret['twitter_dmaccount'], 'Test 2', upload_ids[2])
                else:
                    resp = api.dm(secret['twitter_dmaccount'], tweet)
                messages.append('Tweeted DM ID %s' %(resp.id))
            else:
                if len(upload_ids) > 0:
                    resp = api.tweet(tweet, media_ids=upload_ids)
                else:
                    resp = api.tweet(tweet)
                # Download and update the index
                status = S3_scraper_index(s3, secret['bucketname'], secret['pha-clusters-index'])
                index = status.get_dict()
                for i in range(len(index)):
                    if index[i]['filedate'] == change['filedate']:
                        index[i]['tweet'] = resp.id
                        break
                status.put_dict(index)
                messages.append('Tweeted ID %s and updated %s' %(resp.id, secret['pha-clusters-index']))
        else:
            print(tweet)
            messages.append('Did not tweet')

    return {
        "statusCode": 200,
        "body": json.dumps({
            "message": messages,
        }),
    }

Exemple #8

0

Afficher le fichier

Fichier : app.py Projet : pbarber/ni-covid-tweets

def lambda_handler(event, context):
    # Get the secret
    sm = boto3.client('secretsmanager')
    secretobj = sm.get_secret_value(SecretId='ni-covid-tweets')
    secret = json.loads(secretobj['SecretString'])

    s3 = boto3.client('s3')
    if event.get('mode') == 'aggregate':
        # Get the index of all reports
        status = S3_scraper_index(s3, secret['bucketname'],
                                  secret['doh-dd-index'])
        index = status.get_dict()

        allreports = pandas.DataFrame(columns=[
            'Date of Specimen', 'Reported_Date', 'Total Lab Tests',
            'Individ with Lab Test', 'Individ with Positive Lab Test'
        ])
        # Download the most recently updated Excel file
        for item in index:
            try:
                obj = s3.get_object(Bucket=secret['bucketname'],
                                    Key=item['keyname'])['Body']
                stream = io.BytesIO(obj.read())
                # Load test data
                daily = pandas.read_excel(stream,
                                          engine='openpyxl',
                                          sheet_name='Tests')
                # Take only the required columns
                daily = daily.groupby(['Date of Specimen']).sum()[[
                    'Total Lab Tests', 'Individ with Lab Test',
                    'Individ with Positive Lab Test'
                ]].reset_index()
                # Add reported date
                daily['Reported_Date'] = pandas.to_datetime(item['filedate'],
                                                            format='%Y-%m-%d')
            except:
                logging.exception('Error loading %s' % item)
                raise

            # Combine with the other data reports
            allreports = pandas.concat([allreports, daily])

        # Write the output to CSV
        keyname = 'DoH-DD/all_tests.csv'
        csvbuffer = io.StringIO()
        allreports.to_csv(csvbuffer, index=False)
        s3.put_object(Bucket=secret['bucketname'],
                      Key='DoH-DD/all_tests.csv',
                      Body=csvbuffer.getvalue())

        message = 'Wrote %d rows to %s' % (len(allreports), keyname)
    else:
        obj = s3.get_object(Bucket=secret['bucketname'],
                            Key='DoH-DD/all_tests.csv')['Body']
        stream = io.BytesIO(obj.read())
        # Load test data
        df = pandas.read_csv(stream)

        print(df.columns)

        message = 'Done'

    return {
        "statusCode": 200,
        "body": json.dumps({
            "message:": message,
        }),
    }

Exemple #9

0

Afficher le fichier

Fichier : app.py Projet : pbarber/ni-covid-tweets

def lambda_handler(event, context):
    messages = ['Failure']

    # Get the secret
    sm = boto3.client('secretsmanager')
    secretobj = sm.get_secret_value(SecretId='ni-covid-tweets')
    secret = json.loads(secretobj['SecretString'])

    try:
        # Get the index
        s3 = boto3.client('s3')
        status = S3_scraper_index(s3, secret['bucketname'],
                                  secret['doh-dd-index'])
        index = status.get_dict()

        tweets = []
        # Download the most recently updated Excel file
        for change in event:
            obj = s3.get_object(Bucket=secret['bucketname'],
                                Key=change['keyname'])['Body']
            stream = io.BytesIO(obj.read())

            # Load the tests sheet and add it to the store
            daily = pandas.read_excel(stream,
                                      engine='openpyxl',
                                      sheet_name='Tests')
            daily = daily.groupby(['Sample_Date'
                                   ]).sum()[['Total Tests',
                                             'Total Cases']].reset_index()
            daily['Reported_Date'] = pandas.to_datetime(change['filedate'],
                                                        format='%Y-%m-%d')
            datastore = update_datastore(
                s3, secret['bucketname'], secret['doh-dd-store-tests'],
                daily['Reported_Date'].max(), daily,
                (change.get('notweet', False) is False)
                and (change.get('tweet', True) is True), 'Reported_Date')

            # Load test data and add extra fields
            df = pandas.read_excel(stream,
                                   engine='openpyxl',
                                   sheet_name='Summary Tests')
            df['pos_rate'] = df['Total Cases'] / df['Total Tests']
            df['rolling_pos_rate'] = df['Rolling 7 Day Cases'] / df[
                'Rolling 7 Day Tests (PCR & LFT)']
            df['printdate'] = df['Sample_Date'].dt.strftime('%-d %B %Y')
            df['rolling_7d_change'] = (df['Rolling 7 Day Cases'] -
                                       df['Rolling 7 Day Cases'].shift(7)) * 7
            df['New cases 7-day rolling mean'] = df['Total Cases'].rolling(
                7, center=True).mean()
            df.set_index('Sample_Date', inplace=True)
            newind = pandas.date_range(start=df.index.min(),
                                       end=df.index.max())
            df = df.reindex(newind)
            df.index.name = 'Sample_Date'
            df.reset_index(inplace=True)
            df['Rolling cases per 100k'] = 100000 * (
                df['New cases 7-day rolling mean'] / 1893667)
            df = create_model(df, 'Rolling cases per 100k', 'Sample_Date')

            # Get the latest dates with values for tests and rolling
            latest = df.iloc[df['Sample_Date'].idxmax()]
            latest_7d = df.iloc[df[df['Rolling 7 Day Cases'].notna()]
                                ['Sample_Date'].idxmax()]
            latest_model = df.iloc[df[
                df['Rolling cases per 100k model_daily_change'].notna()]
                                   ['Sample_Date'].idxmax()]
            last_but1_model = df.iloc[
                df[(df['Rolling cases per 100k model_daily_change'].notna())
                   & (df['Sample_Date'] != latest_model['Sample_Date'])]
                ['Sample_Date'].idxmax()]

            # Summary stats to allow 'X registered in last 24 hours' info
            deaths = load_ni_time_series(stream, 'Deaths', 'Date of Death',
                                         'Number of Deaths')
            admissions = load_ni_time_series(stream, 'Admissions',
                                             'Admission Date',
                                             'Number of Admissions', True)
            discharges = load_ni_time_series(stream, 'Discharges',
                                             'Discharge Date',
                                             'Number of Discharges')
            inpatients = load_ni_time_series(
                stream, 'Inpatients', 'Inpatients at Midnight',
                'Number of Confirmed COVID Inpatients', False, 'Sex', 'All')
            inpatients.rename(columns={'Inpatients at Midnight': 'Date'},
                              inplace=True)
            icu = load_ni_time_series(stream, 'ICU', 'Date',
                                      'Confirmed COVID Occupied')
            totals = {
                'ind_tested': int(df['Total Tests'].sum()),
                'ind_positive': int(df['Total Cases'].sum()),
                'deaths': int(deaths['Number of Deaths'].sum()),
                'admissions': int(admissions['Number of Admissions'].sum()),
                'discharges': int(discharges['Number of Discharges'].sum())
            }
            print(totals)
            latest_adm_model = admissions.iloc[admissions[admissions[
                'Number of Admissions 7-day rolling mean model_daily_change'].
                                                          notna()]
                                               ['Admission Date'].idxmax()]
            adm_dis = admissions.merge(discharges,
                                       how='inner',
                                       left_on='Admission Date',
                                       right_on='Discharge Date',
                                       validate='1:1')
            adm_dis.drop(columns=['Discharge Date'], inplace=True)
            adm_dis.rename(columns={'Admission Date': 'Date'}, inplace=True)
            adm_dis['Inpatients'] = adm_dis[
                'Number of Admissions 7-day rolling mean'].cumsum() - adm_dis[
                    'Number of Discharges 7-day rolling mean'].cumsum()
            adm_dis_7d = adm_dis.rename(
                columns={
                    'Number of Admissions 7-day rolling mean': 'Admissions',
                    'Number of Discharges 7-day rolling mean': 'Discharges'
                })[['Date', 'Admissions', 'Discharges']]
            adm_dis_7d = adm_dis_7d.melt(id_vars='Date')

            # Age band data
            age_bands = pandas.read_excel(
                stream,
                engine='openpyxl',
                sheet_name='Individuals 7 Days - 5yr Age')
            age_bands = age_bands.groupby('Age_Band_5yr').sum()[[
                'Total_Cases', 'Total_Tests'
            ]].reset_index()
            age_bands['Positivity_Rate'] = age_bands[
                'Total_Cases'] / age_bands['Total_Tests']
            age_bands['Band Start'] = age_bands['Age_Band_5yr'].str.extract(
                'Aged (\d+)').astype(float)
            age_bands['Band End'] = age_bands['Age_Band_5yr'].str.extract(
                'Aged \d+ - (\d+)').astype(float)
            age_bands['Date'] = df['Sample_Date'].max()
            age_bands['Positive_Tests'] = age_bands['Total_Cases']
            # Get the age bands datastore contents from S3
            datastore = update_datastore(
                s3, secret['bucketname'], secret['doh-dd-store-agebands'],
                df['Sample_Date'].max(), age_bands,
                (change.get('notweet', False) is False)
                and (change.get('tweet', True) is True), 'Date')
            # Plot the case reports and 7-day average
            driver = get_chrome_driver()
            plots = []
            if driver is not None:
                today_str = datetime.datetime.now().date().strftime('%Y-%m-%d')
                p = plot_key_ni_stats_date_range(
                    df, admissions, deaths,
                    latest['Sample_Date'] - pandas.to_timedelta(42, unit='d'),
                    latest['Sample_Date'], ['linear', 'log'])
                plots = output_plot(p, plots, driver,
                                    'ni-cases-%s.png' % today_str)
                if len(plots) > 0:
                    p = plot_hospital_stats(
                        adm_dis_7d, inpatients, icu, latest['Sample_Date'] -
                        pandas.to_timedelta(42, unit='d'))
                    plots = output_plot(p, plots, driver,
                                        'ni-hospitals-%s.png' % today_str)
                    if len(plots) > 1:
                        toplot = datastore[datastore['Date'] >= (
                            datastore['Date'].max() +
                            pandas.DateOffset(days=-42))]
                        toplot['Date'] = pandas.to_datetime(toplot['Date'])
                        newind = pandas.date_range(start=toplot['Date'].max() +
                                                   pandas.DateOffset(days=-42),
                                                   end=toplot['Date'].max())
                        alldates = pandas.Series(newind)
                        alldates.name = 'Date'
                        toplot = toplot.merge(alldates,
                                              how='outer',
                                              left_on='Date',
                                              right_on='Date')
                        toplot['X'] = toplot['Date'].dt.strftime('%e %b')
                        toplot['Most Recent Positive Tests'] = toplot[
                            'Positive_Tests'].where(
                                toplot['Date'] == toplot['Date'].max()).apply(
                                    lambda x: f"{x:n}"
                                    if not pandas.isna(x) else "")
                        toplot['Age_Band_5yr'].fillna('Not Known',
                                                      inplace=True)
                        bands = toplot.groupby(
                            ['Age_Band_5yr', 'Band Start', 'Band End'],
                            dropna=False).size().reset_index()[[
                                'Age_Band_5yr', 'Band Start', 'Band End'
                            ]]
                        bands = bands[bands['Age_Band_5yr'] != 'Not Known']
                        bands.fillna(90, inplace=True)
                        bands['Band End'] = bands['Band End'].astype(int)
                        bands['Band Start'] = bands['Band Start'].astype(int)
                        bands['Year'] = bands.apply(lambda x: range(
                            x['Band Start'], x['Band End'] + 1),
                                                    axis='columns')
                        bands = bands.explode('Year').reset_index()
                        pops = get_ni_pop_pyramid()
                        pops = pops[pops['Year'] == 2020].groupby(
                            ['Age Band']).sum()['Population']
                        bands = bands.merge(pops,
                                            how='inner',
                                            validate='1:1',
                                            right_index=True,
                                            left_on='Year')
                        bands = bands.groupby(
                            'Age_Band_5yr').sum()['Population']
                        toplot = toplot.merge(bands,
                                              how='left',
                                              on='Age_Band_5yr')
                        toplot['Positive per 100k'] = (
                            100000 *
                            toplot['Positive_Tests']) / toplot['Population']
                        toplot['Most Recent Positive per 100k'] = toplot[
                            'Positive per 100k'].where(
                                toplot['Date'] == toplot['Date'].max()).apply(
                                    lambda x: f"{int(x):n}"
                                    if not pandas.isna(x) else "")
                        heatmap2 = plot_heatmap(toplot, 'X', 'Date', 'Date',
                                                'Age_Band_5yr', 'Band Start',
                                                'Age Band',
                                                'Positive per 100k',
                                                'Positive Tests per 100k')
                        p = altair.vconcat(
                            altair.layer(
                                heatmap2.properties(
                                    height=450,
                                    width=800,
                                    title=
                                    'NI COVID-19 7-day Positive Tests by Age Band per 100k people (%s to %s)'
                                    % (toplot['Date'].min().strftime(
                                        '%-d %B %Y'), toplot['Date'].max().
                                       strftime('%-d %B %Y')),
                                ),
                                heatmap2.mark_text(
                                    align='right', baseline='middle',
                                    dx=43).encode(text=altair.Text(
                                        'Most Recent Positive per 100k'),
                                                  color=altair.value('black')))
                        ).properties(title=altair.TitleParams([
                            'Data from DoH daily downloads',
                            'Numbers to right of chart show most recent value',
                            'https://twitter.com/ni_covid19_data on %s' %
                            datetime.datetime.now().strftime('%A %-d %B %Y')
                        ],
                                                              baseline='bottom',
                                                              orient='bottom',
                                                              anchor='end',
                                                              fontWeight=
                                                              'normal',
                                                              fontSize=10,
                                                              dy=10), )
                        plots = output_plot(
                            p, plots, driver,
                            'ni-cases-age-bands-%s.png' % today_str)
                        if len(plots) > 2:
                            p = plot_test_stats(
                                df, latest['Sample_Date'] -
                                pandas.to_timedelta(42, unit='d'))
                            plots = output_plot(p, plots, driver,
                                                'ni-tests-%s.png' % today_str)

            # Find the date since which the rate was as high/low
            symb_7d, est = find_previous(df, latest_7d, 'Rolling 7 Day Cases')

            # Build the tweet text
            tweet = '''{ind_tested:,} people tested, {ind_positive:,} ({pos_rate:.2%}) positive on {date}

{symb_7d} {pos_7d:,} positive in last 7 days, {est}

{tag_model} cases {dir_model} by {model_daily:.1%} per day, {model_weekly:.1%} per week, {doub} time {doub_time:.1f} days

'''.format(date=latest['Sample_Date'].strftime('%A %-d %B %Y'),
            ind_positive=int(latest['Total Cases']),
            ind_tested=int(latest['Total Tests']),
            pos_rate=latest['pos_rate'],
            symb_7d=symb_7d,
            est=est,
            model_daily=abs(
               last_but1_model['Rolling cases per 100k model_daily_change']),
            model_weekly=abs(
               last_but1_model['Rolling cases per 100k model_weekly_change']),
            pos_7d=int(round(latest_7d['Rolling 7 Day Cases'] * 7, 0)),
            dir_model='falling'
            if last_but1_model['Rolling cases per 100k model_daily_change'] < 0
            else 'rising',
            tag_model=good_symb
            if last_but1_model['Rolling cases per 100k model_daily_change'] < 0
            else bad_symb,
            doub='halving' if
            (last_but1_model['Rolling cases per 100k model0'] < 0) else
            'doubling',
            doub_time=abs(
               numpy.log(2) /
               last_but1_model['Rolling cases per 100k model0']))

            # If we have the data for it, build the second tweet
            last_week = datetime.datetime.strptime(
                change['filedate'],
                '%Y-%m-%d').date() - datetime.timedelta(days=7)
            day_before = datetime.datetime.strptime(
                change['filedate'],
                '%Y-%m-%d').date() - datetime.timedelta(days=1)
            yesterday = None
            lastweek = None
            for report in index:
                if (report['filedate']
                        == last_week.strftime('%Y-%m-%d')) and ('totals'
                                                                in report):
                    lastweek = report
                elif (report['filedate']
                      == day_before.strftime('%Y-%m-%d')) and ('totals'
                                                               in report):
                    yesterday = report
                if (yesterday is not None) and (lastweek is not None):
                    break
            tweet2 = '''{inpatients} inpatient{ips} reported'''.format(
                inpatients=totals['admissions'] - totals['discharges'],
                ips='s' if
                ((totals['admissions'] - totals['discharges']) != 1) else '')
            if lastweek is not None:
                ip_change = (totals['admissions'] - totals['discharges']) - (
                    lastweek['totals']['admissions'] -
                    lastweek['totals']['discharges'])
                tweet2 += ''':
{ip_bullet} {ip_change} {ip_text} than 7 days ago ({admissions} admitted, {discharges} discharged)'''.format(
                    ip_change=abs(ip_change),
                    ip_bullet=good_symb if ip_change < 0 else bad_symb,
                    ip_text='fewer' if ip_change < 0 else 'more',
                    admissions=totals['admissions'] -
                    lastweek['totals']['admissions'],
                    discharges=totals['discharges'] -
                    lastweek['totals']['discharges'])
                if yesterday is not None:
                    tweet2 += '''

{deaths} death{ds} reported, {deaths_7d} in last 7 days'''.format(
                        deaths=totals['deaths'] -
                        yesterday['totals']['deaths'],
                        ds='s' if
                        ((totals['deaths'] -
                          yesterday['totals']['deaths']) != 1) else '',
                        deaths_7d=totals['deaths'] -
                        lastweek['totals']['deaths'])

            tweet2 += '''

{tag_model} admissions {dir_model} by {model_daily:.1%} per day, {model_weekly:.1%} per week, {doub} time {doub_time:.1f} days'''.format(
                model_daily=abs(latest_adm_model[
                    'Number of Admissions 7-day rolling mean model_daily_change']
                                ),
                model_weekly=abs(latest_adm_model[
                    'Number of Admissions 7-day rolling mean model_weekly_change']
                                 ),
                dir_model='falling' if latest_adm_model[
                    'Number of Admissions 7-day rolling mean model_daily_change']
                < 0 else 'rising',
                tag_model=good_symb if latest_adm_model[
                    'Number of Admissions 7-day rolling mean model_daily_change']
                < 0 else bad_symb,
                doub='halving' if (latest_adm_model[
                    'Number of Admissions 7-day rolling mean model0'] < 0) else
                'doubling',
                doub_time=abs(
                    numpy.log(2) / latest_adm_model[
                        'Number of Admissions 7-day rolling mean model0']))

            tweets.append({
                'text': tweet,
                'text2': tweet2,
                'url': change['url'],
                'notweet': change.get('notweet', False),
                'tweet': change.get('tweet', True),
                'totals': totals,
                'filedate': change['filedate'],
                'plots': plots
            })

        donottweet = []
        if len(tweets) > 1:
            for i in range(1, len(tweets)):
                for j in range(0, i):
                    if (tweets[i]['text'] == tweets[j]['text']):
                        donottweet.append(i)

        messages = []
        for idx in reversed(range(len(tweets))):
            t = tweets[idx]
            if t['notweet'] is False:
                if (idx not in donottweet):
                    api = TwitterAPI(secret['twitter_apikey'],
                                     secret['twitter_apisecretkey'],
                                     secret['twitter_accesstoken'],
                                     secret['twitter_accesstokensecret'])
                    upload_ids = api.upload_multiple(t['plots'])
                    if t['tweet'] is True:
                        if len(t['plots']) > 0:
                            resp = api.tweet(t['text'] + t['url'],
                                             media_ids=upload_ids)
                        else:
                            resp = api.tweet(t['text'] + t['url'])
                        messages.append('Tweeted ID %s, ' % resp.id)
                        if t['text2'] is not None:
                            resp = api.tweet(t['text2'], resp.id)
                            messages[-1] += ('ID %s, ' % resp.id)

                        # Update the file index
                        for i in range(len(index)):
                            if index[i]['filedate'] == t['filedate']:
                                index[i]['tweet'] = resp.id
                                index[i]['totals'] = t['totals']
                                break
                        status.put_dict(index)

                        messages[-1] += ('updated %s' % secret['doh-dd-index'])
                    else:
                        if len(upload_ids) > 0:
                            resp = api.dm(secret['twitter_dmaccount'],
                                          t['text'] + t['url'], upload_ids[0])
                        else:
                            resp = api.dm(secret['twitter_dmaccount'],
                                          t['text'] + t['url'])
                        messages.append('Tweeted DM %s, ' % resp.id)
                        if len(upload_ids) > 1:
                            resp = api.dm(secret['twitter_dmaccount'],
                                          t['text2'], upload_ids[-2])
                        else:
                            resp = api.dm(secret['twitter_dmaccount'],
                                          t['text2'])
                else:
                    messages.append('Duplicate found %s, did not tweet, ' %
                                    t['filedate'])
            else:
                if (idx not in donottweet):
                    messages.append('Did not tweet')
                    print(t['text'] + t['url'])
                    if t['text2'] is not None:
                        print(t['text2'])
                else:
                    messages.append('Duplicate found %s, did not tweet, ' %
                                    t['filedate'])
    except:
        logging.exception('Caught error in cases tweeter')
        api = TwitterAPI(secret['twitter_apikey'],
                         secret['twitter_apisecretkey'],
                         secret['twitter_accesstoken'],
                         secret['twitter_accesstokensecret'])
        api.dm(secret['twitter_dmaccount'], 'Error in cases tweeter')

    return {
        "statusCode": 200,
        "body": json.dumps({
            "message:": messages,
        }),
    }

Exemple #10

0

Afficher le fichier

Fichier : app.py Projet : pbarber/ni-covid-tweets

def lambda_handler(event, context):
    # Get the secret
    sm = boto3.client('secretsmanager')
    secretobj = sm.get_secret_value(SecretId='ni-covid-tweets')
    secret = json.loads(secretobj['SecretString'])

    s3 = boto3.client('s3')
    # Pull current data from s3
    try:
        obj = s3.get_object(Bucket=secret['bucketname'],Key=secret['pha-education-datastore'])['Body']
    except s3.exceptions.NoSuchKey:
        print("The object %s does not exist in bucket %s." %(secret['pha-education-datastore'], secret['bucketname']))
        datastore = pandas.DataFrame(columns=['filedate'])
    else:
        stream = io.BytesIO(obj.read())
        datastore = pandas.read_csv(stream)

    messages = []
    if 'url' in event[0]:
    # Download the most recently updated PDF file
        for change in event:
            tmp = tempfile.NamedTemporaryFile(suffix='.pdf')
            with open(tmp.name, 'wb') as fp:
                s3.download_fileobj(secret['bucketname'],change['keyname'],fp)
            text = textract.process(tmp.name, method='pdfminer').decode('utf-8')
            regex = re.compile(r'Up to [Ww]eek \d{1,2}\s+\((\d{1,2})\s+([A-Z][a-z]+)\s+(\d{4})\)')
            end_date = None
            for line in text.split('\n'):
                m = regex.search(line)
                if m:
                    end_date = datetime.datetime.strptime('%s %s %s' %(m.group(1),m.group(2),m.group(3)), '%d %B %Y')
                    break
            if end_date is None:
                logging.error('Unable to find end date in report %s' %change['keyname'])
                continue
            regex = re.compile(r'Table (\d+)\. Number of Incidents by School and Incident Type')
            tables = tabula.read_pdf(tmp.name, pages = "all", multiple_tables = True, java_options=["-Xmx1024m"])
            dataset = None
            for df in tables:
                match = False
                for col in df.columns:
                    m = regex.search(col)
                    if m:
                        match = True
                        break
                if match is True:
                    if len(df.columns)!=1:
                        logging.error('Too many columns in %s, %s' %(df, change['keyname']))
                        break
                    df.columns=['raw']
                    df = df[df['raw'].str.endswith('%')]
                    df['Proportion'] = df['raw'].str.rsplit(' ', 1,expand=True)[1]
                    df['raw'] = df['raw'].str.rsplit(' ', 1,expand=True)[0]
                    df['Total'] = df['raw'].str.rsplit(' ', 1,expand=True)[1]
                    df['raw'] = df['raw'].str.rsplit(' ', 1,expand=True)[0]
                    df['School Type'] = df['raw'].str.replace('Single Case ', '')
                    df = df[['School Type','Total']].reset_index(drop=True)
                    if len(df)!=12:
                        logging.error('Unexpected number of rows in %s, %s' %(df, change['keyname']))
                        break
                    df['Incident Type'] = 'Cluster (>5 cases)'
                    df.iloc[:8, df.columns.get_loc('Incident Type')] = 'Cluster (2-5 cases)'
                    df.iloc[:4, df.columns.get_loc('Incident Type')] = 'Single Case'
                    dataset = df
                    break
            if dataset is None:
                logging.error('Unable to find table in %s' %change['keyname'])
                continue
            dataset['filedate'] = change['filedate']
            dataset['End Date'] = end_date.strftime('%Y-%m-%d')
            dataset['url'] = change['url']
            # Clean out any data with matching dates
            datastore = datastore[datastore['filedate'] != change['filedate']]
            # Append the new data
            datastore = pandas.concat([datastore, dataset])
            # Push the data to s3
            stream = io.BytesIO()
            datastore.to_csv(stream, index=False)
            stream.seek(0)
            s3.upload_fileobj(stream, secret['bucketname'], secret['pha-education-datastore'])
    else:
        driver = get_chrome_driver()
        plots = []
        if driver is None:
            logging.error('Failed to start chrome')
        else:
            datastore['End Date'] = pandas.to_datetime(datastore['End Date'])
            weekly = datastore.groupby(['End Date','School Type','Incident Type']).sum()['Total'].reset_index()
            weekly.sort_values('End Date', inplace=True)
            weekly['New'] = weekly['Total'] - weekly.groupby(['School Type','Incident Type'])['Total'].shift(1)
            weekly['New no neg'] = weekly['New'].clip(lower=0) # Remove negatives for the detailed plot
            weekly['order'] = weekly['Incident Type'].replace(
                {val: i for i, val in enumerate(['Cluster (>5 cases)', 'Cluster (2-5 cases)', 'Single Case', 'White'])}
            )
            latest = weekly[weekly['End Date']==weekly['End Date'].max()]
            p = altair.vconcat(
                altair.Chart(
                    latest
                ).mark_bar().encode(
                    x = altair.X('New:Q', axis=altair.Axis(title='Total reported', tickMinStep=1)),
                    y = altair.Y('School Type:O', sort=['Preschool','Primary','Post Primary','Special']),
                    color=altair.Color('Incident Type', sort=altair.SortField('order',order='descending')),
                    order=altair.Order(
                        'order',
                        sort='ascending'
                    ),
                ).properties(
                    height=225,
                    width=400,
                    title='NI COVID-19 School Surveillance reports for week ending %s' %datastore['End Date'].max().strftime('%-d %B %Y')
                ),
            ).properties(
                title=altair.TitleParams(
                    ['Data from Public Health Agency',
                    'Some data has been manually extracted',
                    'https://twitter.com/ni_covid19_data on %s'  %datetime.datetime.now().date().strftime('%A %-d %B %Y')],
                    baseline='bottom',
                    orient='bottom',
                    anchor='end',
                    fontWeight='normal',
                    fontSize=10,
                    dy=10
                ),
            )
            plotname = 'pha-outbreaks-week-%s.png'%datetime.datetime.now().date().strftime('%Y-%d-%m')
            plotstore = io.BytesIO()
            p.save(fp=plotstore, format='png', method='selenium', webdriver=driver)
            plotstore.seek(0)
            plots.append({'name': plotname, 'store': plotstore})
            toplot = weekly[(~weekly['New'].isna()) & (weekly['End Date'] > weekly['End Date'].max()-pandas.to_timedelta(84, unit='d'))]
            p = altair.vconcat(
                altair.Chart(
                    toplot
                ).mark_area().encode(
                    x = altair.X('End Date:T', axis=altair.Axis(title='Date reported')),
                    y = altair.Y('sum(New):Q', axis=altair.Axis(title='Newly reported', orient="right", tickMinStep=1)),
                    color=altair.Color('Incident Type', sort=altair.SortField('order',order='descending')),
                    order=altair.Order(
                        'order',
                        sort='ascending'
                    ),
                ).properties(
                    height=450,
                    width=800,
                    title='NI COVID-19 School Surveillance reports from %s to %s' %(toplot['End Date'].min().strftime('%-d %B %Y'), toplot['End Date'].max().strftime('%-d %B %Y'))
                ),
            ).properties(
                title=altair.TitleParams(
                    ['Data from Public Health Agency',
                    'Some data has been manually extracted',
                    'https://twitter.com/ni_covid19_data on %s'  %datetime.datetime.now().date().strftime('%A %-d %B %Y')],
                    baseline='bottom',
                    orient='bottom',
                    anchor='end',
                    fontWeight='normal',
                    fontSize=10,
                    dy=10
                ),
            )
            plotname = 'pha-education-time-%s.png'%datetime.datetime.now().date().strftime('%Y-%d-%m')
            plotstore = io.BytesIO()
            p.save(fp=plotstore, format='png', method='selenium', webdriver=driver)
            plotstore.seek(0)
            plots.append({'name': plotname, 'store': plotstore})
            p = altair.vconcat(
                altair.Chart(
                    toplot
                ).mark_area().encode(
                    x = altair.X('End Date:T', axis=altair.Axis(title='Date reported')),
                    y = altair.Y('sum(New no neg):Q', axis=altair.Axis(title='Newly reported', orient="right", tickMinStep=1)),
                    color=altair.Color('Incident Type', sort=altair.SortField('order',order='descending')),
                    facet=altair.Facet('School Type:O', columns=2, title=None, spacing=0, sort=['Preschool','Primary','Post Primary','Special']),
                    order=altair.Order(
                        'order',
                        sort='ascending'
                    ),
                ).properties(
                    height=225,
                    width=450,
                    title=altair.TitleParams(
                        'NI COVID-19 School Surveillance reports from %s to %s' %(toplot['End Date'].min().strftime('%-d %B %Y'), toplot['End Date'].max().strftime('%-d %B %Y')),
                        anchor='middle',
                    ),
                ),
            ).properties(
                title=altair.TitleParams(
                    ['Data from Public Health Agency',
                    'Some data has been manually extracted, negative values have been removed',
                    'https://twitter.com/ni_covid19_data on %s'  %datetime.datetime.now().date().strftime('%A %-d %B %Y')],
                    baseline='bottom',
                    orient='bottom',
                    anchor='end',
                    fontWeight='normal',
                    fontSize=10,
                    dy=10
                ),
            )
            plotname = 'pha-education--school-%s.png'%datetime.datetime.now().date().strftime('%Y-%d-%m')
            plotstore = io.BytesIO()
            p.save(fp=plotstore, format='png', method='selenium', webdriver=driver)
            plotstore.seek(0)
            plots.append({'name': plotname, 'store': plotstore})
            change = event[0]
            tweet = '''School Surveillance for COVID-19 in NI, week ending {end_date}
\u2022 Preschool: {preschool:,} incidents
\u2022 Primary: {primary:,}
\u2022 Post Primary: {post_primary:,}
\u2022 Special: {special:,}
\u2022 Total: {total:,}

Source: https://www.publichealth.hscni.net/publications/coronavirus-bulletin'''.format(
                end_date=latest['End Date'].max().strftime('%-d %B %Y'),
                preschool=int(latest[latest['School Type']=='Preschool']['New'].sum()),
                primary=int(latest[latest['School Type']=='Primary']['New'].sum()),
                post_primary=int(latest[latest['School Type']=='Post Primary']['New'].sum()),
                special=int(latest[latest['School Type']=='Special']['New'].sum()),
                total=int(latest['New'].sum())
            )
            if change.get('notweet') is not True:
                api = TwitterAPI(secret['twitter_apikey'], secret['twitter_apisecretkey'], secret['twitter_accesstoken'], secret['twitter_accesstokensecret'])
                upload_ids = api.upload_multiple(plots)
                if change.get('testtweet') is True:
                    if len(upload_ids) > 0:
                        resp = api.dm(secret['twitter_dmaccount'], tweet, upload_ids[0])
                        if len(upload_ids) > 1:
                            resp = api.dm(secret['twitter_dmaccount'], 'Test 1', upload_ids[1])
                            if len(upload_ids) > 2:
                                resp = api.dm(secret['twitter_dmaccount'], 'Test 2', upload_ids[2])
                    else:
                        resp = api.dm(secret['twitter_dmaccount'], tweet)
                    messages.append('Tweeted DM ID %s' %(resp.id))
                else:
                    if len(upload_ids) > 0:
                        resp = api.tweet(tweet, media_ids=upload_ids)
                    else:
                        resp = api.tweet(tweet)
                    # Download and update the index
                    status = S3_scraper_index(s3, secret['bucketname'], secret['pha-bulletin-index'])
                    index = status.get_dict()
                    for i in range(len(index)):
                        if index[i]['filedate'] == datastore['filedate'].max():
                            index[i]['tweet'] = resp.id
                            break
                    status.put_dict(index)
                    messages.append('Tweeted ID %s and updated %s' %(resp.id, secret['pha-bulletin-index']))
            else:
                print(tweet)
                messages.append('Did not tweet')

    return {
        "statusCode": 200,
        "body": json.dumps({
            "message": messages,
        }),
    }