Beispiel #1
0
    def test_send_email(self):
        email_addr = '*****@*****.**'
        payload_plain = 'plain test'
        payload_html = '<em>html test</em>'
        with patch('smtplib.SMTP') as mock_smtp:
            msg = utils.send_email(email_addr,
                                   email_template=myADSTemplate,
                                   payload_plain=payload_plain,
                                   payload_html=payload_html)

            self.assertTrue(
                payload_plain in msg.get_payload()[0].get_payload())
            self.assertTrue(payload_html in msg.get_payload()[1].get_payload())
            self.assertTrue(myADSTemplate.subject == msg.get('subject'))
Beispiel #2
0
def process_myads(since=None, user_ids=None, user_emails=None, test_send_to=None, admin_email=None, force=False,
                  frequency='daily', test_bibcode=None, **kwargs):
    """
    Processes myADS mailings

    :param since: check for new myADS users since this date
    :param user_ids: users to process claims for, else all users - list (given as adsws IDs)
    :param user_emails: users to process claims for, else all users - list (given as email addresses)
    :param test_send_to: for testing; process a given user ID but send the output to this email address
    :param admin_email: if provided, email is sent to this address at beginning and end of processing (does not trigger
    for processing for individual users)
    :param force: if True, will force processing of emails even if sent for a given user already that day
    :param frequency: basestring; 'daily' or 'weekly'
    :param test_bibcode: bibcode to query to test if Solr searcher has been updated
    :return: no return
    """
    if user_ids:
        for u in user_ids:
            tasks.task_process_myads({'userid': u, 'frequency': frequency, 'force': True,
                                      'test_send_to': test_send_to, 'test_bibcode': test_bibcode})

        logger.info('Done (just the supplied user IDs)')
        return

    if user_emails:
        for u in user_emails:
            r = app.client.get(config.get('API_ADSWS_USER_EMAIL') % u,
                               headers={'Accept': 'application/json',
                                        'Authorization': 'Bearer {0}'.format(config.get('API_TOKEN'))}
                               )
            if r.status_code == 200:
                user_id = r.json()['id']
            else:
                logger.warning('Error getting user ID with email {0} from the API. Processing aborted for this user'.format(u))
                continue

            tasks.task_process_myads({'userid': user_id, 'frequency': frequency, 'force': True,
                                      'test_send_to': test_send_to, 'test_bibcode': test_bibcode})

        logger.info('Done (just the supplied user IDs)')
        return

    logging.captureWarnings(True)

    if admin_email:
        msg = utils.send_email(email_addr=admin_email,
                               payload_plain='Processing started for {}'.format(get_date()),
                               payload_html='Processing started for {}'.format(get_date()),
                               subject='myADS {0} processing has started'.format(frequency))

    # if since keyword not provided, since is set to timestamp of last processing
    if not since or isinstance(since, basestring) and since.strip() == "":
        with app.session_scope() as session:
            if frequency == 'daily':
                kv = session.query(KeyValue).filter_by(key='last.process.daily').first()
            else:
                kv = session.query(KeyValue).filter_by(key='last.process.weekly').first()
            if kv is not None:
                since = kv.value
            else:
                since = '1971-01-01T12:00:00Z'

    users_since_date = get_date(since)
    logger.info('Processing {0} myADS queries since: {1}'.format(frequency, users_since_date.isoformat()))

    last_process_date = get_date()
    all_users = app.get_users(users_since_date.isoformat())

    for user in all_users:
        try:
            tasks.task_process_myads.delay({'userid': user, 'frequency': frequency, 'force': force,
                                            'test_bibcode': test_bibcode})
        except:  # potential backpressure (we are too fast)
            time.sleep(2)
            print 'Conn problem, retrying...', user
            tasks.task_process_myads.delay({'userid': user, 'frequency': frequency, 'force': force,
                                            'test_bibcode': test_bibcode})

    # update last processed timestamp
    with app.session_scope() as session:
        if frequency == 'daily':
            kv = session.query(KeyValue).filter_by(key='last.process.daily').first()
        else:
            kv = session.query(KeyValue).filter_by(key='last.process.weekly').first()
        if kv is None:
            if frequency == 'daily':
                kv = KeyValue(key='last.process.daily', value=last_process_date.isoformat())
            else:
                kv = KeyValue(key='last.process.weekly', value=last_process_date.isoformat())
            session.add(kv)
        else:
            kv.value = last_process_date.isoformat()
        session.commit()

    print 'Done submitting {0} myADS processing tasks for {1} users.'.format(frequency, len(all_users))
    logger.info('Done submitting {0} myADS processing tasks for {1} users.'.format(frequency, len(all_users)))
Beispiel #3
0
         arxiv_complete = _arxiv_ingest_complete(sleep_delay=300, sleep_timeout=36000)
     except Exception as e:
         logger.warning('arXiv ingest: code failed with an exception: {0}'.format(e))
     if arxiv_complete:
         logger.info('arxiv ingest: complete')
         if args.wait_send:
             logger.info('arxiv ingest: waiting {0} seconds for all SOLR searchers to sync data'.format(args.wait_send))
             time.sleep(args.wait_send)
         logger.info('arxiv ingest: starting processing')
         process_myads(args.since_date, args.user_ids, args.user_emails, args.test_send_to, args.admin_email, args.force,
                       frequency='daily', test_bibcode=arxiv_complete)
     else:
         logger.warning('arXiv ingest: failed.')
         if args.admin_email:
             msg = utils.send_email(email_addr=args.admin_email,
                                    payload_plain='Error in the arXiv ingest',
                                    payload_html='Error in the arXiv ingest',
                                    subject='arXiv ingest failed')
 if args.weekly_update:
     astro_complete = False
     try:
         astro_complete = _astro_ingest_complete(sleep_delay=300, sleep_timeout=36000)
     except Exception as e:
         logger.warning('astro ingest: code failed with an exception: {0}'.format(e))
     if astro_complete:
         logger.info('astro ingest: complete')
         if args.wait_send:
             logger.info('astro ingest: waiting {0} seconds for all SOLR searchers to sync data'.format(args.wait_send))
             time.sleep(args.wait_send)
         logger.info('astro ingest: starting processing now')
         process_myads(args.since_date, args.user_ids, args.user_emails, args.test_send_to, args.admin_email, args.force,
                       frequency='weekly', test_bibcode=astro_complete)
Beispiel #4
0
def task_process_myads(message):
    """
    Process the myADS notifications for a given user

    :param message: contains the message inside the packet
        {
         'userid': adsws user ID,
         'frequency': 'daily' or 'weekly',
         'force': Boolean (if present, we'll reprocess myADS notifications for the user,
            even if they were already processed today)
         'test_send_to': email address to send output to, if not that of the user (for testing)
         'retries': number of retries attempted
        }
    :return: no return
    """

    if 'userid' not in message:
        logger.error('No user ID received for {0}'.format(message))
        return
    if 'frequency' not in message:
        logger.error('No frequency received for {0}'.format(message))
        return

    userid = message['userid']
    with app.session_scope() as session:
        try:
            q = session.query(AuthorInfo).filter_by(id=userid).one()
            last_sent = q.last_sent
        except ormexc.NoResultFound:
            author = AuthorInfo(id=userid,
                                created=adsputils.get_date(),
                                last_sent=None)
            session.add(author)
            session.flush()
            last_sent = author.last_sent
            session.commit()
        if last_sent and last_sent.date() == adsputils.get_date().date():
            # already sent email today
            if not message['force']:
                logger.warning(
                    'Email for user {0} already sent today'.format(userid))
                return
            else:
                logger.info(
                    'Email for user {0} already sent today, but force mode is on'
                    .format(userid))

    # first fetch the myADS setup from /vault/get-myads
    r = app.client.get(app.conf.get('API_VAULT_MYADS_SETUP') % userid,
                       headers={
                           'Accept':
                           'application/json',
                           'Authorization':
                           'Bearer {0}'.format(app.conf.get('API_TOKEN'))
                       })

    if r.status_code != 200:
        if message.get('retries', None):
            retries = message['retries']
        else:
            retries = 0
        if retries < app.conf.get('TOTAL_RETRIES', 3):
            message['retries'] = retries + 1
            task_process_myads.apply_async(args=(message, ),
                                           countdown=app.conf.get(
                                               'MYADS_RESEND_WINDOW', 3600))
            logger.warning(
                'Failed getting myADS setup for {0}; will try again later. Retry {1}'
                .format(userid, retries))
            return
        else:
            logger.warning(
                'Maximum number of retries attempted for {0}. myADS processing failed.'
                .format(userid))
            return

    if message.get('test_bibcode', None):
        # check that the solr searcher we're getting is still ok by querying for the test bibcode
        q = app.client.get(
            '{0}?q=identifier:{1}&fl=bibcode,identifier,entry_date'.format(
                app.conf.get('API_SOLR_QUERY_ENDPOINT'),
                message.get('test_bibcode')),
            headers={'Authorization': 'Bearer ' + app.conf.get('API_TOKEN')})

        fail = True
        if q.status_code != 200:
            logger.warning(
                'Error retrieving the test bibcode {0} from solr while processing for user {1}. Retrying'
                .format(message.get('test_bibcode'), userid))
        elif q.json()['response']['numFound'] == 0:
            logger.warning(
                'Test bibcode {0} not found in solr while processing for user {1}. Retrying'
                .format(message.get('test_bibcode'), userid))
        else:
            fail = False

        if fail:
            if message.get('solr_retries', None):
                retries = message['solr_retries']
            else:
                retries = 0
            if retries < app.conf.get('TOTAL_RETRIES', 3):
                message['solr_retries'] = retries + 1
                task_process_myads.apply_async(args=(message, ),
                                               countdown=app.conf.get(
                                                   'MYADS_SOLR_RESEND_WINDOW',
                                                   3600))
                logger.warning(
                    'Solr error occurred while processing myADS email for user {0}; rerunning. Retry {1}'
                    .format(userid, retries))
                return
            else:
                logger.warning(
                    'Maximum number of retries attempted for {0}. myADS processing failed: '
                    'solr searchers were not updated.'.format(userid))
                return

    # then execute each qid /vault/execute-query/qid
    setup = r.json()
    payload = []
    for s in setup:
        if s['frequency'] == message['frequency']:
            # only return 5 results, unless it's the daily arXiv posting, then return max
            # TODO should all stateful queries return all results or will this be overwhelming for some? well-cited
            # users can get 40+ new cites in one weekly astro update
            if s['frequency'] == 'daily' and s['stateful'] is False:
                s['rows'] = 2000
            else:
                s['rows'] = 5
            s['fields'] = 'bibcode,title,author_norm,identifier,year,bibstem'
            if s['type'] == 'query':
                qtype = 'general'
                try:
                    raw_results = utils.get_query_results(s)
                except RuntimeError:
                    if message.get('query_retries', None):
                        retries = message['query_retries']
                    else:
                        retries = 0
                    if retries < app.conf.get('TOTAL_RETRIES', 3):
                        message['query_retries'] = retries + 1
                        logger.warning(
                            'Error getting query results for user {0}. Retrying. Retry: {1}'
                            .format(userid, retries))
                        task_process_myads.apply_async(
                            args=(message, ),
                            countdown=app.conf.get('MYADS_RESEND_WINDOW',
                                                   3600))
                        return
                    else:
                        logger.warning(
                            'Maximum number of query retries attempted for user {0}; myADS processing '
                            'failed due to retrieving query results failures.'.
                            format(userid))
            elif s['type'] == 'template':
                qtype = s['template']
                try:
                    raw_results = utils.get_template_query_results(s)
                except RuntimeError:
                    if message.get('query_retries', None):
                        retries = message['query_retries']
                    else:
                        retries = 0
                    if retries < app.conf.get('TOTAL_RETRIES', 3):
                        message['query_retries'] = retries + 1
                        logger.warning(
                            'Error getting template query results for user {0}. Retrying. '
                            'Retry:'.format(userid, retries))
                        task_process_myads.apply_async(
                            args=(message, ),
                            countdown=app.conf.get('MYADS_RESEND_WINDOW',
                                                   3600))
                        return
                    else:
                        logger.warning(
                            'Maximum number of query retries attempted for user {0}; myADS processing '
                            'failed due to retrieving query results failures.'.
                            format(userid))
            else:
                logger.warning(
                    'Wrong query type passed for query {0}, user {1}'.format(
                        s, userid))
                pass

            for r in raw_results:
                # for stateful queries, remove previously seen results, store new results
                if s['stateful']:
                    docs = r['results']
                    bibcodes = [doc['bibcode'] for doc in docs]
                    if s.get('qid', None):
                        good_bibc = app.get_recent_results(
                            user_id=userid,
                            qid=s['qid'],
                            input_results=bibcodes,
                            ndays=app.conf.get('STATEFUL_RESULTS_DAYS', 7))
                    else:
                        good_bibc = app.get_recent_results(
                            user_id=userid,
                            setup_id=s['id'],
                            input_results=bibcodes,
                            ndays=app.conf.get('STATEFUL_RESULTS_DAYS', 7))
                    results = [
                        doc for doc in docs if doc['bibcode'] in good_bibc
                    ]
                else:
                    results = r['results']

                payload.append({
                    'name': r['name'],
                    'query_url': r['query_url'],
                    'results': results,
                    'query': r['query'],
                    'qtype': qtype,
                    'id': s['id']
                })
        else:
            # wrong frequency for this round of processing
            pass

    if len(payload) == 0:
        logger.info(
            'No payload for user {0} for the {1} email. No email was sent.'.
            format(userid, message['frequency']))
        return

    # if test email address provided, send there; otherwise fetch user email address
    if message.get('test_send_to', None):
        email = message.get('test_send_to')
    else:
        email = utils.get_user_email(userid=userid)

    if message['frequency'] == 'daily':
        subject = 'Daily arXiv myADS Notification'
    else:
        subject = 'Weekly myADS Notification'

    payload_plain = utils.payload_to_plain(payload)
    if len(payload) < app.conf.get('NUM_QUERIES_TWO_COL', 3):
        payload_html = utils.payload_to_html(payload,
                                             col=1,
                                             frequency=message['frequency'],
                                             email_address=email)
    else:
        payload_html = utils.payload_to_html(payload,
                                             col=2,
                                             frequency=message['frequency'],
                                             email_address=email)
    msg = utils.send_email(email_addr=email,
                           email_template=myADSTemplate,
                           payload_plain=payload_plain,
                           payload_html=payload_html,
                           subject=subject)

    if msg:
        # update author table w/ last sent datetime
        with app.session_scope() as session:
            q = session.query(AuthorInfo).filter_by(id=userid).one()
            q.last_sent = adsputils.get_date()

            session.commit()

    else:
        if message.get('send_retries', None):
            retries = message['send_retries']
        else:
            retries = 0
        if retries < app.conf.get('TOTAL_RETRIES', 3):
            message['send_retries'] = retries + 1
            task_process_myads.apply_async(args=(message, ),
                                           countdown=app.conf.get(
                                               'MYADS_RESEND_WINDOW', 3600))
            logger.warning(
                'Error sending myADS email for user {0}, email {1}; rerunning. Retry {2}'
                .format(userid, email, retries))
            return
        else:
            logger.warning(
                'Maximum number of retries attempted for {0}. myADS processing failed at sending the email.'
                .format(userid))
            return
Beispiel #5
0
def _arxiv_ingest_complete(date=None, sleep_delay=60, sleep_timeout=7200, admin_email=None):
    """
    Check if new arXiv records are in Solr - run before running myADS processing
    :param date: date to check arXiv records for; default is set by days-delta from today in config (times in local time)
    :param sleep_delay: number of seconds to sleep between retries
    :param sleep_timeout: number of seconds to retry in total before timing out completely
    :return: test bibcode or None
    """

    if not date:
        date = (datetime.datetime.today() - datetime.timedelta(days=config.get('ARXIV_TIMEDELTA_DAYS'))).strftime('%Y-%m-%d')
    else:
        date = get_date(date).strftime('%Y-%m-%d')

    arxiv_file = os.path.join(config.get('ARXIV_INCOMING_ABS_DIR'), date, 'new_records.tsv')

    arxiv_records = []
    try:
        with open(arxiv_file, 'rt') as flist:
            for l in flist.readlines():
                # sample line: 2012.14424	oai/arXiv.org/2012/14424
                arxiv_records.append(l.split()[0])
    except IOError:
        logger.warning('arXiv ingest file not found. Exiting.')
        if admin_email:
            msg_text = 'The arXiv ingest file for today was not found.'
            msg = utils.send_email(email_addr=admin_email,
                                   payload_plain=msg_text,
                                   payload_html=msg_text,
                                   subject='arXiv ingest failed')
        return None

    arxiv_records.sort()

    # get most recent arXiv id to test ingest later
    last_id = arxiv_records.pop()

    total_delay = 0
    while total_delay < sleep_timeout:
        total_delay += sleep_delay
        r = app.client.get('{0}?q=identifier:{1}&fl=bibcode,identifier,entry_date'.format(config.get('API_SOLR_QUERY_ENDPOINT'), last_id),
                           headers={'Authorization': 'Bearer ' + config.get('API_TOKEN')})
        if r.status_code != 200:
            time.sleep(sleep_delay)
            logger.error('Error retrieving record for {0} from Solr ({1} {2}), retrying'.
                         format(last_id, r.status_code, r.text))
            continue

        numfound = r.json()['response']['numFound']
        if numfound == 0:
            # nothing found, try again after a sleep
            time.sleep(sleep_delay)
            logger.info('arXiv ingest not complete (test arXiv id: {0}). Sleeping {1}s, for a total delay of {2}s.'
                        .format(last_id, sleep_delay, total_delay))
            continue
        if numfound > 1:
            # returning this as true for now, since technically something was found
            logger.error('Too many records returned for id {0}'.format(last_id))

        logger.info('Numfound: {0} for test id {1}. Response: {2}. URL: {3}'.format(numfound, last_id,
                                                                                    json.dumps(r.json()), r.url))

        # check number of bibcodes from ingest
        if get_date().weekday() == 0:
            start_date = (get_date() - datetime.timedelta(days=3)).date()
        else:
            start_date = (get_date() - datetime.timedelta(days=1)).date()
        beg_pubyear = (get_date() - datetime.timedelta(days=180)).year
        q = app.client.get('{0}?q={1}'.format(config.get('API_SOLR_QUERY_ENDPOINT'),
                                              quote_plus('bibstem:arxiv entdate:["{0}Z00:00" TO NOW] '
                                                         'pubdate:[{1}-00 TO *]'.format(start_date, beg_pubyear))),
                           headers={'Authorization': 'Bearer ' + config.get('API_TOKEN')})
        logger.info('Total number of arXiv bibcodes ingested: {}'.format(q.json()['response']['numFound']))

        return last_id

    logger.warning('arXiv ingest did not complete within the {0}s timeout limit. Exiting.'.format(sleep_timeout))
    if admin_email:
        msg_text = 'The arXiv ingest was not replicated to production Solr before timeout reached.'
        msg = utils.send_email(email_addr=admin_email,
                               payload_plain=msg_text,
                               payload_html=msg_text,
                               subject='arXiv ingest failed')

    return None
Beispiel #6
0
def _astro_ingest_complete(date=None, sleep_delay=60, sleep_timeout=7200, admin_email=None):
    """
    Check if new astronomy records are in Solr; run before weekly processing
    :param date: check to check against astronomy bibcode list last updated date
    :param sleep_delay: number of seconds to sleep between retries
    :param sleep_timeout: number of seconds to retry in total before timing out completely
    :return: test bibcode or None
    """

    if not date:
        date = (datetime.datetime.today() - datetime.timedelta(days=config.get('ASTRO_TIMEDELTA_DAYS')))
    else:
        date = get_date(date)

    astro_file = config.get('ASTRO_INCOMING_DIR') + 'matches.input'

    # make sure file is present and check modified datestamp on file - should be recent (otherwise contains old data)
    try:
        mod_date = datetime.datetime.fromtimestamp(os.path.getmtime(astro_file))
    except OSError:
        mod_date = None

    # if the file is old or missing, sleep until the file is present and updated
    if not mod_date or mod_date < date:
        total_delay = 0
        while total_delay < sleep_timeout:
            total_delay += sleep_delay
            time.sleep(sleep_delay)
            try:
                mod_date = datetime.datetime.fromtimestamp(os.path.getmtime(astro_file))
            except OSError:
                mod_date = None
            if mod_date and mod_date > date:
                break
        else:
            # timeout reached before astronomy update completed
            logger.warning('Astronomy update did not complete within the {0}s timeout limit. Exiting.'.format(sleep_timeout))

            if admin_email:
                msg_text = 'Astronomy ingest file has not been updated recently, and timeout limit for waiting for ' \
                           'update has been reached.'
                msg = utils.send_email(email_addr=admin_email,
                                       payload_plain=msg_text,
                                       payload_html=msg_text,
                                       subject='Astronomy ingest failed')

            return None

    # make sure the ingest file exists and has enough bibcodes
    total_delay = 0
    while total_delay < sleep_timeout:
        astro_records = []
        try:
            with open(astro_file, 'rt') as flist:
                for l in flist.readlines():
                    # sample line: 2019A&A...632A..94J     K58-37447
                    astro_records.append(l.split()[0])
        except IOError:
            time.sleep(sleep_delay)
            total_delay += sleep_delay
            logger.warning('Error opening astronomy ingest file. Sleeping {0}s, for a total delay of {1}s'.
                           format(sleep_delay, total_delay))
            continue

        if len(astro_records) < 10:
            time.sleep(sleep_delay)
            total_delay += sleep_delay
            logger.warning('Astronomy ingest file too small - ingest not complete. Sleeping {0}s, for a total delay of {1}s'.
                           format(sleep_delay, total_delay))
            continue
        else:
            break
    else:
        if admin_email:
            msg_text = 'The astronomy ingest file has been updated recently, but there was an error opening it ' \
                       'and/or it has very few records. The timeout limit for waiting for the update has been reached.'
            msg = utils.send_email(email_addr=admin_email,
                                   payload_plain=msg_text,
                                   payload_html=msg_text,
                                   subject='Astronomy ingest failed')
        return None

    # get several randomly selected bibcodes, in case one had ingest issues
    sample = random.sample(astro_records, config.get('ASTRO_SAMPLE_SIZE'))

    # check that the astronomy records have made it into solr
    total_delay = 0
    while total_delay < sleep_timeout:
        num_sampled = 0
        for s in sample:
            num_sampled += 1
            r = app.client.get('{0}?q=identifier:{1}&fl=bibcode,identifier,entry_date'.format(config.get('API_SOLR_QUERY_ENDPOINT'), s),
                               headers={'Authorization': 'Bearer ' + config.get('API_TOKEN')})
            # if there's a solr error, sleep then move to the next bibcode
            if r.status_code != 200:
                time.sleep(sleep_delay)
                total_delay += sleep_delay
                logger.error('Error retrieving bibcode {0} from Solr ({1} {2}), sleeping {3}s, for a total delay of {4}s'.
                             format(s, r.status_code, r.text, sleep_delay, total_delay))
                continue

            numfound = r.json()['response']['numFound']
            if numfound == 0:
                # nothing found - if all bibcodes in the sample were tried, sleep then start the while loop again
                if num_sampled == config.get('ASTRO_SAMPLE_SIZE'):
                    time.sleep(sleep_delay)
                    total_delay += sleep_delay
                    logger.warning('Astronomy ingest not complete for all in sample (sample: {0}). Sleeping {1}s, for a total delay of {2}s.'
                                   .format(sample, sleep_delay, total_delay))
                # if we haven't tried the others in the same, try the rest
                else:
                    logger.info(
                        'Astronomy ingest not complete (test astro bibcode: {0}). Trying the next in the sample.'
                        .format(s))
                continue
            elif numfound > 1:
                # returning this as true for now, since technically something was found
                logger.error('Too many records returned for bibcode {0}'.format(s))

            logger.info('Numfound: {0} for test bibcode {1}. Response: {2}. URL: {3}'.format(numfound, s,
                                                                                             json.dumps(r.json()),
                                                                                             r.url))
            return s

    logger.warning('Astronomy ingest did not complete within the {0}s timeout limit. Exiting.'.format(sleep_timeout))

    if admin_email:
        msg_text = 'The astronomy ingest was not replicated to production Solr before timeout reached.'
        msg = utils.send_email(email_addr=admin_email,
                               payload_plain=msg_text,
                               payload_html=msg_text,
                               subject='Astronomy ingest failed')

    return None