def rescrape(processes=2, months=6, limit=50, overwrite=False): from dateutil.relativedelta import relativedelta from modularodm import Q from scripts import retag cutoff_date = datetime.datetime.utcnow() - relativedelta(months=months) query = ( ( Q('date_last_scraped', 'lt', datetime.datetime.utcnow()) | Q('date_last_scraped', 'eq', None) ) & Q('verified.0', 'exists', False) ) retag.batch_rescrape( processes=processes, query=query, limit=limit, overwrite=overwrite, )
def rescrape(processes=1, months=6, limit=50, missing='any', overwrite=False): """ :param int processes: Number of processes to launch :param int months: Minimum time since last scraped :param int limit: Max number of articles to scrape :param str missing: Missing document type (html, pdf, pmc, any) :param bool overwrite: Overwrite existing articles """ from dateutil.relativedelta import relativedelta from modularodm import Q from scripts import retag cutoff_date = datetime.datetime.utcnow() - relativedelta(months=months) query = (Q('date_last_scraped', 'lt', cutoff_date) | Q('date_last_scraped', 'eq', None)) if missing == 'any': query = query & Q('verified.0', 'exists', False) else: query = query & Q('verified', 'ne', missing) retag.batch_rescrape( processes=processes, query=query, limit=limit, overwrite=overwrite, )