Example #1
0
def remove_duplicates(by='pmid'):
    """Remove duplicate articles by field.

    :param str by: Article field to identify duplicates

    """
    counts = collections.defaultdict(int)
    values = [
        value[by]
        for value in Article._storage[0].store.find({}, {by: 1})
    ]
    for value in values:
        counts[value] += 1

    for value, count in counts.items():
        if count == 1:
            continue
        articles = list(Article.find(Q(by, 'eq', value)))
        for duplicate in articles[1:]:
            logger.debug(
                'Deleting duplicate record: {}'.format(
                    value
                )
            )
            Article.remove_one(duplicate)
Example #2
0
def add_missing(query, max_count, randomize=False):
    """Search PubMed for articles and scrape documents.

    :param str query: PubMed query
    :param int max_count: Maximum number of articles to process
    :param bool randomize: Randomize list of articles to fetch
    :return: Added article objects
    """
    pmids = pubtools.search_pmids(query)
    stored_pmids = [
        article['pmid'] for article in mongo['article'].find({}, {'pmid': 1})
    ]

    missing_pmids = set(pmids) - set(stored_pmids)
    logger.warn('Found {0} articles to add.'.format(len(missing_pmids)))

    pmids_to_add = list(missing_pmids)[:max_count]
    if randomize:
        random.shuffle(pmids_to_add)

    records = pubtools.download_pmids(pmids_to_add)

    scraper = SCRAPE_CLASS(**SCRAPE_KWARGS)

    added = []

    for pmid, record in zip(pmids_to_add, records):
        logger.debug('Adding article {}'.format(pmid))
        article = Article.from_record(record)
        article.scrape(scraper)
        article.tag()
        added.append(article)

    return added
Example #3
0
def rescrape(article_id, **kwargs):
    logger.info('Re-scraping article {0}'.format(article_id))
    try:
        article = Article.load(article_id)
        article.scrape(**kwargs)
    except Exception as error:
        logger.error('Error scraping article {0}'.format(article_id))
        logger.exception(error)
Example #4
0
def retag(article_id, **kwargs):
    logger.info('Re-tagging article {0}'.format(article_id))
    try:
        article = Article.load(article_id)
        article.tag(**kwargs)
    except Exception as error:
        logger.error('Error tagging article {0}'.format(article_id))
        logger.exception(error)
Example #5
0
def rescrape(command):
    logger.info('Re-scraping article {0}'.format(command.article_id))
    try:
        article = Article.load(command.article_id)
        article.scrape(overwrite=command.overwrite)
    except Exception as error:
        logger.error('Error scraping article {0}'.format(command.article_id))
        logger.exception(error)
Example #6
0
def retag(command):
    logger.info('Re-tagging article {0}'.format(command.article_id))
    try:
        article = Article.load(command.article_id)
        article.tag(overwrite=command.overwrite)
    except Exception as error:
        logger.error('Error tagging article {0}'.format(command.article_id))
        logger.exception(error)
Example #7
0
def rescrape(article_id, **kwargs):
    logger.info('Re-scraping article {0}'.format(article_id))
    try:
        article = Article.load(article_id)
        article.scrape(**kwargs)
    except Exception as error:
        logger.error('Error scraping article {0}'.format(article_id))
        logger.exception(error)
Example #8
0
def retag(article_id, **kwargs):
    logger.info('Re-tagging article {0}'.format(article_id))
    try:
        article = Article.load(article_id)
        article.tag(**kwargs)
    except Exception as error:
        logger.error('Error tagging article {0}'.format(article_id))
        logger.exception(error)
Example #9
0
def batch_rescrape(processes, query=None, limit=None, **kwargs):
    pool = multiprocessing.Pool(processes=processes)
    articles = Article.find(query)
    if limit:
        articles = articles.limit(limit)
    results = pool.map(
        functools.partial(rescrape, **kwargs),
        (article._id for article in articles),
    )
Example #10
0
def batch_rescrape(processes, query=None, limit=None, overwrite=False):
    pool = multiprocessing.Pool(processes=processes)
    articles = Article.find(query)
    if limit:
        articles = articles.limit(limit)
    results = pool.map(
        rescrape,
        (RescrapeCommand(article._id, overwrite) for article in articles),
    )
Example #11
0
def batch_rescrape(processes, query=None, limit=None, **kwargs):
    pool = multiprocessing.Pool(processes=processes)
    articles = Article.find(query)
    if limit:
        articles = articles.limit(limit)
    results = pool.map(
        functools.partial(rescrape, **kwargs),
        (article._id for article in articles),
    )
Example #12
0
def remove_duplicates(by='pmid'):
    """Remove duplicate articles by field.

    :param str by: Article field to identify duplicates
    """
    counts = collections.defaultdict(int)
    values = [
        value[by] for value in Article._storage[0].store.find({}, {by: 1})
    ]
    for value in values:
        counts[value] += 1

    for value, count in counts.items():
        if count == 1:
            continue
        articles = list(Article.find(Q(by, 'eq', value)))
        for duplicate in articles[1:]:
            logger.debug('Deleting duplicate record: {}'.format(value))
            Article.remove_one(duplicate)
Example #13
0
def count_verified(threshold=VERIFY_THRESHOLD):
    """Count the number of downloaded and verified documents across all
    articles.

    :param float threshold: Document verification threshold
    :return: Tuple of total and verified dictionaries, each mapping document
    types to counts
    """
    count = defaultdict(int)
    verified = defaultdict(int)

    for article in Article.find():
        for type_, field in DOCUMENT_TYPES_TO_FIELDS.iteritems():
            value = getattr(article, field)
            if value:
                count[type_] += 1
                if value.verification_score > threshold:
                    verified[type_] += 1

    return count, verified
Example #14
0
def count_verified(threshold=VERIFY_THRESHOLD):
    """Count the number of downloaded and verified documents across all
    articles.

    :param float threshold: Document verification threshold
    :return: Tuple of total and verified dictionaries, each mapping document
    types to counts
    """
    count = defaultdict(int)
    verified = defaultdict(int)

    for article in Article.find():
        for type_, field in DOCUMENT_TYPES_TO_FIELDS.iteritems():
            value = getattr(article, field)
            if value:
                count[type_] += 1
                if value.verification_score > threshold:
                    verified[type_] += 1

    return count, verified
Example #15
0
def add_missing(query, max_count, randomize=False):
    """Search PubMed for articles and scrape documents.

    :param str query: PubMed query
    :param int max_count: Maximum number of articles to process
    :param bool randomize: Randomize list of articles to fetch
    :return: Added article objects
    """
    pmids = pubtools.search_pmids(query)
    stored_pmids = [
        article['pmid']
        for article in mongo['article'].find(
            {}, {'pmid': 1}
        )
    ]

    missing_pmids = set(pmids) - set(stored_pmids)
    logger.warn('Found {0} articles to add.'.format(len(missing_pmids)))

    pmids_to_add = list(missing_pmids)[:max_count]
    if randomize:
        random.shuffle(pmids_to_add)

    records = pubtools.download_pmids(pmids_to_add)

    scraper = SCRAPE_CLASS(**SCRAPE_KWARGS)

    added = []

    for pmid, record in zip(pmids_to_add, records):
        logger.debug('Adding article {}'.format(pmid))
        article = Article.from_record(record)
        article.scrape(scraper)
        article.tag()
        added.append(article)

    return added
Example #16
0
def update_dates(overwrite=False):
    query = None if overwrite else Q('date', 'eq', None)
    articles = Article.find(query)
    for article in articles:
        article.update_date()
        article.save()