def test_init_from_fetched_article(self): # delete corresponding db fixture object so we can reload it HarvestRecord.objects.get(pmcid=self.article.docid).delete() # mock identifiable authors to avoid actual look-up with patch.object(self.article, 'identifiable_authors', new=Mock(return_value=[])): record = HarvestRecord.init_from_fetched_article(self.article) self.assertEqual(self.article.article_title, record.title) self.assertEqual(self.article.docid, record.pmcid) self.assertEqual(self.article.fulltext_available, record.fulltext) self.assertEqual(0, record.authors.count()) self.assertEqual(self.article.serialize(pretty=True), record.content.read(), 'article xml should be saved in content file field') # remove the new record so we can test creating it again record.content.delete() record.delete() # simulate identifiable authors testauthor = User(username='******') testauthor.save() with patch.object(self.article, 'identifiable_authors', new=Mock(return_value=[testauthor])): record = HarvestRecord.init_from_fetched_article(self.article) self.assertEqual(1, record.authors.count()) self.assert_(testauthor in record.authors.all()) record.content.delete()
def test_init_from_fetched_article(self): # delete corresponding db fixture object so we can reload it HarvestRecord.objects.get(pmcid=self.article.docid).delete() # mock identifiable authors to avoid actual look-up with patch.object(self.article, 'identifiable_authors', new=Mock(return_value=[])): record = HarvestRecord.init_from_fetched_article(self.article) self.assertEqual(self.article.article_title, record.title) self.assertEqual(self.article.docid, record.pmcid) self.assertEqual(self.article.fulltext_available, record.fulltext) self.assertEqual(0, record.authors.count()) self.assertEqual( self.article.serialize(pretty=True), record.content.read(), 'article xml should be saved in content file field') # remove the new record so we can test creating it again record.content.delete() record.delete() # simulate identifiable authors testauthor = User(username='******') testauthor.save() with patch.object(self.article, 'identifiable_authors', new=Mock(return_value=[testauthor])): record = HarvestRecord.init_from_fetched_article(self.article) self.assertEqual(1, record.authors.count()) self.assert_(testauthor in record.authors.all()) record.content.delete()
def handle(self, *args, **options): today = datetime.date.today() formated_today = today.strftime('%Y/%m/%d') self.verbosity = int(options['verbosity']) # 1 = normal, 0 = minimal, 2 = all # number of articles we want to harvest in this run self.max_articles = int(options['max_articles']) if options['max_articles'] else None print options['min_date'] # self.min_date = options['min_date'] # self.max_date = options['max_date'] self.min_date = '2015/09/01' self.max_date = formated_today self.auto_date = False self.v_normal = 1 stats = defaultdict(int) done= False chunks = self.article_chunks(**options) if options['progress']: pbar = ProgressBar(widgets=[Percentage(), ' ', ETA(), ' ', Bar()], maxval=chunks.count).start() for p in chunks.page_range: if self.verbosity > self.v_normal: self.stdout.write('Starting article chunk.\n') for article in chunks.page(p).object_list: # author = EFetchAuthor(article) # for attr in dir(author): # if hasattr( author, attr ): # print( "obj.%s = %s" % (attr, getattr(author, attr))) stats['articles'] += 1 # print article.docid # print article.authors[4].surname # print article.authors[4].aff_ids # print article.authors[4].affiliation() # for attr in dir(article): # if hasattr( article, attr ): # print( "obj.%s = %s" % (attr, getattr(article, attr))) if self.verbosity > self.v_normal: # python2.6 fails with ascii encoding errors (on unicode # titles) unless we explicitly encode output to # sys.stdout.write msg = u'Processing [%s] "%s"\n' % \ (article.docid, article.article_title) self.stdout.write(msg.encode(self.stdout.encoding)) if HarvestRecord.objects.filter(pmcid=article.docid).exists(): if self.verbosity >= self.v_normal: self.stdout.write('[%s] has already been harvested; skipping\n' \ % (article.docid,)) continue if article.identifiable_authors(derive=True): try: # don't save when sinulated if options['simulate']: self.stdout.write('Not Saving [%s] (simulated run)\n' % article.docid) #really save when not simulated else: HarvestRecord.init_from_fetched_article(article) stats['harvested'] += 1 if self.max_articles and stats['harvested'] >= self.max_articles: done = True break except Exception as err: self.stdout.write('Error creating record from article: %s\n' % err) stats['errors'] += 1 else: if self.verbosity >= self.v_normal: self.stdout.write('[%s] has no identifiable authors; skipping\n' \ % (article.docid,)) stats['noauthor'] += 1 if options['progress']: pbar.update(stats['articles']) if done: if self.verbosity > self.v_normal: self.stdout.write('Harvested %s articles ... stopping \n' % stats['harvested']) break if options['progress']: pbar.finish() # summarize what was done if self.date_opts: self.stdout.write('Date Range: %(mindate)s - %(maxdate)s' % self.date_opts) self.stdout.write('\nArticles processed: %(articles)d\n' % stats) self.stdout.write('Articles harvested: %(harvested)d\n' % stats) self.stdout.write('Errors harvesting articles: %(errors)d\n' % stats) self.stdout.write('Articles skipped (no identifiable authors): %(noauthor)d\n' % stats)
def handle(self, *args, **options): self.verbosity = int(options['verbosity']) # 1 = normal, 0 = minimal, 2 = all # number of articles we want to harvest in this run self.max_articles = int(options['max_articles']) if options['max_articles'] else None self.min_date = options['min_date'] self.max_date = options['max_date'] self.auto_date = options['auto_date'] self.v_normal = 1 stats = defaultdict(int) done= False chunks = self.article_chunks(**options) if options['progress']: pbar = ProgressBar(widgets=[Percentage(), ' ', ETA(), ' ', Bar()], maxval=chunks.count).start() for p in chunks.page_range: if self.verbosity > self.v_normal: self.stdout.write('Starting article chunk.\n') for article in chunks.page(p).object_list: stats['articles'] += 1 if self.verbosity > self.v_normal: # python2.6 fails with ascii encoding errors (on unicode # titles) unless we explicitly encode output to # sys.stdout.write msg = u'Processing [%s] "%s"\n' % \ (article.docid, article.article_title) self.stdout.write(msg.encode(self.stdout.encoding)) if HarvestRecord.objects.filter(pmcid=article.docid).exists(): if self.verbosity >= self.v_normal: self.stdout.write('[%s] has already been harvested; skipping\n' \ % (article.docid,)) continue if article.identifiable_authors(derive=True): try: # don't save when sinulated if options['simulate']: self.stdout.write('Not Saving [%s] (simulated run)\n' % article.docid) #really save when not simulated else: HarvestRecord.init_from_fetched_article(article) stats['harvested'] += 1 if self.max_articles and stats['harvested'] >= self.max_articles: done = True break except Exception as err: self.stdout.write('Error creating record from article: %s\n' % err) stats['errors'] += 1 else: if self.verbosity >= self.v_normal: self.stdout.write('[%s] has no identifiable authors; skipping\n' \ % (article.docid,)) stats['noauthor'] += 1 if options['progress']: pbar.update(stats['articles']) if done: if self.verbosity > self.v_normal: self.stdout.write('Harvested %s articles ... stopping \n' % stats['harvested']) break if options['progress']: pbar.finish() # summarize what was done if self.date_opts: self.stdout.write('Date Range: %(mindate)s - %(maxdate)s' % self.date_opts) self.stdout.write('\nArticles processed: %(articles)d\n' % stats) self.stdout.write('Articles harvested: %(harvested)d\n' % stats) self.stdout.write('Errors harvesting articles: %(errors)d\n' % stats) self.stdout.write('Articles skipped (no identifiable authors): %(noauthor)d\n' % stats)