Esempio n. 1
0
    def test_init_from_fetched_article(self):
        # delete corresponding db fixture object so we can reload it
        HarvestRecord.objects.get(pmcid=self.article.docid).delete()
        
        # mock identifiable authors to avoid actual look-up
        with patch.object(self.article, 'identifiable_authors', new=Mock(return_value=[])):
            record = HarvestRecord.init_from_fetched_article(self.article)
            self.assertEqual(self.article.article_title, record.title)
            self.assertEqual(self.article.docid, record.pmcid)
            self.assertEqual(self.article.fulltext_available, record.fulltext)
            self.assertEqual(0, record.authors.count())

            self.assertEqual(self.article.serialize(pretty=True),
                             record.content.read(),
                'article xml should be saved in content file field')
            
            # remove the new record so we can test creating it again
            record.content.delete()
            record.delete()

        # simulate identifiable authors 
        testauthor = User(username='******')
        testauthor.save()
        with patch.object(self.article, 'identifiable_authors',
                          new=Mock(return_value=[testauthor])):
            record = HarvestRecord.init_from_fetched_article(self.article)
            self.assertEqual(1, record.authors.count())
            self.assert_(testauthor in record.authors.all())
            record.content.delete()
Esempio n. 2
0
    def test_init_from_fetched_article(self):
        # delete corresponding db fixture object so we can reload it
        HarvestRecord.objects.get(pmcid=self.article.docid).delete()

        # mock identifiable authors to avoid actual look-up
        with patch.object(self.article,
                          'identifiable_authors',
                          new=Mock(return_value=[])):
            record = HarvestRecord.init_from_fetched_article(self.article)
            self.assertEqual(self.article.article_title, record.title)
            self.assertEqual(self.article.docid, record.pmcid)
            self.assertEqual(self.article.fulltext_available, record.fulltext)
            self.assertEqual(0, record.authors.count())

            self.assertEqual(
                self.article.serialize(pretty=True), record.content.read(),
                'article xml should be saved in content file field')

            # remove the new record so we can test creating it again
            record.content.delete()
            record.delete()

        # simulate identifiable authors
        testauthor = User(username='******')
        testauthor.save()
        with patch.object(self.article,
                          'identifiable_authors',
                          new=Mock(return_value=[testauthor])):
            record = HarvestRecord.init_from_fetched_article(self.article)
            self.assertEqual(1, record.authors.count())
            self.assert_(testauthor in record.authors.all())
            record.content.delete()
    def handle(self, *args, **options):
        today = datetime.date.today()
        formated_today = today.strftime('%Y/%m/%d')
        self.verbosity = int(options['verbosity'])    # 1 = normal, 0 = minimal, 2 = all
        # number of articles we want to harvest in this run
        self.max_articles = int(options['max_articles']) if options['max_articles'] else None
        print options['min_date']
        # self.min_date = options['min_date']
        # self.max_date = options['max_date']
        self.min_date = '2015/09/01'
        self.max_date = formated_today
        self.auto_date = False
        self.v_normal = 1

        stats = defaultdict(int)
        done= False
        chunks = self.article_chunks(**options)

        if options['progress']:
            pbar = ProgressBar(widgets=[Percentage(), ' ', ETA(),  ' ', Bar()], maxval=chunks.count).start()
        for p in chunks.page_range:
            if self.verbosity > self.v_normal:
                self.stdout.write('Starting article chunk.\n')

            for article in chunks.page(p).object_list:
                # author = EFetchAuthor(article)
                # for attr in dir(author):
                #     if hasattr( author, attr ):
                #         print( "obj.%s = %s" % (attr, getattr(author, attr)))
                stats['articles'] += 1
                # print article.docid
                # print article.authors[4].surname
                # print article.authors[4].aff_ids
                # print article.authors[4].affiliation()
                # for attr in dir(article):
                #     if hasattr( article, attr ):
                #         print( "obj.%s = %s" % (attr, getattr(article, attr)))
                if self.verbosity > self.v_normal:
                    # python2.6 fails with ascii encoding errors (on unicode
                    # titles) unless we explicitly encode output to
                    # sys.stdout.write
                    msg = u'Processing [%s] "%s"\n' % \
                          (article.docid, article.article_title)
                    self.stdout.write(msg.encode(self.stdout.encoding))
                
                if HarvestRecord.objects.filter(pmcid=article.docid).exists():
                    if self.verbosity >= self.v_normal:
                        self.stdout.write('[%s] has already been harvested; skipping\n' \
                                          % (article.docid,))
                    continue
                    
                if article.identifiable_authors(derive=True):
                    try:
                        # don't save when sinulated
                        if options['simulate']:
                            self.stdout.write('Not Saving [%s] (simulated run)\n' % article.docid)
                        #really save when not simulated
                        else:
                            HarvestRecord.init_from_fetched_article(article)
                        stats['harvested'] += 1
                        if self.max_articles and stats['harvested'] >= self.max_articles:
                            done = True
                            break
                    except Exception as err:
                        self.stdout.write('Error creating record from article: %s\n' % err)
                        stats['errors'] += 1
                        
                else:
                    if self.verbosity >= self.v_normal:
                        self.stdout.write('[%s] has no identifiable authors; skipping\n' \
                                          % (article.docid,))
                    stats['noauthor'] += 1

                if options['progress']:
                    pbar.update(stats['articles'])
            if done:
                if self.verbosity > self.v_normal:
                    self.stdout.write('Harvested %s articles ... stopping \n' % stats['harvested'])
                break
        if options['progress']:
            pbar.finish()

        # summarize what was done
        if self.date_opts:
            self.stdout.write('Date Range: %(mindate)s - %(maxdate)s' % self.date_opts)
        self.stdout.write('\nArticles processed: %(articles)d\n' % stats)
        self.stdout.write('Articles harvested: %(harvested)d\n' % stats)
        self.stdout.write('Errors harvesting articles: %(errors)d\n' % stats)
        self.stdout.write('Articles skipped (no identifiable authors): %(noauthor)d\n' % stats)
Esempio n. 4
0
    def handle(self, *args, **options):
        self.verbosity = int(options['verbosity'])    # 1 = normal, 0 = minimal, 2 = all
        # number of articles we want to harvest in this run
        self.max_articles = int(options['max_articles']) if options['max_articles'] else None

        self.min_date = options['min_date']
        self.max_date = options['max_date']
        self.auto_date = options['auto_date']
        self.v_normal = 1

        stats = defaultdict(int)
        done= False
        chunks = self.article_chunks(**options)

        if options['progress']:
            pbar = ProgressBar(widgets=[Percentage(), ' ', ETA(),  ' ', Bar()], maxval=chunks.count).start()
        for p in chunks.page_range:
            if self.verbosity > self.v_normal:
                self.stdout.write('Starting article chunk.\n')

            for article in chunks.page(p).object_list:
                stats['articles'] += 1

                if self.verbosity > self.v_normal:
                    # python2.6 fails with ascii encoding errors (on unicode
                    # titles) unless we explicitly encode output to
                    # sys.stdout.write
                    msg = u'Processing [%s] "%s"\n' % \
                          (article.docid, article.article_title)
                    self.stdout.write(msg.encode(self.stdout.encoding))
                
                if HarvestRecord.objects.filter(pmcid=article.docid).exists():
                    if self.verbosity >= self.v_normal:
                        self.stdout.write('[%s] has already been harvested; skipping\n' \
                                          % (article.docid,))
                    continue
                    
                if article.identifiable_authors(derive=True):
                    try:
                        # don't save when sinulated
                        if options['simulate']:
                            self.stdout.write('Not Saving [%s] (simulated run)\n' % article.docid)
                        #really save when not simulated
                        else:
                            HarvestRecord.init_from_fetched_article(article)
                        stats['harvested'] += 1
                        if self.max_articles and stats['harvested'] >= self.max_articles:
                            done = True
                            break
                    except Exception as err:
                        self.stdout.write('Error creating record from article: %s\n' % err)
                        stats['errors'] += 1
                        
                else:
                    if self.verbosity >= self.v_normal:
                        self.stdout.write('[%s] has no identifiable authors; skipping\n' \
                                          % (article.docid,))
                    stats['noauthor'] += 1

                if options['progress']:
                    pbar.update(stats['articles'])
            if done:
                if self.verbosity > self.v_normal:
                    self.stdout.write('Harvested %s articles ... stopping \n' % stats['harvested'])
                break
        if options['progress']:
            pbar.finish()

        # summarize what was done
        if self.date_opts:
            self.stdout.write('Date Range: %(mindate)s - %(maxdate)s' % self.date_opts)
        self.stdout.write('\nArticles processed: %(articles)d\n' % stats)
        self.stdout.write('Articles harvested: %(harvested)d\n' % stats)
        self.stdout.write('Errors harvesting articles: %(errors)d\n' % stats)
        self.stdout.write('Articles skipped (no identifiable authors): %(noauthor)d\n' % stats)