Esempio n. 1
0
 def get_file(self, article):
     return files.get_file(self, article)
Esempio n. 2
0
    def handle(self, *args, **options):
        """Loops through a journals articles, finds its XML and files and then pulls full versions from UP's site.

        :param args: None
        :param options: None
        :return: None
        """
        journal_code = options.get('journal_code', None)
        base_url = options.get('url')
        journal = None

        try:
            journal = journal_models.Journal.objects.get(code=journal_code)
        except journal_models.Journal.DoesNotExist:
            print('No journal with that code found.')

        if journal:
            articles = models.Article.objects.filter(stage=models.STAGE_PUBLISHED, journal=journal)

            for article in articles:
                for galley in article.galley_set.all():
                    if galley.label == 'XML' or galley.file.mime_type == 'application/xml' or galley.file.mime_type == 'text/xml':
                        missing_supplements = has_missing_supplements(galley)

                        if missing_supplements:
                            url = '{url}/articles/{doi}'.format(url=base_url, doi=galley.article.identifier.identifier)
                            r = requests.get(url)
                            soup = BeautifulSoup(r.text, 'lxml')
                            main_article = soup.find('div', {'class': 'major-article-block'})
                            images = main_article.findAll('img')

                            galley_content = files.get_file(galley.file, article)
                            galley_soup = BeautifulSoup(galley_content, 'lxml')
                            galley_images = galley_soup.findAll('graphic')

                            galley_hrefs = [image.get('xlink:href') for image in galley_images]

                            print(url)

                            loop_counter = 0
                            for img in images:
                                img_url = '{url}/{img_src}'.format(url=url, img_src=img['src'])
                                href = galley_hrefs[loop_counter]
                                href_ids = re.findall('(\\d+)', href)
                                file_id = href_ids[1]

                                file = core_models.File.objects.get(id=file_id)
                                file.privacy = 'public'
                                file.save()
                                file.unlink_file(article.journal)

                                try:
                                    image_r = requests.get(img_url, stream=True)
                                    if image_r.status_code == 200:
                                        with open(file.self_article_path(), 'wb+') as f:
                                            for chunk in r.iter_content(1024):
                                                f.write(chunk)
                                except FileNotFoundError:
                                    print('File not found in files directory.')

                                loop_counter += 1