def get_ojs_file(base_url, url, article, auth_file, label): filename, mime = shared.fetch_file(base_url, url, None, None, article, None, handle_images=False, auth_file=auth_file) extension = os.path.splitext(filename)[1] file = shared.add_file(mime, extension, label, article.owner, filename, article, galley=False) return file
def create_article_with_review_content(article_dict, journal, auth_file, base_url): date_started = timezone.make_aware( dateparser.parse(article_dict.get('date_submitted'))) # Create a base article article = models.Article( journal=journal, title=article_dict.get('title'), abstract=article_dict.get('abstract'), language=article_dict.get('language'), stage=models.STAGE_UNDER_REVIEW, is_import=True, date_submitted=date_started, ) article.save() # Check for editors and assign them as section editors. editors = article_dict.get('editors', []) for editor in editors: try: account = core_models.Account.objects.get(email=editor) account.add_account_role('section-editor', journal) review_models.EditorAssignment.objects.create( article=article, editor=account, editor_type='section-editor') logger.info('Editor added to article') except Exception as e: logger.error('Editor account was not found.') logger.exception(e) # Add a new review round round = review_models.ReviewRound.objects.create(article=article, round_number=1) # Add keywords keywords = article_dict.get('keywords') if keywords: for keyword in keywords.split(';'): word, created = models.Keyword.objects.get_or_create(word=keyword) article.keywords.add(word) # Add authors for author in article_dict.get('authors'): try: author_record = core_models.Account.objects.get( email=author.get('email')) except core_models.Account.DoesNotExist: author_record = core_models.Account.objects.create( email=author.get('email'), first_name=author.get('first_name'), last_name=author.get('last_name'), institution=author.get('affiliation'), biography=author.get('bio'), ) # If we have a country, fetch its record if author.get('country'): try: country = core_models.Country.objects.get( code=author.get('country')) author_record.country = country author_record.save() except core_models.Country.DoesNotExist: pass # Add authors to m2m and create an order record article.authors.add(author_record) models.ArticleAuthorOrder.objects.create( article=article, author=author_record, order=article.next_author_sort()) # Set the primary author article.owner = core_models.Account.objects.get( email=article_dict.get('correspondence_author')) article.correspondence_author = article.owner # Get or create the article's section try: section = models.Section.objects.language().fallbacks('en').get( journal=journal, name=article_dict.get('section')) except models.Section.DoesNotExist: section = None article.section = section article.save() # Attempt to get the default review form form = setting_handler.get_setting('general', 'default_review_form', journal, create=True).processed_value if not form: try: form = review_models.ReviewForm.objects.filter(journal=journal)[0] except Exception: form = None logger.error( 'You must have at least one review form for the journal before' ' importing.') exit() for review in article_dict.get('reviews'): try: reviewer = core_models.Account.objects.get( email=review.get('email')) except core_models.Account.DoesNotExist: reviewer = core_models.Account.objects.create( email=review.get('email'), first_name=review.get('first_name'), last_name=review.get('last_name'), ) # Parse the dates date_requested = timezone.make_aware( dateparser.parse(review.get('date_requested'))) date_due = timezone.make_aware(dateparser.parse( review.get('date_due'))) date_complete = timezone.make_aware( dateparser.parse(review.get('date_complete'))) if review.get( 'date_complete') else None date_confirmed = timezone.make_aware( dateparser.parse(review.get('date_confirmed'))) if review.get( 'date_confirmed') else None # If the review was declined, setup a date declined date stamp review.get('declined') if review.get('declined') == '1': date_declined = date_confirmed date_accepted = None date_complete = date_confirmed else: date_accepted = date_confirmed date_declined = None new_review = review_models.ReviewAssignment.objects.create( article=article, reviewer=reviewer, review_round=round, review_type='traditional', visibility='double-blind', date_due=date_due, date_requested=date_requested, date_complete=date_complete, date_accepted=date_accepted, access_code=uuid.uuid4(), form=form) if review.get('declined') or review.get('recommendation'): new_review.is_complete = True if review.get('recommendation'): new_review.decision = map_review_recommendation( review.get('recommendation')) if review.get('review_file_url'): filename, mime = shared.fetch_file(base_url, review.get('review_file_url'), None, None, article, None, handle_images=False, auth_file=auth_file) extension = os.path.splitext(filename)[1] review_file = shared.add_file(mime, extension, 'Reviewer file', reviewer, filename, article, galley=False) new_review.review_file = review_file if review.get('comments'): filepath = core_files.create_temp_file(review.get('comments'), 'comment.txt') file = open(filepath, 'r') comment_file = core_files.save_file_to_article( file, article, article.owner, label='Review Comments', save=False) new_review.review_file = comment_file new_review.save() # Get MS File ms_file = get_ojs_file(base_url, article_dict.get('manuscript_file_url'), article, auth_file, 'MS File') article.manuscript_files.add(ms_file) # Get RV File rv_file = get_ojs_file(base_url, article_dict.get('review_file_url'), article, auth_file, 'RV File') round.review_files.add(rv_file) # Get Supp Files if article_dict.get('supp_files'): for file in article_dict.get('supp_files'): file = get_ojs_file(base_url, file.get('url'), article, auth_file, file.get('title')) article.data_figure_files.add(file) article.save() round.save() return article
def import_article(journal, user, url, thumb_path=None): """ Import a Ubiquity Press article. :param journal: the journal to import to :param user: the user who will own the file :param url: the URL of the article to import :param thumb_path: the base path for thumbnails :return: None """ # retrieve the remote page and establish if it has a DOI already_exists, doi, domain, soup_object = shared.fetch_page_and_check_if_exists( url) requests.packages.urllib3.disable_warnings(InsecureRequestWarning) if already_exists: # if here then this article has already been imported return # fetch basic metadata new_article = shared.get_and_set_metadata(journal, soup_object, user, False, True) # try to do a license lookup pattern = re.compile(r'creativecommons') license_tag = soup_object.find(href=pattern) license_object = models.Licence.objects.filter( url=license_tag['href'].replace('http:', 'https:'), journal=journal) if len(license_object) > 0 and license_object[0] is not None: license_object = license_object[0] logger.info("Found a license for this article: {0}".format( license_object.short_name)) else: license_object = models.Licence.objects.get(name='All rights reserved', journal=journal) logger.warning( "Did not find a license for this article. Using: {0}".format( license_object.short_name)) new_article.license = license_object # determine if the article is peer reviewed peer_reviewed = soup_object.find(name='a', text='Peer Reviewed') is not None logger.debug("Peer reviewed: {0}".format(peer_reviewed)) new_article.peer_reviewed = peer_reviewed # get PDF and XML galleys pdf = shared.get_pdf_url(soup_object) # rip XML out if found pattern = re.compile('.*?XML.*') xml = soup_object.find('a', text=pattern) html = None if xml: logger.info("Ripping XML") xml = xml.get('href', None).strip() else: # looks like there isn't any XML # instead we'll pull out any div with an id of "xml-article" and add as an HTML galley logger.info("Ripping HTML") html = soup_object.find('div', attrs={'id': 'xml-article'}) if html: html = str(html.contents[0]) # attach the galleys to the new article galleys = {'PDF': pdf, 'XML': xml, 'HTML': html} shared.set_article_galleys_and_identifiers(doi, domain, galleys, new_article, url, user) # fetch thumbnails if thumb_path is not None: logger.info("Attempting to assign thumbnail.") final_path_element = url.split('/')[-1] id_regex = re.compile(r'.*?(\d+)') matches = id_regex.match(final_path_element) article_id = matches.group(1) logger.info("Determined remote article ID as: {0}".format(article_id)) logger.info("Thumbnail path: {thumb_path}, URL: {url}".format( thumb_path=thumb_path, url=url)) try: filename, mime = shared.fetch_file(domain, thumb_path + "/" + article_id, "", 'graphic', new_article, user) shared.add_file(mime, 'graphic', 'Thumbnail', user, filename, new_article, thumbnail=True) except Exception as e: logger.warning("Unable to import thumbnail: %s" % e) # lookup status stats = soup_object.findAll('div', {'class': 'stat-number'}) # save the article to the database new_article.save() try: if stats: from metrics import models as metrics_models views = stats[0].contents[0] if len(stats) > 1: downloads = stats[1].contents[0] else: downloads = 0 metrics_models.HistoricArticleAccess.objects.create( article=new_article, views=views, downloads=downloads) except (IndexError, AttributeError): logger.info("No article metrics found")
def import_article(journal, user, url, thumb_path=None): """ Import a Ubiquity Press article. :param journal: the journal to import to :param user: the user who will own the file :param url: the URL of the article to import :param thumb_path: the base path for thumbnails :return: None """ # retrieve the remote page and establish if it has a DOI already_exists, doi, domain, soup_object = shared.fetch_page_and_check_if_exists( url) requests.packages.urllib3.disable_warnings(InsecureRequestWarning) if already_exists: # if here then this article has already been imported return # fetch basic metadata new_article = shared.get_and_set_metadata(journal, soup_object, user, False, True) # try to do a license lookup pattern = re.compile(r'creativecommons') license_tag = soup_object.find(href=pattern) license_object = models.Licence.objects.filter( url=license_tag['href'].replace('http:', 'https:'), journal=journal) if len(license_object) > 0 and license_object[0] is not None: license_object = license_object[0] print("Found a license for this article: {0}".format( license_object.short_name)) else: license_object = models.Licence.objects.get(name='All rights reserved', journal=journal) print("Did not find a license for this article. Using: {0}".format( license_object.short_name)) new_article.license = license_object # determine if the article is peer reviewed peer_reviewed = soup_object.find(name='a', text='Peer Reviewed') is not None print("Peer reviewed: {0}".format(peer_reviewed)) new_article.peer_reviewed = peer_reviewed # get PDF and XML galleys pdf = shared.get_pdf_url(soup_object) # rip XML out if found pattern = re.compile('.*?XML.*') xml = soup_object.find('a', text=pattern) html = None if xml: print("Ripping XML") xml = xml.get('href', None).strip() else: # looks like there isn't any XML # instead we'll pull out any div with an id of "xml-article" and add as an HTML galley print("Ripping HTML") html = soup_object.find('div', attrs={'id': 'xml-article'}) if html: html = str(html.contents[0]) # attach the galleys to the new article galleys = {'PDF': pdf, 'XML': xml, 'HTML': html} shared.set_article_galleys_and_identifiers(doi, domain, galleys, new_article, url, user) # fetch thumbnails if thumb_path is not None: print("Attempting to assign thumbnail.") id_regex = re.compile(r'.*?(\d+)') matches = id_regex.match(url) article_id = matches.group(1) print("Determined remote article ID as: {0}".format(article_id)) try: filename, mime = shared.fetch_file(domain, thumb_path + "/" + article_id, "", 'graphic', new_article, user) shared.add_file(mime, 'graphic', 'Thumbnail', user, filename, new_article, thumbnail=True) except BaseException: print("Unable to import thumbnail. Recoverable error.") # try to do a license lookup # save the article to the database new_article.save()
def import_issue_images(journal, user, url, import_missing=False): """ Imports all issue images and other issue related content Currently also reorders all issues, articles and sections within issues, article thumbnails and issue titles. :param journal: a journal.models.Journal :param user: the owner of the imported content as a core.models.Account :param url: the base url of the journal to import from :param load_missing: Bool. If true, attempt to import missing articles """ base_url = url if not url.endswith('/issue/archive/'): url += '/issue/archive/' requests.packages.urllib3.disable_warnings(InsecureRequestWarning) resp, mime = utils_models.ImportCacheEntry.fetch(url=url) soup = BeautifulSoup(resp, 'lxml') from django.conf import settings import os from django.core.files import File for issue in journal.issues(): issue_num = issue.issue pattern = re.compile(r'\/\d+\/volume\/{0}\/issue\/{1}'.format( issue.volume, issue_num)) img_url_suffix = soup.find(src=pattern) if img_url_suffix: img_url = base_url + img_url_suffix.get('src') logger.info("Fetching {0}".format(img_url)) resp, mime = utils_models.ImportCacheEntry.fetch(url=img_url) path = os.path.join(settings.BASE_DIR, 'files', 'journals', str(journal.id)) os.makedirs(path, exist_ok=True) path = os.path.join( path, 'volume{0}_issue_{0}.graphic'.format(issue.volume, issue_num)) with open(path, 'wb') as f: f.write(resp) with open(path, 'rb') as f: issue.cover_image.save(path, File(f)) sequence_pattern = re.compile( r'.*?(\d+)\/volume\/{0}\/issue\/{1}.*'.format( issue.volume, issue_num)) issue.order = int(sequence_pattern.match(img_url).group(1)) logger.info( "Setting Volume {0}, Issue {1} sequence to: {2}".format( issue.volume, issue_num, issue.order)) logger.info("Extracting section orders within the issue...") new_url = '/{0}/volume/{1}/issue/{2}/'.format( issue.order, issue.volume, issue_num) resp, mime = utils_models.ImportCacheEntry.fetch(url=base_url + new_url) soup_issue = BeautifulSoup(resp, 'lxml') sections_to_order = soup_issue.find_all( name='h2', attrs={'class': 'main-color-text'}) # Find issue title try: issue_title = soup_issue.find("div", { "class": "multi-inline" }).find("h1").string issue_title = issue_title.strip(" -\n") if issue.issue_title and issue_title not in issue.issue_title: issue.issue_title = "{} - {}".format( issue_title, issue.issue_title) else: issue.issue_title = issue_title except AttributeError as e: logger.debug("Couldn't find an issue title: %s" % e) #Find issue description try: desc_parts = soup_issue.find("div", { "class": "article-type-list-block" }).findAll("p", {"class": "p1"}) issue.issue_description = "\n".join(str(p) for p in desc_parts) except AttributeError as e: logger.debug("Couldn't extract an issue description %s" % e) sections_to_order = soup_issue.find_all( name='h2', attrs={'class': 'main-color-text'}) # delete existing order models for sections for this issue journal_models.SectionOrdering.objects.filter(issue=issue).delete() for section_order, section in enumerate(sections_to_order): logger.info('[{0}] {1}'.format(section_order, section.getText())) order_section, c = models.Section.objects.language( 'en').get_or_create(name=section.getText().strip(), journal=journal) journal_models.SectionOrdering.objects.create( issue=issue, section=order_section, order=section_order).save() logger.info("Extracting article orders within the issue...") # delete existing order models for issue journal_models.ArticleOrdering.objects.filter(issue=issue).delete() pattern = re.compile(r'\/articles\/(.+?)/(.+?)/') articles = soup_issue.find_all(href=pattern) article_order = 0 processed = [] for article_link in articles: # parse the URL into a DOI and prefix article_url = article_link["href"] match = pattern.match(article_url) prefix = match.group(1) doi = match.group(2) # get a proper article object article = models.Article.get_article( journal, 'doi', '{0}/{1}'.format(prefix, doi)) if not article and import_missing: logger.debug("Article %s not found, importing...", article_url) import_article(journal, user, base_url + article_url) if article and article not in processed: thumb_img = article_link.find("img") if thumb_img: thumb_path = thumb_img["src"] filename, mime = shared.fetch_file( base_url, thumb_path, "", 'graphic', article, user, ) shared.add_file( mime, 'graphic', 'Thumbnail', user, filename, article, thumbnail=True, ) journal_models.ArticleOrdering.objects.get_or_create( issue=issue, article=article, section=article.section, order=article_order, ) article_order += 1 processed.append(article) issue.save()