def document(self, data): norm_data = {} tmp_article = Article(data) for k, v in NORM_DOCUMENT_ATTRS.items(): try: if v == 'publication_date': norm_data[k] = preprocess_date( tmp_article.publication_date, return_int_year=True) elif v == 'original_title': norm_data[k] = preprocess_default( tmp_article.original_title()) else: norm_data[k] = preprocess_default(getattr(tmp_article, v)) except UnavailableMetadataException: pass except AttributeError: pass except TypeError: pass return norm_data
def json2html(htmlout, config, urli=None, articles=None): # get PID and Codes if urli: pid_code_list = getpidcode(urli) if articles: pid_code_list = [getpidcode(urla)[0] for urla in articles] print('Total documents: %s\n' % (len(pid_code_list))) # Write the html file with open(htmlout, encoding='utf-8', mode='w') as f: # Start HTML output f.write(u'<html>\n<body>\n') # Request Issue reqissue = requestissue(config, pid_code_list[0][2][1:18]) xissue = reqissue[0] seccode_list = reqissue[1] # JINJA jinja_env = Environment(loader=FileSystemLoader('template')) template = jinja_env.get_template('body.html') previous_sec = None for prefix, code, pid in pid_code_list: # Request Article uart = config['articlemeta'][ 'host'] + "/api/v1/article/?code=%s" % pid xart = None while xart is None: try: rart = requests.get(uart) xart = Article(rart.json()) except requests.exceptions.Timeout: logger.info('error: %s' % e) print("Timeout - Try again") leave() except requests.exceptions.RequestException as e: logger.info('error: %s' % e) print( "Request Error - Check your connection and try again") leave() except json.decoder.JSONDecodeError as e: logger.info('error: %s' % e) print("Request Error - Try again") leave() # Language priority to HTML lang_priority = ['en', 'pt', 'es'] # Sets the language of the template for l in lang_priority: if l in xart.languages(): lang = l break # First section only if xart.section_code: if 'en' in xissue.sections[xart.section_code].keys(): section = xissue.sections[xart.section_code]['en'].upper() elif lang in xissue.sections[xart.section_code].keys(): section = xissue.sections[xart.section_code][lang].upper() else: section = "*** ERROR SECTION ***" if section: if previous_sec != section and section.upper( ) not in invalid_sec: print(section) tsec = Template( "<p><strong>{{ section }}</strong></p>\n\n") outsec = tsec.render(section=section) f.write(outsec) previous_sec = section else: logger.info('Section Error: %s' % pid) print('Section Error: %s' % pid) # Article metadata if section: if section.upper() not in invalid_sec: # Title title_html = None title = None # Scraping HTML title try: # prioritizes english language link = ('%s/a/%s/?lang=en' % (prefix, code)) r = requests.get(link) soup = BeautifulSoup(r.content, 'html.parser') arttitle = soup.find("h1", {"class": "article-title"}) # Clear tags and attributes [a.decompose() for a in arttitle.find_all('a')] [su.decompose() for su in arttitle.find_all('sup')] [st.decompose() for st in arttitle.find_all('strong')] [sp.decompose() for sp in arttitle.find_all('span')] arttitle.attrs.clear() arttitle.name = 'strong' title_html = arttitle except requests.exceptions.Timeout: logger.info('error: %s' % e) print("Timeout - Try again") leave() ## HTML title or original_title if title_html: title = title_html elif xart.original_language() == lang: title = xart.original_title() elif lang in xart.translated_titles().keys(): title = xart.translated_titles()[lang] else: title = xart.original_title() # show PID title to user print(pid, title.text.strip()[0:60]) # Authors authors = [] if xart.authors: authors = [ au['surname'] + ', ' + au['given_names'] for au in xart.authors ] # Link text in english link_text = { 'en': ('text in English', 'English'), 'pt': ('text in Portuguese', 'Portuguese'), 'es': ('text in Spanish', 'Spanish') } # Full text links ltxt = None if xart.fulltexts() != None: ltxt = [] if 'html' in xart.fulltexts().keys(): for l in xart.languages(): if l in xart.fulltexts()['html']: utxt = '%s/a/%s/?lang=%s' % (prefix, code, l) ltxt.append((link_text[l][0], link_text[l][1], utxt)) # PDF Links lpdf = None if xart.fulltexts() != None: lpdf = [] for l in xart.languages(): # and PDF in site ???? updf = '%s/a/%s/?format=pdf&lang=%s' % (prefix, code, l) lpdf.append((link_text[l][1], updf)) # Render HTML output = template.render(title=title, authors=authors, lpdf=lpdf, ltxt=ltxt) f.write(output) # Terminate HTML output f.write(u'</body>\n</html>')
def get_solr_args_from_article(document, indexed_date): article = Article(document) original_title = article.original_title() if original_title is not None: original_title = original_title try: # publication_date format maybe yyyy-mm-dd publication_date = datetime.strptime(article.publication_date, '%Y-%m-%d').isoformat() except ValueError: try: # publication_date format maybe yyyy-mm publication_date = datetime.strptime("{0}-01".format(article.publication_date), '%Y-%m-%d').isoformat() except ValueError: # publication_date format maybe yyyy publication_date = datetime.strptime("{0}-01-01".format(article.publication_date), '%Y-%m-%d').isoformat() article_languages = article.languages() languages = [] for l in article_languages: languages.append(l) article_authors = article.authors authors = [] if article_authors is not None: for author in article_authors: author_name = u"{0} {1}".format(author["given_names"], author["surname"]) authors.append(remove_control_chars(author_name)) article_first_author = article.first_author if article_first_author is not None: first_author = remove_control_chars(u"{0} {1}".format(article_first_author["given_names"], article_first_author["surname"])) else: first_author = "" #Start - Insert categories and magazines # print ('Start - Insert categories and magazines') magazine_name = remove_control_chars(u"{0}".format(article.journal.title)) magazine_issn = article.journal.scielo_issn magazine_abbreviated_title = remove_control_chars(article.journal.abbreviated_title) magazine_domain = article.scielo_domain magazine_acronym = article.journal.acronym try: magazine = Magazine.objects.get(magazine_name=magazine_name) except Magazine.DoesNotExist: magazine = Magazine.objects.create(magazine_name=magazine_name, magazine_abbreviated_title=magazine_abbreviated_title, magazine_issn=magazine_issn, magazine_domain=magazine_domain, magazine_acronym=magazine_acronym) magazine.save() category_ids = [] if article.journal.subject_areas is not None: for item_category in article.journal.subject_areas: category_name = remove_control_chars(u"{0}".format(item_category)).title() try: category = Category.objects.get(category_name_en=category_name) except Category.DoesNotExist: category = Category.objects.create(category_name_en=category_name) category.save() category_ids.append(category.id) category_publication_relationship = False for category_loop in magazine.categories.all(): if category_loop.category_name_en == category_name: category_publication_relationship = True break if not category_publication_relationship: magazine.categories.add(category) magazine.save() # print ('End - Insert categories and magazines') # End - Insert categories and magazines args = { "id": u"{0}{1}".format(article.publisher_id, article.collection_acronym), # "scielo_issn": article.journal.scielo_issn, "any_issn": article.journal.any_issn(), "journal_title": remove_control_chars(article.journal.title), # Magazine "journal_id": magazine.id, "journal_volume": article.volume, "journal_number": article.issue, # "journal_abbreviated_title": remove_control_chars(article.journal.abbreviated_title), "original_title": remove_control_chars(original_title), "original_abstract": remove_control_chars(article.original_abstract()), "publication_date": "{0}Z".format(publication_date), # "journal_acronym": article.journal.acronym, "subject_areas": article.journal.subject_areas, # Categories "subject_areas_ids": category_ids, # Category ids "wos_subject_areas": article.journal.wos_subject_areas, "original_language": article.original_language(), "languages": languages, "document_type": article.document_type, "authors": authors, "first_author": first_author, "corporative_authors": article.corporative_authors, # "scielo_domain": article.scielo_domain, "publisher_id": article.publisher_id, "collection_acronym": article.collection_acronym, "indexed_date": indexed_date } # Adding cover if reindexing or updating. try: cover_article = CoverArticle.objects.get(article_id=args[u"id"]) args[u"image_upload_path"] = cover_article.image args[u"image_upload_date"] = cover_article.upload_time args[u"image_uploader"] = cover_article.administrator.name except CoverArticle.DoesNotExist: pass article_translated_abstracts = article.translated_abstracts() if article_translated_abstracts is not None: for language in article_translated_abstracts: args[u"translated_abstracts_{0}".format(language)] = remove_control_chars(article_translated_abstracts[language]) article_translated_titles = article.translated_titles() if article_translated_titles is not None: for language in article_translated_titles: args[u"translated_titles_{0}".format(language)] = remove_control_chars(article_translated_titles[language]) article_keywords = article.keywords() if article_keywords is not None: for language in article_keywords: keywords = [] for keyword in article_keywords[language]: keywords.append(remove_control_chars(keyword)) args[u"keywords_{0}".format(language)] = keywords return args