Example #1
0
    def document(self, data):
        norm_data = {}

        tmp_article = Article(data)

        for k, v in NORM_DOCUMENT_ATTRS.items():
            try:
                if v == 'publication_date':
                    norm_data[k] = preprocess_date(
                        tmp_article.publication_date, return_int_year=True)
                elif v == 'original_title':
                    norm_data[k] = preprocess_default(
                        tmp_article.original_title())
                else:
                    norm_data[k] = preprocess_default(getattr(tmp_article, v))
            except UnavailableMetadataException:
                pass
            except AttributeError:
                pass
            except TypeError:
                pass

        return norm_data
Example #2
0
def json2html(htmlout, config, urli=None, articles=None):
    # get PID and Codes
    if urli:
        pid_code_list = getpidcode(urli)

    if articles:
        pid_code_list = [getpidcode(urla)[0] for urla in articles]

    print('Total documents: %s\n' % (len(pid_code_list)))

    # Write the html file
    with open(htmlout, encoding='utf-8', mode='w') as f:
        # Start HTML output
        f.write(u'<html>\n<body>\n')

        # Request Issue
        reqissue = requestissue(config, pid_code_list[0][2][1:18])
        xissue = reqissue[0]
        seccode_list = reqissue[1]

        # JINJA
        jinja_env = Environment(loader=FileSystemLoader('template'))
        template = jinja_env.get_template('body.html')

        previous_sec = None
        for prefix, code, pid in pid_code_list:
            # Request Article
            uart = config['articlemeta'][
                'host'] + "/api/v1/article/?code=%s" % pid
            xart = None
            while xart is None:
                try:
                    rart = requests.get(uart)
                    xart = Article(rart.json())
                except requests.exceptions.Timeout:
                    logger.info('error: %s' % e)
                    print("Timeout - Try again")
                    leave()
                except requests.exceptions.RequestException as e:
                    logger.info('error: %s' % e)
                    print(
                        "Request Error - Check your connection and try again")
                    leave()
                except json.decoder.JSONDecodeError as e:
                    logger.info('error: %s' % e)
                    print("Request Error - Try again")
                    leave()

            # Language priority to HTML
            lang_priority = ['en', 'pt', 'es']
            # Sets the language of the template
            for l in lang_priority:
                if l in xart.languages():
                    lang = l
                    break

            # First section only
            if xart.section_code:
                if 'en' in xissue.sections[xart.section_code].keys():
                    section = xissue.sections[xart.section_code]['en'].upper()
                elif lang in xissue.sections[xart.section_code].keys():
                    section = xissue.sections[xart.section_code][lang].upper()
            else:
                section = "*** ERROR SECTION ***"

            if section:
                if previous_sec != section and section.upper(
                ) not in invalid_sec:
                    print(section)
                    tsec = Template(
                        "<p><strong>{{ section }}</strong></p>\n\n")
                    outsec = tsec.render(section=section)
                    f.write(outsec)
                    previous_sec = section
            else:
                logger.info('Section Error: %s' % pid)
                print('Section Error: %s' % pid)

            # Article metadata
            if section:
                if section.upper() not in invalid_sec:
                    # Title
                    title_html = None
                    title = None
                    # Scraping HTML title
                    try:
                        # prioritizes english language
                        link = ('%s/a/%s/?lang=en' % (prefix, code))
                        r = requests.get(link)
                        soup = BeautifulSoup(r.content, 'html.parser')
                        arttitle = soup.find("h1", {"class": "article-title"})
                        # Clear tags and attributes
                        [a.decompose() for a in arttitle.find_all('a')]
                        [su.decompose() for su in arttitle.find_all('sup')]
                        [st.decompose() for st in arttitle.find_all('strong')]
                        [sp.decompose() for sp in arttitle.find_all('span')]
                        arttitle.attrs.clear()
                        arttitle.name = 'strong'
                        title_html = arttitle
                    except requests.exceptions.Timeout:
                        logger.info('error: %s' % e)
                        print("Timeout - Try again")
                        leave()

                    ## HTML title or original_title
                    if title_html:
                        title = title_html
                    elif xart.original_language() == lang:
                        title = xart.original_title()
                    elif lang in xart.translated_titles().keys():
                        title = xart.translated_titles()[lang]
                    else:
                        title = xart.original_title()
                    # show PID title to user
                    print(pid, title.text.strip()[0:60])

                    # Authors
                    authors = []
                    if xart.authors:
                        authors = [
                            au['surname'] + ', ' + au['given_names']
                            for au in xart.authors
                        ]

                    # Link text in english
                    link_text = {
                        'en': ('text in English', 'English'),
                        'pt': ('text in Portuguese', 'Portuguese'),
                        'es': ('text in Spanish', 'Spanish')
                    }

                    # Full text links
                    ltxt = None
                    if xart.fulltexts() != None:
                        ltxt = []
                        if 'html' in xart.fulltexts().keys():
                            for l in xart.languages():
                                if l in xart.fulltexts()['html']:
                                    utxt = '%s/a/%s/?lang=%s' % (prefix, code,
                                                                 l)
                                    ltxt.append((link_text[l][0],
                                                 link_text[l][1], utxt))

                    # PDF Links
                    lpdf = None
                    if xart.fulltexts() != None:
                        lpdf = []
                        for l in xart.languages():  # and PDF in site ????
                            updf = '%s/a/%s/?format=pdf&lang=%s' % (prefix,
                                                                    code, l)
                            lpdf.append((link_text[l][1], updf))

                    # Render HTML
                    output = template.render(title=title,
                                             authors=authors,
                                             lpdf=lpdf,
                                             ltxt=ltxt)
                    f.write(output)

        # Terminate HTML output
        f.write(u'</body>\n</html>')
Example #3
0
def get_solr_args_from_article(document, indexed_date):
    article = Article(document)

    original_title = article.original_title()
    if original_title is not None:
        original_title = original_title

    try:  # publication_date format maybe yyyy-mm-dd
        publication_date = datetime.strptime(article.publication_date, '%Y-%m-%d').isoformat()
    except ValueError:
        try:  # publication_date format maybe yyyy-mm
            publication_date = datetime.strptime("{0}-01".format(article.publication_date), '%Y-%m-%d').isoformat()
        except ValueError:  # publication_date format maybe yyyy
            publication_date = datetime.strptime("{0}-01-01".format(article.publication_date), '%Y-%m-%d').isoformat()

    article_languages = article.languages()
    languages = []
    for l in article_languages:
        languages.append(l)

    article_authors = article.authors
    authors = []
    if article_authors is not None:
        for author in article_authors:
            author_name = u"{0} {1}".format(author["given_names"], author["surname"])
            authors.append(remove_control_chars(author_name))

    article_first_author = article.first_author
    if article_first_author is not None:
        first_author = remove_control_chars(u"{0} {1}".format(article_first_author["given_names"], article_first_author["surname"]))
    else:
        first_author = ""

    #Start - Insert categories and magazines
    # print ('Start - Insert categories and magazines')

    magazine_name = remove_control_chars(u"{0}".format(article.journal.title))
    magazine_issn = article.journal.scielo_issn
    magazine_abbreviated_title = remove_control_chars(article.journal.abbreviated_title)
    magazine_domain = article.scielo_domain
    magazine_acronym = article.journal.acronym


    try:
        magazine = Magazine.objects.get(magazine_name=magazine_name)
    except Magazine.DoesNotExist:
        magazine = Magazine.objects.create(magazine_name=magazine_name,
                                           magazine_abbreviated_title=magazine_abbreviated_title,
                                           magazine_issn=magazine_issn,
                                           magazine_domain=magazine_domain,
                                           magazine_acronym=magazine_acronym)
        magazine.save()

    category_ids = []
    if article.journal.subject_areas is not None:
        for item_category in article.journal.subject_areas:
            category_name = remove_control_chars(u"{0}".format(item_category)).title()

            try:
                category = Category.objects.get(category_name_en=category_name)
            except Category.DoesNotExist:
                category = Category.objects.create(category_name_en=category_name)
                category.save()

            category_ids.append(category.id)

            category_publication_relationship = False
            for category_loop in magazine.categories.all():
                if category_loop.category_name_en == category_name:
                    category_publication_relationship = True
                    break

            if not category_publication_relationship:
                magazine.categories.add(category)
                magazine.save()

    # print ('End - Insert categories and magazines')
    # End - Insert categories and magazines

    args = {
        "id": u"{0}{1}".format(article.publisher_id, article.collection_acronym),
        # "scielo_issn": article.journal.scielo_issn,
        "any_issn": article.journal.any_issn(),
        "journal_title": remove_control_chars(article.journal.title),  # Magazine
        "journal_id": magazine.id,

        "journal_volume": article.volume,
        "journal_number": article.issue,

        # "journal_abbreviated_title": remove_control_chars(article.journal.abbreviated_title),
        "original_title": remove_control_chars(original_title),
        "original_abstract": remove_control_chars(article.original_abstract()),
        "publication_date": "{0}Z".format(publication_date),
        # "journal_acronym": article.journal.acronym,
        "subject_areas": article.journal.subject_areas,  # Categories
        "subject_areas_ids": category_ids,  # Category ids

        "wos_subject_areas": article.journal.wos_subject_areas,

        "original_language": article.original_language(),
        "languages": languages,
        "document_type": article.document_type,
        "authors": authors,
        "first_author": first_author,
        "corporative_authors": article.corporative_authors,
        # "scielo_domain": article.scielo_domain,
        "publisher_id": article.publisher_id,
        "collection_acronym": article.collection_acronym,

        "indexed_date": indexed_date
    }

    # Adding cover if reindexing or updating.
    try:
        cover_article = CoverArticle.objects.get(article_id=args[u"id"])

        args[u"image_upload_path"] = cover_article.image
        args[u"image_upload_date"] = cover_article.upload_time
        args[u"image_uploader"] = cover_article.administrator.name
    except CoverArticle.DoesNotExist:
        pass

    article_translated_abstracts = article.translated_abstracts()
    if article_translated_abstracts is not None:
        for language in article_translated_abstracts:
            args[u"translated_abstracts_{0}".format(language)] = remove_control_chars(article_translated_abstracts[language])

    article_translated_titles = article.translated_titles()
    if article_translated_titles is not None:
        for language in article_translated_titles:
            args[u"translated_titles_{0}".format(language)] = remove_control_chars(article_translated_titles[language])

    article_keywords = article.keywords()
    if article_keywords is not None:
        for language in article_keywords:
            keywords = []
            for keyword in article_keywords[language]:
                keywords.append(remove_control_chars(keyword))
            args[u"keywords_{0}".format(language)] = keywords

    return args