Beispiel #1
0
def convert_journal_url_to_pdf_url(journal_url):
    pdf_url = None
    for host in journal_hosts:
        if host in journal_url:
            pdf_url = journal_url_to_pdf[host](journal_url)
            break
    if pdf_url is not None and check_url_contains_pdf(pdf_url):
        return pdf_url, True
    return journal_url, False
Beispiel #2
0
    def journal_url_to_pdf_url(cls, journal_url):
        parts = journal_url.split(cls.journal_url_split_on)
        try:
            uid = parts[1]
            uid = cls.remove_query(uid)
            pdf_url = f'{cls.pdf_url_base}{uid}{cls.pdf_url_suffix}'

            if check_url_contains_pdf(pdf_url):
                return pdf_url
            return None
        except Exception as e:
            sentry.log_error(e, message=journal_url)
            return None
Beispiel #3
0
def populate_pdf_url_from_journal_url(url, metadata):
    """
    Returns tuple of:
    metadata with pdf_url and file if pdf is found
    and whether this fills the metadata or not.
    """
    url, converted = convert_journal_url_to_pdf_url(url)
    if converted and check_url_contains_pdf(url):
        if metadata.get("file", None) is None:
            metadata["file"] = url
        if metadata.get("pdf_url", None) is None:
            metadata["pdf_url"] = url
    return metadata, False
    def _add_url(self, file, validated_data):
        if check_file_is_url(file):
            validated_data["file"] = None
            contains_pdf = check_url_contains_pdf(file)
            is_journal_pdf = check_url_is_pdf(file)

            if contains_pdf:
                validated_data["url"] = file
                validated_data["pdf_url"] = file

            if is_journal_pdf is True:
                pdf_url = file
                journal_url, converted = convert_pdf_url_to_journal_url(file)
            elif is_journal_pdf is False:
                journal_url = file
                pdf_url, converted = convert_journal_url_to_pdf_url(file)
            else:
                validated_data["url"] = file
                return

            if converted:
                validated_data["url"] = journal_url
                validated_data["pdf_url"] = pdf_url
        return
    def search_by_url(self, request):
        # TODO: Ensure we are saving data from here, license, title,
        # publish date, authors, pdf
        # handle pdf url, journal url, or pdf upload
        # TODO: Refactor
        """
        Retrieve bibliographic metadata and potential paper matches
        from the database for `url` (specified via request post data).
        """
        url = request.data.get("url").strip()
        data = {"url": url}

        if not url:
            return Response(
                "search_by_url requests must specify 'url'",
                status=status.HTTP_400_BAD_REQUEST,
            )
        try:
            URLValidator()(url)
        except (ValidationError, Exception) as e:
            print(e)
            return Response(
                f"Double check that URL is valid: {url}",
                status=status.HTTP_400_BAD_REQUEST,
            )

        url_is_pdf = check_url_contains_pdf(url)
        data["url_is_pdf"] = url_is_pdf

        duplicate_papers = Paper.objects.filter(
            Q(url__icontains=url) | Q(pdf_url__icontains=url))
        if duplicate_papers.exists():
            duplicate_paper = duplicate_papers.first()
            serializer_data = self.serializer_class(
                duplicate_paper,
                context={
                    "purchase_minimal_serialization": True
                }).data
            data = {"key": "url", "results": serializer_data}
            return Response(data, status=status.HTTP_403_FORBIDDEN)

        try:
            csl_item = get_csl_item(url)
        except Exception as error:
            data["warning"] = f"Generating csl_item failed with:\n{error}"
            log_error(error)
            csl_item = None

        if csl_item:
            # Cleaning csl data
            cleaned_title = csl_item.get("title", "").strip()
            csl_item["title"] = cleaned_title
            abstract = csl_item.get("abstract", "")
            cleaned_abstract = clean_abstract(abstract)
            csl_item["abstract"] = cleaned_abstract

            url_is_unsupported_pdf = url_is_pdf and csl_item.get("URL") == url
            data["url_is_unsupported_pdf"] = url_is_unsupported_pdf
            csl_item.url_is_unsupported_pdf = url_is_unsupported_pdf
            data["csl_item"] = csl_item
            data["oa_pdf_location"] = get_pdf_location_for_csl_item(csl_item)
            doi = csl_item.get("DOI", None)

            duplicate_papers = Paper.objects.exclude(doi=None).filter(doi=doi)
            if duplicate_papers.exists():
                duplicate_paper = duplicate_papers.first()
                serializer_data = self.serializer_class(
                    duplicate_paper,
                    context={
                        "purchase_minimal_serialization": True
                    }).data
                data = {"key": "doi", "results": serializer_data}
                return Response(data, status=status.HTTP_403_FORBIDDEN)

            data["paper_publish_date"] = csl_item.get_date("issued", fill=True)

        if csl_item and request.data.get("search", False):
            # search existing papers
            search = self.search_by_csl_item(csl_item)
            try:
                search = search.execute()
            except ConnectionError:
                return Response(
                    "Search failed due to an elasticsearch ConnectionError.",
                    status=status.HTTP_500_INTERNAL_SERVER_ERROR,
                )
            data["search"] = [hit.to_dict() for hit in search.hits]

        return Response(data, status=status.HTTP_200_OK)
 def check_url(self, request):
     url = request.data.get("url", None)
     url_is_pdf = check_url_contains_pdf(url)
     data = {"found_file": url_is_pdf}
     return Response(data, status=status.HTTP_200_OK)
Beispiel #7
0
def download_pdf(url):
    if check_url_contains_pdf(url):
        pdf = get_pdf_from_url(url)
        filename = url.split("/").pop()
        return pdf, filename