Exemple #1
0
    def _validate_integrity_constraints(datapoint: SerializableArticleRecord):
        error = None
        error_msg = None
        if not datapoint.doi:
            error = "Missing DOI"
        elif not datapoint.title:
            error = "Missing title"
        elif len(datapoint.title) > Paper.max_length('title'):
            error = "Title too long"
            error_msg = error + f": {datapoint.title}"
        elif not datapoint.abstract:
            error = "Missing abstract"
        elif not datapoint.publication_date:
            error = "Missing publication date"

        if datapoint.doi and '\n' in datapoint.doi:
            error = "DOI has line breaks"

        author_count = 0
        for author in datapoint.authors:
            if ((author[1]
                 and len(author[1]) > Author.max_length("first_name"))
                    or (author[0]
                        and len(author[0]) > Author.max_length("last_name"))):
                error = "Author name too long"
                error_msg = error + f": {author[0]}, {author[1]}"
            if not AuthorNameResolution.objects.filter(
                    source_first_name=author[1],
                    source_last_name=author[0],
                    target_author=None).exists():
                # Count only authors that are not on the author ignore list
                author_count += 1

        if author_count == 0:
            error = "No authors"

        if error:
            if not error_msg:
                error_msg = error
            raise DatabaseUpdate.Error(error_msg)
Exemple #2
0
    def _update(self, db_article: Paper, datapoint: SerializableArticleRecord):
        db_article.title = datapoint.title
        db_article.abstract = datapoint.abstract
        db_article.published_at = datapoint.publication_date

        db_article.url = datapoint.url
        db_article.pdf_url = datapoint.pdf_url
        db_article.is_preprint = datapoint.is_preprint
        db_article.pubmed_id = datapoint.pubmed_id
        db_article.data_source_value = self.datasource
        db_article.covid_related = covid_related(db_article=db_article)

        if self.datasource.check_covid_related and not db_article.covid_related:
            raise DatabaseUpdate.Error("Article not covid related.")

        db_article.host, _ = PaperHost.objects.get_or_create(
            name=datapoint.paperhost)

        db_article.visualized = False
        db_article.vectorized = False
        db_article.save()

        AuthorPaperMembership.objects.filter(paper=db_article).delete()
        rank = 0
        for author in datapoint.authors:
            db_author, _ = Author.get_or_create_by_name(first_name=author[1],
                                                        last_name=author[0])
            if db_author is not None:
                AuthorPaperMembership.objects.create(paper=db_article,
                                                     author=db_author,
                                                     rank=rank)
                rank += 1

        if datapoint.journal:
            db_article.journal, _ = Journal.objects.get_or_create(
                name=datapoint.journal[:Journal.max_length("name")])

        db_article.version = datapoint.version
        db_article.last_scrape = timezone.now()

        db_article.categories.clear()
        db_article.scrape_hash = datapoint.md5
        db_article.save()
    def get_new_data(self, progress=None):
        self.statistics = UpdateStatistics()
        self.statistics.start()

        total = self._count()
        self.log(f"Check {total} publications")

        iterator = progress(
            self._get_data_points(),
            length=total) if progress else self._get_data_points()

        for data_point in iterator:
            self.get_or_create_db_article(data_point)

        self.log("Delete orphaned authors and journals")
        self.statistics.authors_deleted = Author.cleanup()
        self.statistics.journals_deleted = Journal.cleanup()

        self.statistics.stop()
        self.log(self.statistics)
    def update_existing_data(self, count=None, progress=None):
        """
        Updates the stored papers, starting with the one with the earliest last-scrape.
        Count is the total number of papers to update.
        """
        self.statistics = UpdateStatistics()
        self.statistics.start()

        total = self._count()

        if count is None:
            count = total

        self.log(f"Update {count} existing articles")
        if self.force_update:
            self.log("Force updating articles")

        filtered_articles = Paper.objects.all().filter(
            data_source_value=self.data_source).order_by(
                F('last_scrape').asc(nulls_first=True))

        iterator = ArticleDatapointIterator(filtered_articles, count,
                                            self._get_data_point)

        for article, data_point in progress(iterator, length=count):
            if not self.force_update and data_point.update_timestamp and article.last_scrape > data_point.update_timestamp:
                DataUpdater.set_last_scrape(data_point)
                continue
            self.get_or_create_db_article(data_point)

        self.log(
            f"{len(iterator.missing_dois)} missing Data Points: {iterator.missing_dois}"
        )
        self.statistics.n_missing_datapoints = len(iterator.missing_dois)
        self.log("Delete orphaned authors and journals")
        self.statistics.authors_deleted = Author.cleanup()
        self.statistics.journals_deleted = Journal.cleanup()

        self.statistics.stop()
        self.log(self.statistics)
Exemple #5
0
def scrape_conflict(request):
    if request.method == 'GET':
        errors = []
        for error in ScrapeConflict.objects.all():
            datapoint = json.loads(error.datapoint)
            form = PaperForm(instance=error.paper)
            comparison = {
                'publication_date':
                datetime.strftime(error.paper.published_at,
                                  '%Y-%m-%d') == datapoint['publication_date'],
                'authors':
                sorted([[a.last_name, a.first_name]
                        for a in error.paper.authors.all()
                        ]) == sorted(datapoint['authors']),
                'journal':
                error.paper.journal.name == datapoint['journal']
                if error.paper.journal else datapoint['journal'],
            }
            errors.append({
                'paper': error.paper,
                'form': form,
                'datapoint': json.loads(error.datapoint),
                'comparison': comparison
            })

        return render(request,
                      'dashboard/scrape/scrape_conflicts_overview.html', {
                          'errors': errors,
                          'debug': settings.DEBUG
                      })

    elif request.method == 'POST':
        doi = request.POST.get('doi')
        paper = Paper.objects.get(doi=doi)
        conflict = ScrapeConflict.objects.get(paper=paper)

        if 'accept-button' in request.POST:
            form = PaperForm(request.POST, instance=paper)
            if not form.is_valid():
                messages.add_message(request, messages.ERROR, f"{form.errors}")
            else:
                with transaction.atomic():
                    try:
                        form.save()
                        if request.POST.get('manually_modified') == 'on':
                            paper.manually_modified = True
                        else:
                            paper.manually_modified = False

                        AuthorPaperMembership.objects.filter(
                            paper=paper).delete()
                        rank = 0
                        for author in request.POST.get('author_list').split(
                                ';'):
                            author, _ = Author.get_or_create_by_name(
                                author.split(',')[1],
                                author.split(',')[0])
                            if author is not None:
                                AuthorPaperMembership.objects.create(
                                    paper=paper, author=author, rank=rank)
                                rank += 1

                        journal_name = request.POST.get('journal_name', None)
                        if journal_name:
                            journal, _ = Journal.objects.get_or_create(
                                name=journal_name)
                        else:
                            journal = None
                        paper.journal = journal

                        paper.visualized = False
                        paper.vectorized = False

                        paper.scrape_hash = json.loads(
                            conflict.datapoint)['_md5']
                        paper.save()
                        conflict.delete()
                        messages.add_message(
                            request, messages.SUCCESS,
                            "Successfully saved the changes.")
                    except IntegrityError:
                        messages.add_message(request, messages.ERROR,
                                             "Integrity error.")

        return redirect('scrape_conflict')
Exemple #6
0
    def _import_papers(self, papers, paper_informations, authors,
                       import_locations, import_ml_categories, import_journals, tar):
        """
        Import papers and its associated authors. Also its relations with locations, categories and journals,
        depending on the bool parameters.
        The mapping of all things (except authors) must have been built before using this.
        """
        paper_title_max_len = Paper.max_length("title")
        author_firstname_max_len = Author.max_length("first_name")
        author_lastname_max_len = Author.max_length("last_name")

        papers_to_add = []
        category_memberships_to_create = []
        location_memberships_to_create = []

        for i, (paper, paper_info) in enumerate(zip(papers, paper_informations)):
            db_paper = paper_info["db_paper"]
            if not db_paper:
                continue

            if paper_info["will_update"]:
                db_paper.title = paper["title"][:paper_title_max_len]
                db_paper.abstract = paper["abstract"]
                db_paper.data_source_value = paper["datasource_id"]
                db_paper.version = paper["version"]
                db_paper.covid_related = paper["covid_related"]
                db_paper.url = paper["url"]
                db_paper.pdf_url = paper["pdf_url"]
                db_paper.is_preprint = paper["is_preprint"]
                db_paper.published_at = paper["published_at"]

                db_paper.last_scrape = make_aware(
                    datetime.strptime(paper["last_scrape"], "%Y-%m-%d %H:%M:%S")
                ) if paper["last_scrape"] else None

                if self.export_version > 4:
                    db_paper.scrape_hash = paper["scrape_hash"]
                if self.export_version > 5:
                    db_paper.manually_modified = paper["manually_modified"]
                db_paper.host = self._mappings.paperhost_mapping[paper["paperhost_id"]] if paper[
                    "paperhost_id"] else None
                db_paper.pubmed_id = paper["pubmed_id"] if "pubmed_id" in paper else None
                db_paper.journal = (
                    self._mappings.journal_mapping[paper["journal_id"]] if import_journals and paper[
                        "journal_id"] else None
                )
                db_paper.data = self._mappings.paperdata_mapping[
                    db_paper.doi] if db_paper.doi in self._mappings.paperdata_mapping else None

                if self.export_version >= 4:
                    db_paper.visualized = paper["visualized"]
                    db_paper.vectorized = paper["vectorized"]

                img_path = paper["image"]
                if img_path:
                    with tar.extractfile(img_path) as img_file:
                        image = Image.open(img_file)
                        buffer = BytesIO()
                        image.save(buffer, format="JPEG")
                        db_paper.add_preview_image(buffer, save=False)

                papers_to_add.append(db_paper)
                self.statistics.added_papers += 1

                self._mappings.doi_to_author_mapping[db_paper.doi] = []  # maps doi to a list of its db_authors

                for author_id in paper["author_ids"]:
                    author = authors[author_id]
                    author_tuple = (author["firstname"][:author_firstname_max_len],
                                    author["lastname"][:author_lastname_max_len])
                    try:
                        db_author = Author.objects.get(first_name=author["firstname"][:author_firstname_max_len],
                                                       last_name=author["lastname"][:author_lastname_max_len])
                        self._mappings.db_author_mapping[author_tuple] = {"db_author": db_author, "created": False}
                    except Author.DoesNotExist:
                        if author_tuple in self._mappings.db_author_mapping:
                            # author was already requested earlier
                            db_author = self._mappings.db_author_mapping[author_tuple]["db_author"]
                        else:
                            db_author = Author(first_name=author["firstname"][:author_firstname_max_len],
                                               last_name=author["lastname"][:author_lastname_max_len])
                            self._mappings.db_author_mapping[author_tuple] = {"db_author": db_author, "created": True}
                            self.statistics.authors_created += 1
                    self._mappings.doi_to_author_mapping[db_paper.doi].append(db_author)

            if import_ml_categories and not db_paper.categories.exists():
                # Set paper categories if they were not set (even on existing papers)
                if paper["category_memberships"]:
                    self.statistics.papers_w_new_category += 1
                for category in paper["category_memberships"]:
                    membership = CategoryMembership(paper=db_paper,
                                                    category=self._mappings.category_mapping[category["identifier"]],
                                                    score=category["score"])
                    category_memberships_to_create.append(membership)

            if import_locations and not db_paper.locations.exists():
                # Set paper locations if they were not set (even on existing papers)
                if paper["locations"]:
                    self.statistics.papers_w_new_location += 1
                    db_paper.location_modified = paper["location_modified"]
                for location in paper["locations"]:
                    membership = GeoLocationMembership(paper=db_paper,
                                                       location=self._mappings.location_mapping[location["id"]],
                                                       state=location["state"])
                    location_memberships_to_create.append(membership)

        Paper.objects.bulk_create(papers_to_add)
        Author.objects.bulk_create([author["db_author"] for author in self._mappings.db_author_mapping.values()
                                    if author["created"]])
        CategoryMembership.objects.bulk_create(category_memberships_to_create)
        GeoLocationMembership.objects.bulk_create(location_memberships_to_create)

        author_paper_memberships = []
        for doi, authors in self._mappings.doi_to_author_mapping.items():
            author_paper_memberships += [AuthorPaperMembership(paper_id=doi, author_id=author.pk, rank=i)
                                         for i, author in enumerate(authors)]
        AuthorPaperMembership.objects.bulk_create(author_paper_memberships)
        # recompute counts because post save signals are not triggered on bulk create
        GeoLocation.recompute_counts(GeoCity.objects.all(), GeoCountry.objects.all())
Exemple #7
0
 def _cleanup_models(self):
     self.statistics.authors_deleted = Author.cleanup()
     self.statistics.journals_deleted = Journal.cleanup()
     self.statistics.paperdata_deleted = PaperData.cleanup()