def _validate_integrity_constraints(datapoint: SerializableArticleRecord): error = None error_msg = None if not datapoint.doi: error = "Missing DOI" elif not datapoint.title: error = "Missing title" elif len(datapoint.title) > Paper.max_length('title'): error = "Title too long" error_msg = error + f": {datapoint.title}" elif not datapoint.abstract: error = "Missing abstract" elif not datapoint.publication_date: error = "Missing publication date" if datapoint.doi and '\n' in datapoint.doi: error = "DOI has line breaks" author_count = 0 for author in datapoint.authors: if ((author[1] and len(author[1]) > Author.max_length("first_name")) or (author[0] and len(author[0]) > Author.max_length("last_name"))): error = "Author name too long" error_msg = error + f": {author[0]}, {author[1]}" if not AuthorNameResolution.objects.filter( source_first_name=author[1], source_last_name=author[0], target_author=None).exists(): # Count only authors that are not on the author ignore list author_count += 1 if author_count == 0: error = "No authors" if error: if not error_msg: error_msg = error raise DatabaseUpdate.Error(error_msg)
def _update(self, db_article: Paper, datapoint: SerializableArticleRecord): db_article.title = datapoint.title db_article.abstract = datapoint.abstract db_article.published_at = datapoint.publication_date db_article.url = datapoint.url db_article.pdf_url = datapoint.pdf_url db_article.is_preprint = datapoint.is_preprint db_article.pubmed_id = datapoint.pubmed_id db_article.data_source_value = self.datasource db_article.covid_related = covid_related(db_article=db_article) if self.datasource.check_covid_related and not db_article.covid_related: raise DatabaseUpdate.Error("Article not covid related.") db_article.host, _ = PaperHost.objects.get_or_create( name=datapoint.paperhost) db_article.visualized = False db_article.vectorized = False db_article.save() AuthorPaperMembership.objects.filter(paper=db_article).delete() rank = 0 for author in datapoint.authors: db_author, _ = Author.get_or_create_by_name(first_name=author[1], last_name=author[0]) if db_author is not None: AuthorPaperMembership.objects.create(paper=db_article, author=db_author, rank=rank) rank += 1 if datapoint.journal: db_article.journal, _ = Journal.objects.get_or_create( name=datapoint.journal[:Journal.max_length("name")]) db_article.version = datapoint.version db_article.last_scrape = timezone.now() db_article.categories.clear() db_article.scrape_hash = datapoint.md5 db_article.save()
def get_new_data(self, progress=None): self.statistics = UpdateStatistics() self.statistics.start() total = self._count() self.log(f"Check {total} publications") iterator = progress( self._get_data_points(), length=total) if progress else self._get_data_points() for data_point in iterator: self.get_or_create_db_article(data_point) self.log("Delete orphaned authors and journals") self.statistics.authors_deleted = Author.cleanup() self.statistics.journals_deleted = Journal.cleanup() self.statistics.stop() self.log(self.statistics)
def update_existing_data(self, count=None, progress=None): """ Updates the stored papers, starting with the one with the earliest last-scrape. Count is the total number of papers to update. """ self.statistics = UpdateStatistics() self.statistics.start() total = self._count() if count is None: count = total self.log(f"Update {count} existing articles") if self.force_update: self.log("Force updating articles") filtered_articles = Paper.objects.all().filter( data_source_value=self.data_source).order_by( F('last_scrape').asc(nulls_first=True)) iterator = ArticleDatapointIterator(filtered_articles, count, self._get_data_point) for article, data_point in progress(iterator, length=count): if not self.force_update and data_point.update_timestamp and article.last_scrape > data_point.update_timestamp: DataUpdater.set_last_scrape(data_point) continue self.get_or_create_db_article(data_point) self.log( f"{len(iterator.missing_dois)} missing Data Points: {iterator.missing_dois}" ) self.statistics.n_missing_datapoints = len(iterator.missing_dois) self.log("Delete orphaned authors and journals") self.statistics.authors_deleted = Author.cleanup() self.statistics.journals_deleted = Journal.cleanup() self.statistics.stop() self.log(self.statistics)
def scrape_conflict(request): if request.method == 'GET': errors = [] for error in ScrapeConflict.objects.all(): datapoint = json.loads(error.datapoint) form = PaperForm(instance=error.paper) comparison = { 'publication_date': datetime.strftime(error.paper.published_at, '%Y-%m-%d') == datapoint['publication_date'], 'authors': sorted([[a.last_name, a.first_name] for a in error.paper.authors.all() ]) == sorted(datapoint['authors']), 'journal': error.paper.journal.name == datapoint['journal'] if error.paper.journal else datapoint['journal'], } errors.append({ 'paper': error.paper, 'form': form, 'datapoint': json.loads(error.datapoint), 'comparison': comparison }) return render(request, 'dashboard/scrape/scrape_conflicts_overview.html', { 'errors': errors, 'debug': settings.DEBUG }) elif request.method == 'POST': doi = request.POST.get('doi') paper = Paper.objects.get(doi=doi) conflict = ScrapeConflict.objects.get(paper=paper) if 'accept-button' in request.POST: form = PaperForm(request.POST, instance=paper) if not form.is_valid(): messages.add_message(request, messages.ERROR, f"{form.errors}") else: with transaction.atomic(): try: form.save() if request.POST.get('manually_modified') == 'on': paper.manually_modified = True else: paper.manually_modified = False AuthorPaperMembership.objects.filter( paper=paper).delete() rank = 0 for author in request.POST.get('author_list').split( ';'): author, _ = Author.get_or_create_by_name( author.split(',')[1], author.split(',')[0]) if author is not None: AuthorPaperMembership.objects.create( paper=paper, author=author, rank=rank) rank += 1 journal_name = request.POST.get('journal_name', None) if journal_name: journal, _ = Journal.objects.get_or_create( name=journal_name) else: journal = None paper.journal = journal paper.visualized = False paper.vectorized = False paper.scrape_hash = json.loads( conflict.datapoint)['_md5'] paper.save() conflict.delete() messages.add_message( request, messages.SUCCESS, "Successfully saved the changes.") except IntegrityError: messages.add_message(request, messages.ERROR, "Integrity error.") return redirect('scrape_conflict')
def _import_papers(self, papers, paper_informations, authors, import_locations, import_ml_categories, import_journals, tar): """ Import papers and its associated authors. Also its relations with locations, categories and journals, depending on the bool parameters. The mapping of all things (except authors) must have been built before using this. """ paper_title_max_len = Paper.max_length("title") author_firstname_max_len = Author.max_length("first_name") author_lastname_max_len = Author.max_length("last_name") papers_to_add = [] category_memberships_to_create = [] location_memberships_to_create = [] for i, (paper, paper_info) in enumerate(zip(papers, paper_informations)): db_paper = paper_info["db_paper"] if not db_paper: continue if paper_info["will_update"]: db_paper.title = paper["title"][:paper_title_max_len] db_paper.abstract = paper["abstract"] db_paper.data_source_value = paper["datasource_id"] db_paper.version = paper["version"] db_paper.covid_related = paper["covid_related"] db_paper.url = paper["url"] db_paper.pdf_url = paper["pdf_url"] db_paper.is_preprint = paper["is_preprint"] db_paper.published_at = paper["published_at"] db_paper.last_scrape = make_aware( datetime.strptime(paper["last_scrape"], "%Y-%m-%d %H:%M:%S") ) if paper["last_scrape"] else None if self.export_version > 4: db_paper.scrape_hash = paper["scrape_hash"] if self.export_version > 5: db_paper.manually_modified = paper["manually_modified"] db_paper.host = self._mappings.paperhost_mapping[paper["paperhost_id"]] if paper[ "paperhost_id"] else None db_paper.pubmed_id = paper["pubmed_id"] if "pubmed_id" in paper else None db_paper.journal = ( self._mappings.journal_mapping[paper["journal_id"]] if import_journals and paper[ "journal_id"] else None ) db_paper.data = self._mappings.paperdata_mapping[ db_paper.doi] if db_paper.doi in self._mappings.paperdata_mapping else None if self.export_version >= 4: db_paper.visualized = paper["visualized"] db_paper.vectorized = paper["vectorized"] img_path = paper["image"] if img_path: with tar.extractfile(img_path) as img_file: image = Image.open(img_file) buffer = BytesIO() image.save(buffer, format="JPEG") db_paper.add_preview_image(buffer, save=False) papers_to_add.append(db_paper) self.statistics.added_papers += 1 self._mappings.doi_to_author_mapping[db_paper.doi] = [] # maps doi to a list of its db_authors for author_id in paper["author_ids"]: author = authors[author_id] author_tuple = (author["firstname"][:author_firstname_max_len], author["lastname"][:author_lastname_max_len]) try: db_author = Author.objects.get(first_name=author["firstname"][:author_firstname_max_len], last_name=author["lastname"][:author_lastname_max_len]) self._mappings.db_author_mapping[author_tuple] = {"db_author": db_author, "created": False} except Author.DoesNotExist: if author_tuple in self._mappings.db_author_mapping: # author was already requested earlier db_author = self._mappings.db_author_mapping[author_tuple]["db_author"] else: db_author = Author(first_name=author["firstname"][:author_firstname_max_len], last_name=author["lastname"][:author_lastname_max_len]) self._mappings.db_author_mapping[author_tuple] = {"db_author": db_author, "created": True} self.statistics.authors_created += 1 self._mappings.doi_to_author_mapping[db_paper.doi].append(db_author) if import_ml_categories and not db_paper.categories.exists(): # Set paper categories if they were not set (even on existing papers) if paper["category_memberships"]: self.statistics.papers_w_new_category += 1 for category in paper["category_memberships"]: membership = CategoryMembership(paper=db_paper, category=self._mappings.category_mapping[category["identifier"]], score=category["score"]) category_memberships_to_create.append(membership) if import_locations and not db_paper.locations.exists(): # Set paper locations if they were not set (even on existing papers) if paper["locations"]: self.statistics.papers_w_new_location += 1 db_paper.location_modified = paper["location_modified"] for location in paper["locations"]: membership = GeoLocationMembership(paper=db_paper, location=self._mappings.location_mapping[location["id"]], state=location["state"]) location_memberships_to_create.append(membership) Paper.objects.bulk_create(papers_to_add) Author.objects.bulk_create([author["db_author"] for author in self._mappings.db_author_mapping.values() if author["created"]]) CategoryMembership.objects.bulk_create(category_memberships_to_create) GeoLocationMembership.objects.bulk_create(location_memberships_to_create) author_paper_memberships = [] for doi, authors in self._mappings.doi_to_author_mapping.items(): author_paper_memberships += [AuthorPaperMembership(paper_id=doi, author_id=author.pk, rank=i) for i, author in enumerate(authors)] AuthorPaperMembership.objects.bulk_create(author_paper_memberships) # recompute counts because post save signals are not triggered on bulk create GeoLocation.recompute_counts(GeoCity.objects.all(), GeoCountry.objects.all())
def _cleanup_models(self): self.statistics.authors_deleted = Author.cleanup() self.statistics.journals_deleted = Journal.cleanup() self.statistics.paperdata_deleted = PaperData.cleanup()