Beispiel #1
0
 def _compute_updatable_papers(self, papers):
     """
     Computes which of the papers from the import will be touched for (re-)creation and creates model instances
     (without saving), if necessary.
     Returns a list of dicts of size len(papers) of format {db_paper, will_update}.
     db_paper=None indicates an error (possibly with the publication date), so the paper won't be created/updated.
     """
     paper_informations = []
     for i, paper in enumerate(papers):
         if not paper["published_at"]:
             self.log(f"Not importing {paper['doi']} because the date is missing.")
             paper_informations.append({"db_paper": None, "will_update": False})
             continue
         try:
             db_paper = Paper.objects.get(doi=paper["doi"])
             if DataSource.compare(db_paper.data_source_value, paper["datasource_id"]) >= 0:
                 paper_informations.append({"db_paper": db_paper, "will_update": False})
                 continue
             else:
                 # delete db_paper and recreate -> easier to handle using bulk create
                 db_paper.delete()
                 db_paper = Paper(doi=paper["doi"])
         except Paper.DoesNotExist:
             db_paper = Paper(doi=paper["doi"])
         paper_informations.append({"db_paper": db_paper, "will_update": True})
     return paper_informations
Beispiel #2
0
    def handle(self, *args, **options):
        literature_results = json.load(options['literature'])
        variants_found_in_papers = literature_results['variants']
        papers = literature_results['papers']
        crawl_date = literature_results['date']

        # Soft delete all existing records (they will be undeleted if they're
        # in the new data)
        existing_papers = Paper.objects.all()
        existing_papers.update(deleted=True)
        existing_variant_papers = VariantPaper.objects.all()
        existing_variant_papers.update(deleted=True)

        paper_objects = {}
        for pmid, paper in papers.iteritems():
            query = Paper.objects.filter(pmid=pmid)
            if query.count() > 0:
                # we already have this paper in the database
                paper_objects[pmid] = query[0]
                query.update(deleted=False, crawl_date=crawl_date)
            else:
                if not paper['year']:
                    paper['year'] = '0000'
                p = Paper(title=paper['title'], authors=paper['authors'], journal=paper['journal'], \
                        keywords=paper['keywords'], abstract=paper['abstract'], year=paper['year'], \
                        deleted=False, pmid=paper['pmid'], crawl_date=crawl_date)
                p.save()
                paper_objects[pmid] = p

        for variant_genomic_coordinate, variant_instances in variants_found_in_papers.iteritems(
        ):
            for variant in variant_instances:
                pmid = variant['pmid']
                points = variant['points']
                mentions = variant['mentions']
                if pmid in paper_objects:
                    paper = paper_objects[pmid]
                    if mentions == None:
                        mentions = []
                    query = VariantPaper.objects.filter(
                        variant_hg38=variant_genomic_coordinate, paper=paper)
                    if query.count() > 0:
                        # we already have this variantpaper
                        query.update(mentions=mentions,
                                     points=points,
                                     deleted=False)
                    else:
                        vp = VariantPaper(
                            variant_hg38=variant_genomic_coordinate,
                            paper=paper,
                            points=points,
                            mentions=mentions,
                            deleted=False)
                        vp.save()
Beispiel #3
0
    def insert(self, datapoint: SerializableArticleRecord):
        self._validate_integrity_constraints(datapoint)

        if IgnoredPaper.objects.filter(doi=datapoint.doi).exists():
            raise DatabaseUpdate.SkipArticle("DOI is on ignore list")

        conflict = False
        try:
            with transaction.atomic():
                try:
                    db_article = Paper.objects.get(doi=datapoint.doi)
                    created = False
                except Paper.DoesNotExist:
                    db_article = Paper(doi=datapoint.doi)
                    created = True

                if not created:
                    datasource_comparison = DataSource.compare(
                        db_article.data_source_value, datapoint.datasource)
                    if datasource_comparison > 0:
                        datasource_name = DataSource(
                            db_article.data_source_value).name
                        raise DatabaseUpdate.SkipArticle(
                            f"Article already tracked by {datasource_name}")
                    elif not self.force_update and not self.update_existing and datasource_comparison == 0:
                        raise DatabaseUpdate.SkipArticle(
                            "Article already in database")

                    changed_externally = db_article.scrape_hash != datapoint.md5
                    changed_internally = db_article.manually_modified

                    if not self.force_update and not changed_externally:
                        db_article.last_scrape = timezone.now()
                        db_article.save()
                        return db_article, False, False  # Article was neither created, nor updated

                    if changed_internally:
                        conflict = True
                        raise DatabaseUpdate.Error(
                            "Conflict: Manual modification and external change"
                        )

                self._update(db_article, datapoint)
        except DatabaseUpdate.Error as ex:
            if conflict:
                self._handle_conflict(db_article, datapoint)
            raise ex

        return db_article, created, True  # Article was updated
Beispiel #4
0
def index(request):
    global GDRIVE_DIR_ID

    bulk = BulkUploadForm()
    upl = UploadForm()
    if request.method == "POST":
        try:
            assert request.FILES.get('file', None) is not None
            # UploadForm is submitted
            upl = UploadForm(request.POST, request.FILES)
            if upl.is_valid():
                path = STATICFILES_DIRS[0]
                uid = uuid.uuid4()
                path = os.path.join(path, "files", "{}.pdf".format(uid))
                file = request.FILES.get('file')
                with open(path, 'wb+') as dest:
                    for chunk in file.chunks():
                        dest.write(chunk)
                if not GDRIVE_DIR_ID:
                    GDRIVE_DIR_ID = get_or_create_folder(GDRIVE_DIRNAME,
                                                         public=True)
                paper = upl.save(commit=False)
                paper.link = upload_file(path,
                                         "{}.pdf".format(uid),
                                         folderId=GDRIVE_DIR_ID)
                keys_tmp = upl.cleaned_data.get("keywords")
                if upl.cleaned_data.get('custom_subject', '') != '':
                    paper.subject = upl.cleaned_data.get('custom_subject')
                paper.save()

                for key in keys_tmp:
                    paper.keywords.add(key)

                paper.save()
                LOG.info("New file uploaded: {}.pdf".format(uid))
                messages.success(request, "File Upload Successful")
                try:
                    del_key = request.POST.get('del_key', 0)
                    key = int(del_key)
                    if key > 0:
                        PaperRequest.objects.filter(pk=key).delete()
                    LOG.info("Request {} cleared".format(key))
                except Exception as e:
                    LOG.warning(e)

                os.remove(path)

        except AssertionError:
            if request.user.is_staff:
                # BulkUploadForm has been submitted
                bulk = BulkUploadForm(request.POST, request.FILES)
                processed = 0
                saved = 0
                if bulk.is_valid():
                    raw_papers = json.load(request.FILES.get('bulk_file'))
                    for paper in raw_papers:
                        processed += 1
                        dep_code = str(paper.get("Department", "Other"))
                        if dep_code == "":
                            dep_code = "Other"

                        dep, _ = Department.objects\
                            .get_or_create(code=dep_code)

                        p = Paper(department=dep,
                                  year=paper.get("Year", None),
                                  subject=paper.get("Paper", None),
                                  link=paper.get("Link", None),
                                  paper_type=paper.get("Semester", None))
                        try:
                            p.save()
                            saved += 1
                        except Exception as e:
                            LOG.warning(e)

                        LOG.info("%d entries processed, %d entries saved" %
                                 (processed, saved))
                    messages.success(
                        request, "Bulk upload successful:\
                                     {} entries saved".format(saved))

    return render(request, "upload.html", {
        "bulk_form": bulk,
        "crowd_form": upl
    })