Beispiel #1
0
    def test_1_http_upload_fail(self):
        requests.head = ResponseMockFactory.head_fail
        requests.get = ResponseMockFactory.get_fail

        url= "http://fail"

        file_upload = models.FileUpload()
        file_upload.set_id()
        file_upload.upload("testuser", url, status="exists")

        upload_dir = app.config.get("UPLOAD_DIR")
        path = os.path.join(upload_dir, file_upload.local_filename)
        self.cleanup_paths.append(path)

        job = models.BackgroundJob()

        result = ingestarticles.http_upload(job, path, file_upload)

        assert result is False
        assert file_upload.status == "failed"
        assert file_upload.error is not None and file_upload.error != ""
        assert file_upload.error_details is None
        assert list(file_upload.failure_reasons.keys()) == []

        # now try it with an actual exception
        url= "http://except"
        file_upload.upload("testuser", url, status="exists")

        result = ingestarticles.http_upload(job, path, file_upload)

        assert result is False
        assert file_upload.status == "failed"
        assert file_upload.error is not None and file_upload.error != ""
        assert file_upload.error_details is None
        assert list(file_upload.failure_reasons.keys()) == []
Beispiel #2
0
    def test_3_submit_retry(self):
        app.config["HUEY_TASKS"]["ingest_articles"]["retries"] = 1

        fu = models.FileUpload()
        fu.validated("doaj")
        fu.save()

        job = models.BackgroundJob()
        params = {}
        params["ingest_articles__file_upload_id"] = fu.id
        params["ingest_articles__attempts"] = 0
        job.params = params
        job.save(blocking=True)

        # this assumes that huey is in always eager mode, and thus this immediately calls the async task,
        # which in turn calls execute, which ultimately calls run
        with self.assertRaises(RetryException):
            ingestarticles.IngestArticlesBackgroundTask.submit(job)

        job = models.BackgroundJob.pull(job.id)
        assert job.params.get("ingest_articles__attempts") == 1
        assert job.status == "processing"

        # now do it again, to see the retry cause the job to fail on the second attempt as per the config
        with self.assertRaises(RetryException):
            ingestarticles.IngestArticlesBackgroundTask.submit(job)

        job = models.BackgroundJob.pull(job.id)
        assert job.params.get("ingest_articles__attempts") == 2
        assert job.status == "error"
Beispiel #3
0
lastmods = list(set(lastmods))
lastmods.sort()

for lm in lastmods:
    for obj in lookup[lm]:
        attempted += 1

        publisher = obj["publisher"]
        filename = obj["filename"]
        id = obj["id"]

        f = id + ".xml"
        xml_file = os.path.join(xml_dir, f)
        uploaded = datetime.fromtimestamp(lm).strftime("%Y-%m-%dT%H:%M:%SZ")

        upload = models.FileUpload()
        upload.set_schema(xwalk.format_name)
        upload.upload(publisher, filename)
        upload.set_created(uploaded)
        upload.set_id()

        # now try and parse the file
        doc = None
        try:
            doc = etree.parse(open(xml_file))
        except:
            failed += 1
            print f, "Malformed XML"
            malformed_writer.writerow(
                [f, publisher, filename, uploaded, acc.email])
            upload.failed("Unable to parse file")
Beispiel #4
0
    def _url_upload(cls, username, url, schema, previous):
        # first define a few functions
        def __http_upload(record, previous, url):
            # first thing to try is a head request, supporting redirects
            head = requests.head(url, allow_redirects=True)
            if head.status_code == requests.codes.ok:
                return __ok(record, previous)

            # if we get to here, the head request failed.  This might be because the file
            # isn't there, but it might also be that the server doesn't support HEAD (a lot
            # of webapps [including this one] don't implement it)
            #
            # so we do an interruptable get request instead, so we don't download too much
            # unnecessary content
            get = requests.get(url, stream=True)
            get.close()
            if get.status_code == requests.codes.ok:
                return __ok(record, previous)
            return __fail(
                record,
                previous,
                error='error while checking submitted file reference: {0}'.
                format(get.status_code))

        def __ftp_upload(record, previous, parsed_url):
            # 1. find out whether the file exists
            # 2. that's it, return OK

            # We might as well check if the file exists using the SIZE command.
            # If the FTP server does not support SIZE, our article ingestion
            # script is going to refuse to process the file anyway, so might as
            # well get a failure now.
            # Also it's more of a faff to check file existence using LIST commands.
            try:
                f = ftplib.FTP(parsed_url.hostname, parsed_url.username,
                               parsed_url.password)
                r = f.sendcmd(
                    'TYPE I'
                )  # SIZE is not usually allowed in ASCII mode, so set to binary mode
                if not r.startswith('2'):
                    return __fail(
                        record,
                        previous,
                        error='could not set binary '
                        'mode in target FTP server while checking file exists')
                if f.size(parsed_url.path) < 0:
                    # this will either raise an error which will get caught below
                    # or, very rarely, will return an invalid size
                    return __fail(
                        record,
                        previous,
                        error='file does not seem to exist on FTP server')

            except Exception as e:
                return __fail(record,
                              previous,
                              error='error during FTP file existence check: ' +
                              str(e.args))

            return __ok(record, previous)

        def __ok(record, previous):
            record.exists()
            record.save()
            previous.insert(0, record)
            return record.id

        def __fail(record, previous, error):
            message = 'The URL could not be accessed; ' + error
            record.failed(message)
            record.save()
            previous.insert(0, record)
            raise BackgroundException(message)

        # prep a record to go into the index, to record this upload.  The filename is the url
        record = models.FileUpload()
        record.upload(username, url)
        record.set_id()
        record.set_schema(
            schema)  # although it could be wrong, this will get checked later

        # now we attempt to verify that the file is retrievable
        try:
            # first, determine if ftp or http
            parsed_url = urlparse(url)
            if parsed_url.scheme in ['http', "https"]:
                return __http_upload(record, previous, url)
            elif parsed_url.scheme == 'ftp':
                return __ftp_upload(record, previous, parsed_url)
            else:
                return __fail(
                    record,
                    previous,
                    error=
                    'unsupported URL scheme "{0}". Only HTTP(s) and FTP are supported.'
                    .format(parsed_url.scheme))
        except BackgroundException as e:
            raise
        except Exception as e:
            return __fail(record,
                          previous,
                          error="please check it before submitting again; " +
                          e.message)
Beispiel #5
0
    def _file_upload(cls, username, f, schema, previous):
        # prep a record to go into the index, to record this upload
        record = models.FileUpload()
        record.upload(username, f.filename)
        record.set_id()

        # the file path that we are going to write to
        xml = os.path.join(app.config.get("UPLOAD_DIR", "."),
                           record.local_filename)

        # it's critical here that no errors cause files to get left behind unrecorded
        try:
            # write the incoming file out to the XML file
            f.save(xml)

            # save the index entry
            record.save()
        except:
            # if we can't record either of these things, we need to back right off
            try:
                file_failed(xml)
            except:
                pass
            try:
                record.delete()
            except:
                pass

            raise BackgroundException(
                "Failed to upload file - please contact an administrator")

        xwalk_name = app.config.get("ARTICLE_CROSSWALKS", {}).get(schema)
        xwalk = plugin.load_class(xwalk_name)()

        # now we have the record in the index and on disk, we can attempt to
        # validate it
        try:
            with open(xml) as handle:
                xwalk.validate_file(handle)
            record.validated(schema)
            record.save()
            previous.insert(0, record)
            return record.id

        except IngestException as e:
            record.failed(e.message, e.inner_message)
            try:
                file_failed(xml)
            except:
                pass
            record.save()
            previous.insert(0, record)
            raise BackgroundException("Failed to upload file: " + e.message +
                                      "; " + str(e.inner_message))
        except Exception as e:
            record.failed("File system error when reading file")
            try:
                file_failed(xml)
            except:
                pass
            record.save()
            previous.insert(0, record)
            raise BackgroundException(
                "Failed to upload file - please contact an administrator")
Beispiel #6
0
def _file_upload(f, schema, previous):
    
    # prep a record to go into the index, to record this upload
    record = models.FileUpload()
    record.upload(current_user.id, f.filename)
    record.set_id()
    
    # the file path that we are going to write to
    xml = os.path.join(app.config.get("UPLOAD_DIR", "."), record.local_filename)
    
    # it's critical here that no errors cause files to get left behind unrecorded
    try:
        # write the incoming file out to the XML file
        f.save(xml)
        
        # save the index entry
        record.save()
    except:
        # if we can't record either of these things, we need to back right off
        try:
            os.remove(xml)
        except:
            pass
        try:
            record.delete()
        except:
            pass
        
        flash("Failed to upload file - please contact an administrator", "error")
        return render_template('publisher/uploadfile.html', previous=previous)
        
    # now we have the record in the index and on disk, we can attempt to
    # validate it
    try:
        actual_schema = None
        with open(xml) as handle:
            actual_schema = article.check_schema(handle, schema)
    except:
        # file is a dud, so remove it
        try:
            os.remove(xml)
        except:
            pass
        
        # if we're unable to validate the file, we should record this as
        # a file error.
        record.failed("Unable to parse file")
        record.save()
        previous = [record] + previous
        flash("Failed to parse file - it is invalid XML; please fix it before attempting to upload again.", "error")
        return render_template('publisher/uploadfile.html', previous=previous)
    
    if actual_schema:
        record.validated(actual_schema)
        record.save()
        previous = [record] + previous # add the new record to the previous records
        flash("File successfully uploaded - it will be processed shortly", "success")
        return render_template('publisher/uploadfile.html', previous=previous)
    else:
        record.failed("File could not be validated against a known schema")
        record.save()
        os.remove(xml)
        previous = [record] + previous
        flash("File could not be validated against a known schema; please fix this before attempting to upload again", "error")
        return render_template('publisher/uploadfile.html', previous=previous)