Exemple #1
0
    def run(self, _dummy=None):
        file = self.options['file']
        filename = file and file.name
        log.info(
            u"Importing {self.__class__.__name__} from {filename} into {self.project}"
            .format(**locals()))
        from amcat.scripts.article_upload.controller import Controller
        self.controller = Controller()
        arts = self.controller.run(self)

        if not arts:
            raise Exception("No articles were imported")

        self.postprocess(arts)

        for aset in self.articlesets:
            new_provenance = self.get_provenance(file, arts)
            aset.provenance = (
                "%s\n%s" % (aset.provenance or "", new_provenance)).strip()
            aset.save()

        if getattr(self, 'task', None):
            self.task.log_usage("articles", "upload", n=len(arts))

        return [aset.id for aset in self.articlesets]
Exemple #2
0
    def run(self, _dummy=None):
        file = self.options['file']
        filename = file and file.name
        log.info(
            u"Importing {self.__class__.__name__} from {filename} into {self.project}"
            .format(**locals()))
        from amcat.scripts.article_upload.controller import Controller
        self.controller = Controller()
        arts = self.controller.run(self)

        if not arts:
            raise Exception("No articles were imported")
        self.postprocess(arts)
        old_provenance = [] if self.articleset.provenance is None else [
            self.articleset.provenance
        ]
        new_provenance = self.get_provenance(file, arts)
        self.articleset.provenance = "\n".join([new_provenance] +
                                               old_provenance)
        self.articleset.save()

        return self.articleset
Exemple #3
0
    def run(self, _dummy=None):
        file = self.options['file']
        filename = file and file.name
        log.info(u"Importing {self.__class__.__name__} from {filename} into {self.project}"
                 .format(**locals()))
        from amcat.scripts.article_upload.controller import Controller
        self.controller = Controller()
        arts = self.controller.run(self)

        if not arts:
            raise Exception("No articles were imported")
        self.postprocess(arts)
        old_provenance = [] if self.articleset.provenance is None else [self.articleset.provenance]
        new_provenance = self.get_provenance(file, arts)
        self.articleset.provenance = "\n".join([new_provenance] + old_provenance)
        self.articleset.save()

        return self.articleset
Exemple #4
0
    def run(self, _dummy=None):
        file = self.options['file']
        filename = file and file.name
        log.info(u"Importing {self.__class__.__name__} from {filename} into {self.project}"
                 .format(**locals()))
        from amcat.scripts.article_upload.controller import Controller
        self.controller = Controller()
        arts = self.controller.run(self)

        if not arts:
            raise Exception("No articles were imported")

        self.postprocess(arts)

        for aset in self.articlesets:
            new_provenance = self.get_provenance(file, arts)
            aset.provenance = ("%s\n%s" % (aset.provenance or "", new_provenance)).strip()
            aset.save()

        return [aset.id for aset in self.articlesets]
Exemple #5
0
class UploadScript(script.Script):
    """Base class for Upload Scripts, which are scraper scripts driven by the
    the script input.

    For legacy reasons, parse_document and split_text may be used instead of the standard
    get_units and scrape_unit.
    """

    input_type = None
    output_type = ArticleIterator
    options_form = UploadForm

    def __init__(self, *args, **kargs):
        super(UploadScript, self).__init__(*args, **kargs)
        self.project = self.options['project']
        for k, v in self.options.items():
            if type(v) == str:
                self.options[k] = v.decode('utf-8')

        # avoid django problem/bug with repr(File(open(uncode-string)))
        # https://code.djangoproject.com/ticket/8156
        o2 = {k:v for k,v in self.options.iteritems() if k != 'file'}
        log.debug(u"Articleset: {self.articlesets!r}, options: {o2}"
                  .format(**locals()))

    @property
    def articleset(self):
        return self.articlesets[0]

    @property
    def articlesets(self):
        if self.options['articlesets']:
            return self.options['articlesets']

        if self.options['articleset_name']:
            aset = create_new_articleset(self.options['articleset_name'], self.project)
            self.options['articlesets'] = (aset,)
            return (aset,)

        return ()

    def get_errors(self):
        """return a list of document index, message pairs that explains encountered errors"""
        try:
            errors = self.controller.errors
        except AttributeError:
            log.exception("Cannot get controller errors")
            return

        for error in errors:
            yield self.explain_error(error)

    def explain_error(self, error):
        """Explain the error in the context of unit for the end user"""
        return "Error in element {error.i} : {error.error!r}".format(**locals())

    def decode(self, bytes):
        """Decode the bytes using the encoding from the form"""
        enc, text = self.bound_form.decode(bytes)
        return text

    @property
    def uploaded_texts(self):
        """A cached sequence of UploadedFile objects"""
        try:
            return self._input_texts
        except AttributeError:
            self._input_texts = self.bound_form.get_uploaded_texts()
            return self._input_texts

    def get_provenance(self, file, articles):
        n = len(articles)
        filename = file and file.name
        timestamp = unicode(datetime.datetime.now())[:16]
        return ("[{timestamp}] Uploaded {n} articles from file {filename!r} "
                "using {self.__class__.__name__}".format(**locals()))

    def run(self, _dummy=None):
        file = self.options['file']
        filename = file and file.name
        log.info(u"Importing {self.__class__.__name__} from {filename} into {self.project}"
                 .format(**locals()))
        from amcat.scripts.article_upload.controller import Controller
        self.controller = Controller()
        arts = self.controller.run(self)

        if not arts:
            raise Exception("No articles were imported")

        self.postprocess(arts)

        for aset in self.articlesets:
            new_provenance = self.get_provenance(file, arts)
            aset.provenance = ("%s\n%s" % (aset.provenance or "", new_provenance)).strip()
            aset.save()

        return [aset.id for aset in self.articlesets]

    def postprocess(self, articles):
        """
        Optional postprocessing of articles. Removing aricles from the list will exclude them from the
        article set (if needed, list should be changed in place)
        """
        pass

    def _get_units(self):
        """
        Upload form assumes that the form (!) has a get_entries method, which you get
        if you subclass you form from one of the fileupload forms. If not, please override
        this method.
        """
        for entry in self.bound_form.get_entries():
            for u in self.split_file(entry):
                yield u

    def _scrape_unit(self, document):
        result =  self.parse_document(document)
        if isinstance(result, Article):
            result = [result]
        for art in result:
            yield art

    def parse_document(self, document):
        """
        Parse the document as one or more articles, provided for legacy purposes

        @param document: object received from split_text, e.g. a string fragment
        @return: None, an Article or a sequence of Article(s)
        """
        raise NotImplementedError()

    def split_file(self, file):
        """
        Split the file into one or more fragments representing individual documents.
        Default implementation returns a single fragment containing the unicode text.

        @type file: file like object
        @return: a sequence of objects (e.g. strings) to pass to parse_documents
        """
        return [file]
Exemple #6
0
class UploadScript(script.Script):
    """Base class for Upload Scripts, which are scraper scripts driven by the
    the script input.

    For legacy reasons, parse_document and split_text may be used instead of the standard
    get_units and scrape_unit.
    """

    input_type = None
    output_type = ArticleIterator
    options_form = UploadForm

    def __init__(self, *args, **kargs):
        super(UploadScript, self).__init__(*args, **kargs)
        self.project = self.options['project']
        for k, v in self.options.items():
            if type(v) == str:
                self.options[k] = v.decode('utf-8')

        # avoid django problem/bug with repr(File(open(uncode-string)))
        # https://code.djangoproject.com/ticket/8156
        o2 = {k: v for k, v in self.options.iteritems() if k != 'file'}
        log.debug(u"Articleset: {self.articleset!r}, options: {o2}".format(
            **locals()))

    @property
    def articleset(self):
        if self.options['articleset']:
            return self.options['articleset']
        if self.options['articleset_name']:
            aset = create_new_articleset(self.options['articleset_name'],
                                         self.project)
            self.options['articleset'] = aset
            return aset
        return

    def get_errors(self):
        """return a list of document index, message pairs that explains encountered errors"""
        try:
            errors = self.controller.errors
        except AttributeError:
            log.exception("Cannot get controller errors")
            return

        for error in errors:
            yield self.explain_error(error)

    def explain_error(self, error):
        """Explain the error in the context of unit for the end user"""
        return "Error in element {error.i} : {error.error!r}".format(
            **locals())

    def decode(self, bytes):
        """Decode the bytes using the encoding from the form"""
        enc, text = self.bound_form.decode(bytes)
        return text

    @property
    def uploaded_texts(self):
        """A cached sequence of UploadedFile objects"""
        try:
            return self._input_texts
        except AttributeError:
            self._input_texts = self.bound_form.get_uploaded_texts()
            return self._input_texts

    def get_provenance(self, file, articles):
        n = len(articles)
        filename = file and file.name
        timestamp = unicode(datetime.datetime.now())[:16]
        return ("[{timestamp}] Uploaded {n} articles from file {filename!r} "
                "using {self.__class__.__name__}".format(**locals()))

    def run(self, _dummy=None):
        file = self.options['file']
        filename = file and file.name
        log.info(
            u"Importing {self.__class__.__name__} from {filename} into {self.project}"
            .format(**locals()))
        from amcat.scripts.article_upload.controller import Controller
        self.controller = Controller()
        arts = self.controller.run(self)

        if not arts:
            raise Exception("No articles were imported")
        self.postprocess(arts)
        old_provenance = [] if self.articleset.provenance is None else [
            self.articleset.provenance
        ]
        new_provenance = self.get_provenance(file, arts)
        self.articleset.provenance = "\n".join([new_provenance] +
                                               old_provenance)
        self.articleset.save()

        return self.articleset

    def postprocess(self, articles):
        """
        Optional postprocessing of articles. Removing aricles from the list will exclude them from the
        article set (if needed, list should be changed in place)
        """
        pass

    def _get_units(self):
        """
        Upload form assumes that the form (!) has a get_entries method, which you get
        if you subclass you form from one of the fileupload forms. If not, please override
        this method.
        """
        for entry in self.bound_form.get_entries():
            for u in self.split_file(entry):
                yield u

    def _scrape_unit(self, document):
        result = self.parse_document(document)
        if isinstance(result, Article):
            result = [result]
        for art in result:
            yield art

    def parse_document(self, document):
        """
        Parse the document as one or more articles, provided for legacy purposes

        @param document: object received from split_text, e.g. a string fragment
        @return: None, an Article or a sequence of Article(s)
        """
        raise NotImplementedError()

    def split_file(self, file):
        """
        Split the file into one or more fragments representing individual documents.
        Default implementation returns a single fragment containing the unicode text.

        @type text: unicode string
        @return: a sequence of objects (e.g. strings) to pass to parse_documents
        """
        return [file]