def run(self, _dummy=None): file = self.options['file'] filename = file and file.name log.info( u"Importing {self.__class__.__name__} from {filename} into {self.project}" .format(**locals())) from amcat.scripts.article_upload.controller import Controller self.controller = Controller() arts = self.controller.run(self) if not arts: raise Exception("No articles were imported") self.postprocess(arts) for aset in self.articlesets: new_provenance = self.get_provenance(file, arts) aset.provenance = ( "%s\n%s" % (aset.provenance or "", new_provenance)).strip() aset.save() if getattr(self, 'task', None): self.task.log_usage("articles", "upload", n=len(arts)) return [aset.id for aset in self.articlesets]
def run(self, _dummy=None): file = self.options['file'] filename = file and file.name log.info( u"Importing {self.__class__.__name__} from {filename} into {self.project}" .format(**locals())) from amcat.scripts.article_upload.controller import Controller self.controller = Controller() arts = self.controller.run(self) if not arts: raise Exception("No articles were imported") self.postprocess(arts) old_provenance = [] if self.articleset.provenance is None else [ self.articleset.provenance ] new_provenance = self.get_provenance(file, arts) self.articleset.provenance = "\n".join([new_provenance] + old_provenance) self.articleset.save() return self.articleset
def run(self, _dummy=None): file = self.options['file'] filename = file and file.name log.info(u"Importing {self.__class__.__name__} from {filename} into {self.project}" .format(**locals())) from amcat.scripts.article_upload.controller import Controller self.controller = Controller() arts = self.controller.run(self) if not arts: raise Exception("No articles were imported") self.postprocess(arts) old_provenance = [] if self.articleset.provenance is None else [self.articleset.provenance] new_provenance = self.get_provenance(file, arts) self.articleset.provenance = "\n".join([new_provenance] + old_provenance) self.articleset.save() return self.articleset
def run(self, _dummy=None): file = self.options['file'] filename = file and file.name log.info(u"Importing {self.__class__.__name__} from {filename} into {self.project}" .format(**locals())) from amcat.scripts.article_upload.controller import Controller self.controller = Controller() arts = self.controller.run(self) if not arts: raise Exception("No articles were imported") self.postprocess(arts) for aset in self.articlesets: new_provenance = self.get_provenance(file, arts) aset.provenance = ("%s\n%s" % (aset.provenance or "", new_provenance)).strip() aset.save() return [aset.id for aset in self.articlesets]
class UploadScript(script.Script): """Base class for Upload Scripts, which are scraper scripts driven by the the script input. For legacy reasons, parse_document and split_text may be used instead of the standard get_units and scrape_unit. """ input_type = None output_type = ArticleIterator options_form = UploadForm def __init__(self, *args, **kargs): super(UploadScript, self).__init__(*args, **kargs) self.project = self.options['project'] for k, v in self.options.items(): if type(v) == str: self.options[k] = v.decode('utf-8') # avoid django problem/bug with repr(File(open(uncode-string))) # https://code.djangoproject.com/ticket/8156 o2 = {k:v for k,v in self.options.iteritems() if k != 'file'} log.debug(u"Articleset: {self.articlesets!r}, options: {o2}" .format(**locals())) @property def articleset(self): return self.articlesets[0] @property def articlesets(self): if self.options['articlesets']: return self.options['articlesets'] if self.options['articleset_name']: aset = create_new_articleset(self.options['articleset_name'], self.project) self.options['articlesets'] = (aset,) return (aset,) return () def get_errors(self): """return a list of document index, message pairs that explains encountered errors""" try: errors = self.controller.errors except AttributeError: log.exception("Cannot get controller errors") return for error in errors: yield self.explain_error(error) def explain_error(self, error): """Explain the error in the context of unit for the end user""" return "Error in element {error.i} : {error.error!r}".format(**locals()) def decode(self, bytes): """Decode the bytes using the encoding from the form""" enc, text = self.bound_form.decode(bytes) return text @property def uploaded_texts(self): """A cached sequence of UploadedFile objects""" try: return self._input_texts except AttributeError: self._input_texts = self.bound_form.get_uploaded_texts() return self._input_texts def get_provenance(self, file, articles): n = len(articles) filename = file and file.name timestamp = unicode(datetime.datetime.now())[:16] return ("[{timestamp}] Uploaded {n} articles from file {filename!r} " "using {self.__class__.__name__}".format(**locals())) def run(self, _dummy=None): file = self.options['file'] filename = file and file.name log.info(u"Importing {self.__class__.__name__} from {filename} into {self.project}" .format(**locals())) from amcat.scripts.article_upload.controller import Controller self.controller = Controller() arts = self.controller.run(self) if not arts: raise Exception("No articles were imported") self.postprocess(arts) for aset in self.articlesets: new_provenance = self.get_provenance(file, arts) aset.provenance = ("%s\n%s" % (aset.provenance or "", new_provenance)).strip() aset.save() return [aset.id for aset in self.articlesets] def postprocess(self, articles): """ Optional postprocessing of articles. Removing aricles from the list will exclude them from the article set (if needed, list should be changed in place) """ pass def _get_units(self): """ Upload form assumes that the form (!) has a get_entries method, which you get if you subclass you form from one of the fileupload forms. If not, please override this method. """ for entry in self.bound_form.get_entries(): for u in self.split_file(entry): yield u def _scrape_unit(self, document): result = self.parse_document(document) if isinstance(result, Article): result = [result] for art in result: yield art def parse_document(self, document): """ Parse the document as one or more articles, provided for legacy purposes @param document: object received from split_text, e.g. a string fragment @return: None, an Article or a sequence of Article(s) """ raise NotImplementedError() def split_file(self, file): """ Split the file into one or more fragments representing individual documents. Default implementation returns a single fragment containing the unicode text. @type file: file like object @return: a sequence of objects (e.g. strings) to pass to parse_documents """ return [file]
class UploadScript(script.Script): """Base class for Upload Scripts, which are scraper scripts driven by the the script input. For legacy reasons, parse_document and split_text may be used instead of the standard get_units and scrape_unit. """ input_type = None output_type = ArticleIterator options_form = UploadForm def __init__(self, *args, **kargs): super(UploadScript, self).__init__(*args, **kargs) self.project = self.options['project'] for k, v in self.options.items(): if type(v) == str: self.options[k] = v.decode('utf-8') # avoid django problem/bug with repr(File(open(uncode-string))) # https://code.djangoproject.com/ticket/8156 o2 = {k: v for k, v in self.options.iteritems() if k != 'file'} log.debug(u"Articleset: {self.articleset!r}, options: {o2}".format( **locals())) @property def articleset(self): if self.options['articleset']: return self.options['articleset'] if self.options['articleset_name']: aset = create_new_articleset(self.options['articleset_name'], self.project) self.options['articleset'] = aset return aset return def get_errors(self): """return a list of document index, message pairs that explains encountered errors""" try: errors = self.controller.errors except AttributeError: log.exception("Cannot get controller errors") return for error in errors: yield self.explain_error(error) def explain_error(self, error): """Explain the error in the context of unit for the end user""" return "Error in element {error.i} : {error.error!r}".format( **locals()) def decode(self, bytes): """Decode the bytes using the encoding from the form""" enc, text = self.bound_form.decode(bytes) return text @property def uploaded_texts(self): """A cached sequence of UploadedFile objects""" try: return self._input_texts except AttributeError: self._input_texts = self.bound_form.get_uploaded_texts() return self._input_texts def get_provenance(self, file, articles): n = len(articles) filename = file and file.name timestamp = unicode(datetime.datetime.now())[:16] return ("[{timestamp}] Uploaded {n} articles from file {filename!r} " "using {self.__class__.__name__}".format(**locals())) def run(self, _dummy=None): file = self.options['file'] filename = file and file.name log.info( u"Importing {self.__class__.__name__} from {filename} into {self.project}" .format(**locals())) from amcat.scripts.article_upload.controller import Controller self.controller = Controller() arts = self.controller.run(self) if not arts: raise Exception("No articles were imported") self.postprocess(arts) old_provenance = [] if self.articleset.provenance is None else [ self.articleset.provenance ] new_provenance = self.get_provenance(file, arts) self.articleset.provenance = "\n".join([new_provenance] + old_provenance) self.articleset.save() return self.articleset def postprocess(self, articles): """ Optional postprocessing of articles. Removing aricles from the list will exclude them from the article set (if needed, list should be changed in place) """ pass def _get_units(self): """ Upload form assumes that the form (!) has a get_entries method, which you get if you subclass you form from one of the fileupload forms. If not, please override this method. """ for entry in self.bound_form.get_entries(): for u in self.split_file(entry): yield u def _scrape_unit(self, document): result = self.parse_document(document) if isinstance(result, Article): result = [result] for art in result: yield art def parse_document(self, document): """ Parse the document as one or more articles, provided for legacy purposes @param document: object received from split_text, e.g. a string fragment @return: None, an Article or a sequence of Article(s) """ raise NotImplementedError() def split_file(self, file): """ Split the file into one or more fragments representing individual documents. Default implementation returns a single fragment containing the unicode text. @type text: unicode string @return: a sequence of objects (e.g. strings) to pass to parse_documents """ return [file]