class ArchiveSearch(ArchiveTask): """ Search archive via the ia tool. Requires an archive.org account: https://archive.org/account/login.createaccount.php The command `ia configure` will set you up. $ cat ~/.config/ia.ini [s3] access = asudiasd77xsdlds secret = oasdu888s8x9a0sd [cookies] logged-in-user = [email protected] logged-in-sig = secret Refs #8000. """ date = ClosestDateParameter(default=datetime.date.today()) query = luigi.Parameter(default='collection:prelinger') def requires(self): """ The setup of siskin should install this automatically. """ return Executable(name='ia', message='https://pypi.org/project/internetarchive/') def run(self): output = shellout("ia search '{query}' > {output}", query=self.query) luigi.LocalTarget(output).move(self.output().path) def output(self): return luigi.LocalTarget(path=self.path(ext='ldj', digest=True))
class DegruyterDOIList(DegruyterTask): """ A list of Degruyter DOIs. """ date = ClosestDateParameter(default=datetime.date.today()) def requires(self): return { 'input': DegruyterIntermediateSchema(date=self.date), 'jq': Executable(name='jq', message='https://github.com/stedolan/jq') } @timed def run(self): _, stopover = tempfile.mkstemp(prefix='siskin-') shellout( """jq -r '.doi' <(unpigz -c {input}) | grep -v "null" | grep -o "10.*" 2> /dev/null > {output} """, input=self.input().get('input').path, output=stopover) output = shellout("""sort -u {input} > {output} """, input=stopover) luigi.LocalTarget(output).move(self.output().path) def output(self): return luigi.LocalTarget(path=self.path(), format=TSV)
class NLFetch(NLTask): """ Stream from SOLR. """ date = ClosestDateParameter(default=datetime.date.today()) query = luigi.Parameter( default="collection_details:GBV_NL_EBOOK", description="to test: id:NLEB006936695 OR id:NLEB006936733") def run(self): """ cf. https://github.com/stedolan/jq/issues/787, "Warning: replace is deprecated and will be removed in a future version." There's jq -j, but replace cannot run on a single 1GB line. """ output = shellout( """solrdump -verbose -server {server} -q "{query}" -fl fullrecord | \ jq -r '.fullrecord' | \ replace '#29;' $(printf "\\x1D") '#30;' $(printf "\\x1E") '#31;' $(printf "\\x1F") > {output} """, server=self.config.get('nl', 'solr'), query=self.query) output = shellout( "sed ':a;N;$!ba;s/\x1d\x0a/\x1d/g' {input} > {output}", input=output) luigi.LocalTarget(output).move(self.output().path) def output(self): return luigi.LocalTarget(path=self.path(ext="mrc", digest=True))
class CrossrefDOIList(CrossrefTask): """ A list of Crossref DOIs. """ date = ClosestDateParameter(default=datetime.date.today()) def requires(self): return { 'input': CrossrefIntermediateSchema(date=self.date), 'jq': Executable(name='jq', message='https://github.com/stedolan/jq') } @timed def run(self): _, stopover = tempfile.mkstemp(prefix='siskin-') # process substitution sometimes results in a broken pipe, so extract beforehand output = shellout("unpigz -c {input} > {output}", input=self.input().get('input').path) shellout( """jq -r '.doi?' {input} | grep -o "10.*" 2> /dev/null | LC_ALL=C sort -S50% > {output} """, input=output, output=stopover) os.remove(output) output = shellout("""sort -S50% -u {input} > {output} """, input=stopover) luigi.LocalTarget(output).move(self.output().path) def output(self): return luigi.LocalTarget(path=self.path(), format=TSV)
class DegruyterXML(DegruyterTask): """ Single file version. """ date = ClosestDateParameter(default=datetime.date.today()) group = luigi.Parameter(default='SSH', description='main subdirectory') ts = luigi.Parameter(default=DegruyterTask.TIMESTAMP) def requires(self): return DegruyterPaths(date=self.date) @timed def run(self): _, stopover = tempfile.mkstemp(prefix='siskin-') with self.input().open() as handle: for row in handle.iter_tsv(cols=('path', )): if not '/%s/' % self.group in row.path: continue if '-%s.zip' % self.ts not in row.path: continue shellout(r"unzip -p {path} \*.xml 2> /dev/null >> {output}", output=stopover, path=row.path, ignoremap={ 1: 'OK', 9: 'skip corrupt file' }) luigi.LocalTarget(stopover).move(self.output().path) def output(self): return luigi.LocalTarget(path=self.path(ext='xml'))
class IJOCFincSolr(IJOCTask): """ Export to finc solr schema by using span-export. Tag with ISIL for FID and change record type. """ format = luigi.Parameter(default='solr5vu3', description='export format') isil = luigi.Parameter(default='DE-15-FID', description='isil FID') date = ClosestDateParameter(default=datetime.date.today()) def requires(self): return { 'config': AMSLFilterConfig(date=self.date), 'file': IJOCIntermediateSchema(date=self.date) } def run(self): output = shellout( """span-tag -c {config} <(unpigz -c {input}) | span-export -o {format} -with-fullrecord > {output}""", config=self.input().get('config').path, input=self.input().get('file').path, format=self.format) luigi.LocalTarget(output).move(self.output().path) def output(self): return luigi.LocalTarget(path=self.path(ext='fincsolr.ndj'))
class CrossrefExport(CrossrefTask): """ Tag with ISILs, then export to various formats. """ date = ClosestDateParameter(default=datetime.date.today()) format = luigi.Parameter(default='solr5vu3', description='export format') def requires(self): return { 'file': CrossrefIntermediateSchema(date=self.date), 'config': AMSLFilterConfig(date=self.date), } def run(self): output = shellout( "span-tag -c {config} <(unpigz -c {input}) | pigz -c > {output}", config=self.input().get('config').path, input=self.input().get('file').path) output = shellout( "span-export -o {format} <(unpigz -c {input}) | pigz -c > {output}", format=self.format, input=output) luigi.LocalTarget(output).move(self.output().path) def output(self): extensions = { 'solr5vu3': 'ldj.gz', 'formeta': 'form.gz', } return luigi.LocalTarget(path=self.path( ext=extensions.get(self.format, 'gz')))
class DOAJIntermediateSchema(DOAJTask): """ Respect whitelist. """ date = ClosestDateParameter(default=datetime.date.today()) format = luigi.Parameter( default="doaj-oai", description= "kind of source document, doaj-oai (defunkt: doaj, doaj-api)") def requires(self): return { 'data': DOAJIntermediateSchemaDirty(date=self.date, format=self.format), 'whitelist': DOAJWhitelist(date=self.date), } @timed def run(self): output = shellout( """unpigz -c {input} | LC_ALL=C grep -Ff {whitelist} | pigz -c > {output}""", whitelist=self.input().get('whitelist').path, input=self.input().get('data').path) luigi.LocalTarget(output).move(self.output().path) def output(self): return luigi.LocalTarget(path=self.path(ext='ldj.gz'))
class DOAJISSNList(DOAJTask): """ A list of DOAJ ISSNs. """ date = ClosestDateParameter(default=datetime.date.today()) def requires(self): return { 'input': DOAJIntermediateSchema(date=self.date), 'jq': Executable(name='jq', message='http://git.io/NYpfTw') } @timed def run(self): _, stopover = tempfile.mkstemp(prefix='siskin-') shellout( """jq -r '.["rft.issn"][]?' <(unpigz -c {input}) >> {output} """, input=self.input().get('input').path, output=stopover) shellout( """jq -r '.["rft.eissn"][]?' <(unpigz -c {input}) >> {output} """, input=self.input().get('input').path, output=stopover) output = shellout("""sort -u {input} > {output} """, input=stopover) luigi.LocalTarget(output).move(self.output().path) def output(self): return luigi.LocalTarget(path=self.path(), format=TSV)
class MediarepIntermediateSchema(MediarepTask): """ Single file dump. """ date = ClosestDateParameter(default=datetime.date.today()) url = luigi.Parameter(default="https://mediarep.org/oai/request", significant=False) prefix = luigi.Parameter(default="dim", significant=False) def requires(self): return [ Executable(name='metha-sync', message='https://github.com/miku/metha'), Executable(name='span-import', message='https://github.com/miku/span'), ] def run(self): shellout("METHA_DIR={dir} metha-sync -format {prefix} {url}", prefix=self.prefix, url=self.url, dir=self.config.get('core', 'metha-dir')) output = shellout( """METHA_DIR={dir} metha-cat -root Records -format {prefix} {url} | span-import -i mediarep-dim | pigz -c > {output}""", prefix=self.prefix, url=self.url, dir=self.config.get('core', 'metha-dir')) luigi.LocalTarget(output).move(self.output().path) def output(self): return luigi.LocalTarget(path=self.path(ext="ldj.gz"))
class LyndaIntermediateSchema(LyndaTask): """ XXX: Workaround SOLR, refs #11477. """ date = ClosestDateParameter(default=datetime.date.today()) def requires(self): return LyndaPaths(date=self.date) def run(self): with self.input().open() as handle: for row in handle.iter_tsv(cols=('path', )): if row.path.endswith("latest"): output = shellout(""" gunzip -c {input} | jq -rc '.fullrecord' | jq -rc 'del(.["x.labels"])' | jq -rc '. + {{"finc.id": .["finc.record_id"]}}' | gzip -c > {output} """, input=row.path) luigi.LocalTarget(output).move(self.output().path) break else: raise RuntimeError("no latest symlink found in folder") def output(self): return luigi.LocalTarget(path=self.path(ext="ldj.gz"), format=Gzip)
class MediarepMARC(MediarepTask): """ Harvest and convert to MARC. """ date = ClosestDateParameter(default=datetime.date.today()) url = luigi.Parameter(default="https://mediarep.org/oai/request", significant=False) prefix = luigi.Parameter(default="oai_dc", significant=False) def requires(self): return Executable(name='metha-sync', message='https://github.com/miku/metha'), def run(self): shellout("METHA_DIR={dir} metha-sync -format {prefix} {url}", prefix=self.prefix, url=self.url, dir=self.config.get('core', 'metha-dir')) data = shellout( """METHA_DIR={dir} metha-cat -root Records -format {prefix} {url} > {output}""", dir=self.config.get('core', 'metha-dir'), prefix=self.prefix, url=self.url) output = shellout("""python {script} {input} {output}""", script=self.assets('170/170_marcbinary.py'), input=data) luigi.LocalTarget(output).move(self.output().path) os.remove(data) def output(self): return luigi.LocalTarget(path=self.path(ext="fincmarc.mrc"))
class IMSLPConvert(IMSLPTask): """ Extract and transform. TODO, refs #13055 -- see IMSLPDownloadNext and IMSLPConvertNext and IMSLPLegacyMapping. """ date = ClosestDateParameter(default=datetime.date.today()) debug = luigi.BoolParameter(description='do not delete temporary folder', significant=False) def requires(self): return IMSLPDownload(date=self.date) def run(self): tempdir = tempfile.mkdtemp(prefix='siskin-') shellout("tar -xzf {archive} -C {tempdir}", archive=self.input().path, tempdir=tempdir) output = shellout("python {script} {tempdir} {output} {fieldmap}", script=self.assets('15/15_marcbinary.py'), tempdir=tempdir, fieldmap=self.assets('15/15_fieldmap.json')) if not self.debug: shutil.rmtree(tempdir) else: self.logger.debug("not deleting temporary folder at %s", tempdir) luigi.LocalTarget(output).move(self.output().path) def output(self): return luigi.LocalTarget(path=self.path(ext='fincmarc.mrc'))
class IMSLPConvertNext(IMSLPTask): """ Take a current version of the data plus legacy mapping and convert. WIP, refs #12288, refs #13055. May merge with 15_marcbinary.py. """ date = ClosestDateParameter(default=datetime.date.today()) def requires(self): return { 'legacy-mapping': IMSLPLegacyMapping(), 'data': IMSLPDownload(date=self.date), } def run(self): """ Load mapping, convert, write. """ with self.input().get("legacy-mapping").open() as handle: mapping = json.load(handle) output = imslp_tarball_to_marc(self.input().get("data").path, legacy_mapping=mapping) luigi.LocalTarget(output).move(self.output().path) def output(self): filename = os.path.basename(self.latest_link()) dst = os.path.join(self.taskdir(), filename.replace("tar.gz", "fincmarc.mrc")) return luigi.LocalTarget(path=dst)
class SpringerCleanup(SpringerTask): """ 2017-11-28: finc.mega_collection is now multi-valued; AIAccessFacet remains. 2017-12-12: new finc.id, refs #11821, #11960, #11961. """ date = ClosestDateParameter(default=datetime.date.today()) def requires(self): return SpringerPaths(date=self.date) def run(self): realpath = None with self.input().open() as handle: for row in handle.iter_tsv(cols=('path', )): if not row.path.endswith("total_tpu.ldj.gz"): continue realpath = row.path break else: raise RuntimeError( 'FTP site does not contain total_tpu.ldj.gz') output = shellout(""" unpigz -c {input} | jq -rc 'del(.["finc.AIRecordType"]) | del(.["AIAccessFacet"])' | jq -c '. + {{ "finc.record_id": .doi, "finc.format": "ElectronicArticle", "url": ["https://doi.org/" + .doi] }}' | pigz -c > {output} """, input=realpath) luigi.LocalTarget(output).move(self.output().path) def output(self): return luigi.LocalTarget(path=self.path(ext='ldj.gz'), format=Gzip)
class CambridgeDropbox(CambridgeTask): """ Pull down content from FTP dropbox, in Dec '18 about 10K zips. """ date = ClosestDateParameter(default=datetime.date.today()) def requires(self): return Executable('rsync', message='https://rsync.samba.org/') def run(self): target = os.path.join(self.taskdir(), 'mirror') shellout("mkdir -p {target} && rsync {rsync_options} {src} {target}", rsync_options=self.config.get('cambridge', 'rsync-options', fallback='-avzP'), src=self.config.get('cambridge', 'scp-src'), target=target) if not os.path.exists(self.taskdir()): os.makedirs(self.taskdir()) with self.output().open('w') as output: for path in iterfiles(target): output.write_tsv(path) def output(self): return luigi.LocalTarget(path=self.path(ext='filelist'), format=TSV)
class DDNLPaths(DDNLTask): """ Mirror. """ date = ClosestDateParameter(default=datetime.date.today()) max_retries = luigi.IntParameter(default=10, significant=False) timeout = luigi.IntParameter(default=20, significant=False, description='timeout in seconds') def requires(self): return FTPMirror(host=self.config.get('ddnl', 'ftp-host'), base=self.config.get('ddnl', 'ftp-base'), username=self.config.get('ddnl', 'ftp-username'), password=self.config.get('ddnl', 'ftp-password'), pattern=self.config.get('ddnl', 'ftp-pattern'), max_retries=self.max_retries, timeout=self.timeout) def run(self): self.input().move(self.output().path) def output(self): return luigi.LocalTarget(path=self.path(), format=TSV)
class VKFilmFFMARC(VKFilmFFTask): """ Find MARC XML, uncompress, clean, remove "Nichtsortierzeichen" on the fly, convert via Python. """ date = ClosestDateParameter(default=datetime.date.today()) def requires(self): return VKFilmFFPaths(date=self.date) def run(self): with self.input().open() as handle: filename = 'film_theater_marc_%s.xml.gz' % (self.closest().strftime("%Y%m%d")) for row in handle.iter_tsv(cols=('path', )): if not row.path.endswith(filename): continue output = shellout("unpigz -c {file} | sed 's/\xC2\x98//g;s/\xC2\x9C//g' > {output}", file=row.path) output = shellout("yaz-marcdump -i marcxml -o marc {input} > {output}", input=output) output = shellout("python {script} {input} {output}", script=self.assets("119/119_marcbinary.py"), input=output) luigi.LocalTarget(output).move(self.output().path) break else: raise RuntimeError('not found: %s' % filename) def output(self): return luigi.LocalTarget(path=self.path(ext='fincmarc.mrc'))
class CrossrefRawItems(CrossrefTask): """ Concatenate all harvested items. """ begin = luigi.DateParameter(default=datetime.date(2006, 1, 1)) date = ClosestDateParameter(default=datetime.date.today()) update = luigi.Parameter(default='months', description='days, weeks or months') def requires(self): if self.update not in ('days', 'weeks', 'months'): raise RuntimeError('update can only be: days, weeks or months') dates = [ dt for dt in date_range(self.begin, self.date, 1, self.update) ] tasks = [ CrossrefChunkItems(begin=dates[i - 1], end=dates[i]) for i in range(1, len(dates)) ] return tasks def run(self): _, stopover = tempfile.mkstemp(prefix='siskin-') for target in self.input(): shellout("cat {input} >> {output}", input=target.path, output=stopover) luigi.LocalTarget(stopover).move(self.output().path) def output(self): return luigi.LocalTarget(path=self.path(ext='ldj.gz'), format=Gzip)
class PQDTCombine(PQDTTask): """ Combine files.""" date = ClosestDateParameter(default=datetime.date.today()) prefix = luigi.Parameter(default="oai_dc") def requires(self): return Executable(name='metha-sync', message='https://github.com/miku/metha') def run(self): url = self.config.get('pqdt', 'oai') shellout("METHA_DIR={dir} metha-sync -format {prefix} {url}", prefix=self.prefix, url=url, dir=self.config.get('core', 'metha-dir')) output = shellout( "METHA_DIR={dir} metha-cat -root Records -format {prefix} {url} | pigz -c > {output}", prefix=self.prefix, url=url, dir=self.config.get('core', 'metha-dir')) luigi.LocalTarget(output).move(self.output().path) def output(self): return luigi.LocalTarget(path=self.path(ext="xml.gz"))
class CrossrefCollectionsCount(CrossrefTask): """ Report collections and the number of titles per collection. """ begin = luigi.DateParameter(default=datetime.date(2006, 1, 1)) date = ClosestDateParameter(default=datetime.date.today()) def requires(self): return { 'input': CrossrefIntermediateSchema(begin=self.begin, date=self.date), 'jq': Executable(name='jq', message='https://github.com/stedolan/jq') } @timed def run(self): output = shellout( """jq -rc '.["finc.mega_collection"][]?' <(unpigz -c {input}) | LC_ALL=C sort -S35% > {output}""", input=self.input().get('input').path) groups = {} # Map collection name to its size. with open(output) as handle: for k, g in itertools.groupby(handle): name = k.strip() groups[name] = len(list(g)) with self.output().open('w') as output: json.dump(groups, output) def output(self): return luigi.LocalTarget(path=self.path(ext='json'))
class DataciteCombine(DataciteTask): """ Single file dump. """ date = ClosestDateParameter(default=datetime.date.today()) url = luigi.Parameter(default="http://oai.datacite.org/oai", significant=False) prefix = luigi.Parameter(default="oai_dc", significant=False) def requires(self): return Executable(name='metha-sync', message='https://github.com/miku/metha') def run(self): shellout("METHA_DIR={dir} metha-sync -format {prefix} {url}", prefix=self.prefix, url=self.url, dir=self.config.get('core', 'metha-dir')) output = shellout( "METHA_DIR={dir} metha-cat -root Records -format {prefix} {url} | pigz -c > {output}", prefix=self.prefix, url=self.url, dir=self.config.get('core', 'metha-dir')) luigi.LocalTarget(output).move(self.output().path) def output(self): return luigi.LocalTarget(path=self.path(ext="xml.gz"))
class CrossrefDOIAndISSNList(CrossrefTask): """ A list of Crossref DOIs with their ISSNs. """ date = ClosestDateParameter(default=datetime.date.today()) def requires(self): return { 'input': CrossrefIntermediateSchema(date=self.date), 'jq': Executable(name='jq', message='https://github.com/stedolan/jq') } @timed def run(self): _, stopover = tempfile.mkstemp(prefix='siskin-') temp = shellout("unpigz -c {input} > {output}", input=self.input().get('input').path) output = shellout( """jq -r '[.doi?, .["rft.issn"][]?, .["rft.eissn"][]?] | @csv' {input} | LC_ALL=C sort -S50% > {output} """, input=temp, output=stopover) os.remove(temp) luigi.LocalTarget(output).move(self.output().path) def output(self): return luigi.LocalTarget(path=self.path(ext='csv'))
class GeniosIntermediateSchema(GeniosTask): """ Intermediate schema by kind. May be incomplete, since the database mapping is derived from dozens of XLS sheets and manual guesses. Related: "Neue Quellen bzw. Austausch", Mon, Dec 5, 2016 at 12:23 PM, ba54ea7d396a41a2a1281f51bba5d33f See also: #9534. """ kind = luigi.Parameter(default='fachzeitschriften', description='or: ebooks, literaturnachweise_...') date = ClosestDateParameter(default=datetime.date.today()) def requires(self): return GeniosLatest(kind=self.kind, date=self.date) def run(self): if not os.path.exists(self.taskdir()): os.makedirs(self.taskdir()) output = shellout( "span-import -i genios <(unpigz -c {input}) | pigz -c >> {output}", input=self.input().path) luigi.LocalTarget(output).move(self.output().path) def output(self): return luigi.LocalTarget(path=self.path(ext='ldj.gz'))
class DegruyterISSNList(DegruyterTask): """ List of ISSNs. """ date = ClosestDateParameter(default=datetime.date.today()) def requires(self): return DegruyterIntermediateSchema(date=self.date) @timed def run(self): _, stopover = tempfile.mkstemp(prefix='siskin-') shellout( """jq -r '.["rft.issn"][]?' <(unpigz -c {input}) 2> /dev/null >> {output} """, input=self.input().path, output=stopover) shellout( """jq -r '.["rft.eissn"][]?' <(unpigz -c {input}) 2> /dev/null >> {output} """, input=self.input().path, output=stopover) output = shellout("""sort -u {input} > {output} """, input=stopover) luigi.LocalTarget(output).move(self.output().path) def output(self): return luigi.LocalTarget(path=self.path(), format=TSV)
class GeniosISSNList(GeniosTask): """ A list of Genios ISSNs. """ date = ClosestDateParameter(default=datetime.date.today()) def requires(self): return { 'input': GeniosCombinedIntermediateSchema(date=self.date), 'jq': Executable(name='jq', message='https://github.com/stedolan/jq') } def run(self): _, output = tempfile.mkstemp(prefix='siskin-') shellout( """jq -c -r '.["rft.issn"][]?' <(unpigz -c {input}) >> {output} """, input=self.input().get('input').path, output=output) shellout( """jq -c -r '.["rft.eissn"][]?' <(unpigz -c {input}) >> {output} """, input=self.input().get('input').path, output=output) output = shellout("""LC_ALL=C sort -S35% -u {input} > {output} """, input=output) luigi.LocalTarget(output).move(self.output().path) def output(self): return luigi.LocalTarget(path=self.path(), format=TSV)
class DegruyterPaths(DegruyterTask): """ A list of Degruyter file paths (via FTP). """ date = ClosestDateParameter(default=datetime.date.today()) def requires(self): host = self.config.get('degruyter', 'ftp-host') username = self.config.get('degruyter', 'ftp-username') password = self.config.get('degruyter', 'ftp-password') base = self.config.get('degruyter', 'ftp-path') pattern = self.config.get('degruyter', 'ftp-pattern') exclude_glob = self.config.get('degruyter', 'ftp-exclude-glob', fallback='') return FTPMirror(host=host, username=username, password=password, base=base, pattern=pattern, exclude_glob=exclude_glob) @timed def run(self): self.input().move(self.output().path) def output(self): return luigi.LocalTarget(path=self.path(ext="filelist"), format=TSV)
class MarburgCombine(MarburgTask): """ Harvest and combine a given set into a single file. NLM format has been discontinued as of 2018-01-01, refs #5486. Using datacite. """ date = ClosestDateParameter(default=datetime.date.today()) format = luigi.Parameter(default='datacite') set = luigi.Parameter(default='issn:2196-4270') def run(self): endpoint = "http://archiv.ub.uni-marburg.de/ubfind/OAI/Server" shellout("metha-sync -set {set} -format {format} {endpoint}", set=self.set, format=self.format, endpoint=endpoint) output = shellout( "metha-cat -set {set} -format {format} {endpoint} > {output}", set=self.set, format=self.format, endpoint=endpoint) luigi.LocalTarget(output).move(self.output().path) def output(self): return luigi.LocalTarget(self.path(ext="xml", digest=True))
class B3KatDownload(B3KatTask): """ Download snapshot. Output is a single (large) MARC binary file. Typically the downloads are provided in May and November. """ date = ClosestDateParameter(default=datetime.date.today()) def requires(self): return B3KatLinks(date=self.date) def run(self): _, stopover = tempfile.mkstemp(prefix='siskin-') with self.input().open() as handle: for i, row in enumerate(handle.iter_tsv(cols=('url', )), start=1): downloaded = shellout("""curl -sL --fail "{url}" > {output} """, url=row.url) output = shellout("""yaz-marcdump -i marcxml -o marc "{input}" >> {stopover}""", input=downloaded, stopover=stopover) try: os.remove(downloaded) os.remove(output) except OSError as err: self.logger.error(err) luigi.LocalTarget(stopover).move(self.output().path) def output(self): return luigi.LocalTarget(path=self.path(ext='mrc'))
class ArchiveMARC(ArchiveTask): """ Convert. Hard-coded collections, currently. """ date = ClosestDateParameter(default=datetime.date.today()) def requires(self): queries = [ 'collection:prelinger', 'collection:classic_cartoons', 'collection:feature_films', 'collection:more_animation', 'collection:vintage_cartoons', ] return [ ArchiveSearchMetadata(date=self.date, query=query) for query in queries ] def run(self): inputs = [target.path for target in self.input()] output = shellout("python {script} {output} {inputs}", script=self.assets("153/153_marcbinary.py"), inputs=' '.join(inputs)) luigi.LocalTarget(output).move(self.output().path) def output(self): return luigi.LocalTarget(path=self.path(ext='fincmarc.mrc'))