def output(self): timestamp = self.timestamp.isoformat() timestamp = timestamp.replace(':', '-') file_prefix = os.path.splitext(os.path.basename(self.input_file))[0] return state_file(self.timestamp, 'warcs2cdx', '%s-submitted-%s.txt' % (file_prefix, timestamp), on_hdfs=True)
def output(self): return state_file(self.date,'hdfs', 'block-scanner-reports.json')
def _state_file(self, state_date, ext): return state_file(state_date,self.tag,'%s.%s' % (self.name, ext), on_hdfs=self.on_hdfs)
def output(self): return state_file(self.date, 'hdfs', 'duplicate-files-list.tsv')
def output(self): return state_file(self.date, 'warc', 'warc-filesets.txt')
def output(self): return state_file(None, 'access-hdfs', 'all-files-list.csv', on_hdfs=False)
def output(self): return { 'owb': state_file(self.date,'access-data', 'access-whitelist-beta.txt'), 'pywb': state_file(self.date,'access-data', 'access-whitelist-beta.aclj') }
def output(self): return state_file(self.date, 'w3act-csv', 'all.json')
def output(self): return state_file(self.date, 'hdfs', 'warc-%s-duplicate-files-list.tsv' % self.collection)
def output(self): return state_file(self.date, 'hdfs', 'ukwa-%s-files-list.csv' % self.subset)
def output(self): return state_file(self.date, 'hdfs', 'warc-ukwa-files-list.csv')
def output(self): return state_file(self.date, 'hdfs', 'all-files-list.csv.gz', on_hdfs=True)
def output(self): return state_file(self.date,'access-data', 'updated-collections-solr.json')
def output(self): return state_file(self.date,'access-data', 'indexer-annotations.json')
def output(self): logger.warning('in output') return state_file(self.date,'access-data', 'title-level-metadata-w3act.xml')
def output(self): return state_file(self.date, 'w3act-csv', 'db-csv.zip')
def output(self): return state_file(self.date, 'w3act-csv', 'db-csv.zip', on_hdfs=True)
def state_file(self, state_date, ext='csv'): return state_file(state_date, 'hdfs', 'all-files-list.%s' % ext, on_hdfs=False)
def output(self): return state_file(self.date,'access-data', 'access-whitelist-updated.txt')
def output(self): return state_file( self.date, 'w3act-csv', 'crawl-feed-%s.%s.json' % (self.feed, self.frequency))
def output(self): return state_file(self.target_date, 'warcs', '%s-warc-files-for-date.txt' % self.file_count)
def output(self): return state_file(self.date, 'w3act-csv', 'crawl-feed-but-all-oa.json')
def dated_state_file(self): return state_file(self.date, 'access-hdfs', 'all-files-list.csv.gz', on_hdfs=False)
def output(self): return state_file(self.date, 'w3act-collections', 'collections.json')
def output(self): return state_file(self.date, 'hdfs', 'empty-files-list.csv')
def output(self): return state_file(self.date, 'w3act-subjects', 'subject-list.json')
def output(self): return state_file(self.date, 'hdfs', 'crawl-file-lists.txt')
def output(self): return state_file(self.date, 'w3act-target-list', 'target-list.json')
def output(self): return state_file(self.date, 'w3act-target-list', 'target-list-%s.json' % self.frequency)
def output(self): return state_file(self.date, 'access-data', 'title-level-metadata-w3act.xml')