Beispiel #1
0
 def output(self):
     timestamp = self.timestamp.isoformat()
     timestamp = timestamp.replace(':', '-')
     file_prefix = os.path.splitext(os.path.basename(self.input_file))[0]
     return state_file(self.timestamp,
                       'warcs2cdx',
                       '%s-submitted-%s.txt' % (file_prefix, timestamp),
                       on_hdfs=True)
Beispiel #2
0
 def output(self):
   return state_file(self.date,'hdfs', 'block-scanner-reports.json')
Beispiel #3
0
 def _state_file(self, state_date, ext):
     return state_file(state_date,self.tag,'%s.%s' % (self.name, ext), on_hdfs=self.on_hdfs)
Beispiel #4
0
 def output(self):
     return state_file(self.date, 'hdfs', 'duplicate-files-list.tsv')
Beispiel #5
0
 def output(self):
     return state_file(self.date, 'warc', 'warc-filesets.txt')
Beispiel #6
0
 def output(self):
     return state_file(None,
                       'access-hdfs',
                       'all-files-list.csv',
                       on_hdfs=False)
Beispiel #7
0
 def output(self):
     return {
         'owb': state_file(self.date,'access-data', 'access-whitelist-beta.txt'),
         'pywb': state_file(self.date,'access-data', 'access-whitelist-beta.aclj')
     }
Beispiel #8
0
 def output(self):
     return state_file(self.date, 'w3act-csv', 'all.json')
Beispiel #9
0
 def output(self):
     return state_file(self.date, 'hdfs',
                       'warc-%s-duplicate-files-list.tsv' % self.collection)
Beispiel #10
0
 def output(self):
     return state_file(self.date, 'hdfs',
                       'ukwa-%s-files-list.csv' % self.subset)
Beispiel #11
0
 def output(self):
     return state_file(self.date, 'hdfs', 'warc-ukwa-files-list.csv')
Beispiel #12
0
 def output(self):
     return state_file(self.date,
                       'hdfs',
                       'all-files-list.csv.gz',
                       on_hdfs=True)
Beispiel #13
0
 def output(self):
     return state_file(self.date,'access-data', 'updated-collections-solr.json')
Beispiel #14
0
 def output(self):
     return state_file(self.date,'access-data', 'indexer-annotations.json')
Beispiel #15
0
 def output(self):
     logger.warning('in output')
     return state_file(self.date,'access-data', 'title-level-metadata-w3act.xml')
Beispiel #16
0
 def output(self):
     return state_file(self.date, 'w3act-csv', 'db-csv.zip')
Beispiel #17
0
 def output(self):
     return state_file(self.date, 'w3act-csv', 'db-csv.zip', on_hdfs=True)
Beispiel #18
0
 def state_file(self, state_date, ext='csv'):
     return state_file(state_date,
                       'hdfs',
                       'all-files-list.%s' % ext,
                       on_hdfs=False)
Beispiel #19
0
 def output(self):
     return state_file(self.date,'access-data', 'access-whitelist-updated.txt')
Beispiel #20
0
 def output(self):
     return state_file(
         self.date, 'w3act-csv',
         'crawl-feed-%s.%s.json' % (self.feed, self.frequency))
Beispiel #21
0
 def output(self):
     return state_file(self.target_date, 'warcs',
                       '%s-warc-files-for-date.txt' % self.file_count)
Beispiel #22
0
 def output(self):
     return state_file(self.date, 'w3act-csv', 'crawl-feed-but-all-oa.json')
Beispiel #23
0
 def dated_state_file(self):
     return state_file(self.date,
                       'access-hdfs',
                       'all-files-list.csv.gz',
                       on_hdfs=False)
Beispiel #24
0
 def output(self):
     return state_file(self.date, 'w3act-collections', 'collections.json')
Beispiel #25
0
 def output(self):
     return state_file(self.date, 'hdfs', 'empty-files-list.csv')
Beispiel #26
0
 def output(self):
     return state_file(self.date, 'w3act-subjects', 'subject-list.json')
Beispiel #27
0
 def output(self):
     return state_file(self.date, 'hdfs', 'crawl-file-lists.txt')
Beispiel #28
0
 def output(self):
     return state_file(self.date, 'w3act-target-list', 'target-list.json')
Beispiel #29
0
 def output(self):
     return state_file(self.date, 'w3act-target-list',
                       'target-list-%s.json' % self.frequency)
Beispiel #30
0
 def output(self):
     return state_file(self.date, 'access-data',
                       'title-level-metadata-w3act.xml')