def startReplay(warcFilename): global p pathOfWARC = os.path.join(os.path.dirname(__file__) + '/../samples/warcs/' + warcFilename) tempFilePath = tempfile.gettempdir() + '/' + ''.join(random.sample( string.ascii_uppercase + string.digits * 6, 12)) + '.cdxj' open(tempFilePath, 'a').close() # Create placeholder file for replay p = Process(target=replay.start, args=[tempFilePath]) p.start() sleep(5) cdxjList = indexer.indexFileAt(pathOfWARC, quiet=True) cdxj = '\n'.join(cdxjList) with open(tempFilePath, 'w') as f: f.write(cdxj)
def test_push(): """ Read WARC, manipulate content to ensure uniqueness, push to IPFS WARC should result in two CDXJ entries with three space-limited fields each: surt URI, datetime, JSON JSON should contain AT LEAST locator, mime_type, and status fields """ newWARCPath = ipwbTest.createUniqueWARC() # use ipwb indexer to push cdxjList = indexer.indexFileAt(newWARCPath, quiet=True) cdxj = '\n'.join(cdxjList) firstEntry = cdxj.split('\n')[0] firstNonMetadataEntry = '' for line in cdxj.split('\n'): if line[0] != '!': firstNonMetadataEntry = line break assert checkCDXJFields(firstNonMetadataEntry) firstEntryLastField = firstNonMetadataEntry.split(' ', 2)[2] assert checkIPWBJSONFieldPresesence(firstEntryLastField)
def test_warc_ipwbIndexerBrokenWARCRecord(): pathOfBrokenWARC = os.path.join(os.path.dirname(__file__) + '/samples/warcs/broken.warc') cdxjList = indexer.indexFileAt(pathOfBrokenWARC, quiet=True) cdxj = '\n'.join(cdxjList) assert ipwbTest.countCDXJEntries(cdxj) == 1
def test_cdxj_warc_responseRecordCount(): newWARCPath = ipwbTest.createUniqueWARC() # use ipwb indexer to push cdxjList = indexer.indexFileAt(newWARCPath, quiet=True) cdxj = '\n'.join(cdxjList) assert ipwbTest.countCDXJEntries(cdxj) == 2