def build_collection(self, chunk): input = os.path.join(self.temp, chunk) bundles = [ os.path.join(self.temp, chunk, f) for f in os.listdir(input) if os.path.isfile(os.path.join(input, f)) and f.endswith('.txt') ] print(" Creating mg4j collection from " + input) input = os.path.join(self.temp, chunk, "*.txt") output = os.path.join(self.temp, chunk + ".collection") if (platform.system() == 'Windows'): command = "dir /s/b" else: # This command is meaningless and might as well be ls when used this way. command = "find " args = ("{0} {1} | " "java -cp {2} " "it.unimi.di.big.mg4j.document.TRECDocumentCollection " "-f HtmlDocumentFactory " "-p encoding=iso-8859-1 " "{3}").format(command, input, self.classpath, output) print(args) run(args, self.temp)
def decompress(self, chunk): input = os.path.join(self.gov2, chunk + ".7z") args = ("7z x {0}").format(input) print(args) run(args, self.temp)
def create_filtered_chunk(self, chunk): unfiltered = os.path.join(self.temp, chunk + ".chunk") if self.min_postings is not None: # Filter the chunk file # Create the manifest file output = os.path.join(self.temp, "UnfilteredChunks.txt") with open(output, 'w') as file: file.write(unfiltered + '\n') # Create filtered chunk file. manifest = os.path.join(self.temp, "UnfilteredChunks.txt") args = ("{0} filter {1} {2} -size {3} {4}").format( self.bitfunnel, manifest, self.temp, self.min_postings, self.max_postings) print(args) run(args, self.temp) # Rename filtered chunk file. old_name = os.path.join(self.temp, "Chunk-0.chunk") else: # Just use the unfiltered file. old_name = unfiltered os.rename(old_name, self.chunk_name(chunk))
def create_filtered_chunk(self, chunk): manifest = os.path.join(self.temp, "UnfilteredChunks.txt") args = ("{0} filter {1} {2} -size {3} {4}").format( self.bitfunnel, manifest, self.temp, self.min_postings, self.max_postings) print(args) run(args, self.temp)
def create_chunk(self, chunk): input = os.path.join(self.temp, chunk + ".collection") output = os.path.join(self.temp, chunk + ".chunk") args = ("java -cp {0} " "org.bitfunnel.reproducibility.GenerateBitFunnelChunks " "-S {1} {2}").format(self.classpath, input, output) print(args) run(args, self.temp)
def measure_quadwords(experiment, iterations): bf_index_path = os.path.join(experiment.bf_index_path, "quadwords") def results_path(iteration): return os.path.join(bf_index_path, "run-{}".format(iteration)) if not os.path.exists(bf_index_path): os.makedirs(bf_index_path) # We're currently restricted to a single shard, # so create an empty ShardDefinition file. open(os.path.join(bf_index_path, "ShardDefinition.csv"), "w").close() # Make the repl script # query_log = os.path.join(bf_index_path, "single-term-queries.txt") query_log = experiment.filtered_query_file repl_script = os.path.join(bf_index_path, "repl-script") print(repl_script) with open(repl_script, "w") as file: file.write("threads {0}\n".format(experiment.ingestion_thread_count)) file.write("load manifest {0}\n".format(experiment.manifest)) file.write("status\n") file.write("compiler\n") file.write("threads {0}\n".format(experiment.max_thread_count)) for iteration in range(iterations): file.write("cd {0}\n".format(results_path(iteration))) file.write("query log {0}\n".format(query_log)) file.write("quit\n") # Make the directories for the results. for iteration in range(iterations): results_dir = results_path(iteration) if not os.path.exists(results_dir): os.makedirs(results_dir) # Finally, run the queries. args = ("{} repl {} -script {}").format(experiment.bf_executable, experiment.bf_index_path, repl_script) repl_log = os.path.join(bf_index_path, "run-log.txt") print(args) run(args, bf_index_path, repl_log)
def measure_innovations(experiment, treatments, densities): bf_index_path = os.path.join(experiment.bf_index_path, "innovations") if not os.path.exists(bf_index_path): os.makedirs(bf_index_path) # We're currently restricted to a single shard, # so create an empty ShardDefinition file. open(os.path.join(bf_index_path, "ShardDefinition.csv"), "w").close() # Run statistics builder args = ("{0} statistics {1} {2} -text").format(experiment.bf_executable, experiment.manifest, bf_index_path) statistics_log = os.path.join(bf_index_path, "statistics-log.txt") print(args) run(args, bf_index_path, statistics_log) # Make the repl script repl_script = os.path.join(bf_index_path, "repl-script") print(repl_script) with open(repl_script, "w") as file: file.write("threads {0}\n".format(experiment.ingestion_thread_count)) file.write("load manifest {0}\n".format(experiment.manifest)) file.write("status\n") file.write("compiler\n") file.write("threads {0}\n".format(experiment.max_thread_count)) file.write("cd {0}\n".format(bf_index_path)) file.write("query log {0}\n".format(experiment.filtered_query_file)) file.write("quit\n") for treatment in treatments: for density in densities: # Build the termtable args = ("{} termtable {} {} {}").format(experiment.bf_executable, bf_index_path, density, treatment) termtable_log = os.path.join( bf_index_path, "termtable-log-{}-{}.txt".format(treatment, density)) print(args) run(args, bf_index_path, termtable_log) args = ("{} repl {} -script {}").format(experiment.bf_executable, bf_index_path, repl_script) repl_log = os.path.join( bf_index_path, "repl-log-{}-{}.txt".format(treatment, density)) print(args) run(args, bf_index_path, repl_log)
def execute(command, log_file = None): print(command) run(command, os.getcwd(), log_file) print("Finished") print()