예제 #1
0
    def build_collection(self, chunk):
        input = os.path.join(self.temp, chunk)
        bundles = [
            os.path.join(self.temp, chunk, f) for f in os.listdir(input)
            if os.path.isfile(os.path.join(input, f)) and f.endswith('.txt')
        ]
        print("  Creating mg4j collection from " + input)

        input = os.path.join(self.temp, chunk, "*.txt")
        output = os.path.join(self.temp, chunk + ".collection")
        if (platform.system() == 'Windows'):
            command = "dir /s/b"
        else:
            # This command is meaningless and might as well be ls when used this way.
            command = "find "
        args = ("{0} {1} | "
                "java -cp {2} "
                "it.unimi.di.big.mg4j.document.TRECDocumentCollection "
                "-f HtmlDocumentFactory "
                "-p encoding=iso-8859-1 "
                "{3}").format(command, input, self.classpath, output)

        print(args)

        run(args, self.temp)
예제 #2
0
    def decompress(self, chunk):
        input = os.path.join(self.gov2, chunk + ".7z")

        args = ("7z x {0}").format(input)
        print(args)

        run(args, self.temp)
예제 #3
0
    def create_filtered_chunk(self, chunk):
        unfiltered = os.path.join(self.temp, chunk + ".chunk")

        if self.min_postings is not None:
            # Filter the chunk file

            # Create the manifest file
            output = os.path.join(self.temp, "UnfilteredChunks.txt")
            with open(output, 'w') as file:
                file.write(unfiltered + '\n')

            # Create filtered chunk file.
            manifest = os.path.join(self.temp, "UnfilteredChunks.txt")
            args = ("{0} filter {1} {2} -size {3} {4}").format(
                self.bitfunnel, manifest, self.temp, self.min_postings,
                self.max_postings)
            print(args)

            run(args, self.temp)

            # Rename filtered chunk file.
            old_name = os.path.join(self.temp, "Chunk-0.chunk")
        else:
            # Just use the unfiltered file.
            old_name = unfiltered

        os.rename(old_name, self.chunk_name(chunk))
예제 #4
0
    def create_filtered_chunk(self, chunk):
        manifest = os.path.join(self.temp, "UnfilteredChunks.txt")
        args = ("{0} filter {1} {2} -size {3} {4}").format(
            self.bitfunnel, manifest, self.temp, self.min_postings,
            self.max_postings)
        print(args)

        run(args, self.temp)
예제 #5
0
    def create_chunk(self, chunk):
        input = os.path.join(self.temp, chunk + ".collection")
        output = os.path.join(self.temp, chunk + ".chunk")

        args = ("java -cp {0} "
                "org.bitfunnel.reproducibility.GenerateBitFunnelChunks "
                "-S {1} {2}").format(self.classpath, input, output)

        print(args)

        run(args, self.temp)
예제 #6
0
def measure_quadwords(experiment, iterations):
    bf_index_path = os.path.join(experiment.bf_index_path, "quadwords")

    def results_path(iteration):
        return os.path.join(bf_index_path, "run-{}".format(iteration))

    if not os.path.exists(bf_index_path):
        os.makedirs(bf_index_path)

    # We're currently restricted to a single shard,
    # so create an empty ShardDefinition file.
    open(os.path.join(bf_index_path, "ShardDefinition.csv"), "w").close()

    # Make the repl script
    # query_log = os.path.join(bf_index_path, "single-term-queries.txt")
    query_log = experiment.filtered_query_file

    repl_script = os.path.join(bf_index_path, "repl-script")
    print(repl_script)
    with open(repl_script, "w") as file:
        file.write("threads {0}\n".format(experiment.ingestion_thread_count))
        file.write("load manifest {0}\n".format(experiment.manifest))
        file.write("status\n")
        file.write("compiler\n")
        file.write("threads {0}\n".format(experiment.max_thread_count))
        for iteration in range(iterations):
            file.write("cd {0}\n".format(results_path(iteration)))
            file.write("query log {0}\n".format(query_log))
        file.write("quit\n")

    # Make the directories for the results.
    for iteration in range(iterations):
        results_dir = results_path(iteration)

        if not os.path.exists(results_dir):
            os.makedirs(results_dir)

    # Finally, run the queries.
    args = ("{} repl {} -script {}").format(experiment.bf_executable,
                                            experiment.bf_index_path,
                                            repl_script)
    repl_log = os.path.join(bf_index_path, "run-log.txt")
    print(args)
    run(args, bf_index_path, repl_log)
예제 #7
0
def measure_innovations(experiment, treatments, densities):
    bf_index_path = os.path.join(experiment.bf_index_path, "innovations")

    if not os.path.exists(bf_index_path):
        os.makedirs(bf_index_path)

    # We're currently restricted to a single shard,
    # so create an empty ShardDefinition file.
    open(os.path.join(bf_index_path, "ShardDefinition.csv"), "w").close()

    # Run statistics builder
    args = ("{0} statistics {1} {2} -text").format(experiment.bf_executable,
                                                   experiment.manifest,
                                                   bf_index_path)
    statistics_log = os.path.join(bf_index_path, "statistics-log.txt")
    print(args)
    run(args, bf_index_path, statistics_log)

    # Make the repl script
    repl_script = os.path.join(bf_index_path, "repl-script")
    print(repl_script)
    with open(repl_script, "w") as file:
        file.write("threads {0}\n".format(experiment.ingestion_thread_count))
        file.write("load manifest {0}\n".format(experiment.manifest))
        file.write("status\n")
        file.write("compiler\n")
        file.write("threads {0}\n".format(experiment.max_thread_count))
        file.write("cd {0}\n".format(bf_index_path))
        file.write("query log {0}\n".format(experiment.filtered_query_file))
        file.write("quit\n")

    for treatment in treatments:
        for density in densities:
            # Build the termtable
            args = ("{} termtable {} {} {}").format(experiment.bf_executable,
                                                    bf_index_path, density,
                                                    treatment)
            termtable_log = os.path.join(
                bf_index_path,
                "termtable-log-{}-{}.txt".format(treatment, density))
            print(args)
            run(args, bf_index_path, termtable_log)

            args = ("{} repl {} -script {}").format(experiment.bf_executable,
                                                    bf_index_path, repl_script)
            repl_log = os.path.join(
                bf_index_path, "repl-log-{}-{}.txt".format(treatment, density))
            print(args)
            run(args, bf_index_path, repl_log)
예제 #8
0
def execute(command, log_file = None):
    print(command)
    run(command, os.getcwd(), log_file)
    print("Finished")
    print()