Ejemplo n.º 1
0
 def process(self, temp_folder, n_jobs=2, verbose=False, do_notify=False):
     inputs = Inputs(self.inputs_folder)
     inputs.load_kb()
     inputs.load_taxonomy()
     inputs.load_similarity_matrix()
     inputs.load_detective()
     queue_in = mp.JoinableQueue()
     queue_out = mp.Queue()
     jobs = [
         BulkPipelineWorker(inputs, queue_in, queue_out, False)
         for _ in range(n_jobs)
     ]
     for job in jobs:
         job.start()
     remaining = 0
     for i, part in enumerate(self.partition):
         remaining += 1
         queue_in.put(
             BulkPipelineTask(os.path.join(temp_folder, str(i)),
                              self.parameters, part))
     for job in jobs:
         queue_in.put(None)
     bar = tqdm(total=remaining, disable=not verbose)
     while remaining > 0:
         out = queue_out.get()
         remaining -= 1
         bar.update(1)
     bar.close()
     if do_notify:
         notify("Finished Processing", str(temp_folder))
Ejemplo n.º 2
0
 def __init__(self, inputs_folder, parameters):
     Inputs.__init__(self, inputs_folder)
     # self.load()
     for key, value in parameters.items():
         if key in self.parameters:
             self.parameters[key] = value
     Parameters.process(**parameters)
     if "logger_path" in parameters:
         self.parameters["logger_path"] = parameters["logger_path"]
     if bool(self.parameters["log"]):
         self.logger = Output(
             os.path.join(inputs_folder, self.parameters["logger_path"]))
Ejemplo n.º 3
0
def clone_inputs(argv):
    """clone_inputs
    arguments:  <inputs-folder> <clone-folder>
    """
    from dice import Inputs
    inputs_folder, clone_folder = argv
    Inputs(inputs_folder, load=True).clone(clone_folder).save()
Ejemplo n.º 4
0
def bulk_gatherer(argv):
    """bulk_gatherer
    arguments:  <inputs-folder> <bulk-folder>
    """
    from dice.process import BulkGatherer
    from dice import Inputs
    inputs_folder, bulk_folder = argv
    Inputs(inputs_folder).set_assignment(BulkGatherer(bulk_folder).gather())
Ejemplo n.º 5
0
def tracker(argv):
    """tracker
    arguments:  <inputs-folder> <save-path>
    """
    from dice import Inputs
    from dice.evaluation import Tracker
    inputs_folder, save_path = argv
    tracker = Tracker()
    tracker.build(Inputs(inputs_folder, load=True))
    tracker.save(save_path)
Ejemplo n.º 6
0
def partitioner(argv):
    """partitioner
    arguments:  <inputs-folder> <save-path> <max-cluster-size> <max-neighborhood-size> <max-siblings> <n-jobs>
    """
    from dice.process import Partitioner
    from dice import Inputs
    inputs_folder, save_path, max_cluster_size, max_neighborhood_size, max_siblings, n_jobs = argv
    partitioner = Partitioner(Inputs(inputs_folder), int(max_cluster_size),
                              int(max_neighborhood_size), int(max_siblings))
    partition = partitioner.process(int(n_jobs))
    partitioner.save(partition, save_path)
Ejemplo n.º 7
0
def bulk_kb_extender(argv):
    """bulk_kb_extender
    arguments:  <inputs-folder> <ratio> <verbose> <n_jobs>
    """
    from dice.process import BulkKbExtender
    from dice import Inputs
    import os
    inputs_folder, ratio, verbose, n_jobs = argv
    inputs = Inputs(inputs_folder)
    bulk_kb_extender = BulkKbExtender(inputs)
    bulk_kb_extender.process(float(ratio), verbose == "True", int(n_jobs))
Ejemplo n.º 8
0
def bulk_process(argv):
    """bulk_process
    arguments:  <inputs-folder> <bulk-folder> <n-jobs> <partition-file> <tracker-path> <verbose> <notify>
    """
    from dice.process import BulkPipeline
    from dice.process import BulkPipelineTask
    from dice.process import BulkGatherer
    from dice import Inputs
    from dice.evaluation import Tracker
    from dice.misc import notify
    inputs_folder, bulk_folder, n_jobs, partition_file, tracker_path, verbose, do_notify = argv
    partition = list()
    with open(partition_file) as file:
        for line in file.readlines():
            partition.append(list(map(int, line.strip().split("\t"))))
    verbose = verbose == "True"
    pipeline = BulkPipeline(inputs_folder, partition, **{})
    pipeline.process(bulk_folder, int(n_jobs), verbose, False)
    if verbose:
        print("Gathering assignment...")
    inputs = Inputs(inputs_folder)
    inputs.set_assignment(BulkGatherer(bulk_folder).gather())
    if verbose:
        print("Gathering tracker...")
    tracker = Tracker()
    tracker.build(inputs)
    tracker.save(tracker_path)
    if do_notify == "True":
        notify(
            "Bulk Process", """Finished processing for the following arguments:
        inputs folder: {inputs_folder}
        bulk folder: {bulk_folder}
        n jobs: {n_jobs}
        partition file: {partition_file}
        tracker path: {tracker_path}
        """.format(inputs_folder=inputs_folder,
                   bulk_folder=bulk_folder,
                   n_jobs=n_jobs,
                   partition_file=partition_file,
                   tracker_path=tracker_path))
Ejemplo n.º 9
0
def kb_extender(argv):
    """kb_extender
    arguments:  <inputs-folder> <subjects>+
    """
    from dice.process import KbExtender
    from dice import Inputs
    import os
    inputs_folder, *subjects = argv
    inputs = Inputs(inputs_folder)
    kb_extender = KbExtender(inputs)
    for subject in subjects:
        tracker = kb_extender.extend(
            subject, os.path.join(inputs_folder, "dummy", subject))
        tracker.save(
            os.path.join(inputs_folder, "dummy", subject, "tracker.tsv"))
Ejemplo n.º 10
0
from tqdm import tqdm
import sys
import os
sys.path.insert(0, "../")
from dice import Inputs

if __name__ == "__main__":

    print("Kb extension computation")
    if len(sys.argv) != 2:
        print(__doc__)
        exit()
    inputs_folder = sys.argv[1]

    print("Loading...")
    inputs = Inputs(inputs_folder)
    print("\tLoading KB...")
    kb = inputs.get_kb()
    print("\tLoading taxonomy...")
    taxonomy = inputs.get_taxonomy()
    print("\tLoading probability...")
    probability = inputs.get_probability()

    print("Gathering properties...")
    properties = dict()
    for fact in tqdm(kb.values()):
        properties.setdefault(fact.subject, set())
        properties[fact.subject].add(probability.index_P[fact.property])

    print("Gathering scores...")
    scores = sparse.lil_matrix((len(probability.C), len(probability.P)))
Ejemplo n.º 11
0
def demo(argv):
    """demo
    arguments:  <inputs-folder> <partition-file> <max-facts-per-subjects> <clean-source> <save-folder>
    """
    inputs_folder, partition_file, maximum_facts_per_subject, clean_source, save_path = argv
    from dice import Inputs
    from dice.misc import Output
    from dice.constants import Dimensions
    from dice.reason import Variable
    from dice.evidence.cues import JointCue
    from dice.evidence.cues import NecessityCue
    from dice.evidence.cues import SufficiencyCue
    from dice.evidence.cues import ImplicationCue
    from dice.evidence.cues import EntailmentCue
    from dice.evidence.cues import ContradictionCue
    from dice.evidence.cues import EntropyCue
    from tqdm import tqdm
    import pandas as pd
    output = Output(save_path)
    inputs = Inputs(inputs_folder)
    print("Loading inputs...")
    kb = inputs.get_kb()
    taxonomy = inputs.get_taxonomy()
    detective = inputs.get_detective()
    assignment = inputs.get_assignment()
    similarity = inputs.get_similarity_matrix()
    data = list()
    selected_indices = set()
    subjects_representation = dict()
    print("Selecting indices...")
    for fact in tqdm(inputs.get_kb().values()):
        subjects_representation.setdefault(fact.subject, list())
        subjects_representation[fact.subject].append(fact.index)
    print("Thresholding number of facts per subject...")
    for subject, indices in tqdm(subjects_representation.items()):
        # if len(indices) > 20:
        selected_indices = selected_indices.union(
            indices[:int(maximum_facts_per_subject)])
    print("Gathering facts...")
    for fact in tqdm(inputs.get_kb().values()):
        if fact.index not in selected_indices:
            continue
        data.append({
            "index":
            fact.index,
            "source":
            clean_source,
            "subject":
            fact.subject,
            "property":
            fact.property,
            "score":
            fact.score,
            "evidence_plausible":
            detective[fact.index].plausible,
            "evidence_typical":
            detective[fact.index].typical,
            "evidence_remarkable":
            detective[fact.index].remarkable,
            "evidence_salient":
            detective[fact.index].salient,
            "cue_joint":
            detective.cues[JointCue][fact.index],
            "cue_necessity":
            detective.cues[NecessityCue][fact.index],
            "cue_sufficiency":
            detective.cues[SufficiencyCue][fact.index],
            "cue_implication":
            detective.cues[ImplicationCue][fact.index],
            "cue_entailment":
            detective.cues[EntailmentCue][fact.index],
            "cue_contradiction":
            detective.cues[ContradictionCue][fact.index],
            "cue_entropy":
            detective.cues[EntropyCue][fact.index],
            "plausible":
            assignment.confidence.get(
                Variable(fact.index, Dimensions.PLAUSIBLE), 0),
            "typical":
            assignment.confidence.get(Variable(fact.index, Dimensions.TYPICAL),
                                      0),
            "remarkable":
            assignment.confidence.get(
                Variable(fact.index, Dimensions.REMARKABLE), 0),
            "salient":
            assignment.confidence.get(Variable(fact.index, Dimensions.SALIENT),
                                      0),
            "plausible_percentile":
            assignment.confidence.get(
                Variable(fact.index, Dimensions.PLAUSIBLE), 0),
            "typical_percentile":
            assignment.confidence.get(Variable(fact.index, Dimensions.TYPICAL),
                                      0),
            "remarkable_percentile":
            assignment.confidence.get(
                Variable(fact.index, Dimensions.REMARKABLE), 0),
            "salient_percentile":
            assignment.confidence.get(Variable(fact.index, Dimensions.SALIENT),
                                      0),
        })
    df_facts = pd.DataFrame(data)
    del data
    n = df_facts.shape[0]
    print("Normalizing columns...")
    pbar = tqdm(total=20)
    for column in [
            "plausible_percentile", "typical_percentile",
            "remarkable_percentile", "salient_percentile",
            "evidence_plausible", "evidence_typical", "evidence_remarkable",
            "evidence_salient", "cue_joint", "cue_necessity",
            "cue_sufficiency", "cue_implication", "cue_implication",
            "cue_entailment", "cue_contradiction", "cue_entropy"
    ]:
        df_facts = df_facts.sort_values(by=column)
        df_facts[column] = [i / (n - 1) for i in range(n)]
        pbar.update(1)
    for column in ["plausible", "typical", "remarkable", "salient"]:
        values = list()
        a, b = df_facts[column].min(), df_facts[column].max()
        for index, row in df_facts.iterrows():
            values.append((row[column] - a) / (b - a))
        df_facts[column] = values
        pbar.update(1)
    pbar.close()
    print("Gathering partition...")
    data = list()
    with open(partition_file) as file:
        for line in tqdm(file.readlines()):
            count, *indices = list(map(int, line.strip().split("\t")))
            subjects = set([kb[j].subject for j in indices])
            properties_all = list(set([kb[j].property for j in indices]))
            local_indices = [similarity.index[p] for p in properties_all]
            local_matrix = similarity.matrix[local_indices][:, local_indices]
            for i in range(count):
                fact = kb[indices[i]]
                if indices[i] not in selected_indices:
                    continue
                property_index_self = similarity.index[fact.property]
                parents = list()
                children = list()
                siblings = list()
                if fact.subject in taxonomy.nodes:
                    parents = [
                        "{neighbor}:{weight}".format(
                            neighbor=neighbor,
                            weight=taxonomy.weight(fact.subject, neighbor),
                        ) for neighbor in subjects.intersection(
                            taxonomy.predecessors(fact.subject))
                    ]
                    children = [
                        "{neighbor}:{weight}".format(
                            neighbor=neighbor,
                            weight=taxonomy.weight(fact.subject, neighbor),
                        ) for neighbor in subjects.intersection(
                            taxonomy.successors(fact.subject))
                    ]
                    siblings = [
                        "{neighbor}:{weight}".format(
                            neighbor=neighbor,
                            weight=taxonomy.weight(fact.subject, neighbor),
                        ) for neighbor in subjects.intersection(
                            taxonomy.siblings(fact.subject))
                    ]
                properties = list()
                for j, k in zip(*local_matrix.nonzero()):
                    if local_indices[j] != property_index_self:
                        continue
                    properties.append(properties_all[k] + ":" +
                                      str(local_matrix[j, k]))
                data.append({
                    "index": indices[i],
                    "parents": ";".join(parents),
                    "children": ";".join(children),
                    "siblings": ";".join(siblings),
                    "properties": ";".join(properties),
                })
    df_partition = pd.DataFrame(data)
    df = df_facts.set_index("index").join(df_partition.set_index("index"),
                                          on="index",
                                          how="outer")
    df.to_csv(output.path("demo.csv"), index=False)