Ejemplo n.º 1
0
from dice import Inputs

if __name__ == "__main__":

    print("Kb extension computation")
    if len(sys.argv) != 2:
        print(__doc__)
        exit()
    inputs_folder = sys.argv[1]

    print("Loading...")
    inputs = Inputs(inputs_folder)
    print("\tLoading KB...")
    kb = inputs.get_kb()
    print("\tLoading taxonomy...")
    taxonomy = inputs.get_taxonomy()
    print("\tLoading probability...")
    probability = inputs.get_probability()

    print("Gathering properties...")
    properties = dict()
    for fact in tqdm(kb.values()):
        properties.setdefault(fact.subject, set())
        properties[fact.subject].add(probability.index_P[fact.property])

    print("Gathering scores...")
    scores = sparse.lil_matrix((len(probability.C), len(probability.P)))
    for subject, indices in tqdm(properties.items()):
        index_c = probability.index_C[subject]
        for index_p in indices:
            scores[index_c, index_p] = 1
Ejemplo n.º 2
0
def demo(argv):
    """demo
    arguments:  <inputs-folder> <partition-file> <max-facts-per-subjects> <clean-source> <save-folder>
    """
    inputs_folder, partition_file, maximum_facts_per_subject, clean_source, save_path = argv
    from dice import Inputs
    from dice.misc import Output
    from dice.constants import Dimensions
    from dice.reason import Variable
    from dice.evidence.cues import JointCue
    from dice.evidence.cues import NecessityCue
    from dice.evidence.cues import SufficiencyCue
    from dice.evidence.cues import ImplicationCue
    from dice.evidence.cues import EntailmentCue
    from dice.evidence.cues import ContradictionCue
    from dice.evidence.cues import EntropyCue
    from tqdm import tqdm
    import pandas as pd
    output = Output(save_path)
    inputs = Inputs(inputs_folder)
    print("Loading inputs...")
    kb = inputs.get_kb()
    taxonomy = inputs.get_taxonomy()
    detective = inputs.get_detective()
    assignment = inputs.get_assignment()
    similarity = inputs.get_similarity_matrix()
    data = list()
    selected_indices = set()
    subjects_representation = dict()
    print("Selecting indices...")
    for fact in tqdm(inputs.get_kb().values()):
        subjects_representation.setdefault(fact.subject, list())
        subjects_representation[fact.subject].append(fact.index)
    print("Thresholding number of facts per subject...")
    for subject, indices in tqdm(subjects_representation.items()):
        # if len(indices) > 20:
        selected_indices = selected_indices.union(
            indices[:int(maximum_facts_per_subject)])
    print("Gathering facts...")
    for fact in tqdm(inputs.get_kb().values()):
        if fact.index not in selected_indices:
            continue
        data.append({
            "index":
            fact.index,
            "source":
            clean_source,
            "subject":
            fact.subject,
            "property":
            fact.property,
            "score":
            fact.score,
            "evidence_plausible":
            detective[fact.index].plausible,
            "evidence_typical":
            detective[fact.index].typical,
            "evidence_remarkable":
            detective[fact.index].remarkable,
            "evidence_salient":
            detective[fact.index].salient,
            "cue_joint":
            detective.cues[JointCue][fact.index],
            "cue_necessity":
            detective.cues[NecessityCue][fact.index],
            "cue_sufficiency":
            detective.cues[SufficiencyCue][fact.index],
            "cue_implication":
            detective.cues[ImplicationCue][fact.index],
            "cue_entailment":
            detective.cues[EntailmentCue][fact.index],
            "cue_contradiction":
            detective.cues[ContradictionCue][fact.index],
            "cue_entropy":
            detective.cues[EntropyCue][fact.index],
            "plausible":
            assignment.confidence.get(
                Variable(fact.index, Dimensions.PLAUSIBLE), 0),
            "typical":
            assignment.confidence.get(Variable(fact.index, Dimensions.TYPICAL),
                                      0),
            "remarkable":
            assignment.confidence.get(
                Variable(fact.index, Dimensions.REMARKABLE), 0),
            "salient":
            assignment.confidence.get(Variable(fact.index, Dimensions.SALIENT),
                                      0),
            "plausible_percentile":
            assignment.confidence.get(
                Variable(fact.index, Dimensions.PLAUSIBLE), 0),
            "typical_percentile":
            assignment.confidence.get(Variable(fact.index, Dimensions.TYPICAL),
                                      0),
            "remarkable_percentile":
            assignment.confidence.get(
                Variable(fact.index, Dimensions.REMARKABLE), 0),
            "salient_percentile":
            assignment.confidence.get(Variable(fact.index, Dimensions.SALIENT),
                                      0),
        })
    df_facts = pd.DataFrame(data)
    del data
    n = df_facts.shape[0]
    print("Normalizing columns...")
    pbar = tqdm(total=20)
    for column in [
            "plausible_percentile", "typical_percentile",
            "remarkable_percentile", "salient_percentile",
            "evidence_plausible", "evidence_typical", "evidence_remarkable",
            "evidence_salient", "cue_joint", "cue_necessity",
            "cue_sufficiency", "cue_implication", "cue_implication",
            "cue_entailment", "cue_contradiction", "cue_entropy"
    ]:
        df_facts = df_facts.sort_values(by=column)
        df_facts[column] = [i / (n - 1) for i in range(n)]
        pbar.update(1)
    for column in ["plausible", "typical", "remarkable", "salient"]:
        values = list()
        a, b = df_facts[column].min(), df_facts[column].max()
        for index, row in df_facts.iterrows():
            values.append((row[column] - a) / (b - a))
        df_facts[column] = values
        pbar.update(1)
    pbar.close()
    print("Gathering partition...")
    data = list()
    with open(partition_file) as file:
        for line in tqdm(file.readlines()):
            count, *indices = list(map(int, line.strip().split("\t")))
            subjects = set([kb[j].subject for j in indices])
            properties_all = list(set([kb[j].property for j in indices]))
            local_indices = [similarity.index[p] for p in properties_all]
            local_matrix = similarity.matrix[local_indices][:, local_indices]
            for i in range(count):
                fact = kb[indices[i]]
                if indices[i] not in selected_indices:
                    continue
                property_index_self = similarity.index[fact.property]
                parents = list()
                children = list()
                siblings = list()
                if fact.subject in taxonomy.nodes:
                    parents = [
                        "{neighbor}:{weight}".format(
                            neighbor=neighbor,
                            weight=taxonomy.weight(fact.subject, neighbor),
                        ) for neighbor in subjects.intersection(
                            taxonomy.predecessors(fact.subject))
                    ]
                    children = [
                        "{neighbor}:{weight}".format(
                            neighbor=neighbor,
                            weight=taxonomy.weight(fact.subject, neighbor),
                        ) for neighbor in subjects.intersection(
                            taxonomy.successors(fact.subject))
                    ]
                    siblings = [
                        "{neighbor}:{weight}".format(
                            neighbor=neighbor,
                            weight=taxonomy.weight(fact.subject, neighbor),
                        ) for neighbor in subjects.intersection(
                            taxonomy.siblings(fact.subject))
                    ]
                properties = list()
                for j, k in zip(*local_matrix.nonzero()):
                    if local_indices[j] != property_index_self:
                        continue
                    properties.append(properties_all[k] + ":" +
                                      str(local_matrix[j, k]))
                data.append({
                    "index": indices[i],
                    "parents": ";".join(parents),
                    "children": ";".join(children),
                    "siblings": ";".join(siblings),
                    "properties": ";".join(properties),
                })
    df_partition = pd.DataFrame(data)
    df = df_facts.set_index("index").join(df_partition.set_index("index"),
                                          on="index",
                                          how="outer")
    df.to_csv(output.path("demo.csv"), index=False)