Esempio n. 1
0
    def pickle_vt(self):
        files = glob.glob(self.json_path)

        docs = []
        count = 0
        print("\n This may take some time")
        for file in files:
            count += 1
            with utils.fopen(file) as f:
                for line in f:
                    doc = json.loads(line)
                    docs.append(doc)
            print("\n JSON DOC #", str(count))
            print("DOC NAME: ", file)

        with open(self.pickle_path, "wb") as f:
            print("Starting pickling ")
            pickle.dump(docs, f)
        print("Done Pickling All VT JSON to One Doc ")
Esempio n. 2
0
from json_flattener import JsonFlattener

# Globbing directories for compact json strings
data_dir = "../../../data/static_vt/compact/*/*.json"
files = glob.glob(data_dir)

# initializing flattener and doc list
flattener = JsonFlattener()
docs = []
total_files = len(files)
print("\n Reading all Json into memory")

for file in files:
    # Reading each globbed file flattening and appending to docs
    # Output of percentage is shown
    with utils.fopen(file) as f:
        for line in f:
            doc = json.loads(line)
            flat_doc = flattener.flatten_json_iterative_solution(doc)
            docs.append(flat_doc)
    done = int(50 * len(docs) / total_files)
    sys.stdout.write("\r[{}{}] {}%".format("█" * done, "." * (50 - done),
                                           int(100 * len(docs) / total_files)))
    sys.stdout.flush()

# initializing JsonVectorizer
vectorizer = JsonVectorizer()

# extending vectorizer with flattened_docs
processed = 0
print("\nExtending with docs")
Esempio n. 3
0
def generate_dict(src, idx_list, target_dict, cnt):
    cur_level = src
    flag = True
    for item in idx_list:
        if item in cur_level.keys():
            cur_level = cur_level[item]
        else:
            flag = False

    if flag:
        for idx in target_dict.keys():
            if idx == cur_level:
                target_dict[idx].append(cnt)


with fopen(osp.join(DATA_PATH, 'sample10000.json')) as f:
    for line in f:
        doc = json.loads(line)

        generate_dict(doc, location_idx_list, location_dict, cnt)
        generate_dict(doc, certificate_idx_list, certificate_dict, cnt)

        cur_level = doc
        flag = True
        for item in server_idx_list:
            if item in cur_level.keys():
                cur_level = cur_level[item]
            else:
                flag = False

        if flag: