def _process_json(): cnt = 0 selected = 0 en_labels = 0 ca_labels = 0 en_descs = 0 ca_descs = 0 claims_stats = {} json_file = open("allwords-wikidata.json", "w") words_file_ca = open("words-ca.txt", "w") descriptions_file_ca = open("descriptions-ca.txt", "w") db = _create_collection() mongo_records = MongoRecords(db) items = db.wikidata.find({}) for item in items: label = item.get("labels") if label is None: continue item_id = item["id"] if item_id is None: continue if item_id.startswith("Q") is False: continue cnt = cnt + 1 en_label, ca_label = mongo_records.get_en_ca_labels(label) if en_label is None: continue descriptions = item.get("descriptions") en_description, ca_description = mongo_records.get_en_ca_descriptions(descriptions) data = {} data["en"] = en_label en_labels = en_labels + 1 if ca_label is not None: data["ca"] = ca_label ca_labels = ca_labels + 1 if en_description is not None: data["en_description"] = en_description en_descs = en_descs + 1 if ca_description is not None: data["ca_description"] = ca_description ca_descs = ca_descs + 1 data["comment"] = item_id claims = item.get("claims") if claims is not None: text = "" for claim in claims: text += claim + " " times = claims_stats.get(claim) if times is None: times = 0 times = times + 1 claims_stats[claim] = times data["claims"] = text selected = selected + 1 json.dump(data, json_file, indent=4, separators=(",", ": ")) if ca_label is not None: words_file_ca.write(ca_label.encode("utf-8") + " id:" + str(item_id) + "\r\n") if ca_description is not None: s = "{0} id: {1} - {2}\r\n".format( ca_label.encode("utf-8"), str(item_id), ca_description.encode("utf-8") ) descriptions_file_ca.write(s) _write_claims(mongo_records, claims_stats) stats = { "entries": cnt, "selected": selected, "ca_labels": ca_labels, "en_labels": en_labels, "ca_descs": ca_descs, "en_descs": en_descs, } _show_statistics(stats, json_file) words_file_ca.close() descriptions_file_ca.close()