def convert_rdf_to_dic(ids, data_type): item = {} data = {} old_item_id = "" item_id = "" find_index = -1 results = [] a = 0 print "[" start_time = time.time() with gzip.open(global_gzip_filename, 'r') as infile: for line in infile: if not line.startswith("<http://rdf.freebase.com/ns/m."): continue splited_line = line.split('\t') #save part of results to file if len(results) >= 100000: print_data(results, data_type) old_item_id = item_id item_id = splited_line[0] # if next entity --> save previous entity and clear saving array for new entity if old_item_id != item_id: #determinate if this id is connect with entity witch we want to extract find_index = binary_search(ids, item_id) if data_type == "all": find_index = 1 if find_index != -1: if len(data) is not 0 and "name" in data: item["itemInfo"] = data results.append(json.dumps(item, separators=(',',':'))) item.clear() data.clear() data["id"] = item_id.replace("<http://rdf.freebase.com/ns/m.", "")[:-1] if find_index != -1: #bind all thinks starting with actual id (item_id) determine_content(splited_line, data, data_type) print_data(results, data_type) print "{\"itemInfo\":{}}]"
def convert_input_to_knowledge_base_format(input_data): geonames_part = [] for l in input_data: fields = l.strip().split("\t") geonameid = fields[0] population = fields[14] wikipedia_url = -1 #dont touch this, its crazy :( but works :) if not ((fields[6] is not "S" or ( fields[6] is "S" and fields[7] in global_S_wanted_freature_codes)) and (((data_type == "location" and fields[7] != "MUS") or(data_type == "museum" and (fields[7] == "MUS") or data_type == "all")))): continue # WIKIPEDIA URL wikipedia_url_all_occur = binary_search_all_occur( global_links, geonameid, cross_columns=True, col_sep="\t", finding_column=0, return_column=1) if len(wikipedia_url_all_occur) is 0: continue # vybere lépe ohodnocenou url ze souboru links z geonames na základě # wikipedia statistik (např. Istanbul místo Constantinopole) maxscore = -1 go_next = False if len(wikipedia_url_all_occur) > 1: for url in wikipedia_url_all_occur: if global_allowed_url not in url: go_next = True continue url = get_wikipedia_url(url) score = binary_search( global_wikipedia_statistic, url.replace(global_allowed_url, ""), cross_columns=True, col_sep="\t", finding_column=0, return_column=2) if int(score) > int(maxscore): maxscore = score wikipedia_url = url if go_next: continue if maxscore is -1: wikipedia_url = wikipedia_url_all_occur[0] wikipedia_url = urllib.unquote(wikipedia_url) entity_type = "museum" if "museum" == data_type else "location" name = fields[1] #alternative names names = fields[3].split(",") alternatenames = set() for n in names: n = n.strip() if n and n.lower() in global_alternative_names: alternatenames.add(n) state = binary_search( global_country_codes, fields[8], cross_columns=True, col_sep="\t", finding_column=0, return_column=1) if state is not -1 and state: alternatenames.add(name + ", " + state) country = state else: country = "" # substate (USA states) if fields[10]: substate = binary_search( subcountry_code, fields[8] + "." + fields[10], cross_columns=True, col_sep="\t", finding_column=0, return_column=1) if substate is not -1 and substate: alternatenames.add(name + ", " + substate) latitude = fields[4] longitude = fields[5] class_code = fields[6] feature_code = fields[7] elevation = fields[15] new_geoname = GeoName( geonameid, name, alternatenames, latitude, longitude, class_code, feature_code, country, population, elevation, wikipedia_url, entity_type) geonames_part.append(new_geoname) return geonames_part
def load_data(_filename, data_type): data = json.load(_filename) results = [] for d in data: d = d["itemInfo"] if "name" not in d: continue if d["id"] in global_unwanted_ids: continue if "object_type" in d and data_type == "location": if (not any("location.citytown" in ot for ot in d["object_type"]) and not any("location.country" in ot for ot in d["object_type"])): if any("organization.organization>" in ot or "metropolitan_transit.transit_stop" in ot or "architecture.building" in ot or "architecture.structure" in ot for ot in d["object_type"]): continue name = "" nationality_name = ""; if data_type == "nationalities": nationality_name = format_string(d["name"]) else: name = format_string(d["name"]) freebase_id = format_string(d["id"]) # all alias = []; description = ""; image = []; wikipedia_url = ""; # artist, person period_or_movement = []; influenced = []; influenced_by = []; place_of_birth = ""; place_of_death = ""; date_of_birth = ""; date_of_death = ""; profession = []; art_form = []; places_lived = []; gender = ""; nationality = [] # artwork artist = []; art_subject = []; art_form = ""; art_genre = []; media = []; support = []; period_or_movement = []; location = []; date_begun = ""; date_completed = ""; owner = []; dimensions = {"height":"", "width":"", "depth":""} # location latitude = ""; longitude = ""; loc_type = ""; country = ""; population = [] # museum type_of_museum = []; established = ""; director = ""; visitors = ""; address = {"citytown":"", "postal_code":"", "state_province_region":"", "street_address":""} # event start_date = ""; end_date = ""; locations = []; notable_types = "" # country short_name = ""; adjectival_form = [] # all if "alias" in d: fields = d["alias"] for f in fields: alias.append(re.sub(global_replace_backslash, '\'', format_string(f))) if "description" in d: description = re.sub(global_replace_new_line, ' ', re.sub(global_replace_backslash, '\'', format_string(d["description"]))) if "article" in d: if description.strip() == "": description = re.sub(global_replace_new_line, ' ', re.sub(global_replace_backslash, '\'', format_string(d["article"]))) if "image" in d: fields = d["image"] for f in fields: image.append("freebase/"+format_string(f["path"])) if "key_wikipedia_en" in d: wikipedia_url = format_string(d["key_wikipedia_en"]) # artist, person if "associated_periods_or_movements" in d: fields = d["associated_periods_or_movements"] for f in fields: period_or_movement.append(format_string(f)) if "influenced" in d: fields = d["influenced"] for f in fields: if "value" in f: influenced.append(format_string(f["value"])) if "influenced_by" in d: fields = d["influenced_by"] for f in fields: if "value" in f: influenced_by.append(format_string(f["value"])) if "place_of_birth" in d: if "value" in d["place_of_birth"]: place_of_birth = format_string(d["place_of_birth"]["value"]) if "place_of_death" in d: if "value" in d["place_of_death"]: place_of_death = format_string(d["place_of_death"]["value"]) if "date_of_birth" in d: date_of_birth = format_string(d["date_of_birth"]) if "date_of_death" in d: date_of_death = format_string(d["date_of_death"]) if "profession" in d: fields = d["profession"] for f in fields: profession.append(format_string(f).replace("-GB", "")) if "places_lived" in d: fields = d["places_lived"] for f in fields: if "value" in f: places_lived.append(format_string(f["value"])) if "nationality" in d: fields = d["nationality"] for f in fields: nationality_string = binary_search(global_nationality_country, format_string(f), cross_columns=True, col_sep="\t") if nationality_string == -1: nationality_string = f try: nationality.append(format_string(nationality_string)) except: nationality.append(format_string(f)) if "gender" in d: gender = format_string(d["gender"]) # artwork if "artist" in d: fields = d["artist"] for f in fields: artist.append(format_string(f["value"])) if "art_subject" in d: fields = d["art_subject"] for f in fields: art_subject.append(format_string(f)) if "art_form" in d: art_form = format_string(d["art_form"]) if "art_genre" in d: fields = d["art_genre"] for f in fields: art_genre.append(format_string(f)) if "media" in d: fields = d["media"] for f in fields: media.append(format_string(f)) if "support" in d: fields = d["support"] for f in fields: support.append(format_string(f)) if "period_or_movement" in d: fields = d["period_or_movement"] for f in fields: period_or_movement.append(format_string(f)) if "location" in d: fields = d["location"] for f in fields: location.append(format_string(f["value"])) if "date_begun" in d: date_begun = format_string(d["date_begun"]) if "date_completed" in d: date_completed = format_string(d["date_completed"]) if "wikipedia_url" in d: fields = d["wikipedia_url"] for f in fields: wikipedia_url.append(format_string(f[1:-1])) if "owner" in d: fields = d["owner"] for f in fields: owner.append(format_string(f["value"])) if "dimensions" in d: fields = d["dimensions"] if "height" in f: dimension["height"] = format_string(f["height"]) if "width" in f: dimension["width"] = format_string(f["width"]) if "depth" in f: dimension["depth"] = format_string(f["depth"]) # location if "population" in d: fields = d["population"] for f in fields: population.append(format_string(f)) if "latitude" in d: latitude = format_string(d["latitude"]) if "longitude" in d: longitude = format_string(d["longitude"]) if "country" in d: if "value" in d["country"]: country = format_string(d["country"]["value"]) if "loc_type" in d: loc_type = (format_string(d["loc_type"])) # museum if "type_of_museum" in d: fields = d["type_of_museum"] for f in fields: type_of_museum.append(format_string(f)) if "established" in d: established = format_string(d["established"]) if "director" in d: if "value" in d["director"]: director = format_string(d["director"]["value"]) if "visitors" in d: visitors = format_string(d["visitors"]) if "address" in d: if "citytown" in d["address"]: if "value" in d["address"]["citytown"]: address["citytown"] = format_string(d["address"]["citytmwn"]["value"]) if "postal_code" in d["address"]: address["postal_code"] = format_string(d["address"]["postal_code"]) if "state_province_region" in d["address"]: address["state_province_region"] = format_string(d["address"]["state_province_region"]) if "street_address" in d["address"]: address["street_address"] = format_string(d["address"]["street_address"]) # event if "start_date" in d: start_date = format_string(d["start_date"]) if "end_date" in d: end_date = format_string(d["end_date"]) if "locations" in d: fields = d["locations"] for f in fields: if "value" in f: locations.append(format_string(f["value"])) if "notable_types" in d: notable_types = format_string(d["notable_types"][0]) #country if "short_name" in d: short_name = format_string(d["short_name"][0]) if "adjectival_form" in d and data_type == "nationalities": fields = d["adjectival_form"] if len(fields) > 0 and fields[0]: name = format_string(fields[0]) for f in fields: adjectival_form.append(format_string(f)) if name.strip() == "": continue new_entity = None if data_type == "artist": new_entity = FreebaseArtist("a:" + freebase_id, name, alias, description, image, period_or_movement, influenced, influenced_by, place_of_birth, place_of_death, date_of_birth, date_of_death, wikipedia_url, profession, art_form, places_lived, gender, nationality) elif data_type == "person": new_entity = FreebasePerson("p:" + freebase_id, name, alias, description, image, period_or_movement, place_of_birth, place_of_death, date_of_birth, date_of_death, wikipedia_url, profession, places_lived, gender, nationality) elif data_type == "artwork": new_entity = FreebaseArtwork("w:" + freebase_id, name, alias, description, image, artist, art_subject, art_form, art_genre, media, support, period_or_movement, location, date_begun, date_completed, wikipedia_url, owner, dimensions) elif data_type == "location": new_entity = FreebaseLocation("l:" + freebase_id, name, alias, description, image, wikipedia_url, latitude, longitude, loc_type, population, adjectival_form) elif data_type == "museum": new_entity = FreebaseMuseum("c:" + freebase_id, name, alias, description, image, wikipedia_url, type_of_museum, established, director, visitors, address, latitude, longitude) elif data_type == "event": new_entity = FreebaseEvent("e:" + freebase_id, name, alias, description, image, wikipedia_url, start_date, end_date, locations, notable_types) elif data_type == "visual_art_form": new_entity = FreebaseEntity("f:" + freebase_id, name, alias, description, image, wikipedia_url) new_entity.set_type("visual_art_form") elif data_type == "visual_art_genre": new_entity = FreebaseEntity("g:" + freebase_id, name, alias, description, image, wikipedia_url) new_entity.set_type("visual_art_genre") elif data_type == "art_period_movement": new_entity = FreebaseEntity("m:" + freebase_id, name, alias, description, image, wikipedia_url) new_entity.set_type("art_period_movement") elif data_type == "visual_art_medium": new_entity = FreebaseEntity("d:" + freebase_id, name, alias, description, image, wikipedia_url) new_entity.set_type("visual_art_medium") elif data_type == "nationalities": new_entity = FreebaseNationality("n:" + freebase_id, name, alias, description, image, wikipedia_url, short_name, adjectival_form, nationality_name) elif data_type == "all": new_entity = FreebaseEntity("f" + freebase_id, name, alias, description, image, wikipedia_url) if new_entity is not None: results.append(new_entity) return results
def bind( original_key, key, splited_line, data, save_id=False, save_like_array=True, only_first_letter=False, is_foreign_key=True, group_name=None, language_data=False): if splited_line[1] != original_key: return FK = splited_line[2] FK = str(FK).replace('"', '') if group_name is not None: if group_name not in data: data[group_name] = [] if is_foreign_key: FK = FK.replace("<http://rdf.freebase.com/ns/m.", "")[:-1] if key == "height": FK = binary_search(global_height_meters, FK, cross_columns=True, col_sep="\t") elif key == "width": FK = binary_search(global_width_meters, FK, cross_columns=True, col_sep="\t") elif key == "depth": FK = binary_search(global_depth_meters, FK, cross_columns=True, col_sep="\t") elif key == "citytown": next_FK = binary_search(global_citytowns, FK, cross_columns=True, col_sep="\t") if next_FK != -1: next_FK = next_FK.replace("<http://rdf.freebase.com/ns/m.", "")[:-1] FK = search_in_labels(str(next_FK)) elif key == "postal_code": next_FK = binary_search(global_postal_codes, FK, cross_columns=True, col_sep="\t") if next_FK != -1: next_FK = next_FK.replace("<http://rdf.freebase.com/ns/m.", "")[:-1] FK = search_in_labels(str(next_FK)) elif key == "state_province_region": next_FK = binary_search(global_state_province_regions, FK, cross_columns=True, col_sep="\t") if next_FK != -1: next_FK = next_FK.replace("<http://rdf.freebase.com/ns/m.", "")[:-1] FK = search_in_labels(str(next_FK)) else: FK = search_in_labels(FK) if FK == -1: return FK = remove_language_tags(FK) data[group_name].append({key : FK}) return if key not in data and save_like_array: data[key] = [] if is_foreign_key: label = -1 next_FK = -1 FK = FK.replace("<http://rdf.freebase.com/ns/m.", "")[:-1] if key == "latitude": label = binary_search(global_latitudes, FK, cross_columns=True, col_sep="\t") elif key == "longitude": label = binary_search(global_longitudes, FK, cross_columns=True, col_sep="\t") elif key == "country": next_FK = binary_search(global_countries, FK, cross_columns=True, col_sep="\t") if next_FK != -1: next_FK = next_FK.replace("<http://rdf.freebase.com/ns/m.", "")[:-1] label = search_in_labels(str(next_FK)) elif key == "population": label = binary_search(global_numbers, FK, cross_columns=True, col_sep="\t") elif key == "places_lived": next_FK = binary_search(global_locations, FK, cross_columns=True, col_sep="\t") if next_FK != -1: next_FK = next_FK.replace("<http://rdf.freebase.com/ns/m.", "")[:-1] label = search_in_labels(str(next_FK)) elif key == "owner": next_FK = binary_search(global_owners, FK, cross_columns=True, col_sep="\t") if next_FK != -1: next_FK = next_FK.replace("<http://rdf.freebase.com/ns/m.", "")[:-1] label = search_in_labels(str(next_FK)) elif original_key == "<http://rdf.freebase.com/ns/visual_art.artwork.locations>": next_FK = binary_search(global_artwork_location_relationship, FK, cross_columns=True, col_sep="\t") if next_FK != -1: next_FK = next_FK.replace("<http://rdf.freebase.com/ns/m.", "")[:-1] label = search_in_labels(str(next_FK)) else: label = search_in_labels(FK) if label == -1: return label = remove_language_tags(label) if save_id: if save_like_array: data[key].append({'value': label, 'id': splited_line[2].replace("<http://rdf.freebase.com/ns/m.", "")[:-1]}) else: data[key] = {'value': label, 'id': splited_line[2].replace("<http://rdf.freebase.com/ns/m.", "")[:-1]} else: if only_first_letter: label = label[0] if save_like_array: data[key].append(label) else: data[key] = label return else: value = FK.replace("%13", "–") value = remove_language_tags(value) if save_like_array: if "en.wikipedia.org" in value and "%" in value: try: value = urllib.unquote(value).decode('cp1250') except: pass data[key].append(value) else: value = value.replace("^^<http://www.w3.org/2001/XMLSchema", "").replace("#gYear>", "").replace("#date>", "").replace("#gYearMonth>", "").replace(">","") data[key] = value
def search_in_labels(label_id): return binary_search(global_labels, label_id, cross_columns=True, col_sep="\t")