def convert_rdf_to_dic(ids, data_type):
  item = {}
  data = {}
  old_item_id = ""
  item_id = ""
  find_index = -1
  results = []
  
  a = 0

  print "["

  start_time = time.time()
  with gzip.open(global_gzip_filename, 'r') as infile:
    for line in infile:
      
      if not line.startswith("<http://rdf.freebase.com/ns/m."):
        continue

      splited_line = line.split('\t')

      #save part of results to file
      if len(results) >= 100000:
        print_data(results, data_type)

      old_item_id = item_id
      item_id = splited_line[0]
      # if next entity --> save previous entity and clear saving array for new entity
      if old_item_id != item_id:
        #determinate if this id is connect with entity witch we want to extract
        find_index = binary_search(ids, item_id)

        if data_type == "all":
          find_index = 1

        if find_index != -1:
          if len(data) is not 0 and "name" in data:
            item["itemInfo"] = data
            results.append(json.dumps(item, separators=(',',':')))
          item.clear()
          data.clear()
          data["id"] = item_id.replace("<http://rdf.freebase.com/ns/m.", "")[:-1]

      if find_index != -1:
        #bind all thinks starting with actual id (item_id)
        determine_content(splited_line, data, data_type)

  print_data(results, data_type)
  print "{\"itemInfo\":{}}]"
Example #2
0
def convert_input_to_knowledge_base_format(input_data):
  geonames_part = []
  for l in input_data:
    fields = l.strip().split("\t")

    geonameid = fields[0]
    population = fields[14]

    wikipedia_url = -1

    #dont touch this, its crazy :( but works :)
    if not ((fields[6] is not "S" or ( fields[6] is "S" 
            and fields[7] in global_S_wanted_freature_codes)) 
            and (((data_type == "location" and fields[7] != "MUS")
          or(data_type == "museum" and (fields[7] == "MUS") 
            or data_type == "all")))):
        continue
  
    # WIKIPEDIA URL
    wikipedia_url_all_occur = binary_search_all_occur(
        global_links, geonameid, cross_columns=True, col_sep="\t", 
        finding_column=0, return_column=1)

    if len(wikipedia_url_all_occur) is 0:
      continue

    # vybere lépe ohodnocenou url ze souboru links z geonames na základě 
    # wikipedia statistik (např. Istanbul místo Constantinopole)
    maxscore = -1
    go_next = False
    if len(wikipedia_url_all_occur) > 1:
      for url in wikipedia_url_all_occur:
        if global_allowed_url not in url:
          go_next = True
          continue
        url = get_wikipedia_url(url)
        score = binary_search(
            global_wikipedia_statistic, url.replace(global_allowed_url, ""),
            cross_columns=True, col_sep="\t", finding_column=0, 
            return_column=2)

        if int(score) > int(maxscore):
          maxscore = score
          wikipedia_url = url

    if go_next:
      continue

    if maxscore is -1:
      wikipedia_url = wikipedia_url_all_occur[0]
    wikipedia_url = urllib.unquote(wikipedia_url)

    entity_type = "museum" if "museum" == data_type else "location"

    name = fields[1]

    #alternative names
    names = fields[3].split(",")
    alternatenames = set()

    for n in names:
      n = n.strip()
      if n and n.lower() in global_alternative_names:
        alternatenames.add(n)

    state = binary_search(
        global_country_codes, fields[8], cross_columns=True, col_sep="\t", 
        finding_column=0, return_column=1)

    if state is not -1 and state:
      alternatenames.add(name + ", " + state)
      country = state
    else:
      country = ""

    # substate (USA states)
    if fields[10]:
      substate = binary_search(
          subcountry_code, fields[8] + "." + fields[10], cross_columns=True, 
          col_sep="\t", finding_column=0, return_column=1)
      if substate is not -1 and substate:
        alternatenames.add(name + ", " + substate)

    latitude = fields[4]
    longitude = fields[5]
    class_code = fields[6]
    feature_code = fields[7]
    elevation = fields[15]

    new_geoname = GeoName(
        geonameid, name, alternatenames, latitude,
        longitude, class_code, feature_code, country, 
        population, elevation, wikipedia_url, entity_type)

    geonames_part.append(new_geoname)
  return geonames_part
def load_data(_filename, data_type):
  data = json.load(_filename)

  results = []

  for d in data:
    d = d["itemInfo"]

    if "name" not in d:
      continue

    if d["id"] in global_unwanted_ids:
      continue
    
    if "object_type" in d and data_type == "location":
      if (not any("location.citytown" in ot for ot in d["object_type"]) and
          not any("location.country" in ot for ot in d["object_type"])):
        if any("organization.organization>" in ot or 
             "metropolitan_transit.transit_stop" in ot or
             "architecture.building" in ot or
             "architecture.structure" in ot 
             for ot in d["object_type"]):
          continue

    name = ""
    nationality_name = "";
    if data_type == "nationalities":
      nationality_name = format_string(d["name"])
    else:
      name = format_string(d["name"])

    freebase_id = format_string(d["id"])

    # all
    alias = []; description = ""; image = [];  wikipedia_url = "";


    # artist, person
    period_or_movement = []; influenced = []; influenced_by = []; 
    place_of_birth = ""; place_of_death = ""; date_of_birth = ""; 
    date_of_death = ""; profession = []; art_form = []; places_lived = []; gender = ""; nationality = []

    # artwork
    artist = []; art_subject = []; art_form = ""; art_genre = []; media = []; support = []; period_or_movement = []; location = []; date_begun = ""; date_completed = ""; owner = []; dimensions = {"height":"", "width":"", "depth":""}

    # location
    latitude = ""; longitude = ""; loc_type = ""; country = ""; population = []

    # museum
    type_of_museum = []; established = ""; director = ""; visitors = ""; address = {"citytown":"", "postal_code":"", "state_province_region":"", "street_address":""}

    # event
    start_date = ""; end_date = ""; locations = []; notable_types = ""

    # country
    short_name = ""; adjectival_form = []
    
    # all
    if "alias" in d:
      fields = d["alias"]
      for f in fields:
        alias.append(re.sub(global_replace_backslash, '\'', format_string(f)))
    if "description" in d:
      description = re.sub(global_replace_new_line, ' ', re.sub(global_replace_backslash, '\'', format_string(d["description"])))
    if "article" in d:
      if description.strip() == "":
        description = re.sub(global_replace_new_line, ' ', re.sub(global_replace_backslash, '\'', format_string(d["article"])))
    if "image" in d:
      fields = d["image"]
      for f in fields:
        image.append("freebase/"+format_string(f["path"]))
    if "key_wikipedia_en" in d:
      wikipedia_url = format_string(d["key_wikipedia_en"])

    # artist, person
    if "associated_periods_or_movements" in d:
      fields = d["associated_periods_or_movements"]
      for f in fields:
        period_or_movement.append(format_string(f))
    if "influenced" in d:
      fields = d["influenced"]
      for f in fields:
        if "value" in f:
          influenced.append(format_string(f["value"]))
    if "influenced_by" in d:
      fields = d["influenced_by"]
      for f in fields:
        if "value" in f:
          influenced_by.append(format_string(f["value"]))
    if "place_of_birth" in d:
      if "value" in d["place_of_birth"]:
        place_of_birth = format_string(d["place_of_birth"]["value"])
    if "place_of_death" in d:
      if "value" in d["place_of_death"]:
        place_of_death = format_string(d["place_of_death"]["value"])
    if "date_of_birth" in d:
      date_of_birth = format_string(d["date_of_birth"])
    if "date_of_death" in d:
      date_of_death = format_string(d["date_of_death"])
    if "profession" in d:
      fields = d["profession"]
      for f in fields:
        profession.append(format_string(f).replace("-GB", ""))
    if "places_lived" in d:
      fields = d["places_lived"]
      for f in fields:
        if "value" in f:
          places_lived.append(format_string(f["value"]))
    if "nationality" in d:
      fields = d["nationality"]
      for f in fields:
        nationality_string = binary_search(global_nationality_country, format_string(f), cross_columns=True, col_sep="\t")
        if nationality_string == -1:
          nationality_string = f
        try:
          nationality.append(format_string(nationality_string))
        except:
          nationality.append(format_string(f))
    if "gender" in d:
      gender = format_string(d["gender"])

    # artwork
    if "artist" in d:
        fields = d["artist"]
        for f in fields:
          artist.append(format_string(f["value"]))
    if "art_subject" in d:
      fields = d["art_subject"]
      for f in fields:
        art_subject.append(format_string(f))
    if "art_form" in d:
      art_form = format_string(d["art_form"])
    if "art_genre" in d:
      fields = d["art_genre"]
      for f in fields:
        art_genre.append(format_string(f))
    if "media" in d:
      fields = d["media"]
      for f in fields:
        media.append(format_string(f))
    if "support" in d:
      fields = d["support"]
      for f in fields:
        support.append(format_string(f))
    if "period_or_movement" in d:
      fields = d["period_or_movement"]
      for f in fields:
        period_or_movement.append(format_string(f))
    if "location" in d:
      fields = d["location"]
      for f in fields:
        location.append(format_string(f["value"]))
    if "date_begun" in d:
      date_begun = format_string(d["date_begun"])
    if "date_completed" in d:
      date_completed = format_string(d["date_completed"])
    if "wikipedia_url" in d:
      fields = d["wikipedia_url"]
      for f in fields:
        wikipedia_url.append(format_string(f[1:-1]))
    if "owner" in d:
      fields = d["owner"]
      for f in fields:
        owner.append(format_string(f["value"]))
    if "dimensions" in d:
      fields = d["dimensions"]
      if "height" in f:
        dimension["height"] = format_string(f["height"])
      if "width" in f:
        dimension["width"] = format_string(f["width"])
      if "depth" in f:
        dimension["depth"] = format_string(f["depth"])

    # location
    if "population" in d:
      fields = d["population"]
      for f in fields:
        population.append(format_string(f))
    if "latitude" in d:
      latitude =  format_string(d["latitude"])
    if "longitude" in d:
      longitude =  format_string(d["longitude"])
    if "country" in d:
      if "value" in d["country"]:
        country = format_string(d["country"]["value"])
    if "loc_type" in d:
      loc_type = (format_string(d["loc_type"]))

    # museum
    if "type_of_museum" in d:
      fields = d["type_of_museum"]
      for f in fields:
        type_of_museum.append(format_string(f))
    if "established" in d:
      established = format_string(d["established"])
    if "director" in d:
      if "value" in d["director"]:
        director = format_string(d["director"]["value"])
    if "visitors" in d:
      visitors = format_string(d["visitors"])
    if "address" in d:
      if "citytown" in d["address"]:
        if "value" in d["address"]["citytown"]:
          address["citytown"] = format_string(d["address"]["citytmwn"]["value"])
      if "postal_code" in d["address"]:
        address["postal_code"] = format_string(d["address"]["postal_code"])
      if "state_province_region" in d["address"]:
        address["state_province_region"] = format_string(d["address"]["state_province_region"])
      if "street_address" in d["address"]:
        address["street_address"] = format_string(d["address"]["street_address"])

    # event 
    if "start_date" in d:
      start_date = format_string(d["start_date"])
    if "end_date" in d:
      end_date = format_string(d["end_date"])
    if "locations" in d:
      fields = d["locations"]
      for f in fields:
        if "value" in f:
          locations.append(format_string(f["value"]))
    if "notable_types" in d:
      notable_types = format_string(d["notable_types"][0])

    #country
    if "short_name" in d:
      short_name = format_string(d["short_name"][0])
    if "adjectival_form" in d and data_type == "nationalities":
      fields = d["adjectival_form"]
      if len(fields) > 0 and fields[0]:
        name = format_string(fields[0])
      for f in fields:
        adjectival_form.append(format_string(f))

    if name.strip() == "":
      continue

    new_entity = None
    if data_type == "artist":
      new_entity = FreebaseArtist("a:" + freebase_id, name, alias, description, image, period_or_movement, influenced, influenced_by, place_of_birth, place_of_death, date_of_birth, date_of_death, wikipedia_url, profession, art_form, places_lived, gender, nationality)
    elif data_type == "person":
      new_entity = FreebasePerson("p:" + freebase_id, name, alias, description, image, period_or_movement, place_of_birth, place_of_death, date_of_birth, date_of_death, wikipedia_url, profession, places_lived, gender, nationality)
    elif data_type == "artwork":
      new_entity = FreebaseArtwork("w:" + freebase_id, name, alias, description, image, artist, art_subject, art_form, art_genre, media, support, period_or_movement, location, date_begun, date_completed, wikipedia_url, owner, dimensions)
    elif data_type == "location":
      new_entity = FreebaseLocation("l:" + freebase_id, name, alias, description, image, wikipedia_url, latitude, longitude, loc_type, population, adjectival_form)
    elif data_type == "museum":
      new_entity = FreebaseMuseum("c:" + freebase_id, name, alias, description, image, wikipedia_url, type_of_museum, established, director, visitors, address, latitude, longitude)
    elif data_type == "event":
      new_entity = FreebaseEvent("e:" + freebase_id, name, alias, description, image, wikipedia_url, start_date, end_date, locations, notable_types)
    elif data_type == "visual_art_form":
      new_entity = FreebaseEntity("f:" + freebase_id, name, alias, description, image, wikipedia_url)
      new_entity.set_type("visual_art_form")
    elif data_type == "visual_art_genre":
      new_entity = FreebaseEntity("g:" + freebase_id, name, alias, description, image, wikipedia_url)
      new_entity.set_type("visual_art_genre")
    elif data_type == "art_period_movement":
      new_entity = FreebaseEntity("m:" + freebase_id, name, alias, description, image, wikipedia_url)
      new_entity.set_type("art_period_movement")
    elif data_type == "visual_art_medium":
      new_entity = FreebaseEntity("d:" + freebase_id, name, alias, description, image, wikipedia_url)
      new_entity.set_type("visual_art_medium")
    elif data_type == "nationalities":
      new_entity = FreebaseNationality("n:" + freebase_id, name, alias, description, image, wikipedia_url, short_name, adjectival_form, nationality_name)
    elif data_type == "all":
      new_entity = FreebaseEntity("f" + freebase_id, name, alias, description, image, wikipedia_url)

    if new_entity is not None:
      results.append(new_entity)

  return results
def bind(
    original_key, key, splited_line, data, save_id=False, 
    save_like_array=True, only_first_letter=False, is_foreign_key=True,
    group_name=None, language_data=False):

  if splited_line[1] != original_key:
    return
    
  FK = splited_line[2]
  
  FK = str(FK).replace('"', '')
  if group_name is not None:
    if group_name not in data:
      data[group_name] = []
    if is_foreign_key:
      FK = FK.replace("<http://rdf.freebase.com/ns/m.", "")[:-1]
      if key == "height":
        FK = binary_search(global_height_meters, FK, cross_columns=True, col_sep="\t")
      elif key == "width":
        FK = binary_search(global_width_meters, FK, cross_columns=True, col_sep="\t")
      elif key == "depth":
        FK = binary_search(global_depth_meters, FK, cross_columns=True, col_sep="\t")
      elif key == "citytown":
        next_FK = binary_search(global_citytowns, FK, cross_columns=True, col_sep="\t")
        if next_FK != -1:
          next_FK = next_FK.replace("<http://rdf.freebase.com/ns/m.", "")[:-1]
          FK = search_in_labels(str(next_FK))
      elif key == "postal_code":
        next_FK = binary_search(global_postal_codes, FK, cross_columns=True, col_sep="\t")
        if next_FK != -1:
          next_FK = next_FK.replace("<http://rdf.freebase.com/ns/m.", "")[:-1]
          FK = search_in_labels(str(next_FK))
      elif key == "state_province_region":
        next_FK = binary_search(global_state_province_regions, FK, cross_columns=True, col_sep="\t")
        if next_FK != -1:
          next_FK = next_FK.replace("<http://rdf.freebase.com/ns/m.", "")[:-1]
          FK = search_in_labels(str(next_FK))
      else:
        FK = search_in_labels(FK)
    if FK == -1:
      return
    
    FK = remove_language_tags(FK)
    data[group_name].append({key : FK})
    return

  if key not in data and save_like_array:
    data[key] = []

  if is_foreign_key: 
    label = -1
    next_FK = -1
    FK = FK.replace("<http://rdf.freebase.com/ns/m.", "")[:-1]
    if key == "latitude":
      label = binary_search(global_latitudes, FK, cross_columns=True, col_sep="\t")
    elif key == "longitude":
      label = binary_search(global_longitudes, FK, cross_columns=True, col_sep="\t")
    elif key == "country":
      next_FK = binary_search(global_countries, FK, cross_columns=True, col_sep="\t")
      if next_FK != -1:
        next_FK = next_FK.replace("<http://rdf.freebase.com/ns/m.", "")[:-1]
        label = search_in_labels(str(next_FK))
    elif key == "population":
      label = binary_search(global_numbers, FK, cross_columns=True, col_sep="\t")
    elif key == "places_lived":
      next_FK = binary_search(global_locations, FK, cross_columns=True, col_sep="\t")
      if next_FK != -1:
        next_FK = next_FK.replace("<http://rdf.freebase.com/ns/m.", "")[:-1]
        label = search_in_labels(str(next_FK))
    elif key == "owner":
      next_FK = binary_search(global_owners, FK, cross_columns=True, col_sep="\t")
      if next_FK != -1:
        next_FK = next_FK.replace("<http://rdf.freebase.com/ns/m.", "")[:-1]
        label = search_in_labels(str(next_FK))
    elif original_key == "<http://rdf.freebase.com/ns/visual_art.artwork.locations>":
      next_FK = binary_search(global_artwork_location_relationship, FK, cross_columns=True, col_sep="\t")
      if next_FK != -1:
        next_FK = next_FK.replace("<http://rdf.freebase.com/ns/m.", "")[:-1]
        label = search_in_labels(str(next_FK))
    else:
      label = search_in_labels(FK)
    if label == -1:
      return

    label = remove_language_tags(label)
    if save_id:
      if save_like_array:
        data[key].append({'value': label, 'id': splited_line[2].replace("<http://rdf.freebase.com/ns/m.", "")[:-1]})
      else:
        data[key] = {'value': label, 'id': splited_line[2].replace("<http://rdf.freebase.com/ns/m.", "")[:-1]}
    else:
      if only_first_letter:
        label = label[0]
      if save_like_array:
        data[key].append(label)
      else:
        data[key] = label
    return
  else:
    value = FK.replace("%13", "–")
    value = remove_language_tags(value)
    if save_like_array:
      if "en.wikipedia.org" in value and "%" in value:
        try:
          value = urllib.unquote(value).decode('cp1250')
        except:
          pass
      data[key].append(value)
    else:
      value = value.replace("^^<http://www.w3.org/2001/XMLSchema", "").replace("#gYear>", "").replace("#date>", "").replace("#gYearMonth>", "").replace(">","")
      data[key] = value
def search_in_labels(label_id):
  return binary_search(global_labels, label_id, cross_columns=True, col_sep="\t")