def get_few_img_class_dataset(dataset, min_s_img, min_g_img, min_f_img, out_dataset=ArtworkDataset()): # Type(ArtworkDataset, int, int, int, ArtworkDataset) -> ArtworkDataset for artwork in dataset.artworks: files = os.listdir( os.path.join(OLD_DATASET_PATH, artwork.getFolderName())) # if len(files) > MIN_G_IMG + MIN_F_IMG + MIN_S_IMG: # continue gimg = 0 fimg = 0 seed = 0 for f in files: if f.startswith("google"): gimg += 1 elif f.startswith("flickr"): fimg += 1 elif f.startswith("seed"): seed += 1 if gimg < MIN_G_IMG or fimg < MIN_F_IMG or seed < MIN_S_IMG: out_dataset.artworks.append(artwork) return out_dataset
def artwork_dataset_ordering(class_name_ordered_file, unordered_dataset_path, ordered_dataset_out_path=None): ordering = txt_to_list(class_name_ordered_file) not_ordered_dataset = ArtworkDataset().loadJson(unordered_dataset_path) ordered_artwork_dict = {} for artwork in not_ordered_dataset.artworks: try: index = ordering.index(artwork.getFolderName()) except ValueError as v: print "Artwork not find in the ordering list" raise v ordered_artwork_dict[index] = artwork ordered_dataset = copy.deepcopy(not_ordered_dataset) ordered_dataset._artworks = ordered_artwork_dict.values() if ordered_dataset_out_path is not None: ordered_dataset.saveJson(ordered_dataset_out_path) return ordered_dataset
def main(sparql_query=False, json_processing=False, make_artwork_dataset=False, filter_artwork_dataset=False, gps_request_in_dataset=False, dbpedia=False, dbpedia_width=800, google=0, flickr=0, sleep_between_artwork_download=0.5): if sparql_query: sparqlQueryJson(QUERY, output_json_file_path=SPARQL_JSON_RESULT, endpoint=ENDPOINT, offset_limit=1000) print "Query executed!" if json_processing: split_string_lists(SPARQL_JSON_RESULT, splitter_symbol=SPLITTER_SYMBOL, output_json_path=SPARQL_JSON_RESULT_PROCESSED) remove_key_from_dict(SPARQL_JSON_RESULT_PROCESSED, "value", ["type"], output_json_path=SPARQL_JSON_RESULT_PROCESSED) print "Conversion done!" if make_artwork_dataset: dbpediaJSON_to_artwork_list( json_file_or_string=SPARQL_JSON_RESULT_PROCESSED, out_json_file=ARTWORK_DATASET, doGpsToAddressQuery=gps_request_in_dataset) if filter_artwork_dataset: artwork_dataset = ArtworkDataset().loadJson(ARTWORK_DATASET) filtered_dataset = artwork_dataset_filtering(artwork_dataset) filtered_dataset.saveJson(ARTWORK_DATASET_FILTERED) if dbpedia or google > 0 or flickr > 0: ad = ArtworkDataset(img_path=DATASET_PATH) ad.loadJson(ARTWORK_DATASET) if dbpedia: ad.downloadDbPediaThumbs( preferred_dim=dbpedia_width, sleep_between_artwork=sleep_between_artwork_download) if google > 0: ad.downloadGoogleImages( images_per_artwork=google, sleep_between_artwork=sleep_between_artwork_download, google_localization=".it") if flickr > 0: ad.downloadFlickrImages( api_key="flickr.apikey", images_per_artwork=flickr, sleep_between_artwork=sleep_between_artwork_download)
def show_uri_info(uri): ad = ArtworkDataset() ad.loadJson(RESULT_DATASET) art_dict = dict((x.getFolderName(), x) for x in ad.artworks) print unicode(art_dict[uri].getStrRepr(True, True, True))
def main(): old_data = ArtworkDataset() old_data.loadJson(RESULT_DATASET) new_data = ArtworkDataset(img_path=NEW_DATASET_PATH) new_data = get_few_img_class_dataset(old_data, min_s_img=MIN_S_IMG, min_f_img=MIN_F_IMG, min_g_img=MIN_G_IMG, out_dataset=new_data) few_img_classes = len(new_data.artworks) if SHOW_STATS: print("Classes with few images: " + str(few_img_classes)) if DOWNLOAD: if G_NEW_DOWN > 0: new_data.downloadGoogleImages( images_per_artwork=G_NEW_DOWN, sleep_between_artwork=SLEEP_BETWEEN_ARTWORK) if F_NEW_DOWN > 0: new_data.downloadFlickrImages( api_key="flickr.apikey", images_per_artwork=F_NEW_DOWN, sleep_between_artwork=SLEEP_BETWEEN_ARTWORK) if SEED_NEW_DOWN: new_data.downloadDbPediaThumbs( preferred_dim=DBPEDIA_WIDTH, sleep_between_artwork=SLEEP_BETWEEN_ARTWORK) if SHOW_DOWNLOAD_STATS: get_few_img_class_dataset(new_data, min_s_img=MIN_S_IMG, min_f_img=MIN_F_IMG, min_g_img=MIN_G_IMG) print("Classes with few images before redownload: " + str(few_img_classes)) print("Classes with few images after redownload: " + str(len(new_data.artworks)))
def prepare_docs(out_file, lambda_on_doc_str=None, include_classes_filter=None, class_filter_per_name=None): # type: (basestring, callable(basestring), list[int], list[basestring]) -> None ad = ArtworkDataset() ad.loadJson(ARTWORK_DATASET) outf = file(out_file, mode='w') for class_index, artwork in enumerate(ad.artworks): if include_classes_filter is not None: if class_index not in include_classes_filter: continue #skip this class! else: print(u"Included class: " + unicode(class_index)) i = include_classes_filter.index(class_index) print(u"With class name: " + unicode(class_filter_per_name[i])) print("") #ust for check: if class_filter_per_name is not None: folder_uri = artwork.getFolderName() if folder_uri != class_filter_per_name[i]: raise ValueError( "Wow... stop it! Probably class index not corresponding from dataset to training set?" ) title = artwork.title descr = artwork.description comment = artwork.comment authors = artwork.authors #locations = artwork.currentLocations if comment is None: comment = u'' if descr is None: descr = u'' if title is None: title = u'' authors_str = u"" if authors is not None: for author in authors: if author.name is not None: authors_str += u"Author Name: " + author.name + u". " if author.comment is not None: authors_str += author.comment + u". " if author.abstract is not None: authors_str += author.abstract + u". " if author.birthDate is not None: authors_str += u"Author Birth: " + str(author.birthDate) if author.deathDate is not None: authors_str += u", Author Death: " + str( author.deathDate) + u'. ' else: authors_str += '. ' if author.movement is not None: authors_str += u"Author Movement: " + author.movement + u'. ' if author.nationality is not None: authors_str += u"Author Nationality: " + author.nationality + u'. ' locations_str = u"" # if locations is not None: # for loc in locations: # authors_str += str(loc) doc = title + u'. ' + authors_str + '. ' + comment + '. ' + descr + '. ' + locations_str # doc.replace('\n', '') if lambda_on_doc_str is not None: doc = lambda_on_doc_str(doc) doc += u'\n' outf.write(doc.encode("UTF-8")) outf.close()
def dbpediaJSON_to_artwork_list(json_file_or_string, out_json_file=None, doGpsToAddressQuery=False): if os.path.isfile(json_file_or_string): json_str = open(json_file_or_string).read() else: json_str = json_file_or_string jdata = json.loads(json_str) keeplist = ListRefactor( list_rule=ListRule.keep_list, one_element_rule=OneElementListRule.follow_list_rule, bad_rule=BadOrNoneInListRule.remove, empty_rule=EmptyListRule.none_value, bad_value=u"") pickfirst = ListRefactor( list_rule=ListRule.first_value, one_element_rule=OneElementListRule.follow_list_rule, bad_rule=BadOrNoneInListRule.remove, empty_rule=EmptyListRule.none_value, bad_value=u"") concatparag = ListRefactor( list_rule=ListRule.reduce_value, one_element_rule=OneElementListRule.follow_list_rule, bad_rule=BadOrNoneInListRule.remove, empty_rule=EmptyListRule.none_value, list_reducer_func=lambda x, y: str(x) + "\n\n" + str(y), bad_value=u"") # concatlist = ListRefactor(list_rule=ListRule.reduce_value, # one_element_rule=OneElementListRule.follow_list_rule, # bad_rule=BadOrNoneInListRule.remove, # empty_rule=EmptyListRule.none_value, # list_reducer_func=lambda x, y: str(x) + ", " + str(y)) # avg = ListRefactor(list_rule=ListRule.reduce_value, # one_element_rule=OneElementListRule.follow_list_rule, # bad_rule=BadOrNoneInListRule.remove, # empty_rule=EmptyListRule.none_value, # list_reducer_func=lambda x, y: avg(float(x), float(y))) artworkList = ArtworkDataset() for r in jdata: artwork = Artwork() # out = r["artwork"]["value"] # data = dict() # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * # Try to get artwork name, artist name, ... * # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * artwork.uri = r[QK_ARTWORK] # this field is not optional! artwork.title = list_in_dict_refactor(r, QK_TITLE, pickfirst) artwork.description = list_in_dict_refactor(r, QK_DESCRIPTION, concatparag) artwork.comment = list_in_dict_refactor(r, QK_COMMENT, concatparag) artwork.thumb_link = list_in_dict_refactor(r, QK_THUMB, pickfirst) artwork.img_link = list_in_dict_refactor(r, QK_IMG, pickfirst) author = Author() author.uri = list_in_dict_refactor(r, QK_AUTHOR, pickfirst) author.abstract = list_in_dict_refactor(r, QK_AUTHOR_ABSTRACT, pickfirst) author.comment = list_in_dict_refactor(r, QK_AUTHOR_COMMENT, pickfirst) author.movement = list_in_dict_refactor(r, QK_AUTHOR_MOVEMENT, pickfirst) author.birthDate = list_in_dict_refactor(r, QK_AUTHOR_BIRTH_DATE, pickfirst) author.deathDateate = list_in_dict_refactor(r, QK_AUTHOR_DEATH_DATE, pickfirst) author.name = list_in_dict_refactor(r, QK_AUTHOR_BIRTH_NAME, pickfirst) if author.name is None: author.name = list_in_dict_refactor(r, QK_AUTHOR_NAME, pickfirst) if author.name is None: author.name = list_in_dict_refactor(r, QK_AUTHOR_NAME_2, pickfirst) if author.name is None: author.name = list_in_dict_refactor(r, QK_AUTHOR_NAME_3, pickfirst) artwork.addAuthor(author) # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * # Try to get museum info * # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * mus = Museum() mus.uri = list_in_dict_refactor(r, QK_MUS, pickfirst) mus.name = list_in_dict_refactor(r, QK_MUS_NAME, pickfirst) if QK_MUS_LAT in r.keys() and QK_MUS_LNG in r.keys(): mus.setLatLng(list_in_dict_refactor(r, QK_MUS_LAT, pickfirst), list_in_dict_refactor(r, QK_MUS_LNG, pickfirst)) elif QK_MUS_LATLNG in r.keys(): i = 0 lat_sum = 0 lng_sum = 0 list_of_latlng = list_in_dict_refactor(r, QK_MUS_LATLNG, keeplist) if list_of_latlng is not None: for ll in list_of_latlng: latlng = ll.split(',') lat_sum += float(latlng[0]) lng_sum += float(latlng[1]) i += 1 if i > 0: mus.setLatLng(lat_sum / i, lng_sum / i) artwork.addMuseum(mus) # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * # Try to get GPS location of the artwork or, if not available, of the museum: * # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * # ARTWORK GPS: if QK_LAT in r.keys() and QK_LNG in r.keys(): artwork.setLatLng(list_in_dict_refactor(r, QK_LAT, pickfirst), list_in_dict_refactor(r, QK_LNG, pickfirst)) elif QK_LATLNG in r.keys(): i = 0 lat_sum = 0 lng_sum = 0 list_of_latlng = list_in_dict_refactor(r, QK_LATLNG, keeplist) if list_of_latlng is not None: for ll in list_of_latlng: latlng = ll.split(',') lat_sum += float(latlng[0]) lng_sum += float(latlng[1]) i += 1 if i > 0: artwork.currentLatLng.lat = lat_sum / i artwork.currentLatLng.lng = lng_sum / i # MUSEUM GPS: if artwork._currentLatLng is None and mus._latLng is not None: artwork.currentLatLng = mus._latLng # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * # Try to get location (city name) from GPS or JSON data * # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * address = None if (doGpsToAddressQuery and gps_to_address == True and artwork.currentLatLng.lat != None and artwork.currentLatLng.lng != None): retry = True gotAddress = False while (retry): try: time.sleep(GPS_SERVER_REQUEST_DELAY) address = gpsToAddress(artwork.currentLatLng.lat, artwork.currentLatLng.lng) if address != None: retry = False gotAddress = True else: print("\nNone address returned.") ch = raw_input("\nTry again? [Y|n]\n> ") if ch == "n" or ch == "N": retry = False gotAddress = False else: retry = True except GeocoderTimedOut as error: retry = True except GeocoderServiceError as error: if "429" in error.message: # too many request error if request_user_action_if_errror_429: print("\nServer says too many request: " + error.message) ch = raw_input("\nRetry to connect? [Y|n]\n> ") if ch == "n" or ch == "N": retry = False return else: retry = True else: time.sleep(GPS_SERVER_REQUEST_DELAY_IF_ERROR_429) retry = True elif "Errno -2" in error.message: print("\nCan't reach server: " + error.message) ch = raw_input("\nRetry to connect? [Y|n]\n> ") if ch == "n" or ch == "N": retry = False return else: retry = True else: print("\nServer unknown error: " + error.message) ch = raw_input("\nRetry to connect? [Y|n]\n> ") if ch == "n" or ch == "N": retry = False return else: retry = True if gotAddress: if address.city is not None: artwork.currentLocations.append(address.city) elif address.county_province is not None: artwork.currentLocations.append(address.county_province) elif address.state_region is not None: artwork.currentLocations.append(address.state_region) elif address.country is not None: artwork.currentLocations.append(address.country) else: address = None if address is None: lambda_locations_list = [ list_in_dict_refactor(r, QK_MUS_LOCATION, keeplist), list_in_dict_refactor(r, QK_MUS_LOCATION, keeplist), list_in_dict_refactor(r, QK_LOCATION_B, keeplist), list_in_dict_refactor(r, QK_MUS_LOCATION_B, keeplist) ] for locations in lambda_locations_list: if locations is not None: artwork.currentLocations = [] for loc in locations: artwork.currentLocations.append(loc) break # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * if print_gps_address_info: if address is None: print "-- address missing -- (no gps info)" else: print(address.toString()) if print_location: if not artwork.currentLocations: print("<no-location>") else: for loc in artwork.currentLocations: print(loc + "\t") if print_artwork: print vars(artwork) artworkList.artworks.append(artwork) if isinstance(out_json_file, str): artworkList.saveJson(out_json_file) return artworkList