예제 #1
0
def get_few_img_class_dataset(dataset,
                              min_s_img,
                              min_g_img,
                              min_f_img,
                              out_dataset=ArtworkDataset()):
    # Type(ArtworkDataset, int, int, int, ArtworkDataset) -> ArtworkDataset

    for artwork in dataset.artworks:
        files = os.listdir(
            os.path.join(OLD_DATASET_PATH, artwork.getFolderName()))
        # if len(files) > MIN_G_IMG + MIN_F_IMG + MIN_S_IMG:
        #     continue
        gimg = 0
        fimg = 0
        seed = 0

        for f in files:
            if f.startswith("google"):
                gimg += 1
            elif f.startswith("flickr"):
                fimg += 1
            elif f.startswith("seed"):
                seed += 1
        if gimg < MIN_G_IMG or fimg < MIN_F_IMG or seed < MIN_S_IMG:
            out_dataset.artworks.append(artwork)

    return out_dataset
예제 #2
0
def artwork_dataset_ordering(class_name_ordered_file, unordered_dataset_path, ordered_dataset_out_path=None):
    ordering = txt_to_list(class_name_ordered_file)
    not_ordered_dataset = ArtworkDataset().loadJson(unordered_dataset_path)

    ordered_artwork_dict = {}
    for artwork in not_ordered_dataset.artworks:
        try:
            index = ordering.index(artwork.getFolderName())
        except ValueError as v:
            print "Artwork not find in the ordering list"
            raise v
        ordered_artwork_dict[index] = artwork

    ordered_dataset = copy.deepcopy(not_ordered_dataset)
    ordered_dataset._artworks = ordered_artwork_dict.values()
    if ordered_dataset_out_path is not None:
        ordered_dataset.saveJson(ordered_dataset_out_path)
    return ordered_dataset
예제 #3
0
def main(sparql_query=False,
         json_processing=False,
         make_artwork_dataset=False,
         filter_artwork_dataset=False,
         gps_request_in_dataset=False,
         dbpedia=False,
         dbpedia_width=800,
         google=0,
         flickr=0,
         sleep_between_artwork_download=0.5):

    if sparql_query:
        sparqlQueryJson(QUERY,
                        output_json_file_path=SPARQL_JSON_RESULT,
                        endpoint=ENDPOINT,
                        offset_limit=1000)
        print "Query executed!"

    if json_processing:
        split_string_lists(SPARQL_JSON_RESULT,
                           splitter_symbol=SPLITTER_SYMBOL,
                           output_json_path=SPARQL_JSON_RESULT_PROCESSED)
        remove_key_from_dict(SPARQL_JSON_RESULT_PROCESSED,
                             "value", ["type"],
                             output_json_path=SPARQL_JSON_RESULT_PROCESSED)
        print "Conversion done!"

    if make_artwork_dataset:
        dbpediaJSON_to_artwork_list(
            json_file_or_string=SPARQL_JSON_RESULT_PROCESSED,
            out_json_file=ARTWORK_DATASET,
            doGpsToAddressQuery=gps_request_in_dataset)

    if filter_artwork_dataset:
        artwork_dataset = ArtworkDataset().loadJson(ARTWORK_DATASET)
        filtered_dataset = artwork_dataset_filtering(artwork_dataset)
        filtered_dataset.saveJson(ARTWORK_DATASET_FILTERED)

    if dbpedia or google > 0 or flickr > 0:

        ad = ArtworkDataset(img_path=DATASET_PATH)
        ad.loadJson(ARTWORK_DATASET)

        if dbpedia:
            ad.downloadDbPediaThumbs(
                preferred_dim=dbpedia_width,
                sleep_between_artwork=sleep_between_artwork_download)

        if google > 0:
            ad.downloadGoogleImages(
                images_per_artwork=google,
                sleep_between_artwork=sleep_between_artwork_download,
                google_localization=".it")

        if flickr > 0:
            ad.downloadFlickrImages(
                api_key="flickr.apikey",
                images_per_artwork=flickr,
                sleep_between_artwork=sleep_between_artwork_download)
예제 #4
0
def show_uri_info(uri):
    ad = ArtworkDataset()
    ad.loadJson(RESULT_DATASET)

    art_dict = dict((x.getFolderName(), x) for x in ad.artworks)
    print unicode(art_dict[uri].getStrRepr(True, True, True))
예제 #5
0
def main():
    old_data = ArtworkDataset()
    old_data.loadJson(RESULT_DATASET)

    new_data = ArtworkDataset(img_path=NEW_DATASET_PATH)
    new_data = get_few_img_class_dataset(old_data,
                                         min_s_img=MIN_S_IMG,
                                         min_f_img=MIN_F_IMG,
                                         min_g_img=MIN_G_IMG,
                                         out_dataset=new_data)

    few_img_classes = len(new_data.artworks)
    if SHOW_STATS:
        print("Classes with few images: " + str(few_img_classes))

    if DOWNLOAD:
        if G_NEW_DOWN > 0:
            new_data.downloadGoogleImages(
                images_per_artwork=G_NEW_DOWN,
                sleep_between_artwork=SLEEP_BETWEEN_ARTWORK)

        if F_NEW_DOWN > 0:
            new_data.downloadFlickrImages(
                api_key="flickr.apikey",
                images_per_artwork=F_NEW_DOWN,
                sleep_between_artwork=SLEEP_BETWEEN_ARTWORK)
        if SEED_NEW_DOWN:
            new_data.downloadDbPediaThumbs(
                preferred_dim=DBPEDIA_WIDTH,
                sleep_between_artwork=SLEEP_BETWEEN_ARTWORK)

        if SHOW_DOWNLOAD_STATS:
            get_few_img_class_dataset(new_data,
                                      min_s_img=MIN_S_IMG,
                                      min_f_img=MIN_F_IMG,
                                      min_g_img=MIN_G_IMG)
            print("Classes with few images before redownload: " +
                  str(few_img_classes))
            print("Classes with few images after redownload: " +
                  str(len(new_data.artworks)))
예제 #6
0
def prepare_docs(out_file,
                 lambda_on_doc_str=None,
                 include_classes_filter=None,
                 class_filter_per_name=None):
    # type: (basestring, callable(basestring), list[int], list[basestring]) -> None
    ad = ArtworkDataset()
    ad.loadJson(ARTWORK_DATASET)

    outf = file(out_file, mode='w')

    for class_index, artwork in enumerate(ad.artworks):

        if include_classes_filter is not None:

            if class_index not in include_classes_filter:
                continue  #skip this class!
            else:
                print(u"Included class:  " + unicode(class_index))
                i = include_classes_filter.index(class_index)
                print(u"With class name: " + unicode(class_filter_per_name[i]))
                print("")
                #ust for check:
                if class_filter_per_name is not None:
                    folder_uri = artwork.getFolderName()
                    if folder_uri != class_filter_per_name[i]:
                        raise ValueError(
                            "Wow... stop it! Probably class index not corresponding from dataset to training set?"
                        )

        title = artwork.title
        descr = artwork.description
        comment = artwork.comment
        authors = artwork.authors
        #locations = artwork.currentLocations

        if comment is None:
            comment = u''
        if descr is None:
            descr = u''
        if title is None:
            title = u''

        authors_str = u""
        if authors is not None:
            for author in authors:
                if author.name is not None:
                    authors_str += u"Author Name: " + author.name + u". "
                if author.comment is not None:
                    authors_str += author.comment + u". "
                if author.abstract is not None:
                    authors_str += author.abstract + u". "

                if author.birthDate is not None:
                    authors_str += u"Author Birth: " + str(author.birthDate)
                if author.deathDate is not None:
                    authors_str += u", Author Death: " + str(
                        author.deathDate) + u'. '
                else:
                    authors_str += '. '

                if author.movement is not None:
                    authors_str += u"Author Movement: " + author.movement + u'. '

                if author.nationality is not None:
                    authors_str += u"Author Nationality: " + author.nationality + u'. '

        locations_str = u""
        # if locations is not None:
        #     for loc in locations:
        #         authors_str += str(loc)

        doc = title + u'. ' + authors_str + '. ' + comment + '. ' + descr + '. ' + locations_str
        # doc.replace('\n', '')
        if lambda_on_doc_str is not None:
            doc = lambda_on_doc_str(doc)
        doc += u'\n'
        outf.write(doc.encode("UTF-8"))

    outf.close()
예제 #7
0
def dbpediaJSON_to_artwork_list(json_file_or_string,
                                out_json_file=None,
                                doGpsToAddressQuery=False):
    if os.path.isfile(json_file_or_string):
        json_str = open(json_file_or_string).read()
    else:
        json_str = json_file_or_string
    jdata = json.loads(json_str)

    keeplist = ListRefactor(
        list_rule=ListRule.keep_list,
        one_element_rule=OneElementListRule.follow_list_rule,
        bad_rule=BadOrNoneInListRule.remove,
        empty_rule=EmptyListRule.none_value,
        bad_value=u"")

    pickfirst = ListRefactor(
        list_rule=ListRule.first_value,
        one_element_rule=OneElementListRule.follow_list_rule,
        bad_rule=BadOrNoneInListRule.remove,
        empty_rule=EmptyListRule.none_value,
        bad_value=u"")

    concatparag = ListRefactor(
        list_rule=ListRule.reduce_value,
        one_element_rule=OneElementListRule.follow_list_rule,
        bad_rule=BadOrNoneInListRule.remove,
        empty_rule=EmptyListRule.none_value,
        list_reducer_func=lambda x, y: str(x) + "\n\n" + str(y),
        bad_value=u"")

    # concatlist = ListRefactor(list_rule=ListRule.reduce_value,
    #                           one_element_rule=OneElementListRule.follow_list_rule,
    #                           bad_rule=BadOrNoneInListRule.remove,
    #                           empty_rule=EmptyListRule.none_value,
    #                           list_reducer_func=lambda x, y: str(x) + ", " + str(y))

    # avg = ListRefactor(list_rule=ListRule.reduce_value,
    #                    one_element_rule=OneElementListRule.follow_list_rule,
    #                    bad_rule=BadOrNoneInListRule.remove,
    #                    empty_rule=EmptyListRule.none_value,
    #                    list_reducer_func=lambda x, y: avg(float(x), float(y)))

    artworkList = ArtworkDataset()

    for r in jdata:
        artwork = Artwork()

        # out = r["artwork"]["value"]
        # data = dict()

        # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
        # Try to get artwork name, artist name, ...                                                                           *
        # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
        artwork.uri = r[QK_ARTWORK]  # this field is not optional!
        artwork.title = list_in_dict_refactor(r, QK_TITLE, pickfirst)
        artwork.description = list_in_dict_refactor(r, QK_DESCRIPTION,
                                                    concatparag)
        artwork.comment = list_in_dict_refactor(r, QK_COMMENT, concatparag)
        artwork.thumb_link = list_in_dict_refactor(r, QK_THUMB, pickfirst)
        artwork.img_link = list_in_dict_refactor(r, QK_IMG, pickfirst)

        author = Author()
        author.uri = list_in_dict_refactor(r, QK_AUTHOR, pickfirst)
        author.abstract = list_in_dict_refactor(r, QK_AUTHOR_ABSTRACT,
                                                pickfirst)
        author.comment = list_in_dict_refactor(r, QK_AUTHOR_COMMENT, pickfirst)
        author.movement = list_in_dict_refactor(r, QK_AUTHOR_MOVEMENT,
                                                pickfirst)
        author.birthDate = list_in_dict_refactor(r, QK_AUTHOR_BIRTH_DATE,
                                                 pickfirst)
        author.deathDateate = list_in_dict_refactor(r, QK_AUTHOR_DEATH_DATE,
                                                    pickfirst)
        author.name = list_in_dict_refactor(r, QK_AUTHOR_BIRTH_NAME, pickfirst)
        if author.name is None:
            author.name = list_in_dict_refactor(r, QK_AUTHOR_NAME, pickfirst)
        if author.name is None:
            author.name = list_in_dict_refactor(r, QK_AUTHOR_NAME_2, pickfirst)
        if author.name is None:
            author.name = list_in_dict_refactor(r, QK_AUTHOR_NAME_3, pickfirst)
        artwork.addAuthor(author)

        # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
        # Try to get museum info                                                                                              *
        # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
        mus = Museum()
        mus.uri = list_in_dict_refactor(r, QK_MUS, pickfirst)
        mus.name = list_in_dict_refactor(r, QK_MUS_NAME, pickfirst)

        if QK_MUS_LAT in r.keys() and QK_MUS_LNG in r.keys():
            mus.setLatLng(list_in_dict_refactor(r, QK_MUS_LAT, pickfirst),
                          list_in_dict_refactor(r, QK_MUS_LNG, pickfirst))
        elif QK_MUS_LATLNG in r.keys():
            i = 0
            lat_sum = 0
            lng_sum = 0
            list_of_latlng = list_in_dict_refactor(r, QK_MUS_LATLNG, keeplist)
            if list_of_latlng is not None:
                for ll in list_of_latlng:
                    latlng = ll.split(',')
                    lat_sum += float(latlng[0])
                    lng_sum += float(latlng[1])
                    i += 1
                if i > 0:
                    mus.setLatLng(lat_sum / i, lng_sum / i)

        artwork.addMuseum(mus)

        # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
        # Try to get GPS location of the artwork or, if not available, of the museum:                                         *
        # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
        #  ARTWORK GPS:
        if QK_LAT in r.keys() and QK_LNG in r.keys():
            artwork.setLatLng(list_in_dict_refactor(r, QK_LAT, pickfirst),
                              list_in_dict_refactor(r, QK_LNG, pickfirst))

        elif QK_LATLNG in r.keys():
            i = 0
            lat_sum = 0
            lng_sum = 0
            list_of_latlng = list_in_dict_refactor(r, QK_LATLNG, keeplist)
            if list_of_latlng is not None:
                for ll in list_of_latlng:
                    latlng = ll.split(',')
                    lat_sum += float(latlng[0])
                    lng_sum += float(latlng[1])
                    i += 1
                if i > 0:
                    artwork.currentLatLng.lat = lat_sum / i
                    artwork.currentLatLng.lng = lng_sum / i

        # MUSEUM GPS:
        if artwork._currentLatLng is None and mus._latLng is not None:
            artwork.currentLatLng = mus._latLng

        # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *

        # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
        # Try to get location (city name) from GPS or JSON data                                                               *
        # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *

        address = None
        if (doGpsToAddressQuery and gps_to_address == True
                and artwork.currentLatLng.lat != None
                and artwork.currentLatLng.lng != None):

            retry = True
            gotAddress = False
            while (retry):
                try:
                    time.sleep(GPS_SERVER_REQUEST_DELAY)
                    address = gpsToAddress(artwork.currentLatLng.lat,
                                           artwork.currentLatLng.lng)
                    if address != None:
                        retry = False
                        gotAddress = True
                    else:
                        print("\nNone address returned.")
                        ch = raw_input("\nTry again? [Y|n]\n> ")
                        if ch == "n" or ch == "N":
                            retry = False
                            gotAddress = False
                        else:
                            retry = True
                except GeocoderTimedOut as error:
                    retry = True
                except GeocoderServiceError as error:
                    if "429" in error.message:  # too many request error
                        if request_user_action_if_errror_429:
                            print("\nServer says too many request: " +
                                  error.message)
                            ch = raw_input("\nRetry to connect? [Y|n]\n> ")
                            if ch == "n" or ch == "N":
                                retry = False
                                return
                            else:
                                retry = True
                        else:
                            time.sleep(GPS_SERVER_REQUEST_DELAY_IF_ERROR_429)
                            retry = True
                    elif "Errno -2" in error.message:
                        print("\nCan't reach server: " + error.message)
                        ch = raw_input("\nRetry to connect? [Y|n]\n> ")
                        if ch == "n" or ch == "N":
                            retry = False
                            return
                        else:
                            retry = True
                    else:
                        print("\nServer unknown error: " + error.message)
                        ch = raw_input("\nRetry to connect? [Y|n]\n> ")
                        if ch == "n" or ch == "N":
                            retry = False
                            return
                        else:
                            retry = True

            if gotAddress:
                if address.city is not None:
                    artwork.currentLocations.append(address.city)
                elif address.county_province is not None:
                    artwork.currentLocations.append(address.county_province)
                elif address.state_region is not None:
                    artwork.currentLocations.append(address.state_region)
                elif address.country is not None:
                    artwork.currentLocations.append(address.country)
            else:
                address = None

        if address is None:
            lambda_locations_list = [
                list_in_dict_refactor(r, QK_MUS_LOCATION, keeplist),
                list_in_dict_refactor(r, QK_MUS_LOCATION, keeplist),
                list_in_dict_refactor(r, QK_LOCATION_B, keeplist),
                list_in_dict_refactor(r, QK_MUS_LOCATION_B, keeplist)
            ]
            for locations in lambda_locations_list:
                if locations is not None:
                    artwork.currentLocations = []
                    for loc in locations:
                        artwork.currentLocations.append(loc)
                    break
        # * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *

        if print_gps_address_info:
            if address is None:
                print "-- address missing -- (no gps info)"
            else:
                print(address.toString())

        if print_location:
            if not artwork.currentLocations:
                print("<no-location>")
            else:
                for loc in artwork.currentLocations:
                    print(loc + "\t")

        if print_artwork:
            print vars(artwork)

        artworkList.artworks.append(artwork)

    if isinstance(out_json_file, str):
        artworkList.saveJson(out_json_file)

    return artworkList