def _check_valid(self, parameters):
     """ 
     Checks if all parameters in the query are valid, if not, they go into 'keywords' 
         
     method compensates for values that are present in some searchers but not others
     will not search for a without modifier attached to keywords at the moment
     """
     keywords=""
     del_list = []
     add_list = []
     parameters_temp = parameters.copy()
     for p in parameters:
         if p not in query_lang and not p[0] in query_mods:
             keywords += list_to_str(parameters[p])
             del_list.append(p)
         elif p[0] in query_mods and len(p)>0:
             if self.searcher_identity == "nga":
                 new_key = p[0]
                 add_to_dict(parameters_temp,new_key,parameters[p])
                 del_list.append(p)
             elif p[1:] not in query_lang:
                 if not p[0]=="-":
                     keywords += list_to_str(parameters[p])
                 del_list.append(p)
     parameters = parameters_temp
     for p in del_list:
         if p in parameters:
             del parameters[p]
     for p in add_list:
         parameters[self._translate(p[0])] = p[1]
     return keywords , parameters    
Beispiel #2
0
def build_parameters(query, params):
    """ builds parameters dictionary to search by"""
    if not params:
        translator = Query_Language(identifier)
        params = translator.searcher_translator(query)
    all_words = getValue(params, 'all words')
    exact_phrase = list_to_str(getValue(params, 'exact phrase'))
    exclude = getValue(params, 'exclude words')
    not_in = getValue(params,'-')
    if exclude and not_in:
        exclude += "+"+not_in
    elif not_in:
        exclude = not_in
    if exclude:
        params.update({"exclude words":[exclude]})
    artist = getValue(params, 'artist')
    keywords = getValue(params, 'title')
    accession_number = getValue(params, 'accession number')
    school = getValue(params, 'school')
    classification = getValue(params, 'classification')
    medium = getValue(params, 'medium')
    year1 = getValue(params, 'start date')
    year2 = getValue(params, 'end date')
    access = getValue(params, 'access')
    # build up the url
    url_base = BASE_ADVANCED_SEARCH_URL + "&all_words="+list_to_str(all_words) + "&exact_phrase="+list_to_str(exact_phrase)+ "&exclude_words="+list_to_str(exclude)
    url_base += "&artist_last_name="+artist+"&keywords_in_title="+keywords + "&accession_num="+accession_number
    url_base += "&school="+list_to_str(school) + "&classification="+list_to_str(classification) + "&medium=" + list_to_str(medium) + "&year="+list_to_str(year1) + "&year2="+list_to_str(year2)
    url_base += "&open_access="+list_to_str(access)
    # replace all whitespace from the parameters
    url_base = re.sub(" ", "+", url_base)
    return params,  url_base
Beispiel #3
0
def build_parameters(query, params):
    """ builds parameters dictionary to search by"""
    if not params:
        translator = Query_Language(identifier)
        params = translator.searcher_translator(query)
    all_words = getValue(params, 'all words')
    exact_phrase = list_to_str(getValue(params, 'exact phrase'))
    exclude = getValue(params, 'exclude words')
    not_in = getValue(params, '-')
    if exclude and not_in:
        exclude += "+" + not_in
    elif not_in:
        exclude = not_in
    if exclude:
        params.update({"exclude words": [exclude]})
    artist = getValue(params, 'artist')
    keywords = getValue(params, 'title')
    accession_number = getValue(params, 'accession number')
    school = getValue(params, 'school')
    classification = getValue(params, 'classification')
    medium = getValue(params, 'medium')
    year1 = getValue(params, 'start date')
    year2 = getValue(params, 'end date')
    access = getValue(params, 'access')
    # build up the url
    url_base = BASE_ADVANCED_SEARCH_URL + "&all_words=" + list_to_str(
        all_words) + "&exact_phrase=" + list_to_str(
            exact_phrase) + "&exclude_words=" + list_to_str(exclude)
    url_base += "&artist_last_name=" + artist + "&keywords_in_title=" + keywords + "&accession_num=" + accession_number
    url_base += "&school=" + list_to_str(
        school) + "&classification=" + list_to_str(
            classification) + "&medium=" + list_to_str(
                medium) + "&year=" + list_to_str(
                    year1) + "&year2=" + list_to_str(year2)
    url_base += "&open_access=" + list_to_str(access)
    # replace all whitespace from the parameters
    url_base = re.sub(" ", "+", url_base)
    return params, url_base
Beispiel #4
0
def search(query, params, off, num_results_wanted) :
    try:
        """ 
        Gets search results - method must be called `search`
        query -- search query
        params -- parameters received from sidebar - if not sidebar they are empty
        off -- offset - number of images to offset the result by
        num_results_wanted -- images per page
        """
        if not query and params == {}:
            return Result(0, off), get_empty_params()
        arg = get_empty_params()
        off = (int)(off)    
        params,  url_base = build_parameters(query, params)
        no_query = True;
        if "query_string" in params:
            arg["query_string"] = fix_query_string(params["query_string"])
            del params["query_string"]
        else:
            query_string = ""
            if "all words" in params:
                query_string = params["all words"]
            for key in params:
                if not key == "all words":
                    if not query_string == "":
                        query_string += ","
                    value = list_to_str(params[key])
                    
                    query_string += query_dict[key] + "=" + value
            arg["query_string"] = fix_query_string(query_string)
            
        
        for key in params:
            value = params[key]
            if isinstance(value,list):
                value = list_to_str(value)
            
            no_query=False
            arg.update({key:value})
        if no_query:
            return Result(0, off), arg
        # get the image details
        searchhtml, firstIdIndex = __getHTMLPage_Containing_SearchResult(url_base, off)
        website_search_results_parser = BeautifulSoup(searchhtml)
        if not any_results(website_search_results_parser) :
            return Result(0, off), arg
        list_of_image_ids, thumbnail_urls, image_descriptions = __parse_html_for_image_details(website_search_results_parser, num_results_wanted, firstIdIndex)
        # ensure the correct number of images found
        num_results_wanted = min(num_results_wanted, __count(website_search_results_parser))    # adjusted by how many there are to have
        count = __count(website_search_results_parser)
        if off>count:
            return search(query,params,0,50)
        else:
            num_results_wanted = min(num_results_wanted, __count(website_search_results_parser)-off)
        if len(list_of_image_ids) < num_results_wanted:    # need more results and the next page has some
            tmp = 0
            while len(list_of_image_ids) < num_results_wanted and tmp<1:
                searchhtml, firstIdIndex = __getHTMLPage_Containing_SearchResult(url_base, off+len(list_of_image_ids))
                website_search_results_parser = BeautifulSoup(searchhtml)
                results = __parse_html_for_image_details(website_search_results_parser, num_results_wanted, firstIdIndex)
                if len(results[0])==0:
                    break
                if len(results[0])<25 :
                    tmp=1
                for i in range(0, len(results[0])) :
                    list_of_image_ids.append(results[0][i])
                    thumbnail_urls.append(results[1][i])
                    image_descriptions.append(results[2][i])
        if (len(list_of_image_ids) > num_results_wanted) :    # we've found too many, so remove some. Note, thumbs and image_descriptions self-regulate to never be more
            while (len(list_of_image_ids) > num_results_wanted) :
                list_of_image_ids.pop()
        # make Result that the rest of UnitedSearch can deal with
        resulting_images = Result(__count(website_search_results_parser), off+num_results_wanted)
        for i in range(len(list_of_image_ids)) :
            resulting_images.addImage(__createImage(list_of_image_ids[i], thumbnail_urls[i], image_descriptions[i]))
        if is_simple_search(arg):
            arg.update({"simple_keywords":str(arg["all words"])})
            arg.update({"all words":[]})
        return resulting_images, arg
    except:
        return Result(0, off), get_empty_params()
Beispiel #5
0
def _build_simple_URL(query_terms, per_page, page):
    """ returns a search url with all the given keywords, at the given page and with the number or specified results per page """
    facets=""
    keywords=""
    arg = get_empty_params()
    facet_arg = []
    query_string = ""
    sidebar = True
    if 'query_string' in query_terms:
        query_string=query_terms['query_string']   
        arg.update({"query_string":query_string})
        sidebar = False
        del query_terms['query_string']
    if 'keywords' in query_terms:
        keywords= list_to_str(query_terms['keywords'])
        arg.update({"keywords":keywords})
        if sidebar:
            query_string = list_to_str(query_terms['keywords'])
        del query_terms['keywords']
    
    for q in query_terms:
        q_split = q.split('_')
        if len(q_split)>1:   
            query_mod = q_split[0]
            facet = q[len(query_mod)+1:]
        else:   
            query_mod = 'and'
            facet = q
        
        if facet == "keywords":
            keywords += " "+list_to_str(query_terms[q])
        else:
            value_list = query_terms[q]
            if not isinstance(value_list,list):
                value_list = [value_list]
            for value in value_list:
                facets += '&'+query_mod+'['+facet+'][]='+value
        
        value_list = query_terms[q]
        if not isinstance(value_list,list):
            value_list = [value_list]
        for value in value_list:
            if sidebar and not facet=="keywords":
                query_string += ","+query_dict[query_mod]+query_dict[facet]+"="+value

            if facet!= "keywords":
                facet_arg.append([query_mod,[facet,value]])

    keywords = keywords.replace(" ","+")
    if not "query_string" in arg:
        while "''" in query_string:
            query_string = query_string.replace(",,",",")
        if query_string.startswith(","):
            query_string = query_string[1:]
        while query_string.endswith(","):
            query_string.pop()
        arg['query_string'] = query_string
    url =( BASE_SEARCH_API_URL+"&text="+keywords+BLOCKED_CONTENT_PARTNERS+facets+
        CATEGORY_VALUE+"&per_page="+str(per_page)+"&page="+str(page))
    if DEBUG:
        print "DIGITAL NZ URL = "+ url
    while len(facet_arg)<5:
        facet_arg.append([])
    arg.update({"field":facet_arg})
    
    return url, arg 
Beispiel #6
0
def _build_simple_URL(query_terms, per_page, page):
    """ returns a search url with all the given keywords, at the given page and with the number or specified results per page """
    facets = ""
    keywords = ""
    arg = get_empty_params()
    facet_arg = []
    query_string = ""
    sidebar = True
    if 'query_string' in query_terms:
        query_string = query_terms['query_string']
        arg.update({"query_string": query_string})
        sidebar = False
        del query_terms['query_string']
    if 'keywords' in query_terms:
        keywords = list_to_str(query_terms['keywords'])
        arg.update({"keywords": keywords})
        if sidebar:
            query_string = list_to_str(query_terms['keywords'])
        del query_terms['keywords']

    for q in query_terms:
        q_split = q.split('_')
        if len(q_split) > 1:
            query_mod = q_split[0]
            facet = q[len(query_mod) + 1:]
        else:
            query_mod = 'and'
            facet = q

        if facet == "keywords":
            keywords += " " + list_to_str(query_terms[q])
        else:
            value_list = query_terms[q]
            if not isinstance(value_list, list):
                value_list = [value_list]
            for value in value_list:
                facets += '&' + query_mod + '[' + facet + '][]=' + value

        value_list = query_terms[q]
        if not isinstance(value_list, list):
            value_list = [value_list]
        for value in value_list:
            if sidebar and not facet == "keywords":
                query_string += "," + query_dict[query_mod] + query_dict[
                    facet] + "=" + value

            if facet != "keywords":
                facet_arg.append([query_mod, [facet, value]])

    keywords = keywords.replace(" ", "+")
    if not "query_string" in arg:
        while "''" in query_string:
            query_string = query_string.replace(",,", ",")
        if query_string.startswith(","):
            query_string = query_string[1:]
        while query_string.endswith(","):
            query_string.pop()
        arg['query_string'] = query_string
    url = (BASE_SEARCH_API_URL + "&text=" + keywords +
           BLOCKED_CONTENT_PARTNERS + facets + CATEGORY_VALUE + "&per_page=" +
           str(per_page) + "&page=" + str(page))
    if DEBUG:
        print "DIGITAL NZ URL = " + url
    while len(facet_arg) < 5:
        facet_arg.append([])
    arg.update({"field": facet_arg})

    return url, arg
Beispiel #7
0
def search(query, params, off, num_results_wanted):
    try:
        """ 
        Gets search results - method must be called `search`
        query -- search query
        params -- parameters received from sidebar - if not sidebar they are empty
        off -- offset - number of images to offset the result by
        num_results_wanted -- images per page
        """
        if not query and params == {}:
            return Result(0, off), get_empty_params()
        arg = get_empty_params()
        off = (int)(off)
        params, url_base = build_parameters(query, params)
        no_query = True
        if "query_string" in params:
            arg["query_string"] = fix_query_string(params["query_string"])
            del params["query_string"]
        else:
            query_string = ""
            if "all words" in params:
                query_string = params["all words"]
            for key in params:
                if not key == "all words":
                    if not query_string == "":
                        query_string += ","
                    value = list_to_str(params[key])

                    query_string += query_dict[key] + "=" + value
            arg["query_string"] = fix_query_string(query_string)

        for key in params:
            value = params[key]
            if isinstance(value, list):
                value = list_to_str(value)

            no_query = False
            arg.update({key: value})
        if no_query:
            return Result(0, off), arg
        # get the image details
        searchhtml, firstIdIndex = __getHTMLPage_Containing_SearchResult(
            url_base, off)
        website_search_results_parser = BeautifulSoup(searchhtml)
        if not any_results(website_search_results_parser):
            return Result(0, off), arg
        list_of_image_ids, thumbnail_urls, image_descriptions = __parse_html_for_image_details(
            website_search_results_parser, num_results_wanted, firstIdIndex)
        # ensure the correct number of images found
        num_results_wanted = min(num_results_wanted,
                                 __count(website_search_results_parser)
                                 )  # adjusted by how many there are to have
        count = __count(website_search_results_parser)
        if off > count:
            return search(query, params, 0, 50)
        else:
            num_results_wanted = min(
                num_results_wanted,
                __count(website_search_results_parser) - off)
        if len(
                list_of_image_ids
        ) < num_results_wanted:  # need more results and the next page has some
            tmp = 0
            while len(list_of_image_ids) < num_results_wanted and tmp < 1:
                searchhtml, firstIdIndex = __getHTMLPage_Containing_SearchResult(
                    url_base, off + len(list_of_image_ids))
                website_search_results_parser = BeautifulSoup(searchhtml)
                results = __parse_html_for_image_details(
                    website_search_results_parser, num_results_wanted,
                    firstIdIndex)
                if len(results[0]) == 0:
                    break
                if len(results[0]) < 25:
                    tmp = 1
                for i in range(0, len(results[0])):
                    list_of_image_ids.append(results[0][i])
                    thumbnail_urls.append(results[1][i])
                    image_descriptions.append(results[2][i])
        if (
                len(list_of_image_ids) > num_results_wanted
        ):  # we've found too many, so remove some. Note, thumbs and image_descriptions self-regulate to never be more
            while (len(list_of_image_ids) > num_results_wanted):
                list_of_image_ids.pop()
        # make Result that the rest of UnitedSearch can deal with
        resulting_images = Result(__count(website_search_results_parser),
                                  off + num_results_wanted)
        for i in range(len(list_of_image_ids)):
            resulting_images.addImage(
                __createImage(list_of_image_ids[i], thumbnail_urls[i],
                              image_descriptions[i]))
        if is_simple_search(arg):
            arg.update({"simple_keywords": str(arg["all words"])})
            arg.update({"all words": []})
        return resulting_images, arg
    except:
        return Result(0, off), get_empty_params()