def _check_valid(self, parameters): """ Checks if all parameters in the query are valid, if not, they go into 'keywords' method compensates for values that are present in some searchers but not others will not search for a without modifier attached to keywords at the moment """ keywords="" del_list = [] add_list = [] parameters_temp = parameters.copy() for p in parameters: if p not in query_lang and not p[0] in query_mods: keywords += list_to_str(parameters[p]) del_list.append(p) elif p[0] in query_mods and len(p)>0: if self.searcher_identity == "nga": new_key = p[0] add_to_dict(parameters_temp,new_key,parameters[p]) del_list.append(p) elif p[1:] not in query_lang: if not p[0]=="-": keywords += list_to_str(parameters[p]) del_list.append(p) parameters = parameters_temp for p in del_list: if p in parameters: del parameters[p] for p in add_list: parameters[self._translate(p[0])] = p[1] return keywords , parameters
def build_parameters(query, params): """ builds parameters dictionary to search by""" if not params: translator = Query_Language(identifier) params = translator.searcher_translator(query) all_words = getValue(params, 'all words') exact_phrase = list_to_str(getValue(params, 'exact phrase')) exclude = getValue(params, 'exclude words') not_in = getValue(params,'-') if exclude and not_in: exclude += "+"+not_in elif not_in: exclude = not_in if exclude: params.update({"exclude words":[exclude]}) artist = getValue(params, 'artist') keywords = getValue(params, 'title') accession_number = getValue(params, 'accession number') school = getValue(params, 'school') classification = getValue(params, 'classification') medium = getValue(params, 'medium') year1 = getValue(params, 'start date') year2 = getValue(params, 'end date') access = getValue(params, 'access') # build up the url url_base = BASE_ADVANCED_SEARCH_URL + "&all_words="+list_to_str(all_words) + "&exact_phrase="+list_to_str(exact_phrase)+ "&exclude_words="+list_to_str(exclude) url_base += "&artist_last_name="+artist+"&keywords_in_title="+keywords + "&accession_num="+accession_number url_base += "&school="+list_to_str(school) + "&classification="+list_to_str(classification) + "&medium=" + list_to_str(medium) + "&year="+list_to_str(year1) + "&year2="+list_to_str(year2) url_base += "&open_access="+list_to_str(access) # replace all whitespace from the parameters url_base = re.sub(" ", "+", url_base) return params, url_base
def build_parameters(query, params): """ builds parameters dictionary to search by""" if not params: translator = Query_Language(identifier) params = translator.searcher_translator(query) all_words = getValue(params, 'all words') exact_phrase = list_to_str(getValue(params, 'exact phrase')) exclude = getValue(params, 'exclude words') not_in = getValue(params, '-') if exclude and not_in: exclude += "+" + not_in elif not_in: exclude = not_in if exclude: params.update({"exclude words": [exclude]}) artist = getValue(params, 'artist') keywords = getValue(params, 'title') accession_number = getValue(params, 'accession number') school = getValue(params, 'school') classification = getValue(params, 'classification') medium = getValue(params, 'medium') year1 = getValue(params, 'start date') year2 = getValue(params, 'end date') access = getValue(params, 'access') # build up the url url_base = BASE_ADVANCED_SEARCH_URL + "&all_words=" + list_to_str( all_words) + "&exact_phrase=" + list_to_str( exact_phrase) + "&exclude_words=" + list_to_str(exclude) url_base += "&artist_last_name=" + artist + "&keywords_in_title=" + keywords + "&accession_num=" + accession_number url_base += "&school=" + list_to_str( school) + "&classification=" + list_to_str( classification) + "&medium=" + list_to_str( medium) + "&year=" + list_to_str( year1) + "&year2=" + list_to_str(year2) url_base += "&open_access=" + list_to_str(access) # replace all whitespace from the parameters url_base = re.sub(" ", "+", url_base) return params, url_base
def search(query, params, off, num_results_wanted) : try: """ Gets search results - method must be called `search` query -- search query params -- parameters received from sidebar - if not sidebar they are empty off -- offset - number of images to offset the result by num_results_wanted -- images per page """ if not query and params == {}: return Result(0, off), get_empty_params() arg = get_empty_params() off = (int)(off) params, url_base = build_parameters(query, params) no_query = True; if "query_string" in params: arg["query_string"] = fix_query_string(params["query_string"]) del params["query_string"] else: query_string = "" if "all words" in params: query_string = params["all words"] for key in params: if not key == "all words": if not query_string == "": query_string += "," value = list_to_str(params[key]) query_string += query_dict[key] + "=" + value arg["query_string"] = fix_query_string(query_string) for key in params: value = params[key] if isinstance(value,list): value = list_to_str(value) no_query=False arg.update({key:value}) if no_query: return Result(0, off), arg # get the image details searchhtml, firstIdIndex = __getHTMLPage_Containing_SearchResult(url_base, off) website_search_results_parser = BeautifulSoup(searchhtml) if not any_results(website_search_results_parser) : return Result(0, off), arg list_of_image_ids, thumbnail_urls, image_descriptions = __parse_html_for_image_details(website_search_results_parser, num_results_wanted, firstIdIndex) # ensure the correct number of images found num_results_wanted = min(num_results_wanted, __count(website_search_results_parser)) # adjusted by how many there are to have count = __count(website_search_results_parser) if off>count: return search(query,params,0,50) else: num_results_wanted = min(num_results_wanted, __count(website_search_results_parser)-off) if len(list_of_image_ids) < num_results_wanted: # need more results and the next page has some tmp = 0 while len(list_of_image_ids) < num_results_wanted and tmp<1: searchhtml, firstIdIndex = __getHTMLPage_Containing_SearchResult(url_base, off+len(list_of_image_ids)) website_search_results_parser = BeautifulSoup(searchhtml) results = __parse_html_for_image_details(website_search_results_parser, num_results_wanted, firstIdIndex) if len(results[0])==0: break if len(results[0])<25 : tmp=1 for i in range(0, len(results[0])) : list_of_image_ids.append(results[0][i]) thumbnail_urls.append(results[1][i]) image_descriptions.append(results[2][i]) if (len(list_of_image_ids) > num_results_wanted) : # we've found too many, so remove some. Note, thumbs and image_descriptions self-regulate to never be more while (len(list_of_image_ids) > num_results_wanted) : list_of_image_ids.pop() # make Result that the rest of UnitedSearch can deal with resulting_images = Result(__count(website_search_results_parser), off+num_results_wanted) for i in range(len(list_of_image_ids)) : resulting_images.addImage(__createImage(list_of_image_ids[i], thumbnail_urls[i], image_descriptions[i])) if is_simple_search(arg): arg.update({"simple_keywords":str(arg["all words"])}) arg.update({"all words":[]}) return resulting_images, arg except: return Result(0, off), get_empty_params()
def _build_simple_URL(query_terms, per_page, page): """ returns a search url with all the given keywords, at the given page and with the number or specified results per page """ facets="" keywords="" arg = get_empty_params() facet_arg = [] query_string = "" sidebar = True if 'query_string' in query_terms: query_string=query_terms['query_string'] arg.update({"query_string":query_string}) sidebar = False del query_terms['query_string'] if 'keywords' in query_terms: keywords= list_to_str(query_terms['keywords']) arg.update({"keywords":keywords}) if sidebar: query_string = list_to_str(query_terms['keywords']) del query_terms['keywords'] for q in query_terms: q_split = q.split('_') if len(q_split)>1: query_mod = q_split[0] facet = q[len(query_mod)+1:] else: query_mod = 'and' facet = q if facet == "keywords": keywords += " "+list_to_str(query_terms[q]) else: value_list = query_terms[q] if not isinstance(value_list,list): value_list = [value_list] for value in value_list: facets += '&'+query_mod+'['+facet+'][]='+value value_list = query_terms[q] if not isinstance(value_list,list): value_list = [value_list] for value in value_list: if sidebar and not facet=="keywords": query_string += ","+query_dict[query_mod]+query_dict[facet]+"="+value if facet!= "keywords": facet_arg.append([query_mod,[facet,value]]) keywords = keywords.replace(" ","+") if not "query_string" in arg: while "''" in query_string: query_string = query_string.replace(",,",",") if query_string.startswith(","): query_string = query_string[1:] while query_string.endswith(","): query_string.pop() arg['query_string'] = query_string url =( BASE_SEARCH_API_URL+"&text="+keywords+BLOCKED_CONTENT_PARTNERS+facets+ CATEGORY_VALUE+"&per_page="+str(per_page)+"&page="+str(page)) if DEBUG: print "DIGITAL NZ URL = "+ url while len(facet_arg)<5: facet_arg.append([]) arg.update({"field":facet_arg}) return url, arg
def _build_simple_URL(query_terms, per_page, page): """ returns a search url with all the given keywords, at the given page and with the number or specified results per page """ facets = "" keywords = "" arg = get_empty_params() facet_arg = [] query_string = "" sidebar = True if 'query_string' in query_terms: query_string = query_terms['query_string'] arg.update({"query_string": query_string}) sidebar = False del query_terms['query_string'] if 'keywords' in query_terms: keywords = list_to_str(query_terms['keywords']) arg.update({"keywords": keywords}) if sidebar: query_string = list_to_str(query_terms['keywords']) del query_terms['keywords'] for q in query_terms: q_split = q.split('_') if len(q_split) > 1: query_mod = q_split[0] facet = q[len(query_mod) + 1:] else: query_mod = 'and' facet = q if facet == "keywords": keywords += " " + list_to_str(query_terms[q]) else: value_list = query_terms[q] if not isinstance(value_list, list): value_list = [value_list] for value in value_list: facets += '&' + query_mod + '[' + facet + '][]=' + value value_list = query_terms[q] if not isinstance(value_list, list): value_list = [value_list] for value in value_list: if sidebar and not facet == "keywords": query_string += "," + query_dict[query_mod] + query_dict[ facet] + "=" + value if facet != "keywords": facet_arg.append([query_mod, [facet, value]]) keywords = keywords.replace(" ", "+") if not "query_string" in arg: while "''" in query_string: query_string = query_string.replace(",,", ",") if query_string.startswith(","): query_string = query_string[1:] while query_string.endswith(","): query_string.pop() arg['query_string'] = query_string url = (BASE_SEARCH_API_URL + "&text=" + keywords + BLOCKED_CONTENT_PARTNERS + facets + CATEGORY_VALUE + "&per_page=" + str(per_page) + "&page=" + str(page)) if DEBUG: print "DIGITAL NZ URL = " + url while len(facet_arg) < 5: facet_arg.append([]) arg.update({"field": facet_arg}) return url, arg
def search(query, params, off, num_results_wanted): try: """ Gets search results - method must be called `search` query -- search query params -- parameters received from sidebar - if not sidebar they are empty off -- offset - number of images to offset the result by num_results_wanted -- images per page """ if not query and params == {}: return Result(0, off), get_empty_params() arg = get_empty_params() off = (int)(off) params, url_base = build_parameters(query, params) no_query = True if "query_string" in params: arg["query_string"] = fix_query_string(params["query_string"]) del params["query_string"] else: query_string = "" if "all words" in params: query_string = params["all words"] for key in params: if not key == "all words": if not query_string == "": query_string += "," value = list_to_str(params[key]) query_string += query_dict[key] + "=" + value arg["query_string"] = fix_query_string(query_string) for key in params: value = params[key] if isinstance(value, list): value = list_to_str(value) no_query = False arg.update({key: value}) if no_query: return Result(0, off), arg # get the image details searchhtml, firstIdIndex = __getHTMLPage_Containing_SearchResult( url_base, off) website_search_results_parser = BeautifulSoup(searchhtml) if not any_results(website_search_results_parser): return Result(0, off), arg list_of_image_ids, thumbnail_urls, image_descriptions = __parse_html_for_image_details( website_search_results_parser, num_results_wanted, firstIdIndex) # ensure the correct number of images found num_results_wanted = min(num_results_wanted, __count(website_search_results_parser) ) # adjusted by how many there are to have count = __count(website_search_results_parser) if off > count: return search(query, params, 0, 50) else: num_results_wanted = min( num_results_wanted, __count(website_search_results_parser) - off) if len( list_of_image_ids ) < num_results_wanted: # need more results and the next page has some tmp = 0 while len(list_of_image_ids) < num_results_wanted and tmp < 1: searchhtml, firstIdIndex = __getHTMLPage_Containing_SearchResult( url_base, off + len(list_of_image_ids)) website_search_results_parser = BeautifulSoup(searchhtml) results = __parse_html_for_image_details( website_search_results_parser, num_results_wanted, firstIdIndex) if len(results[0]) == 0: break if len(results[0]) < 25: tmp = 1 for i in range(0, len(results[0])): list_of_image_ids.append(results[0][i]) thumbnail_urls.append(results[1][i]) image_descriptions.append(results[2][i]) if ( len(list_of_image_ids) > num_results_wanted ): # we've found too many, so remove some. Note, thumbs and image_descriptions self-regulate to never be more while (len(list_of_image_ids) > num_results_wanted): list_of_image_ids.pop() # make Result that the rest of UnitedSearch can deal with resulting_images = Result(__count(website_search_results_parser), off + num_results_wanted) for i in range(len(list_of_image_ids)): resulting_images.addImage( __createImage(list_of_image_ids[i], thumbnail_urls[i], image_descriptions[i])) if is_simple_search(arg): arg.update({"simple_keywords": str(arg["all words"])}) arg.update({"all words": []}) return resulting_images, arg except: return Result(0, off), get_empty_params()