def parseLocal(self): doc = Request.s_unicodeResponse(open("response.html").read()); brokers = {"totalItems" : 0, "results" : {}}; result = self.parseItems(doc.xpath("//div[contains(@class, 'item result')]"), brokers); result = self.createResultItem(result); self.printSearchHeader(result); for idx, item in enumerate(result.get("results")[:10]): self.printBroker(idx, item);
def parseLocal(self): doc = Request.s_unicodeResponse(open("response.html").read()) brokers = { "totalItems": 0, "results": {} } result = self.parseItems( doc.xpath("//div[contains(@class, 'item result')]"), brokers) result = self.createResultItem(result) self.printSearchHeader(result) for idx, item in enumerate(result.get("results")[:10]): self.printBroker(idx, item)
def __init__(self): self.log = Logger("Hemnet") self.request = Request() #Base objects for searches and results self.baseUrl = "http://www.hemnet.se" self.baseSearch = self.baseUrl + "/sok/create" self.baseLocation = self.baseUrl + "/locations/show?" self.baseResult = self.baseUrl + "/resultat" self.searchQuery = {} #Basetype, english -> Swedish self.translatedTypes = { "municipality": "Kommun", "district": u"Område", "postal_city": "Stadsdel", "region": u"Län", "street": "Gata", "city": "Stad" } #BaseAverageTypes -> Swedish self.translatedAverageTypes = { "age": u"List ålder", "price": "Medelpris", "price_m2": u"Pris per m²", "size": u"Storlek (m²)", "rooms": "Antal rum", "fee": u"Månadsavgift", "price_change_up": u"Prisökning (%)", "price_change_down": u"Prissäkning (%)" } #searchTypes self.searchTypes = { "f": "fritidshus", "v": "villa", "t": "tomt", "r": "radhus", "g": "gard", "b": "bostadsratt", "o": "other", "a": "all" } #Items to get average for self.itemAverageTypes = { "age": 0, "price": 0, "price_m2": 0, "size": 0, "rooms": 0, "fee": 0, "price_change_up": 0, "price_change_down": 0 } #Base result format self.resultFormat = { "totalItems": 0, "results": {} } self.log.info("Initiated Hemnet")
class Hemnet(): def __init__(self): self.log = Logger("Hemnet") self.request = Request() #Base objects for searches and results self.baseUrl = "http://www.hemnet.se" self.baseSearch = self.baseUrl + "/sok/create" self.baseLocation = self.baseUrl + "/locations/show?" self.baseResult = self.baseUrl + "/resultat" self.searchQuery = {} #Basetype, english -> Swedish self.translatedTypes = { "municipality": "Kommun", "district": u"Område", "postal_city": "Stadsdel", "region": u"Län", "street": "Gata", "city": "Stad" } #BaseAverageTypes -> Swedish self.translatedAverageTypes = { "age": u"List ålder", "price": "Medelpris", "price_m2": u"Pris per m²", "size": u"Storlek (m²)", "rooms": "Antal rum", "fee": u"Månadsavgift", "price_change_up": u"Prisökning (%)", "price_change_down": u"Prissäkning (%)" } #searchTypes self.searchTypes = { "f": "fritidshus", "v": "villa", "t": "tomt", "r": "radhus", "g": "gard", "b": "bostadsratt", "o": "other", "a": "all" } #Items to get average for self.itemAverageTypes = { "age": 0, "price": 0, "price_m2": 0, "size": 0, "rooms": 0, "fee": 0, "price_change_up": 0, "price_change_down": 0 } #Base result format self.resultFormat = { "totalItems": 0, "results": {} } self.log.info("Initiated Hemnet") ''' Searchdata is a formpost in a very specific format ''' def createSearchFormData(self, data, specificType='a'): locationData = [{ "id": (data.get("id")), "name": (data.get("name")), "parent_id": (data.get("parent_location").get("id")), "parent_name": (data.get("parent_location").get("name")) }] searchData = { "search[location_search]": locationData, "search[location_ids][]": data.get("id"), "search[region_id]": -1, "search[municipality_ids][]": -1, "search[country_id]": 0, "search[item_types][]": "%s" % self.searchTypes[specificType], "search[price_min]": '', "search[price_max]": '', "search[fee_max]": '', "search[rooms_min]": '', "search[living_area_min]": '', "search[keywords]": '', "commit": '' } return searchData def searchRequest(self, query): return self.request.postRequest(self.baseSearch, query) ''' Pass a list of keys and a dict of data to caluclate average value for each key ''' def avgByKey(self, keys, data): final = {} for d in data: for k in d.keys(): if k in keys: final[k] = final.get(k, 0) + d[k] for k in final.keys(): final[k] = final[k] / len(data) return final def getLocationQueryURL(self, query): return "%sq=%s" % (self.baseLocation, urllib.quote(query.encode('utf-8'))) @cache.methodcache.cache('findLocations', expire=72000) def findLocations(self, query, extra, area=None): queryURL = self.getLocationQueryURL(query) cacheResult = cache.locations.get(hashlib.md5(queryURL).hexdigest()) if (cacheResult is not None): print "Found cached loc" return cacheResult locFormData = [] locResponse = self.request.getResponse(queryURL, None) jdata = json.loads(locResponse) print json.dumps(jdata, indent=4) formData = {} locations = [] for id, item in enumerate(jdata): item["score"] = Levenshtein.ratio( item.get("location").get("name"), query) if (area is not None): if (item.get("location").get("parent_location").get( "name").find(area) != -1): formData = self.createSearchFormData( item.get("location"), extra) locations.append(item) locFormData.append(formData) else: formData = self.createSearchFormData(item.get("location"), extra) locations.append(item) locFormData.append(formData) locations = sorted(locations, key=itemgetter('score'), reverse=True) result = { 'search': locFormData, 'area': area, 'locations': locations } cache.locations[hashlib.md5(queryURL).hexdigest()] = result return result @cache.methodcache.cache('performSearch', expire=72000) def performSearch(self, searchData): hashkey = hashlib.md5(json.dumps(searchData, sort_keys=True)).hexdigest() cachedResult = cache.storage.get(hashkey) if (cachedResult is not None): print "Found cached searchResponse" return cachedResult print "Performing search on " + json.dumps(searchData, indent=4) searchRequest = self.searchRequest(searchData) searchResponse = self.request.getUnicodeDoc(searchRequest) resultData = self.parseResult(searchResponse, self.resultFormat) result = self.createResultItem(resultData) print "Storing hash " + hashkey chart_list = cache.storage.get(hashkey, {}) # metadata is the chart item minus the actual list plus a size metadata_keys = filter(lambda k: k != 'list', result.keys()) metadata = {key: result[key] for key in metadata_keys} chart_list[hashkey] = metadata cache.storage[hashkey] = chart_list[hashkey] return result def parseResult(self, doc, brokers={}): brokers = self.parseItems( doc.xpath("//div[contains(@class, 'item result')]"), brokers) nextpage = doc.xpath('//a[@class="next_page"]') try: url = nextpage[0].attrib["href"] if url is not None: self.log.info("Parsing %s" % url) nextDoc = self.request.requestUnicodeDoc(self.baseUrl + url) self.parseResult(nextDoc, brokers) except Exception, e: self.log.debug("ParseResult %s" % e) pass return brokers
def __init__(self): self.log = Logger("Hemnet"); self.request = Request(); #Base objects for searches and results self.baseUrl = "http://www.hemnet.se"; self.baseSearch = self.baseUrl + "/sok/create"; self.baseLocation = self.baseUrl + "/locations/show?"; self.baseResult = self.baseUrl + "/resultat"; self.searchQuery = {} #Basetype, english -> Swedish self.translatedTypes = { "municipality" : "Kommun", "district" : u"Område", "postal_city" : "Stadsdel", "region" : u"Län", "street" : "Gata", "city" : "Stad" } #BaseAverageTypes -> Swedish self.translatedAverageTypes = { "age" : u"List ålder", "price" : "Medelpris", "price_m2" : u"Pris per m²", "size" : u"Storlek (m²)", "rooms" : "Antal rum", "fee" : u"Månadsavgift", "price_change_up" : u"Prisökning (%)", "price_change_down" : u"Prissäkning (%)" } #searchTypes self.searchTypes = { "f" : "fritidshus", "v" : "villa", "t" : "tomt", "r" : "radhus", "g" : "gard", "b" : "bostadsratt", "o" : "other", "a" : "all" } #Items to get average for self.itemAverageTypes = { "age" : 0, "price" : 0, "price_m2" : 0, "size" : 0, "rooms" : 0, "fee" : 0, "price_change_up" : 0, "price_change_down" : 0 }; #Base result format self.resultFormat = { "totalItems" : 0, "results" : {} }; self.log.info("Initiated Hemnet");
class Hemnet() : def __init__(self): self.log = Logger("Hemnet"); self.request = Request(); #Base objects for searches and results self.baseUrl = "http://www.hemnet.se"; self.baseSearch = self.baseUrl + "/sok/create"; self.baseLocation = self.baseUrl + "/locations/show?"; self.baseResult = self.baseUrl + "/resultat"; self.searchQuery = {} #Basetype, english -> Swedish self.translatedTypes = { "municipality" : "Kommun", "district" : u"Område", "postal_city" : "Stadsdel", "region" : u"Län", "street" : "Gata", "city" : "Stad" } #BaseAverageTypes -> Swedish self.translatedAverageTypes = { "age" : u"List ålder", "price" : "Medelpris", "price_m2" : u"Pris per m²", "size" : u"Storlek (m²)", "rooms" : "Antal rum", "fee" : u"Månadsavgift", "price_change_up" : u"Prisökning (%)", "price_change_down" : u"Prissäkning (%)" } #searchTypes self.searchTypes = { "f" : "fritidshus", "v" : "villa", "t" : "tomt", "r" : "radhus", "g" : "gard", "b" : "bostadsratt", "o" : "other", "a" : "all" } #Items to get average for self.itemAverageTypes = { "age" : 0, "price" : 0, "price_m2" : 0, "size" : 0, "rooms" : 0, "fee" : 0, "price_change_up" : 0, "price_change_down" : 0 }; #Base result format self.resultFormat = { "totalItems" : 0, "results" : {} }; self.log.info("Initiated Hemnet"); ''' Searchdata is a formpost in a very specific format ''' def createSearchFormData(self, data, specificType = 'a') : locationData = [{ "id": (data.get("id")), "name": (data.get("name")), "parent_id": (data.get("parent_location").get("id")), "parent_name": (data.get("parent_location").get("name")) }] searchData = { "search[location_search]" : locationData, "search[location_ids][]": data.get("id"), "search[region_id]":-1, "search[municipality_ids][]":-1, "search[country_id]":0, "search[item_types][]": "%s" % self.searchTypes[specificType], "search[price_min]": '', "search[price_max]": '', "search[fee_max]": '', "search[rooms_min]": '', "search[living_area_min]": '', "search[keywords]":'', "commit": '' } return searchData; def searchRequest(self, query) : return self.request.postRequest(self.baseSearch, query); ''' Pass a list of keys and a dict of data to caluclate average value for each key ''' def avgByKey(self, keys, data): final = {} for d in data: for k in d.keys(): if k in keys: final[k] = final.get(k,0) + d[k] for k in final.keys(): final[k] = final[k]/len(data); return final; def getLocationQueryURL(self, query): return "%sq=%s" % (self.baseLocation, urllib.quote(query.encode('utf-8'))) @cache.methodcache.cache('findLocations', expire=72000) def findLocations(self, query, extra, area = None) : queryURL = self.getLocationQueryURL(query); cacheResult = cache.locations.get(hashlib.md5(queryURL).hexdigest()); if( cacheResult is not None): print "Found cached loc"; return cacheResult; locFormData = [] locResponse = self.request.getResponse(queryURL, None) jdata = json.loads(locResponse); print json.dumps(jdata, indent=4); formData = {} locations = [] for id, item in enumerate(jdata) : item["score"] = Levenshtein.ratio(item.get("location").get("name"), query) if( area is not None ): if( item.get("location").get("parent_location").get("name").find(area) != -1 ): formData = self.createSearchFormData(item.get("location"), extra); locations.append(item) locFormData.append(formData); else: formData = self.createSearchFormData(item.get("location"), extra); locations.append(item) locFormData.append(formData); locations = sorted(locations, key=itemgetter('score'), reverse=True) result = {'search' : locFormData, 'area' : area, 'locations' : locations }; cache.locations[hashlib.md5(queryURL).hexdigest()] = result return result; @cache.methodcache.cache('performSearch', expire=72000) def performSearch(self, searchData): hashkey = hashlib.md5( json.dumps(searchData, sort_keys=True) ).hexdigest(); cachedResult = cache.storage.get(hashkey); if(cachedResult is not None): print "Found cached searchResponse"; return cachedResult; print "Performing search on " + json.dumps(searchData, indent=4); searchRequest = self.searchRequest(searchData); searchResponse = self.request.getUnicodeDoc(searchRequest); resultData = self.parseResult(searchResponse, self.resultFormat); result = self.createResultItem(resultData); print "Storing hash " + hashkey; chart_list = cache.storage.get(hashkey, {}) # metadata is the chart item minus the actual list plus a size metadata_keys = filter(lambda k: k != 'list', result.keys()) metadata = { key: result[key] for key in metadata_keys } chart_list[hashkey] = metadata cache.storage[hashkey] = chart_list[hashkey] return result; def parseResult(self, doc, brokers = {}) : brokers = self.parseItems(doc.xpath("//div[contains(@class, 'item result')]"), brokers); nextpage = doc.xpath('//a[@class="next_page"]'); try: url = nextpage[0].attrib["href"]; if url is not None: self.log.info("Parsing %s" % url); nextDoc = self.request.requestUnicodeDoc(self.baseUrl + url); self.parseResult(nextDoc, brokers); except Exception,e: self.log.debug("ParseResult %s" % e) pass; return brokers;