コード例 #1
0
 def get_room_info_from_web_site(self, flag):
     """ Get the room properties from the web site """
     try:
         # initialization
         logger.info("-" * 70)
         logger.info("Room " + str(self.room_id) +
                     ": getting from Airbnb web site")
         room_url = self.config.URL_ROOM_ROOT + str(self.room_id)
         response = airbnb_ws.ws_request_with_repeats(self.config, room_url)
         if response is not None:
             page = response.text
             tree = html.fromstring(page)
             self.__get_room_info_from_tree(tree, flag)
             logger.info("Room %s: found", self.room_id)
             return True
         else:
             logger.info("Room %s: not found", self.room_id)
             return False
     except (KeyboardInterrupt, SystemExit):
         raise
     except Exception as ex:
         logger.exception("Room " + str(self.room_id) +
                          ": failed to retrieve from web site.")
         logger.error("Exception: " + str(type(ex)))
         raise
コード例 #2
0
 def get_search_page_info_zipcode(self, room_type, zipcode, guests,
                                  section_offset, flag):
     try:
         logger.info("-" * 70)
         logger.info(room_type + ", zipcode " + str(zipcode) + ", " +
                     str(guests) + " guests, " + "page " +
                     str(section_offset + 1))
         room_count = 0
         new_rooms = 0
         params = {}
         params["guests"] = str(guests)
         params["section_offset"] = str(section_offset)
         params["source"] = "filter"
         params["location"] = zipcode
         params["room_types[]"] = room_type
         response = airbnb_ws.ws_request_with_repeats(
             self.config, self.config.URL_API_SEARCH_ROOT, params)
         json = response.json()
         for result in json["results_json"]["search_results"]:
             room_id = int(result["listing"]["id"])
             if room_id is not None:
                 room_count += 1
                 listing = self.listing_from_search_page_json(
                     result, room_id)
                 if listing is None:
                     continue
                 if listing.host_id is not None:
                     listing.deleted = 0
                     if flag == self.config.FLAGS_ADD:
                         if listing.save(
                                 self.config.FLAGS_INSERT_NO_REPLACE):
                             new_rooms += 1
                     elif flag == self.config.FLAGS_PRINT:
                         print(room_type, listing.room_id)
         if room_count > 0:
             has_rooms = 1
         else:
             has_rooms = 0
         if flag == self.config.FLAGS_ADD:
             self.log_progress(room_type, zipcode, guests, section_offset,
                               has_rooms)
         else:
             logger.info("No rooms found")
         return room_count
     except UnicodeEncodeError:
         logger.error(
             "UnicodeEncodeError: you may want to set PYTHONIOENCODING=utf-8"
         )
         # if sys.version_info >= (3,):
         #    logger.info(s.encode('utf8').decode(sys.stdout.encoding))
         # else:
         #    logger.info(s.encode('utf8'))
         # unhandled at the moment
     except Exception as e:
         logger.error("Exception type: " + type(e).__name__)
         raise
コード例 #3
0
 def __search_neighborhood_page(self, room_type, neighborhood, guests,
                                section_offset, flag):
     try:
         logger.info("-" * 70)
         logger.info(room_type + ", " + str(neighborhood) + ", " +
                     str(guests) + " guests, " + "page " +
                     str(section_offset))
         new_rooms = 0
         room_count = 0
         params = {}
         params["page"] = str(section_offset)
         params["source"] = "filter"
         params["location"] = self.search_area_name
         params["room_types[]"] = room_type
         params["neighborhoods[]"] = neighborhood
         response = airbnb_ws.ws_request_with_repeats(
             self.config, self.config.URL_API_SEARCH_ROOT, params)
         json = response.json()
         for result in json["results_json"]["search_results"]:
             room_id = int(result["listing"]["id"])
             if room_id is not None:
                 room_count += 1
                 listing = self.listing_from_search_page_json(
                     result, room_id)
                 if listing is None:
                     continue
                 if listing.host_id is not None:
                     listing.deleted = 0
                     if flag == self.config.FLAGS_ADD:
                         if listing.save(
                                 self.config.FLAGS_INSERT_NO_REPLACE):
                             new_rooms += 1
                     elif flag == self.config.FLAGS_PRINT:
                         print(room_type, listing.room_id)
         if room_count > 0:
             has_rooms = 1
         else:
             has_rooms = 0
         if flag == self.config.FLAGS_ADD:
             neighborhood_id = self.get_neighborhood_id(neighborhood)
             self.log_progress(room_type, neighborhood_id, guests,
                               section_offset, has_rooms)
         return room_count
     except UnicodeEncodeError:
         logger.error("UnicodeEncodeError: set PYTHONIOENCODING=utf-8")
         # if sys.version_info >= (3,):
         #    logger.info(s.encode('utf8').decode(sys.stdout.encoding))
         # else:
         #    logger.info(s.encode('utf8'))
         # unhandled at the moment
     except Exception:
         raise
コード例 #4
0
    def get_room_info_from_web_site(self, flag):
        """ Get the room properties from the web site """
        try:
            # first check if we have a cached local version
            base_path = self.config.CACHE_PATH + "/" + str(
                self.config.SURVEY_ID)
            file = Path(base_path + "/" + str(self.room_id) + ".html")
            page = None
            if self.config.USE_CACHE and file.exists():
                page = file.read_text()
            else:
                # initialization
                logger.info("-" * 70)
                logger.info("Room " + str(self.room_id) +
                            ": getting from Airbnb web site")
                room_url = self.config.URL_ROOM_ROOT + str(self.room_id)
                response = airbnb_ws.ws_request_with_repeats(
                    self.config, room_url)
                if response is not None:
                    page = response.text
                    if self.config.USE_CACHE:
                        if not Path(base_path).exists():
                            os.makedirs(base_path)
                        file.write_text(page)
                else:
                    logger.info("Room %s: not found", self.room_id)
                    return False
            if page:
                tree = html.fromstring(page)
                self.__get_room_info_from_tree(tree, flag)
                logger.info("Room %s: found", self.room_id)
                return True

        except (KeyboardInterrupt, SystemExit):
            raise
        except Exception as ex:
            logger.exception("Room " + str(self.room_id) +
                             ": failed to retrieve from web site.")
            logger.error("Exception: " + str(type(ex)))
            raise
コード例 #5
0
 def search_node(self, room_type, guests, price_range, quadtree_node,
                 median_node, flag):
     """
         rectangle is (n_lat, e_lng, s_lat, w_lng)
         returns number of *new* rooms and number of pages tested
     """
     try:
         logger.info("-" * 70)
         rectangle = self.get_rectangle_from_quadtree_node(
             quadtree_node, median_node)
         logger.info(
             ("Searching rectangle: {room_type}, guests = {guests}, "
              "prices in [{p1}, {p2}], zoom factor = {z}").format(
                  room_type=room_type,
                  guests=guests,
                  p1=price_range[0],
                  p2=price_range[1],
                  z=len(quadtree_node)))
         logger.debug("quadtree_node = {quadtree_node}".format(
             quadtree_node=str(quadtree_node)))
         logger.debug(
             "Rectangle: N={n:+.5f}, E={e:+.5f}, S={s:+.5f}, W={w:+.5f}".
             format(n=rectangle[0],
                    e=rectangle[1],
                    s=rectangle[2],
                    w=rectangle[3]))
         new_rooms = 0
         room_total = 0
         # median_lists are collected from results on each page and used to
         # calculate the median values, which will be used to divide the
         # volume into optimal "quadrants".
         median_lists = {}
         median_lists["latitude"] = []
         median_lists["longitude"] = []
         for page_number in range(1, self.config.SEARCH_MAX_PAGES + 1):
             room_count = 0
             # set up the parameters for the request
             params = {}
             params["guests"] = str(guests)
             params["page"] = str(page_number)
             params["source"] = "filter"
             params["room_types[]"] = room_type
             params["sw_lat"] = str(rectangle[2])
             params["sw_lng"] = str(rectangle[3])
             params["ne_lat"] = str(rectangle[0])
             params["ne_lng"] = str(rectangle[1])
             params["search_by_map"] = str(True)
             params["price_min"] = str(price_range[0])
             params["price_max"] = str(price_range[1])
             # make the http request
             response = airbnb_ws.ws_request_with_repeats(
                 self.config, self.config.URL_API_SEARCH_ROOT, params)
             # process the response
             if response is None:
                 logger.warning(
                     "No response received from request despite multiple attempts: {p}"
                     .format(p=params))
                 continue
             json = response.json()
             for result in json["results_json"]["search_results"]:
                 room_id = int(result["listing"]["id"])
                 if room_id is not None:
                     room_count += 1
                     room_total += 1
                     listing = self.listing_from_search_page_json(
                         result, room_id, room_type)
                     median_lists["latitude"].append(listing.latitude)
                     median_lists["longitude"].append(listing.longitude)
                     if listing is None:
                         continue
                     if listing.host_id is not None:
                         listing.deleted = 0
                         if flag == self.config.FLAGS_ADD:
                             if listing.save(
                                     self.config.FLAGS_INSERT_NO_REPLACE):
                                 new_rooms += 1
                         elif flag == self.config.FLAGS_PRINT:
                             print(room_type, listing.room_id)
             # Log page-level results
             logger.info(
                 "Page {page_number:02d} returned {room_count:02d} listings"
                 .format(page_number=page_number, room_count=room_count))
             if flag == self.config.FLAGS_PRINT:
                 # for FLAGS_PRINT, fetch one page and print it
                 sys.exit(0)
             if room_count < self.config.SEARCH_LISTINGS_ON_FULL_PAGE:
                 # If a full page of listings is not returned by Airbnb,
                 # this branch of the search is complete.
                 logger.debug("Final page of listings for this search")
                 break
         # Log rectangle-level results
         logger.info(
             ("Results:  {page_count} pages, {new_rooms} new rooms, "
              "{room_type}, {g} guests, prices in [{p1}, {p2}]").format(
                  room_type=room_type,
                  g=str(guests),
                  p1=str(price_range[0]),
                  p2=str(price_range[1]),
                  new_rooms=str(new_rooms),
                  page_count=str(page_number)))
         if len(median_node) == 0:
             median_leaf = "[]"
         else:
             median_leaf = median_node[-1]
         logger.info(
             "Results: rect = {median_leaf}, node = {quadtree_node}".format(
                 quadtree_node=str(quadtree_node),
                 median_leaf=str(median_leaf)))
         # calculate medians
         if room_count > 0:
             median_lat = sorted(median_lists["latitude"])[int(
                 len(median_lists["latitude"]) / 2)]
             median_lng = sorted(median_lists["longitude"])[int(
                 len(median_lists["longitude"]) / 2)]
             median_leaf = [median_lat, median_lng]
         else:
             # values not needed, but we need to fill in an item anyway
             median_leaf = [0, 0]
         # log progress
         self.log_progress(room_type, guests, price_range[0],
                           price_range[1], quadtree_node, median_node)
         return (new_rooms, page_number, median_leaf)
     except UnicodeEncodeError:
         logger.error("UnicodeEncodeError: set PYTHONIOENCODING=utf-8")
         # if sys.version_info >= (3,):
         #    logger.info(s.encode('utf8').decode(sys.stdout.encoding))
         # else:
         #    logger.info(s.encode('utf8'))
         # unhandled at the moment
     except Exception:
         logger.exception("Exception in get_search_page_info_rectangle")
         raise
コード例 #6
0
def ws_get_city_info(config, city, flag):
    try:
        url = config.URL_SEARCH_ROOT + city
        response = airbnb_ws.ws_request_with_repeats(config, url)
        if response is None:
            return False
        tree = html.fromstring(response.text)
        try:
            citylist = tree.xpath("//input[@name='location']/@value")
            neighborhoods = tree.xpath(
                "//input[contains(@id, 'filter-option-neighborhoods')]/@value")
            if flag == config.FLAGS_PRINT:
                print("\n", citylist[0])
                print("Neighborhoods:")
                for neighborhood in neighborhoods:
                    print("\t", neighborhood)
            elif flag == config.FLAGS_ADD:
                if len(citylist) > 0:
                    conn = config.connect()
                    cur = conn.cursor()
                    # check if it exists
                    sql_check = """
                        select name
                        from search_area
                        where name = %s"""
                    cur.execute(sql_check, (citylist[0], ))
                    if cur.fetchone() is not None:
                        logger.info("City already exists: " + citylist[0])
                        return
                    sql_search_area = """insert
                                into search_area (name)
                                values (%s)"""
                    cur.execute(sql_search_area, (citylist[0], ))
                    # city_id = cur.lastrowid
                    sql_identity = """select
                    currval('search_area_search_area_id_seq')
                    """
                    cur.execute(sql_identity, ())
                    search_area_id = cur.fetchone()[0]
                    sql_city = """insert
                            into city (name, search_area_id)
                            values (%s,%s)"""
                    cur.execute(sql_city, (
                        city,
                        search_area_id,
                    ))
                    logger.info("Added city " + city)
                    logger.debug(str(len(neighborhoods)) + " neighborhoods")
                if len(neighborhoods) > 0:
                    sql_neighborhood = """
                        insert into neighborhood(name, search_area_id)
                        values(%s, %s)
                        """
                    for neighborhood in neighborhoods:
                        cur.execute(sql_neighborhood, (
                            neighborhood,
                            search_area_id,
                        ))
                        logger.info("Added neighborhood " + neighborhood)
                else:
                    logger.info("No neighborhoods found for " + city)
                conn.commit()
        except UnicodeEncodeError:
            # if sys.version_info >= (3,):
            #    logger.info(s.encode('utf8').decode(sys.stdout.encoding))
            # else:
            #    logger.info(s.encode('utf8'))
            # unhandled at the moment
            pass
        except Exception:
            logger.error("Error collecting city and neighborhood information")
            raise
    except Exception:
        logger.error("Error getting city info from website")
        raise
コード例 #7
0
    def search_node(self, quadtree_node, median_node, flag):
        """
            rectangle is (n_lat, e_lng, s_lat, w_lng)
            returns number of *new* rooms and number of pages tested
        """
        try:
            logger.info("-" * 70)
            rectangle = self.get_rectangle_from_quadtree_node(
                quadtree_node, median_node)
            logger.info(
                "Searching rectangle: zoom factor = {z}, node = {node}".format(
                    z=len(quadtree_node), node=str(quadtree_node)))
            logger.debug(
                "Rectangle: N={n:+.5f}, E={e:+.5f}, S={s:+.5f}, W={w:+.5f}".
                format(n=rectangle[0],
                       e=rectangle[1],
                       s=rectangle[2],
                       w=rectangle[3]))
            new_rooms = 0
            room_quadtree_total = 0
            # set zoomable to false if the search finishes withoug returning a
            # full complement of 20 pages, 18 listings per page
            zoomable = True

            # median_lists are collected from results on each page and used to
            # calculate the median values, which will be used to divide the
            # volume into optimal "quadrants".
            median_lists = {}
            median_lists["latitude"] = []
            median_lists["longitude"] = []
            for section_offset in range(0, self.config.SEARCH_MAX_PAGES):
                # section_offset is the zero-based counter used on the site
                # page number is convenient for logging, etc
                page_number = section_offset + 1
                room_count = 0
                # set up the parameters for the request
                params = {}
                params["source"] = "filter"
                params["refinement_paths[]"] = "homes"
                params["sw_lat"] = str(rectangle[2])
                params["sw_lng"] = str(rectangle[3])
                params["ne_lat"] = str(rectangle[0])
                params["ne_lng"] = str(rectangle[1])
                params["search_by_map"] = str(True)
                if section_offset > 0:
                    params["section_offset"] = str(section_offset)
                # make the http request
                response = airbnb_ws.ws_request_with_repeats(
                    self.config, self.config.URL_API_SEARCH_ROOT, params)
                # process the response
                # If no response, maybe it's a network problem rather than a lack of data, so
                # to be conservative go to the next page rather than the next rectangle
                if response is None:
                    logger.warning(
                        "No response received from request despite multiple attempts: {p}"
                        .format(p=params))
                    continue
                soup = BeautifulSoup(
                    response.content.decode("utf-8", "ignore"), "lxml")
                html_file = open("test.html", mode="w", encoding="utf-8")
                html_file.write(soup.prettify())
                html_file.close()
                # The returned page includes a script tag that encloses a
                # comment. The comment in turn includes a complex json
                # structure as a string, which has the data we need
                spaspabundlejs_set = soup.find_all(
                    "script", {
                        "type": "application/json",
                        "data-hypernova-key": "spaspabundlejs"
                    })
                if len(spaspabundlejs_set) > 0:
                    logger.debug("Found spaspabundlejs tag")
                    comment = spaspabundlejs_set[0].contents[0]
                    # strip out the comment tags (everything outside the
                    # outermost curly braces)
                    json_doc = json.loads(
                        comment[comment.find("{"):comment.rfind("}") + 1])
                    logger.debug("results-containing json found")
                else:
                    logger.warning(
                        "json results-containing script node "
                        "(spaspabundlejs) not found in the web page: "
                        "go to next page")
                    return None
                # Now we have the json. It includes a list of 18 or fewer listings
                json_file = open("listing_json.json",
                                 mode="w",
                                 encoding="utf-8")
                json_file.write(json.dumps(json_doc, indent=4, sort_keys=True))
                json_file.close()

                # Steal a function from StackOverflow which searches for items
                # with a given list of keys (in this case just one: "listing")
                # https://stackoverflow.com/questions/14048948/how-to-find-a-particular-json-value-by-key
                def search_json_keys(key, json_doc):
                    """ Return a list of the values for each occurrence of key
                    in json_doc, at all levels. In particular, "listings"
                    occurs more than once, and we need to get them all."""
                    found = []
                    if isinstance(json_doc, dict):
                        if key in json_doc.keys():
                            found.append(json_doc[key])
                        elif len(json_doc.keys()) > 0:
                            for json_key in json_doc.keys():
                                result_list = search_json_keys(
                                    key, json_doc[json_key])
                                if result_list:
                                    found.extend(result_list)
                    elif isinstance(json_doc, list):
                        for item in json_doc:
                            result_list = search_json_keys(key, item)
                            if result_list:
                                found.extend(result_list)
                    return found

                # Get all items with tags "listings". Each json_listings is a
                # list, and each json_listing is a {listing, pricing_quote, verified}
                # dict for the listing in question
                # There may be multiple lists of listings
                json_listings_lists = search_json_keys("listings", json_doc)

                room_count = 0
                for json_listings in json_listings_lists:
                    for json_listing in json_listings:
                        room_id = int(json_listing["listing"]["id"])
                        if room_id is not None:
                            room_count += 1
                            room_quadtree_total += 1
                            listing = self.listing_from_search_page_json(
                                json_listing, room_id)
                            if listing is None:
                                continue
                            if listing.latitude is not None:
                                median_lists["latitude"].append(
                                    listing.latitude)
                            if listing.longitude is not None:
                                median_lists["longitude"].append(
                                    listing.longitude)
                            if listing.host_id is not None:
                                listing.deleted = 0
                                if flag == self.config.FLAGS_ADD:
                                    if listing.save(self.config.
                                                    FLAGS_INSERT_NO_REPLACE):
                                        new_rooms += 1
                                elif flag == self.config.FLAGS_PRINT:
                                    print(room_type, listing.room_id)

                # Log page-level results
                logger.info(
                    "Page {page_number:02d} returned {room_count:02d} listings"
                    .format(page_number=page_number, room_count=room_count))
                if flag == self.config.FLAGS_PRINT:
                    # for FLAGS_PRINT, fetch one page and print it
                    sys.exit(0)
                if room_count < self.config.SEARCH_LISTINGS_ON_FULL_PAGE:
                    # If a full page of listings is not returned by Airbnb,
                    # this branch of the search is complete.
                    logger.debug("Final page of listings for this search")
                    zoomable = False
                    break
            # Log node-level results
            logger.info(
                "Results: {page_count} pages, {new_rooms} new rooms".format(
                    new_rooms=str(new_rooms), page_count=str(page_number)))

            # Median-based partitioning not currently in use: may use later
            if len(median_node) == 0:
                median_leaf = "[]"
            else:
                median_leaf = median_node[-1]
            # calculate medians
            if room_count > 0:
                median_lat = round(
                    sorted(median_lists["latitude"])[int(
                        len(median_lists["latitude"]) / 2)], 5)
                median_lng = round(
                    sorted(median_lists["longitude"])[int(
                        len(median_lists["longitude"]) / 2)], 5)
                median_leaf = [median_lat, median_lng]
            else:
                # values not needed, but we need to fill in an item anyway
                median_leaf = [0, 0]
            # log progress
            self.log_progress(quadtree_node, median_node)
            return (zoomable, median_leaf)
        except UnicodeEncodeError:
            logger.error("UnicodeEncodeError: set PYTHONIOENCODING=utf-8")
            # if sys.version_info >= (3,):
            #    logger.info(s.encode('utf8').decode(sys.stdout.encoding))
            # else:
            #    logger.info(s.encode('utf8'))
            # unhandled at the moment
        except Exception:
            logger.exception("Exception in get_search_page_info_rectangle")
            raise