def parse_details_html(url): try: soup = to_soup(url) basic_data = as_dict( table_into_json(soup.find_all('section', {'id': 'szczegoly-oferty'})[0].table)) extended_data_sections = soup.find_all('section', {'id': 'dodatkowe-oplaty'}) extended_data = {} if extended_data_sections: extended_data = as_dict(table_into_json(extended_data_sections[0].table)) return { u"Link": url, u"Region": retrieve_meta(soup, 'dimension-region'), u"Ulica": retrieve_meta(soup, "streetAddress", "itemprop").lower().strip("ul.").strip(), u"Cena mieszkania": as_int(retrieve_meta(soup, 'dimension-price')), u"Cena za metr": as_int(retrieve_meta(soup, 'dimension-price-m2')), u"Powierzchnia": as_float(retrieve_meta(soup, 'dimension-area')), u"Pokoje": as_int(retrieve_meta(soup, 'dimension-rooms')), u'Cena parkingu': parse_parking_places(basic_data['Miejsca postojowe:'][1]), u'Piętro': as_int(retrieve_meta(soup, 'dimension-floor')), u"Koszty dodatkowe": sum([as_int(value) for value in extended_data.values()]) or None, u"Długosć geograficzna": as_float(retrieve_meta(soup, 'longitude', 'itemprop')), u"Szerokość geograficzna": as_float(retrieve_meta(soup, 'latitude', 'itemprop')), u"Termin": as_date( one_of(basic_data, ['Realizacja inwestycji:', u'Realizacja nieruchomości:'])[ -16:-6]) } except requests.exceptions.ConnectionError: return url except Exception, e: raise Exception("Failed to fetch %s; %s" % (url, traceback.format_exc()), e)
def scrap_content(url): res = get_request(url) soup = to_soup(res) content = soup.find("div", {"id": "inner-block"}) content = remove_attributes(content) return clear_content(content)
def parse_search_page(page, region=11158): url = ("https://rynekpierwotny.pl/oferty/?type=®ion={region}" "&distance=0&price_0=&price_1=&area_0=&area_1=&rooms_0=&rooms_1=" "&construction_end_date=&price_m2_0=&price_m2_1=&floor_0=&floor_1=" "&offer_size=&keywords=&is_luxury=&page={page}&is_mdm=&is_holiday=&lat=&lng=&sort=").format( region=region, page=page ) soup = to_soup(url) links = [] for result in soup.find_all('h2', {'class': 'offer-item-name'}): links.append(make_link(result.a)) return links
def parse_invest_html(url): try: soup = to_soup(url) body = soup.find_all('div', {"data-config-table-container": "propertyListFull"}) links = [] if body: for row in body[0].tbody.find_all('tr'): potential_links = row.find_all('a', href=True) if len(potential_links) == 2: links.append(make_link(potential_links[1])) else: links.append(make_link(potential_links[0])) return links except Exception, e: raise Exception("Failed to fetch %s; %s" % (url, traceback.format_exc()), e)
def get_public_transport_time(gps_from, gps_to, time=datetime.datetime.now()): weekday = (int(time.strftime("%w")) + 5) % 6 url = ( "http://www.m.rozkladzik.pl/krakow/wyszukiwarka_polaczen.html?" "from={from_x};{from_y}|c|{from_x}|{from_y}&" "to={to_x};{to_y}|c|{to_x}|{to_y}&profile=opt&maxWalkChange=400&minChangeTime=2&time={time}&day={day}".format( from_x=gps_from[0], from_y=gps_from[1], to_x=gps_to[0], to_y=gps_to[1], time=time.strftime("%H:%M"), day=weekday, ) ) soup = to_soup(url) times = [] for sum_row in soup.find_all("div", {"class": "route_sum_row"}): time_td = sum_row.find_all("td", {"class": "time"})[0] times.append(as_int(stringify_child(time_td)[1])) return min(times)
def crawl_articles(url): res = get_request(url) soup = to_soup(res) divs = soup.find_all("div", {"class": "vest_container"}) news_data = [] for div in divs: date = div.find("div", {"class", "meta"}).text picture = div.find("img")["src"] description = div.find("p").text title = div.find("h2").text tip = div.find("div", {"class", "img_desc"}).text url = div.find("a")["href"] content = scrap_content(url) news_data.append({ "title": title, "dateInfo": date, "picture": picture, "description": description, "type": tip, "url": url, "content": content }) return news_data