def parse(listitem): title = ident = web = short_address = phone = lat = lng = None tags = [] t = css(listitem, 'h1 a') if t: title = t[0].contents[0] ident = t[0]['href'] t = css(listitem, '.tel-fax .record-detail') if t: phone = t[0].contents[1].strip() t = css(listitem, '.web a[href^=http]') if t: web = t[0]['href'] t = css(listitem, '.p-code .record-detail') if t: short_address = str(t[0].contents[1]).strip() item = { 'title': title, 'lat': lat, 'lng': lng, 'url': web, 'phone': phone, 'short_address': short_address, 'tags': tags, 'origin': ident } with LOCK: sys.stdout.write('.') data[ident] = item
def do_work(*args): # Do something with args url = args[0] with LOCK: print url html = ''.join(urllib2.urlopen(url, timeout=TIMEOUT).readlines()) html = html.replace('<!- Google Analytics -->', '') html = re.sub('<script.*?>[\s\S]*?</.*?script>', '', html) soup = BeautifulSoup(html) item = {} def parse(listitem): title = ident = web = short_address = phone = lat = lng = None tags = [] t = css(listitem, 'h1 a') if t: title = t[0].contents[0] ident = t[0]['href'] t = css(listitem, '.tel-fax .record-detail') if t: phone = t[0].contents[1].strip() t = css(listitem, '.web a[href^=http]') if t: web = t[0]['href'] t = css(listitem, '.p-code .record-detail') if t: short_address = str(t[0].contents[1]).strip() item = { 'title': title, 'lat': lat, 'lng': lng, 'url': web, 'phone': phone, 'short_address': short_address, 'tags': tags, 'origin': ident } with LOCK: sys.stdout.write('.') data[ident] = item for listitem in css(soup, '.search-row-grey-wrapper'): parse(listitem) for listitem in css(soup, '.search-row-white-wrapper'): parse(listitem)
def parse(self, response): html = response.body html = html.replace('<!- Google Analytics -->', '') html = re.sub('<script.*?>[\s\S]*?</.*?script>', '', html) soup = BeautifulSoup(html) items = [] for listitem in css(soup, '.search-row-grey-wrapper'): items.append(self.soup_parse(listitem)) for listitem in css(soup, '.search-row-white-wrapper'): items.append(self.soup_parse(listitem)) return items
def do_work(*args): location = args[0] # print location lat, lng, address = location['lat'], location['lng'], location['short_address'] url = URL_TEMPLATE % (lat, lng) xml = ''.join(urllib2.urlopen(url, timeout=TIMEOUT).readlines()) soup = BeautifulSoup(xml) woeid = None woeid = css(soup, 'woeid')[0].contents[0] placetype = css(soup, 'type')[0].contents[0] item = {"lat_lon": [lat, lng], "latitude": lat, "longitude": lng, "_types": ["Location"], "name": address, "woeid": woeid, "placetype": placetype, "_cls": "Location"} with lock: print '.' data.append(item)
def soup_parse(self, listitem): title = ident = web = short_address = area = phone = lat = lng = None tags = [] t = css(listitem, 'h1 a') if t: title = t[0].contents[0] ident = t[0]['href'] """ this could look for span class=bold, extract heading, get content as next sibling """ t = css(listitem, '.tel-fax .record-detail') if t: phone = t[0].contents[1].strip() t = css(listitem, '.web a[href^=http]') if t: web = t[0]['href'] t = css(listitem, '.p-code .record-detail') if t: short_address = str(t[0].contents[1]).strip() t = css(listitem, '.p-code p') if t: area = self.extract_span_heading(t[0], 'Area Covered:') item = { 'title': title, # 'lat': lat, # 'lng': lng, 'url': web, # 'phone': phone, 'short_address': short_address, 'area': area, # 'tags': tags, # 'origin': ident } return GCDItem(**item)
def extract_span_heading(self, node, heading): """ GCD pages have data as eg <p><span class="bold">heading:</span>some text</p> can be p or div """ result = None h = css(node, 'span.bold') if h and (str(h[0].contents[0].strip()) == heading): if len(node.contents) > 1: result = str(node.contents[1]).strip() return result
def do_work(*args): url = args[0] html = ''.join(urllib2.urlopen(url, timeout=5).readlines()) soup = BeautifulSoup(html) for listitem in css(soup, 'li.listitem'): title = url = short_address = phone = lat = lng = None activities = [] t = css(listitem, 'h3 a') if t: title = t[0].contents[0] url = t[0]['href'] sa = css(listitem, 'li.shortaddress') if sa: short_address = sa[0].contents[0] pn = css(listitem, 'li.phonenumber') if pn: phone = pn[0].contents[0] for im in css(listitem, 'div.activityicons img'): activities.append(im['title']) img = css(listitem, 'div.listmap img[alt^=Map]') if img: ll = parse_qs(urlparse(img[0]['src']).query) lat, lng = ll['lat'][0], ll['lng'][0] item = { 'title': title, 'lat': lat, 'lng': lng, 'url': url, 'phone': phone, 'short_address': short_address, 'tags': activities } data_lock.acquire() data[item['url']] = item print '.' data_lock.release()