def get(self): force_fetch = bool(self.request.get('force_fetch')) response = urllib2.urlopen('http://sfbay.craigslist.org/sfc/mis/') html = response.read() soup = BeautifulSoup(html) all_the_post_urls = [anchor['href'] for anchor in soup.findAll('a') if anchor.parent.name == 'p' ] count = 0 for url in all_the_post_urls: res = urllib2.urlopen(url) post_html = res.read() post_soup = BeautifulSoup(post_html) dt = helper.parse_date(post_soup) post_id = helper.parse_id_from_url(url) if (not MisHTML.get_by_key_name(post_id)) or force_fetch: m_h = MisHTML(html = unicode(post_html, errors='ignore'), posted = dt, key_name = post_id) m_h.put() count += 1 else: break; self.response.out.write('Found ' + str(count))
def get(self): count = 0 for m_h in MisHTML.all().filter('parsed_version <', PARSER_VERSION): post_dict = helper.parse_post(m_h.html, m_h.key().name()) m = Mis(key_name = post_dict['id'], html = m_h.html, post_id = post_dict['id'], url = post_dict['url'], title = post_dict['title'], body = post_dict['body'], location = post_dict['location'], age = post_dict['age'], me_gender = post_dict['me_gender'], you_gender = post_dict['you_gender'], posted = m_h.posted) m.put() m_h.parsed_version = PARSER_VERSION m_h.put() count += 1 self.response.out.write('Parsed ' + str(count))