def scrape(self, targeturl): target = get_target(targeturl) log.info('Scraping user %s at URL: %s', target, targeturl) # do posts first because we're already on the timeline if self.settings['posts']: self.mapping['posts'](targeturl) for key, value in self.settings.iteritems(): if value and key is not 'posts': self.mapping[key](targeturl) log.info('Finished scraping user %s', target)
def scrape_about(self, targeturl): target = get_target(targeturl) rec = record.Record(self._output_file(target, 'about'), ['section', 'text']) def callback(section, content): rec.add_record({'section': section, 'text': content}) log.info('Scraped section %s with the following text:\n#### START ####\n%s\n#### END ####', section, content) self.crawl_about(targeturl, callback)
def scrape_checkins(self, targeturl): target = get_target(targeturl) rec = record.Record(self._output_file(target, 'checkins'), ['name', 'url']) def callback(name, url, i): rec.add_record({'name': name, 'url': url}) log.info('Scraped check in %d: %s', i, name) scraped = self.crawl_checkins(targeturl, callback) log.info('Scraped %d checkins into %s', scraped, rec.filename)
def scrape_likes(self, targeturl): target = get_target(targeturl) rec = record.Record(self._output_file(target, 'likes'), ['name', 'url']) log.info('Scraping likes into %s', rec.filename) def callback(name, page_url, i): rec.add_record({'name': name, 'url': page_url}) log.info('Scraped like %d: %s', i, name) likes_scraped = self.crawl_likes(targeturl, callback) log.info('Scraped %d likes into %s', likes_scraped, rec.filename)
def scrape_friends(self, targeturl): target = get_target(targeturl) rec = record.Record(self._output_file(target, 'friends'), ['name', 'profile']) log.info('Scraping friends into %s', rec.filename) def callback(name, url, imgurl, i): friend_url = strip_query(url) rec.add_record({'name': name, 'profile': friend_url}) log.info('Scraped friend %d: %s', i, name) friends_scraped = self.crawl_friends(targeturl, callback) log.info('Scraped %d friends into %s', friends_scraped, rec.filename)
def scrape_photos(self, targeturl): """Scrapes the targets photos and only the photos on the target's photo page. Photos in albums are not scraped. """ target = get_target(targeturl) # scrape main photos photo_album = record.Album(self._output_file(target, 'photos'), True) def photo_cb(photourl, description, perma, _): self._save_to_album(photourl, description, perma, photo_album) photos_scraped = self.crawl_photos(targeturl, photo_cb) log.info('Scraped %d photos into %s', photos_scraped, photo_album.name)
def scrape_all_albums(self, targeturl): target = get_target(targeturl) def album_cb(name, url, _): """What to do for each album. """ album_name = 'album-' + path_safe(name) album = record.Album(self._output_file(target, album_name), True) def album_download_cb(photourl, perma, _): self._save_to_album(photourl, '', perma, album) scraped = self.crawl_one_album(url, album_download_cb) log.info('Scraped %d photos into %s', scraped, album.name) self.crawl_albums(targeturl, album_cb)
def scrape_posts_by_year(self, targeturl, year=None): target = get_target(targeturl) rec_name = 'posts' if year: rec_name += '_' + str(year) rec = record.Record(self._output_file(target, rec_name), ['date', 'post', 'translation', 'permalink']) log.info('Scraping posts into %s', rec.filename) def callback(p_time, post_text, p_link, translation, i): rec.add_record({ 'date': timestring(p_time), 'post': post_text, 'translation': translation, 'permalink': p_link, }) # keep translation as a unicode string while merging if translation: translation = u'==== TRANSLATION ====\n{}\n'.format(translation) log.info(('Scraped post %d\n\n#### START POST ####\n%s\n%s' '#### END POST ####\n'), i, post_text, translation) posts_scraped = self.crawl_posts(targeturl, callback, year) log.info('Scraped %d posts into %s', posts_scraped, rec.filename)