def crawl_nexon_board(): raw_data = [] browser = Browser() browser.get_page(URL.BASE_URL) browser.click_element_by_xpath(ElementXpath.POST_TITLES_ON_BOARD) recent_post_id = int( re.search('n4articlesn=(\d+)', browser.get_current_url()).group(1)) for i in range(0, 10): url = PathUtils.fromTemplate(URL.POST_URL, str(recent_post_id - i)) browser.get_page(url) title = browser.get_element_by_xpath( ElementXpath.TITLE_ON_POST).text content = browser.get_element_by_xpath( ElementXpath.CONTENT_ON_POST).get_attribute("content") comments = list( map( lambda element: element.text, browser.get_elements_by_xpath( ElementXpath.COMMENTS_ON_POST))) raw_data.append(title) raw_data.append(content) raw_data.extend(comments) return raw_data
class FarooSearch(object): SEARCH_URL = "http://www.faroo.com/api?q=%(query)s&start=%(start)s&length=%(length)s&l=en&src=web&f=json" def __init__(self, query, start, length): self.query = query.replace(' ', '%20') self.start = start self.length = length self.browser = Browser() def _get_results(self): url = FarooSearch.SEARCH_URL actual_url = url % {'query': self.query, 'start': self.start, 'length': self.length} try: page = self.browser.get_page(actual_url) except BrowserError: raise SearchError ("Failed getting %s: %s") % (e.url, e.error) #return (unicodedata.normalize('NFKD', page).encode('ascii','ignore')).decode("utf-8") return page.decode("utf-8")