def main(): parser = SafeConfigParser() parser.read("config.ini") capacity = parser.get('stocks', 'number') folder = parser.get('csv', 'folder') csv_file = parser.get('csv', 'filename') csv = CsvReader(folder, csv_file, int(capacity)) stocks = iter(csv.get_stocks()) url = parser.get('stocks', 'url') request = Request(url, stocks, int(capacity)) stock_info = request.get_url_info() stream = parser.get('kinesis', 'stream_name') shards = json.loads(parser.get('kinesis', 'shards')) kinesis = Kinesis(stream, shards) responses = kinesis.stream_stock(stock_info) print('{},{},{},{},{},{}'.format('Timestamp', 'StatusCode', 'ShardId', 'Sequence Number', 'Stock', 'Stock Price')) for s, r in zip(stock_info, responses): t = (s + r) print(','.join(str(i) for i in t))
def list_parse(self, res): soup = BeautifulSoup(res.text, 'lxml') # 列表页信息获取 for li in soup.select('div.listNews.whtPD.columns'): print(li.find('h2', 'titleNews').get_text(strip=True)) print(urljoin(res.url, li.a['href'])) print() break # 下一页 next_page = soup.select('.navigation-page a.jp-last') if next_page: url = urljoin(res.url, next_page[0]['href']) yield Request(url, cb=self.list_parse)
def start(self): keyword = 'Trump' yield Request('https://twitter.com/i/search/timeline', params={ 'vertical': 'news', 'q': keyword, 'src': 'typd', 'include_available_features': '1', 'include_entities': '1', 'reset_error_state': 'false' }, headers={ 'accept-language': 'zh-CN,zh-HK;q=0.9,zh;q=0.8,zh-TW;q=0.7' }, cb=self.list_parse)
def list_parse(self, res): soup = BeautifulSoup(res.json()['items_html'], 'lxml') for li in soup.select('li.stream-item'): data = self.information(li, res.url) if data['footer']['reply_count'] > 0: url = 'https://twitter.com/i/%s/conversation/%s' % ( data['user']['username'], data['id']) yield Request(url, params={ 'include_available_features': '1', 'include_entities': '1', 'max_position': '', 'reset_error_state': 'false' }, headers={ 'accept-language': 'zh-CN,zh-HK;q=0.9,zh;q=0.8,zh-TW;q=0.7' }, cb=self.reply_parse, item=data) else: yield TwitterItem(data)
def start(self): for i in range(1, 21): url = 'https://www.thejakartapost.com/index/page/%d' % i yield Request(url, cb=self.list_parse)
def start(self): yield Request('http://www.thejakartapost.com/index', cb=self.list_parse)