def get_data(token_name, url, api_url): collection = Mongo().github result = requests.get('{}?client_id={}&client_secret={}'.format(api_url, 'dcc3734066251548c999', '89d90ad41f32b18d2ed689cb21875b75e88a2d82')).json() if 'forks_count' not in result: # TODO record error result return token = collection.find_one({ 'token_name': token_name, 'github_url': url }) insert_data = { 'token_name': token_name, 'github_url': url, 'star': result['stargazers_count'], 'fork': result['forks_count'], 'watch': result['subscribers_count'], 'spider_time': time.time(), 'update_time': result['updated_at'], 'create_time': result['created_at'] } if token: token.update(insert_data) collection.save(token) else: collection.insert(insert_data)
def get_single_article(url): d = PyQuery(url=url, encoding="utf-8") a = d('#artibody') a.remove('#left_hzh_ad') content = a.text() title = d('.main-title').text() if not title: return False source = d('.source').text() collection = Mongo().news db_result = collection.find_one({'spider_from': 'sina', 'url': url}) if db_result: return True insert_data = { 'type': 'articles', 'created_at': int(time.time()), 'author': '', 'spider_from': 'sina', 'source': source, 'source_id': -1, 'title': title, 'content': content, 'url': url, 'images': [], 'keywords': [], 'has_send': 0 } collection.insert(insert_data) return True
def get_erc_transaction(): collection = Mongo().token p = 1 # 取前面150位 while p <= 3: p += 1 list_page = PyQuery(url='https://etherscan.io/tokens') tokens = list_page('tbody')('tr').items() for token in tokens: token_name = token('h5')('a').text() token_name = re.findall(r'\w+', token_name) token_name = token_name[-1].lower() href = 'https://etherscan.io' + token('h5')('a').attr('href') contract_address = href.split('/')[-1] if token_name in key_words: try: transaction = get_single_erc_transaction(contract_address) db_result = collection.find_one({'token_name': token_name}) if db_result: db_result.update({ 'transaction': transaction }) collection.save(db_result) else: collection.insert({ 'token_name': token_name, 'transaction': transaction }) except: print(contract_address)
def get_google_trend(key, token_id): # socks.set_default_proxy(socks.SOCKS5, "127.0.0.1", 1086) # temp_socket = socket.socket # socket.socket = socks.socksocket token, search_time = get_google_token(key) headers = { 'host': 'trends.google.com', 'User_Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36', 'Referfer': ('https://trends.google.com/trends/explore?q=' + key).encode('utf-8'), 'x-client-data': 'CJa2yQEIo7bJAQjBtskBCKmdygEIqKPKAQ==' } request_url = 'https://trends.google.com/trends/api/widgetdata/multiline?hl=zh-CN&tz=-480&req=%7B%22time%22:%22{}%22,%22resolution%22:%22DAY%22,%22locale%22:%22zh-CN%22,%22comparisonItem%22:%5B%7B%22geo%22:%7B%7D,%22complexKeywordsRestriction%22:%7B%22keyword%22:%5B%7B%22type%22:%22BROAD%22,%22value%22:%22{}%22%7D%5D%7D%7D%5D,%22requestOptions%22:%7B%22property%22:%22%22,%22backend%22:%22IZG%22,%22category%22:0%7D%7D&token={}&tz=-480'.format( search_time, key, token) result = requests.get(request_url, headers=headers).text[5:] result = json.loads(result) data = result['default']['timelineData'] # socket.socket = temp_socket collection = Mongo().google_trends db_result = collection.find_one({ 'token_id': token_id }) if db_result: db_result.update({ 'trends': data }) collection.save(db_result) return True collection.insert({ 'token_id': token_id, 'token_name': key, 'trends': data }) return True
def start_spider(self): result = None try: result = requests.get(conf['news']['bishijie']).json() collection = Mongo().news if result['error'] != 0: self.retry() result = result['data'] for date in result: id_list = [new['newsflash_id'] for new in result[date]['buttom']] db_news = collection.find({ 'spider_from': 'bishijie', 'source_id': { '$in': id_list } }) db_id_list = [new['source_id'] for new in db_news] for new in result[date]['buttom']: if new['newsflash_id'] in db_id_list: continue content = new['content'] try: front_title_index = content.index('【') tail_title_index = content.index('】') title = content[front_title_index + 1:tail_title_index] content = content[tail_title_index + 1:] except Exception as e: title = '' insert_data = { 'type': 'news', 'created_at': new['issue_time'], 'author': new['source'], 'spider_from': 'bishijie', 'source': 'bishijie', 'source_id': new['newsflash_id'], 'title': title, 'content': content, 'url': 'http://www.bishijie.com/home/newsflashpc/detail?id=' + str(new['newsflash_id']), 'images': [], 'keywords': [], 'has_send': 0 } collection.insert(insert_data) except Exception as e: self.retry(e)
def start_spider(): collection = Mongo().news # html = requests.get('http://www.jinse.com/lives').text dom = PyQuery(url='https://wallstreetcn.com/live/blockchain') pane = dom(".wscn-tab-pane") items = pane.items() next(items) pane = next(items) lives = pane('.live-item') for li in lives.items(): source_id = None content = li('.live-item__main__content')('p').text() if not content: continue content_more = li('.live-item__main__content-more')('p').html() try: front_title_index = content.index('【') tail_title_index = content.index('】') title = content[front_title_index + 1: tail_title_index] content = content[tail_title_index + 1:] except Exception as e: title = '' if content_more: content += content_more images = [] images_items = li('.live-item__main__images')('.zoomer__img') for image in images_items.items(): images.append(image.attr('src')) # 查询记录是否已经存在 db_count = collection.find({ 'spider_from': 'wallstreetcn', 'content': content }).count() if db_count > 0: continue insert_data = { 'type': 'news', 'created_at': int(time.time()), 'author': "华尔街见闻", 'spider_from': 'wallstreetcn', 'source': 'wallstreetcn', 'source_id': -1, 'title': title, 'content': content, 'url': '', 'images': [], 'keywords': [], 'has_send': 0 } collection.insert(insert_data) return True
def get_btc_holders(): collection = Mongo().token_address result = requests.get('https://api.blockchain.info/charts/my-wallet-n-users?format=json') if result.status_code == 200: values = result.json()['values'] values = values[-5:-1] for value in values: db_result = collection.find_one({ 'token_name': 'btc', 'time': value['x'] }) if not db_result: collection.insert({ 'token_name': 'btc', 'time': value['x'], 'address': value['y'] })
def get_user_info(token_name, username, token_id): try: collection = Mongo().twitter result = api.get_user(screen_name=username) result._json['token_name'] = token_name result._json['user_name'] = username result._json['token_id'] = token_id token = collection.find_one({ "token_id": token_id, "user_name": username }) if token: token.update(result._json) collection.save(token) else: collection.insert(result._json) except TweepError: pass
def get_erc20(): collection = Mongo().token_address p = 1 # 取前面150位 while p <= 3: list_page = PyQuery(url='https://etherscan.io/tokens') tokens = list_page('tbody')('tr').items() for token in tokens: token_name = token('h5')('a').text() token_name = re.findall(r'\w+', token_name) token_name = token_name[-1].lower() href = 'https://etherscan.io' + token('h5')('a').attr('href') + '#balances' if token_name in key_words: address = get_erc20_holders(href) collection.insert({ 'token_name': token_name, 'address': address, 'time': int(time.time()) }) p += 1
def get_eth_holders(): collection = Mongo().token_address result = requests.get('https://etherscan.io/chart/address?output=csv') if result.status_code == 200: text = result.text values = text.split('\r\n')[-5:-1] for value in values: value = value.replace('"', '') value = value.split(',') address_time = int(value[1]) address = int(value[2]) db_result = collection.find_one({ 'token_name': 'eth', 'time': address_time }) if not db_result: collection.insert({ 'token_name': 'eth', 'time': address_time, 'address': address })
def get_transaction(): collection = Mongo().token dom = PyQuery(url='http://www.blocktivity.info/') lists = dom('.font_size_row').items() for _ in lists: token_name = _('td').eq(2)('a').text().lower() transaction = _('td').eq(3).text() transaction = list(filter(str.isdigit, transaction)) transaction = int(''.join(map(str, transaction))) db_result = collection.find_one({'token_name': token_name}) if db_result: db_result.update({ 'transaction': transaction }) collection.save(db_result) else: collection.insert({ 'token_name': token_name, 'transaction': transaction }) get_erc_transaction()
def cryptopanic_spider(): collection = Mongo().news news = get_cryptopanic() if not news: return False for new in news: source_id = new['pk'] db_count = collection.find({ 'spider_from': 'cryptopanic', 'source_id': source_id }).count() if db_count > 0: continue title, content = new.get('title'), html2text(new.get('body')) title_cn, content_cn = google_translate_list([title, content]) insert_data = { 'type': new['kind'], 'created_at': int(time.time()), 'author': new.get('domain'), 'spider_from': 'cryptopanic', 'source': new['source']['domain'], 'source_id': source_id, 'title': new.get('title'), 'content': html2text(new.get('body')), 'url': new.get('url'), 'images': new.get('image'), 'has_keywords': 0, 'has_send': 0, 'repeat': -1, 'has_translated': 1, 'translated_title': title_cn, 'translated_text': content_cn } currencies = new.get('currencies') if currencies: for currencie in new['currencies']: insert_data.setdefault('keywords', []).append(currencie['code']) insert_data['has_keywords'] = 1 collection.insert(insert_data)
def statistic_tokens_address(): collection = Mongo().token tokens = get_tokens() for token in tokens: token_name = token['ticker'].lower() code, address, increase = statistic_token_address(token_name) if not code: address = 0 increase = 0 db_result = collection.find_one({'token_name': token_name}) if db_result: db_result.update({ 'address': address, 'address_increase': increase }) collection.save(db_result) else: collection.insert({ 'token_name': token_name, 'address': address, 'address_increase': increase })
def start_spider(): collection = Mongo().news data = requests.get( 'https://api.jinse.com/v4/live/list?limit=20&reading=false') for date in data.json()['list']: for new in date['lives']: source_id = new['id'] content = new['content'] # 查询记录是否已经存在 db_count = collection.find({ 'spider_from': 'jinse', 'source_id': source_id }).count() if db_count > 0: continue try: front_title_index = content.index('【') tail_title_index = content.index('】') title = content[front_title_index + 1:tail_title_index] content = content[tail_title_index + 1:] except Exception as e: title = '' insert_data = { 'type': 'news', 'created_at': int(time.time()), 'author': "金色快讯", 'spider_from': 'jinse', 'source': 'jinse', 'source_id': source_id, 'title': title, 'content': content, 'url': 'http://www.jinse.com/lives/' + str(source_id) + '.htm', 'images': [], 'keywords': [], 'has_send': 0 } collection.insert(insert_data) return True
class TestMongo(TestCase): def setUp(self): config = { "hostname": "localhost", "port": 27017, "username": "******", "password": "******", "database": "monitor", } self.client = Mongo(config) self.client.insert( { "html": "<html><body><h1>TEST</h1></body></html>", "id": 1 }, { "id": 1, "diff": "html > body > h1", "count": 1 }) def test_get_exec_count(self): actual = self.client.get_exec_count() self.assertEqual(1, actual) def test_get_previous_html(self): actual = self.client.get_previous_html() self.assertEqual("<html><body><h1>TEST</h1></body></html>", actual["html"]) def test_update_exec_count(self): self.client.update_exec_count() actual = self.client.get_exec_count() self.assertEqual(2, actual) def test_update_previous_html(self): self.client.update_previous_html( "<html><body><h1>TEST</h1><h2>TEST2</2></body></html>") actual = self.client.get_previous_html() self.assertEqual( "<html><body><h1>TEST</h1><h2>TEST2</2></body></html>", actual) def test_find_diff_from_previous(self): actual = self.client.find_diff_from_previous("html > body > h1") self.assertEqual(1, actual["id"]) self.assertEqual("html > body > h1", actual["diff"]) self.assertEqual(1, actual["count"]) def test_find_diff_from_previous_empty(self): actual = self.client.find_diff_from_previous("invalid key") self.assertIsNone(actual) def test_insert_previous_diff(self): previous = self.client.find_diff_from_previous("key1") self.assertIsNone(previous) self.client.insert_previous_diff("key1") actual = self.client.find_diff_from_previous("key1") self.assertIsNotNone(actual) def test_update_previous_diff(self): previous = self.client.find_diff_from_previous("html > body > h1") self.assertEqual(1, previous["count"]) self.client.update_previous_diff("html > body > h1") actual = self.client.find_diff_from_previous("html > body > h1") self.assertEqual(2, actual["count"]) def test_insert_or_update_diff_with_new_diff(self): actual = self.client.insert_or_update_diff("key1") self.assertEqual("key1", actual["diff"]) self.assertEqual(1, actual["count"]) def test_insert_or_update_diff_with_exist_diff(self): actual = self.client.insert_or_update_diff("html > body > h1") self.assertEqual("html > body > h1", actual["diff"]) self.assertEqual(2, actual["count"]) def tearDown(self): self.client.drop()