def get_single_article(url): d = PyQuery(url=url, encoding="utf-8") a = d('#artibody') a.remove('#left_hzh_ad') content = a.text() title = d('.main-title').text() if not title: return False source = d('.source').text() collection = Mongo().news db_result = collection.find_one({'spider_from': 'sina', 'url': url}) if db_result: return True insert_data = { 'type': 'articles', 'created_at': int(time.time()), 'author': '', 'spider_from': 'sina', 'source': source, 'source_id': -1, 'title': title, 'content': content, 'url': url, 'images': [], 'keywords': [], 'has_send': 0 } collection.insert(insert_data) return True
def get_database(config, setting): if config == "mongo": return Mongo(setting) elif config == "file": return File(setting) else: return Mongo(setting)
def find_repeat_news(): """ repeat 1 重复 -1 不重复 :return: """ collection = Mongo().news news = list(collection.find({'created_at': {'$gt': time.time() - 3600}})) for new in news: new['keywords_temp'] = get_keywords(new['content']) for new1 in news: if new1.get('repeat'): new1['state'] = 1 continue for new2 in news: if new2['_id'] == new1['_id']: continue if is_sim(new1, new2): new1['repeat'] = 1 new1['state'] = 1 break if new1.get('state') != 1: new1['state'] = 1 new1['repeat'] = -1 for new in news: new.pop('state') new.pop('keywords_temp') collection.save(new)
def get_github_data(token_name): collection = Mongo().github try: result = collection.find({'token_name': token_name}).sort('spider_time', pymongo.DESCENDING).limit(1)[0] result.pop('_id') return return_success(data=result) except Exception as e: return return_error()
def start_spider(self): result = None try: result = requests.get(conf['news']['bishijie']).json() collection = Mongo().news if result['error'] != 0: self.retry() result = result['data'] for date in result: id_list = [new['newsflash_id'] for new in result[date]['buttom']] db_news = collection.find({ 'spider_from': 'bishijie', 'source_id': { '$in': id_list } }) db_id_list = [new['source_id'] for new in db_news] for new in result[date]['buttom']: if new['newsflash_id'] in db_id_list: continue content = new['content'] try: front_title_index = content.index('【') tail_title_index = content.index('】') title = content[front_title_index + 1:tail_title_index] content = content[tail_title_index + 1:] except Exception as e: title = '' insert_data = { 'type': 'news', 'created_at': new['issue_time'], 'author': new['source'], 'spider_from': 'bishijie', 'source': 'bishijie', 'source_id': new['newsflash_id'], 'title': title, 'content': content, 'url': 'http://www.bishijie.com/home/newsflashpc/detail?id=' + str(new['newsflash_id']), 'images': [], 'keywords': [], 'has_send': 0 } collection.insert(insert_data) except Exception as e: self.retry(e)
def news_send_finish(self, news): try: collection = Mongo().news for new in news: new.update({ 'has_send': 1 }) collection.save(new) except: self.retry()
def deal_content(): collection = Mongo().news news = collection.find({'has_keywords': {'$ne': 1}}) for new in news: if not new['title'] or not new['content']: continue text = new['title'] + ';' + new['content'] keywords = get_keywords(text) new.update({'keywords': keywords, 'has_keywords': 1}) collection.save(new)
def sync_google_trends(): collection = Mongo().google_trends trends = collection.find({}) for trend in trends: if 'trends' not in trend or not trend['trends']: continue post_data = { 'token_id': trend['token_id'], 'search_number': trend['trends'][-1]['value'][0] } send_google_trend.delay(post_data)
def start_spider(): collection = Mongo().news # html = requests.get('http://www.jinse.com/lives').text dom = PyQuery(url='https://wallstreetcn.com/live/blockchain') pane = dom(".wscn-tab-pane") items = pane.items() next(items) pane = next(items) lives = pane('.live-item') for li in lives.items(): source_id = None content = li('.live-item__main__content')('p').text() if not content: continue content_more = li('.live-item__main__content-more')('p').html() try: front_title_index = content.index('【') tail_title_index = content.index('】') title = content[front_title_index + 1: tail_title_index] content = content[tail_title_index + 1:] except Exception as e: title = '' if content_more: content += content_more images = [] images_items = li('.live-item__main__images')('.zoomer__img') for image in images_items.items(): images.append(image.attr('src')) # 查询记录是否已经存在 db_count = collection.find({ 'spider_from': 'wallstreetcn', 'content': content }).count() if db_count > 0: continue insert_data = { 'type': 'news', 'created_at': int(time.time()), 'author': "华尔街见闻", 'spider_from': 'wallstreetcn', 'source': 'wallstreetcn', 'source_id': -1, 'title': title, 'content': content, 'url': '', 'images': [], 'keywords': [], 'has_send': 0 } collection.insert(insert_data) return True
def send_single_token_github(token_id, token_name): collection = Mongo().github db_result = collection.find_one({ 'token_name': token_name, }) if db_result: send_data = { "token_id": token_id, 'url': db_result['github_url'], 'star': db_result['star'], 'fork': db_result['fork'], 'user_count': db_result['watch'], 'code_hot': db_result['star'] } result = requests.post(conf['sync']['host'] + conf['sync']['git_update'], data=send_data) print(result.json())
def get_btc_holders(): collection = Mongo().token_address result = requests.get('https://api.blockchain.info/charts/my-wallet-n-users?format=json') if result.status_code == 200: values = result.json()['values'] values = values[-5:-1] for value in values: db_result = collection.find_one({ 'token_name': 'btc', 'time': value['x'] }) if not db_result: collection.insert({ 'token_name': 'btc', 'time': value['x'], 'address': value['y'] })
def get_erc_transaction(): collection = Mongo().token p = 1 # 取前面150位 while p <= 3: p += 1 list_page = PyQuery(url='https://etherscan.io/tokens') tokens = list_page('tbody')('tr').items() for token in tokens: token_name = token('h5')('a').text() token_name = re.findall(r'\w+', token_name) token_name = token_name[-1].lower() href = 'https://etherscan.io' + token('h5')('a').attr('href') contract_address = href.split('/')[-1] if token_name in key_words: try: transaction = get_single_erc_transaction(contract_address) db_result = collection.find_one({'token_name': token_name}) if db_result: db_result.update({ 'transaction': transaction }) collection.save(db_result) else: collection.insert({ 'token_name': token_name, 'transaction': transaction }) except: print(contract_address)
def get_data(token_name, url, api_url): collection = Mongo().github result = requests.get('{}?client_id={}&client_secret={}'.format(api_url, 'dcc3734066251548c999', '89d90ad41f32b18d2ed689cb21875b75e88a2d82')).json() if 'forks_count' not in result: # TODO record error result return token = collection.find_one({ 'token_name': token_name, 'github_url': url }) insert_data = { 'token_name': token_name, 'github_url': url, 'star': result['stargazers_count'], 'fork': result['forks_count'], 'watch': result['subscribers_count'], 'spider_time': time.time(), 'update_time': result['updated_at'], 'create_time': result['created_at'] } if token: token.update(insert_data) collection.save(token) else: collection.insert(insert_data)
def get_google_trend(key, token_id): # socks.set_default_proxy(socks.SOCKS5, "127.0.0.1", 1086) # temp_socket = socket.socket # socket.socket = socks.socksocket token, search_time = get_google_token(key) headers = { 'host': 'trends.google.com', 'User_Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36', 'Referfer': ('https://trends.google.com/trends/explore?q=' + key).encode('utf-8'), 'x-client-data': 'CJa2yQEIo7bJAQjBtskBCKmdygEIqKPKAQ==' } request_url = 'https://trends.google.com/trends/api/widgetdata/multiline?hl=zh-CN&tz=-480&req=%7B%22time%22:%22{}%22,%22resolution%22:%22DAY%22,%22locale%22:%22zh-CN%22,%22comparisonItem%22:%5B%7B%22geo%22:%7B%7D,%22complexKeywordsRestriction%22:%7B%22keyword%22:%5B%7B%22type%22:%22BROAD%22,%22value%22:%22{}%22%7D%5D%7D%7D%5D,%22requestOptions%22:%7B%22property%22:%22%22,%22backend%22:%22IZG%22,%22category%22:0%7D%7D&token={}&tz=-480'.format( search_time, key, token) result = requests.get(request_url, headers=headers).text[5:] result = json.loads(result) data = result['default']['timelineData'] # socket.socket = temp_socket collection = Mongo().google_trends db_result = collection.find_one({ 'token_id': token_id }) if db_result: db_result.update({ 'trends': data }) collection.save(db_result) return True collection.insert({ 'token_id': token_id, 'token_name': key, 'trends': data }) return True
def setUp(self): config = { "hostname": "localhost", "port": 27017, "username": "******", "password": "******", "database": "monitor", } self.client = Mongo(config) self.client.insert( { "html": "<html><body><h1>TEST</h1></body></html>", "id": 1 }, { "id": 1, "diff": "html > body > h1", "count": 1 })
def statistic_token_address(token_name): collection = Mongo().token_address current_info = collection.find({ 'token_name': token_name }).sort('time', pymongo.DESCENDING).limit(1) if current_info.count() == 0: return False, False, False current_info = current_info[0] last_info = collection.find({ 'time': {'$gt': current_info['time'] - 86400}, 'token_name': token_name }).sort('time', pymongo.ASCENDING).limit(1)[0] if current_info['address'] - last_info['address'] == 0: last_info = collection.find({ 'time': {'$lte': current_info['time'] - 86400}, 'token_name': token_name }).sort('time', pymongo.DESCENDING).limit(1)[0] return True, current_info['address'], current_info['address'] - last_info['address']
def send_test_token_info(): collection = Mongo().token tokens = get_test_tokens() for token in tokens: token_name = token['ticker'].lower() db_result = collection.find_one({ 'token_name': token_name }) if not db_result: continue data = { 'token_id': token['token_id'], 'transaction': db_result.get('transaction', 0), 'holders': db_result.get('address', 0), 'holders_increase': db_result.get('address_increase', 0) } try: result = requests.post('http://47.52.103.240:18189' + conf['sync']['token_info'], data) except: pass
def sync_test_token_github(): tokens = get_test_tokens() collection = Mongo().github for token in tokens: db_result = collection.find_one({ 'token_name': token['ticker'].lower(), }) if db_result: send_data = { "token_id": token['token_id'], 'url': db_result['github_url'], 'star': db_result['star'], 'fork': db_result['fork'], 'user_count': db_result['watch'], 'code_hot': db_result['star'] } print('send test environment github') print(send_data) result = requests.post('http://47.52.103.240:18189' + conf['sync']['git_update'], data=send_data) print(result.json())
def get_erc20(): collection = Mongo().token_address p = 1 # 取前面150位 while p <= 3: list_page = PyQuery(url='https://etherscan.io/tokens') tokens = list_page('tbody')('tr').items() for token in tokens: token_name = token('h5')('a').text() token_name = re.findall(r'\w+', token_name) token_name = token_name[-1].lower() href = 'https://etherscan.io' + token('h5')('a').attr('href') + '#balances' if token_name in key_words: address = get_erc20_holders(href) collection.insert({ 'token_name': token_name, 'address': address, 'time': int(time.time()) }) p += 1
def get_eth_holders(): collection = Mongo().token_address result = requests.get('https://etherscan.io/chart/address?output=csv') if result.status_code == 200: text = result.text values = text.split('\r\n')[-5:-1] for value in values: value = value.replace('"', '') value = value.split(',') address_time = int(value[1]) address = int(value[2]) db_result = collection.find_one({ 'token_name': 'eth', 'time': address_time }) if not db_result: collection.insert({ 'token_name': 'eth', 'time': address_time, 'address': address })
def cryptopanic_spider(): collection = Mongo().news news = get_cryptopanic() if not news: return False for new in news: source_id = new['pk'] db_count = collection.find({ 'spider_from': 'cryptopanic', 'source_id': source_id }).count() if db_count > 0: continue title, content = new.get('title'), html2text(new.get('body')) title_cn, content_cn = google_translate_list([title, content]) insert_data = { 'type': new['kind'], 'created_at': int(time.time()), 'author': new.get('domain'), 'spider_from': 'cryptopanic', 'source': new['source']['domain'], 'source_id': source_id, 'title': new.get('title'), 'content': html2text(new.get('body')), 'url': new.get('url'), 'images': new.get('image'), 'has_keywords': 0, 'has_send': 0, 'repeat': -1, 'has_translated': 1, 'translated_title': title_cn, 'translated_text': content_cn } currencies = new.get('currencies') if currencies: for currencie in new['currencies']: insert_data.setdefault('keywords', []).append(currencie['code']) insert_data['has_keywords'] = 1 collection.insert(insert_data)
def start_spider(): collection = Mongo().news data = requests.get( 'https://api.jinse.com/v4/live/list?limit=20&reading=false') for date in data.json()['list']: for new in date['lives']: source_id = new['id'] content = new['content'] # 查询记录是否已经存在 db_count = collection.find({ 'spider_from': 'jinse', 'source_id': source_id }).count() if db_count > 0: continue try: front_title_index = content.index('【') tail_title_index = content.index('】') title = content[front_title_index + 1:tail_title_index] content = content[tail_title_index + 1:] except Exception as e: title = '' insert_data = { 'type': 'news', 'created_at': int(time.time()), 'author': "金色快讯", 'spider_from': 'jinse', 'source': 'jinse', 'source_id': source_id, 'title': title, 'content': content, 'url': 'http://www.jinse.com/lives/' + str(source_id) + '.htm', 'images': [], 'keywords': [], 'has_send': 0 } collection.insert(insert_data) return True
def setUp(self): self.series = Series(Mongo('movies')) self.a_film = Film() self.a_film.id = 1 self.a_film.description = "description" self.a_film.director = "director" self.a_film.name = "name" self.a_film.seasons = "season" self.a_film.year = "2016" self.other_film = Film() self.other_film.id = 2 self.other_film.description = "description_other" self.other_film.director = "director_other" self.other_film.name = "name_other" self.other_film.seasons = "season_other" self.other_film.year = "2017"
def get_user_info(token_name, username, token_id): try: collection = Mongo().twitter result = api.get_user(screen_name=username) result._json['token_name'] = token_name result._json['user_name'] = username result._json['token_id'] = token_id token = collection.find_one({ "token_id": token_id, "user_name": username }) if token: token.update(result._json) collection.save(token) else: collection.insert(result._json) except TweepError: pass
def get_transaction(): collection = Mongo().token dom = PyQuery(url='http://www.blocktivity.info/') lists = dom('.font_size_row').items() for _ in lists: token_name = _('td').eq(2)('a').text().lower() transaction = _('td').eq(3).text() transaction = list(filter(str.isdigit, transaction)) transaction = int(''.join(map(str, transaction))) db_result = collection.find_one({'token_name': token_name}) if db_result: db_result.update({ 'transaction': transaction }) collection.save(db_result) else: collection.insert({ 'token_name': token_name, 'transaction': transaction }) get_erc_transaction()
def statistic_tokens_address(): collection = Mongo().token tokens = get_tokens() for token in tokens: token_name = token['ticker'].lower() code, address, increase = statistic_token_address(token_name) if not code: address = 0 increase = 0 db_result = collection.find_one({'token_name': token_name}) if db_result: db_result.update({ 'address': address, 'address_increase': increase }) collection.save(db_result) else: collection.insert({ 'token_name': token_name, 'address': address, 'address_increase': increase })
def setUp(self): self.a_movie_data = {} self.other_movie_data = {} self.movie_model = Movies(3600, Mongo('movies'))
# -*- coding: utf-8 -*- from model.mongo import Mongo from variables.general import config # ============================================================================== # GLOBAL # ============================================================================== mongo_host = config.get_env("MONGO_HOST") mongo_port = 27017 mongo_user = config.get_env("MONGO_INITDB_ROOT_USERNAME") mongo_password = config.get_env("MONGO_INITDB_ROOT_PASSWORD") mongo = Mongo(mongo_host, mongo_user, mongo_password, port=mongo_port)
# coding=utf-8 from flask import Flask import os from blueprints.movies import movies from blueprints.series import series from model.movies import Movies from model.series import Series from model.mongo import Mongo app = Flask(__name__) app.cachetime = 3600 # 1 óra app.movies = Movies(app.cachetime, Mongo('movies')) app.series = Series(Mongo('series')) @app.route('/') def hello_world(): return """ <!DOCTYPE html> <html> <head> <title>SZTE - PIANK - @Numichi</title> </head> <body> <h1>SZTE - PIANK - @Numichi</h1> <h2>Homework list:</h2> <ul> <li>1. Homework: Git / GitHub</li> <li>2. Homework: Heroku -> Hello World</li>
# -*- coding: utf-8 -*- """ @author: maozhufeng @file: send_news_to_test @time: 2018/6/16 下午12:41 """ import pymongo import requests from common import conf from model.mongo import Mongo collection = Mongo().news news_to_send = collection.find({ 'has_keywords': 1, 'repeat': -1, 'title': { '$ne': '' }, 'content': { '$ne': '' } }) news_to_send = list(news_to_send) news_to_send = [ new for new in news_to_send if new['title'] is not None and new['content'] is not None ] all_count = len(news_to_send) send_count = 0 start = 0