コード例 #1
0
def get_single_article(url):
    d = PyQuery(url=url, encoding="utf-8")
    a = d('#artibody')
    a.remove('#left_hzh_ad')
    content = a.text()
    title = d('.main-title').text()
    if not title:
        return False
    source = d('.source').text()
    collection = Mongo().news
    db_result = collection.find_one({'spider_from': 'sina', 'url': url})
    if db_result:
        return True
    insert_data = {
        'type': 'articles',
        'created_at': int(time.time()),
        'author': '',
        'spider_from': 'sina',
        'source': source,
        'source_id': -1,
        'title': title,
        'content': content,
        'url': url,
        'images': [],
        'keywords': [],
        'has_send': 0
    }
    collection.insert(insert_data)
    return True
コード例 #2
0
 def get_database(config, setting):
     if config == "mongo":
         return Mongo(setting)
     elif config == "file":
         return File(setting)
     else:
         return Mongo(setting)
コード例 #3
0
ファイル: repeat.py プロジェクト: Fern9/newsSpider
def find_repeat_news():
    """
    repeat 1 重复 -1 不重复
    :return:
    """
    collection = Mongo().news
    news = list(collection.find({'created_at': {'$gt': time.time() - 3600}}))
    for new in news:
        new['keywords_temp'] = get_keywords(new['content'])
    for new1 in news:
        if new1.get('repeat'):
            new1['state'] = 1
            continue
        for new2 in news:
            if new2['_id'] == new1['_id']:
                continue
            if is_sim(new1, new2):
                new1['repeat'] = 1
                new1['state'] = 1
                break
        if new1.get('state') != 1:
            new1['state'] = 1
            new1['repeat'] = -1
    for new in news:
        new.pop('state')
        new.pop('keywords_temp')
        collection.save(new)
コード例 #4
0
def get_github_data(token_name):
    collection = Mongo().github
    try:
        result = collection.find({'token_name': token_name}).sort('spider_time', pymongo.DESCENDING).limit(1)[0]
        result.pop('_id')
        return return_success(data=result)
    except Exception as e:
        return return_error()
コード例 #5
0
def start_spider(self):
    result = None
    try:
        result = requests.get(conf['news']['bishijie']).json()
        collection = Mongo().news
        if result['error'] != 0:
            self.retry()
        result = result['data']
        for date in result:
            id_list = [new['newsflash_id'] for new in result[date]['buttom']]
            db_news = collection.find({
                'spider_from': 'bishijie',
                'source_id': {
                    '$in': id_list
                }
            })
            db_id_list = [new['source_id'] for new in db_news]
            for new in result[date]['buttom']:
                if new['newsflash_id'] in db_id_list:
                    continue
                content = new['content']
                try:
                    front_title_index = content.index('【')
                    tail_title_index = content.index('】')
                    title = content[front_title_index + 1:tail_title_index]
                    content = content[tail_title_index + 1:]
                except Exception as e:
                    title = ''
                insert_data = {
                    'type':
                    'news',
                    'created_at':
                    new['issue_time'],
                    'author':
                    new['source'],
                    'spider_from':
                    'bishijie',
                    'source':
                    'bishijie',
                    'source_id':
                    new['newsflash_id'],
                    'title':
                    title,
                    'content':
                    content,
                    'url':
                    'http://www.bishijie.com/home/newsflashpc/detail?id=' +
                    str(new['newsflash_id']),
                    'images': [],
                    'keywords': [],
                    'has_send':
                    0
                }
                collection.insert(insert_data)
    except Exception as e:
        self.retry(e)
コード例 #6
0
ファイル: sync_data.py プロジェクト: Fern9/newsSpider
def news_send_finish(self, news):
    try:
        collection = Mongo().news
        for new in news:
            new.update({
                'has_send': 1
            })
            collection.save(new)
    except:
        self.retry()
コード例 #7
0
ファイル: parse.py プロジェクト: Fern9/newsSpider
def deal_content():
    collection = Mongo().news
    news = collection.find({'has_keywords': {'$ne': 1}})
    for new in news:
        if not new['title'] or not new['content']:
            continue
        text = new['title'] + ';' + new['content']
        keywords = get_keywords(text)
        new.update({'keywords': keywords, 'has_keywords': 1})
        collection.save(new)
コード例 #8
0
ファイル: sync_data.py プロジェクト: Fern9/newsSpider
def sync_google_trends():
    collection = Mongo().google_trends
    trends = collection.find({})
    for trend in trends:
        if 'trends' not in trend or not trend['trends']:
            continue
        post_data = {
            'token_id': trend['token_id'],
            'search_number': trend['trends'][-1]['value'][0]
        }
        send_google_trend.delay(post_data)
コード例 #9
0
def start_spider():
    collection = Mongo().news
    # html = requests.get('http://www.jinse.com/lives').text
    dom = PyQuery(url='https://wallstreetcn.com/live/blockchain')
    pane = dom(".wscn-tab-pane")
    items = pane.items()
    next(items)
    pane = next(items)
    lives = pane('.live-item')
    for li in lives.items():
        source_id = None
        content = li('.live-item__main__content')('p').text()
        if not content:
            continue
        content_more = li('.live-item__main__content-more')('p').html()
        try:
            front_title_index = content.index('【')
            tail_title_index = content.index('】')
            title = content[front_title_index + 1: tail_title_index]
            content = content[tail_title_index + 1:]
        except Exception as e:
            title = ''
        if content_more:
            content += content_more

        images = []
        images_items = li('.live-item__main__images')('.zoomer__img')
        for image in images_items.items():
            images.append(image.attr('src'))
        # 查询记录是否已经存在
        db_count = collection.find({
            'spider_from': 'wallstreetcn',
            'content': content
        }).count()
        if db_count > 0:
            continue

        insert_data = {
            'type': 'news',
            'created_at': int(time.time()),
            'author': "华尔街见闻",
            'spider_from': 'wallstreetcn',
            'source': 'wallstreetcn',
            'source_id': -1,
            'title': title,
            'content': content,
            'url': '',
            'images': [],
            'keywords': [],
            'has_send': 0
        }
        collection.insert(insert_data)
    return True
コード例 #10
0
ファイル: sync_data.py プロジェクト: Fern9/newsSpider
def send_single_token_github(token_id, token_name):
    collection = Mongo().github
    db_result = collection.find_one({
        'token_name': token_name,
    })
    if db_result:
        send_data = {
            "token_id": token_id,
            'url': db_result['github_url'],
            'star': db_result['star'],
            'fork': db_result['fork'],
            'user_count': db_result['watch'],
            'code_hot': db_result['star']
        }
        result = requests.post(conf['sync']['host'] + conf['sync']['git_update'], data=send_data)
        print(result.json())
コード例 #11
0
def get_btc_holders():
    collection = Mongo().token_address
    result = requests.get('https://api.blockchain.info/charts/my-wallet-n-users?format=json')
    if result.status_code == 200:
        values = result.json()['values']
        values = values[-5:-1]
        for value in values:
            db_result = collection.find_one({
                'token_name': 'btc',
                'time': value['x']
            })
            if not db_result:
                collection.insert({
                    'token_name': 'btc',
                    'time': value['x'],
                    'address': value['y']
                })
コード例 #12
0
ファイル: transaction.py プロジェクト: Fern9/newsSpider
def get_erc_transaction():
    collection = Mongo().token
    p = 1
    # 取前面150位
    while p <= 3:
        p += 1
        list_page = PyQuery(url='https://etherscan.io/tokens')
        tokens = list_page('tbody')('tr').items()
        for token in tokens:
            token_name = token('h5')('a').text()
            token_name = re.findall(r'\w+', token_name)
            token_name = token_name[-1].lower()
            href = 'https://etherscan.io' + token('h5')('a').attr('href')
            contract_address = href.split('/')[-1]
            if token_name in key_words:
                try:
                    transaction = get_single_erc_transaction(contract_address)
                    db_result = collection.find_one({'token_name': token_name})
                    if db_result:
                        db_result.update({
                            'transaction': transaction
                        })
                        collection.save(db_result)
                    else:
                        collection.insert({
                            'token_name': token_name,
                            'transaction': transaction
                        })
                except:
                    print(contract_address)
コード例 #13
0
 def get_data(token_name, url, api_url):
     collection = Mongo().github
     result = requests.get('{}?client_id={}&client_secret={}'.format(api_url, 'dcc3734066251548c999',
                                                                     '89d90ad41f32b18d2ed689cb21875b75e88a2d82')).json()
     if 'forks_count' not in result:
         # TODO record error result
         return
     token = collection.find_one({
         'token_name': token_name,
         'github_url': url
     })
     insert_data = {
         'token_name': token_name,
         'github_url': url,
         'star': result['stargazers_count'],
         'fork': result['forks_count'],
         'watch': result['subscribers_count'],
         'spider_time': time.time(),
         'update_time': result['updated_at'],
         'create_time': result['created_at']
     }
     if token:
         token.update(insert_data)
         collection.save(token)
     else:
         collection.insert(insert_data)
コード例 #14
0
def get_google_trend(key, token_id):
    # socks.set_default_proxy(socks.SOCKS5, "127.0.0.1", 1086)
    # temp_socket = socket.socket
    # socket.socket = socks.socksocket
    token, search_time = get_google_token(key)
    headers = {
        'host': 'trends.google.com',
        'User_Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
        'Referfer': ('https://trends.google.com/trends/explore?q=' + key).encode('utf-8'),
        'x-client-data': 'CJa2yQEIo7bJAQjBtskBCKmdygEIqKPKAQ=='
    }
    request_url = 'https://trends.google.com/trends/api/widgetdata/multiline?hl=zh-CN&tz=-480&req=%7B%22time%22:%22{}%22,%22resolution%22:%22DAY%22,%22locale%22:%22zh-CN%22,%22comparisonItem%22:%5B%7B%22geo%22:%7B%7D,%22complexKeywordsRestriction%22:%7B%22keyword%22:%5B%7B%22type%22:%22BROAD%22,%22value%22:%22{}%22%7D%5D%7D%7D%5D,%22requestOptions%22:%7B%22property%22:%22%22,%22backend%22:%22IZG%22,%22category%22:0%7D%7D&token={}&tz=-480'.format(
        search_time, key, token)
    result = requests.get(request_url, headers=headers).text[5:]
    result = json.loads(result)
    data = result['default']['timelineData']
    # socket.socket = temp_socket
    collection = Mongo().google_trends
    db_result = collection.find_one({
        'token_id': token_id
    })
    if db_result:
        db_result.update({
            'trends': data
        })
        collection.save(db_result)
        return True
    collection.insert({
        'token_id': token_id,
        'token_name': key,
        'trends': data
    })
    return True
コード例 #15
0
 def setUp(self):
     config = {
         "hostname": "localhost",
         "port": 27017,
         "username": "******",
         "password": "******",
         "database": "monitor",
     }
     self.client = Mongo(config)
     self.client.insert(
         {
             "html": "<html><body><h1>TEST</h1></body></html>",
             "id": 1
         }, {
             "id": 1,
             "diff": "html > body > h1",
             "count": 1
         })
コード例 #16
0
def statistic_token_address(token_name):
    collection = Mongo().token_address
    current_info = collection.find({
        'token_name': token_name
    }).sort('time', pymongo.DESCENDING).limit(1)
    if current_info.count() == 0:
        return False, False, False
    current_info = current_info[0]
    last_info = collection.find({
        'time': {'$gt': current_info['time'] - 86400},
        'token_name': token_name
    }).sort('time', pymongo.ASCENDING).limit(1)[0]
    if current_info['address'] - last_info['address'] == 0:
        last_info = collection.find({
            'time': {'$lte': current_info['time'] - 86400},
            'token_name': token_name
        }).sort('time', pymongo.DESCENDING).limit(1)[0]
    return True, current_info['address'], current_info['address'] - last_info['address']
コード例 #17
0
ファイル: sync_data.py プロジェクト: Fern9/newsSpider
def send_test_token_info():
    collection = Mongo().token
    tokens = get_test_tokens()
    for token in tokens:
        token_name = token['ticker'].lower()
        db_result = collection.find_one({
            'token_name': token_name
        })
        if not db_result:
            continue
        data = {
            'token_id': token['token_id'],
            'transaction': db_result.get('transaction', 0),
            'holders': db_result.get('address', 0),
            'holders_increase': db_result.get('address_increase', 0)
        }
        try:
            result = requests.post('http://47.52.103.240:18189' + conf['sync']['token_info'], data)
        except:
            pass
コード例 #18
0
ファイル: sync_data.py プロジェクト: Fern9/newsSpider
def sync_test_token_github():
    tokens = get_test_tokens()
    collection = Mongo().github
    for token in tokens:
        db_result = collection.find_one({
            'token_name': token['ticker'].lower(),
        })
        if db_result:
            send_data = {
                "token_id": token['token_id'],
                'url': db_result['github_url'],
                'star': db_result['star'],
                'fork': db_result['fork'],
                'user_count': db_result['watch'],
                'code_hot': db_result['star']
            }
            print('send test environment github')
            print(send_data)
            result = requests.post('http://47.52.103.240:18189' + conf['sync']['git_update'], data=send_data)
            print(result.json())
コード例 #19
0
def get_erc20():
    collection = Mongo().token_address
    p = 1
    # 取前面150位
    while p <= 3:
        list_page = PyQuery(url='https://etherscan.io/tokens')
        tokens = list_page('tbody')('tr').items()
        for token in tokens:
            token_name = token('h5')('a').text()
            token_name = re.findall(r'\w+', token_name)
            token_name = token_name[-1].lower()
            href = 'https://etherscan.io' + token('h5')('a').attr('href') + '#balances'
            if token_name in key_words:
                address = get_erc20_holders(href)
                collection.insert({
                    'token_name': token_name,
                    'address': address,
                    'time': int(time.time())
                })
            p += 1
コード例 #20
0
def get_eth_holders():
    collection = Mongo().token_address
    result = requests.get('https://etherscan.io/chart/address?output=csv')
    if result.status_code == 200:
        text = result.text
        values = text.split('\r\n')[-5:-1]
        for value in values:
            value = value.replace('"', '')
            value = value.split(',')
            address_time = int(value[1])
            address = int(value[2])
            db_result = collection.find_one({
                'token_name': 'eth',
                'time': address_time
            })
            if not db_result:
                collection.insert({
                    'token_name': 'eth',
                    'time': address_time,
                    'address': address
                })
コード例 #21
0
ファイル: news_sprider.py プロジェクト: Fern9/newsSpider
def cryptopanic_spider():
    collection = Mongo().news
    news = get_cryptopanic()
    if not news:
        return False
    for new in news:
        source_id = new['pk']
        db_count = collection.find({
            'spider_from': 'cryptopanic',
            'source_id': source_id
        }).count()
        if db_count > 0:
            continue
        title, content = new.get('title'), html2text(new.get('body'))
        title_cn, content_cn = google_translate_list([title, content])
        insert_data = {
            'type': new['kind'],
            'created_at': int(time.time()),
            'author': new.get('domain'),
            'spider_from': 'cryptopanic',
            'source': new['source']['domain'],
            'source_id': source_id,
            'title': new.get('title'),
            'content': html2text(new.get('body')),
            'url': new.get('url'),
            'images': new.get('image'),
            'has_keywords': 0,
            'has_send': 0,
            'repeat': -1,
            'has_translated': 1,
            'translated_title': title_cn,
            'translated_text': content_cn
        }
        currencies = new.get('currencies')
        if currencies:
            for currencie in new['currencies']:
                insert_data.setdefault('keywords',
                                       []).append(currencie['code'])
            insert_data['has_keywords'] = 1
        collection.insert(insert_data)
コード例 #22
0
ファイル: sprider.py プロジェクト: Fern9/newsSpider
def start_spider():
    collection = Mongo().news
    data = requests.get(
        'https://api.jinse.com/v4/live/list?limit=20&reading=false')
    for date in data.json()['list']:
        for new in date['lives']:
            source_id = new['id']
            content = new['content']

            # 查询记录是否已经存在
            db_count = collection.find({
                'spider_from': 'jinse',
                'source_id': source_id
            }).count()
            if db_count > 0:
                continue
            try:
                front_title_index = content.index('【')
                tail_title_index = content.index('】')
                title = content[front_title_index + 1:tail_title_index]
                content = content[tail_title_index + 1:]
            except Exception as e:
                title = ''
            insert_data = {
                'type': 'news',
                'created_at': int(time.time()),
                'author': "金色快讯",
                'spider_from': 'jinse',
                'source': 'jinse',
                'source_id': source_id,
                'title': title,
                'content': content,
                'url': 'http://www.jinse.com/lives/' + str(source_id) + '.htm',
                'images': [],
                'keywords': [],
                'has_send': 0
            }
            collection.insert(insert_data)
    return True
コード例 #23
0
 def setUp(self):
     self.series = Series(Mongo('movies'))
     self.a_film = Film()
     self.a_film.id = 1
     self.a_film.description = "description"
     self.a_film.director = "director"
     self.a_film.name = "name"
     self.a_film.seasons = "season"
     self.a_film.year = "2016"
     self.other_film = Film()
     self.other_film.id = 2
     self.other_film.description = "description_other"
     self.other_film.director = "director_other"
     self.other_film.name = "name_other"
     self.other_film.seasons = "season_other"
     self.other_film.year = "2017"
コード例 #24
0
ファイル: sprider.py プロジェクト: Fern9/newsSpider
def get_user_info(token_name, username, token_id):
    try:
        collection = Mongo().twitter
        result = api.get_user(screen_name=username)
        result._json['token_name'] = token_name
        result._json['user_name'] = username
        result._json['token_id'] = token_id
        token = collection.find_one({
            "token_id": token_id,
            "user_name": username
        })
        if token:
            token.update(result._json)
            collection.save(token)
        else:
            collection.insert(result._json)
    except TweepError:
        pass
コード例 #25
0
ファイル: transaction.py プロジェクト: Fern9/newsSpider
def get_transaction():
    collection = Mongo().token
    dom = PyQuery(url='http://www.blocktivity.info/')
    lists = dom('.font_size_row').items()
    for _ in lists:
        token_name = _('td').eq(2)('a').text().lower()
        transaction = _('td').eq(3).text()
        transaction = list(filter(str.isdigit, transaction))
        transaction = int(''.join(map(str, transaction)))
        db_result = collection.find_one({'token_name': token_name})
        if db_result:
            db_result.update({
                'transaction': transaction
            })
            collection.save(db_result)
        else:
            collection.insert({
                'token_name': token_name,
                'transaction': transaction
            })
    get_erc_transaction()
コード例 #26
0
def statistic_tokens_address():
    collection = Mongo().token
    tokens = get_tokens()
    for token in tokens:
        token_name = token['ticker'].lower()
        code, address, increase = statistic_token_address(token_name)
        if not code:
            address = 0
            increase = 0
        db_result = collection.find_one({'token_name': token_name})
        if db_result:
            db_result.update({
                'address': address,
                'address_increase': increase
            })
            collection.save(db_result)
        else:
            collection.insert({
                'token_name': token_name,
                'address': address,
                'address_increase': increase
            })
コード例 #27
0
 def setUp(self):
     self.a_movie_data = {}
     self.other_movie_data = {}
     self.movie_model = Movies(3600, Mongo('movies'))
コード例 #28
0
# -*- coding: utf-8 -*-

from model.mongo import Mongo
from variables.general import config

# ==============================================================================
# GLOBAL
# ==============================================================================

mongo_host = config.get_env("MONGO_HOST")
mongo_port = 27017
mongo_user = config.get_env("MONGO_INITDB_ROOT_USERNAME")
mongo_password = config.get_env("MONGO_INITDB_ROOT_PASSWORD")

mongo = Mongo(mongo_host, mongo_user, mongo_password, port=mongo_port)
コード例 #29
0
# coding=utf-8
from flask import Flask
import os

from blueprints.movies import movies
from blueprints.series import series
from model.movies import Movies
from model.series import Series
from model.mongo import Mongo

app = Flask(__name__)

app.cachetime = 3600  # 1 óra
app.movies = Movies(app.cachetime, Mongo('movies'))
app.series = Series(Mongo('series'))


@app.route('/')
def hello_world():
    return """
    <!DOCTYPE html>
    <html>
    <head>
        <title>SZTE - PIANK - @Numichi</title>
    </head>
    <body>
        <h1>SZTE - PIANK - @Numichi</h1>
        <h2>Homework list:</h2>
        <ul>
            <li>1. Homework: Git / GitHub</li>
            <li>2. Homework: Heroku -> Hello World</li>
コード例 #30
0
ファイル: send_full_data.py プロジェクト: Fern9/newsSpider
# -*- coding: utf-8 -*-
"""
@author: maozhufeng
@file: send_news_to_test
@time: 2018/6/16 下午12:41
"""
import pymongo
import requests

from common import conf
from model.mongo import Mongo

collection = Mongo().news
news_to_send = collection.find({
    'has_keywords': 1,
    'repeat': -1,
    'title': {
        '$ne': ''
    },
    'content': {
        '$ne': ''
    }
})
news_to_send = list(news_to_send)
news_to_send = [
    new for new in news_to_send
    if new['title'] is not None and new['content'] is not None
]
all_count = len(news_to_send)
send_count = 0
start = 0