コード例 #1
0
ファイル: api.py プロジェクト: zlv2s/proxy-pool
def get_many():
    #  http://127.0.0.1:5000/many?count=2
    # args = flask.request.args  # 参数提交
    proxies = MongoDB().get(1)
    result = [proxy['proxy'] for proxy in proxies]
    print(result)
    print(MongoDB().get_count())
    # x = random.randint(1,MongoDB().get_count()-1)
    res_dict = {'result': result}
    return jsonify(res_dict)
コード例 #2
0
ファイル: crawler.py プロジェクト: zlv2s/proxy-pool
def check():
    '''
    定时检测数据库中代理的可用性
    :return:
    '''
    while True:
        m = MongoDB()
        count = m.get_count()
        if not count == 0:
            logging.info('开始检测数据库中代理可用性>>>>>>>>')
            proxies = m.get(count)
            Validate().valid_many(proxies, 'check')
        time.sleep(10 * 60)
コード例 #3
0
    def __init__(self):
        file_name = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time())) + '.log'
        #self.logger = logger#logging.getLogger('weather.spider')
        self.log = Logger('province_spider', console=False, file_name=file_name).getLogger()
        self.db = MongoDB(
            auth=True,
            host='localhost',
            user='******',
            password='******',
            authSource='admin',
            authMechanism='SCRAM-SHA-1')

        self.db.remove('weather', 'wea', {})
        super(ProvinceSpider, self).__init__()
コード例 #4
0
class MongoTwitterConsumer:
    def __init__(self, collection_name, delay=5):
        self.db = MongoDB()
        self.kinesis = boto3.client("kinesis")
        self.collection_name = collection_name
        self.delay = delay
        self.shard_id = "shardId-000000000000"  #only one shard!

    def run(self, stream_name):
        print(
            f'Starting MongoDB consumer, db: {DB_NAME}, collection: {self.collection_name}'
        )

        # Connect to db. This must happen inside process otherwise there can be a problem with
        # locking: http://api.mongodb.com/python/current/faq.html#multiprocessing.
        self.db.connect(DB_NAME)

        pre_shard_it = self.kinesis.get_shard_iterator(
            StreamName=stream_name,
            ShardId=self.shard_id,
            ShardIteratorType="LATEST")
        shard_it = pre_shard_it["ShardIterator"]

        while True:
            out = self.kinesis.get_records(ShardIterator=shard_it, Limit=1)
            shard_it = out["NextShardIterator"]
            if len(out['Records']) > 0:
                for rec in out['Records']:
                    print('Processing: ', rec['SequenceNumber'])
                    bytes_data = rec['Data']
                    json_obj = json.loads(bytes_data.decode('utf8'))
                    json_obj['tweet_id'] = json_obj['id']
                    del json_obj['id']
                    self.db.add_document(self.collection_name, json_obj)
            time.sleep(self.delay)

    def start(self, stream_name):
        print(stream_name, flush=True)
        self.process = Process(target=self.run, args=(stream_name, ))
        self.process.start()

    def stop(self):
        print('Stopping consumer thread.')
        self.process.terminate()
コード例 #5
0
    def valid_one(self, proxy, method, url='https://baidu.com'):

        headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X x.y; rv:42.0) Gecko/20100101 Firefox/42.0'
        }
        # proxies = {
        #     'http': 'http://' + proxy['proxy'],
        #     'https': 'http://' + proxy['proxy']
        # }
        proxies = {'http': proxy['proxy'], 'https': proxy['proxy']}

        try:
            start_time = time.time()
            resp = requests.get(url,
                                headers=headers,
                                proxies=proxies,
                                timeout=8)
            delay = round(time.time() - start_time, 2)  # round()方法返回浮点数x的四舍五入值
            if resp.status_code == 200:
                proxy['delay'] = delay
                if method == 'insert':
                    MongoDB().insert(proxy)
                elif method == 'check':
                    MongoDB().update({'proxy': proxy['proxy']},
                                     {'delay': proxy['delay']})

            else:
                logging.info(f'无效ip: {proxy}')
                if method == 'check':
                    MongoDB().delete({'proxy': proxy['proxy']})
        except (ProxyError, ConnectTimeout):
            logging.info(f'无效ip: {proxy}')
            if method == 'check':
                MongoDB().delete({'proxy': proxy['proxy']})

        except:
            pass
コード例 #6
0
ファイル: test_us16.py プロジェクト: BenjiTheC/SSW555
    def test_same_lastname(self):
        """ Positive test, males' last names are the same"""
        mongo_instance = MongoDB()
        mongo_instance.drop_collection("family")
        mongo_instance.drop_collection("individual")

        ged = Gedcom('./GEDCOM_files/us16/us16_male_last_name_same.ged')
        ged.insert_to_mongo()

        self.assertEqual(ged.us16_male_last_name(debug=True), [])
コード例 #7
0
ファイル: test_us16.py プロジェクト: BenjiTheC/SSW555
    def test_diff_lastname(self):
        """ Negative test, males' last names are different"""
        mongo_instance = MongoDB()
        mongo_instance.drop_collection("family")
        mongo_instance.drop_collection("individual")

        ged = Gedcom('./GEDCOM_files/us16/us16_male_last_name_diff.ged')
        ged.insert_to_mongo()

        self.assertEqual(ged.us16_male_last_name(debug=True),
                         [('@F1@', '@I2@, @I3@', 'LastName,Test')])
コード例 #8
0
ファイル: api.py プロジェクト: zlv2s/proxy-pool
def get_one():
    proxies = MongoDB().get(1)
    result = [proxy['proxy'] for proxy in proxies]
    x = random.randint(0, MongoDB().get_count() - 1)
    return jsonify(dict(proxy=result[x]))
コード例 #9
0
ファイル: api.py プロジェクト: zlv2s/proxy-pool
def delete():
    args = request.args
    MongoDB().delete({'proxy': args['proxy']})
    return '删除成功:{}'.format(args)
コード例 #10
0
class ProvinceSpider(scrapy.Spider):
    '''
        get province
    '''
    name = 'province_spider'
    allowed_domains = ['weather.com.cn']
    start_urls = ['http://www.weather.com.cn/province/']
    
    def __init__(self):
        file_name = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time())) + '.log'
        #self.logger = logger#logging.getLogger('weather.spider')
        self.log = Logger('province_spider', console=False, file_name=file_name).getLogger()
        self.db = MongoDB(
            auth=True,
            host='localhost',
            user='******',
            password='******',
            authSource='admin',
            authMechanism='SCRAM-SHA-1')

        self.db.remove('weather', 'wea', {})
        super(ProvinceSpider, self).__init__()

    def parse(self, response):
        '''
            解析省
        '''
        provinces = []
        for li in response.xpath('//div[@class="sheng_rukou"]/ul/li'):
            name = li.xpath('.//text()').extract_first()
            if name not in constant.PIG_ZONE:
                provinces.append({
                    'url': li.xpath('a/@href').extract_first(),
                    'province': name
                })
        for p in provinces:
            yield scrapy.Request(p['url'], callback=self.parse_city, meta=p)

    def parse_city(self, response):
        '''
            解析市/区
        '''
        # 上级省/直辖市
        province_info = response.meta

        cities = []
        for a in response.xpath('//div[@class="navbox"]/span/a'):
            cities.append({
                'url': response.urljoin(a.xpath('@href').extract_first()),
                'city': a.xpath('.//text()').extract_first()
            })
        # shirt, 广东省的主页样式不一样
        if not cities:
            for a in response.xpath('//div[@class="area_Weather"]/ul/li'):
                cities.append({
                    'url': response.urljoin(a.xpath('./a/@href').extract_first()),
                    'city': a.xpath('./a/text()').extract_first()
                })
        for c in cities:
            yield scrapy.Request(c['url'], callback=self.parse_county, meta={
                'province': province_info['province'],
                'city': c['city']
            })
        
        
    def parse_county(self, response):
        '''
            解析县
        '''
        city_info = response.meta

        # 如果是直辖市, 没有下级县, 直接解析天气数据
        if city_info['province'] in constant.DIRECT_CITY:
            self.parse_direct_weather(response, city_info)
        
        else:
            counties = []
            for a in response.xpath('//div[@class="navbox"]/span/a'):
                counties.append({
                    'url': response.urljoin(a.xpath('@href').extract_first()),
                    'county': a.xpath('.//text()').extract_first()
                })
            for c in counties:
                city_info['county'] = c['county']
                yield scrapy.Request(c['url'], callback=self.parse_county_weather, meta=city_info)
        
    def parse_county_weather(self, response):
        '''
            解析县天气数据
        '''
        meta = response.meta
        self._parse_weather(response, meta)


    def parse_direct_weather(self, response, meta):
        '''
            解析直辖市天气数据
        '''
        #self.log.info('provicince:%s, city:%s', meta['province'], meta['city'])
        self._parse_weather(response, meta)


    def _parse_weather(self, response, meta):
        seven_day_weather = []
        for li in response.xpath('//div[@id="7d"]/ul[@class="t clearfix"]/li'):
            # 相对日期
            h1 = li.xpath('./h1/text()').extract_first()
            # 描述
            desc = li.xpath('./p[@class="wea"]/text()').extract_first()
            # 最高、低温度
            max_tem = li.xpath('./p[@class="tem"]/span/text()').extract_first()
            min_tem = li.xpath('./p[@class="tem"]/i/text()').extract_first()
            # 风向
            wind_direction = li.xpath('.//em/span/@title').extract()
            # 风力 可能会有隐患
            wf = li.xpath('.//i/text()').extract()
            wind_force = wf[-1] if len(wf) >= 2 else 'unkonw'

            seven_day_weather.append({
                'day': h1,
                'desc': desc,
                'max_tem': max_tem,
                'min_tem': min_tem,
                'wind_direction': wind_direction,
                'wind_force': wind_force
            })
        self.log.info("========province:%s=======city:%s========county:%s", meta['province'], meta['city'], meta.get('county', None))

        data = {
            'province': meta['province'],
            'city': meta['city'],
            'county': meta.get('county', None),
            'data': seven_day_weather
        }
        self.db.insert('weather', 'wea', data)
コード例 #11
0
 def __init__(self, collection_name, delay=5):
     self.db = MongoDB()
     self.kinesis = boto3.client("kinesis")
     self.collection_name = collection_name
     self.delay = delay
     self.shard_id = "shardId-000000000000"  #only one shard!
コード例 #12
0
""" US26: Less than 150 years old
    Benji, Feb 24th, 2019
    Death should be less than 150 years after birth for dead people, and
    current date should be less than 150 years after birth for all living people
"""

import os
import unittest
from gedcom_ajry import Gedcom
from mongo_db import MongoDB

MONGO = MongoDB()


class test_us26(unittest.TestCase):
    """ Test cases for US26"""
    def test_indi_entry_bleach(self):
        """ Individual data missed in family collection."""
        self.assertEqual(
            Gedcom('GEDCOM_files/us26/us26_indi_entry_bleach.ged').
            us26_corrspnding_entries(debug=True), [('Individual', '@I4@')])

    def test_no_err(self):
        """ Positive test for US26."""
        self.assertEqual(
            Gedcom('GEDCOM_files/us26/us26_no_err.ged').
            us26_corrspnding_entries(debug=True), [])


if __name__ == '__main__':
    unittest.main(exit=False, verbosity=2)