Exemple #1
0
def get_plant_info(name: str, page_size: int) -> List[Dict[str, Any]]:

    try:
        trefle_token = os.environ["TREFLE_ACCESS_TOKEN"]
        api = Shamrock(trefle_token, page_size=page_size)
    except KeyError as e:
        logger.error(
            "TREFLE_ACCESS_TOKEN environment variable not set, cannot fetch plants data"
        )
        raise e
    except Exception as e:
        logger.error(
            f"Could not create Trefle API resource due to {type(e)}, args: {e.args}"
        )
        raise e

    logger.info(f"Fetching data for plant name: {name}")

    batch = api.search(name)
    search_results = []
    for d in batch:
        plant = api.plants(d["id"])
        search_results.append(plant)

    return search_results
Exemple #2
0
 async def crawl(self):
     logger.info(f'{self._name}开始爬取...')
     try:
         return await self.do_crawl()
     except Exception as e:
         logger.exception(f'{self._name}爬取失败:e:{e}')
     return []
Exemple #3
0
def post_sensors_data() -> Response:
    """
    Endpoint receiving sensors data

    Db table:

    CREATE TABLE events.sensors (
        uuid UUID DEFAULT uuid_generate_v4 (),
        created_at TIMESTAMP WITH TIME ZONE DEFAULT CURRENT_TIMESTAMP,
        measure_type VARCHAR(32) NOT NULL,
        unit VARCHAR(32) NOT NULL,
        value NUMERIC NOT NULL
    );

    curl -X POST http://localhost:5000/pi/sensors -H "Content-Type: application/json" -d @tests/resources/post_pi_sensor.json
    """

    try:
        payload = request.get_json()
        with get_cursor(db_creds, commit=True) as cur:
            logger.info(request.data)
            q = (
                f"INSERT INTO events.sensors (measure_type, unit, value) VALUES ('{payload['measure_type']}', "
                f"'{payload['unit']}', '{payload['value']}')")
            cur.execute(q)
        return jsonify(status_code=200, )
    except KeyError as e:
        return jsonify(
            message=f"Missing key in payload: {type(e)}",
            args=e.args,
            status_code=400,
            error_type="Bad Request",
        )
    except UniqueViolation as e:
        return jsonify(
            message=f"Entity already exists: {type(e)}",
            args=e.args,
            status_code=409,
            error_type="Conflict",
        )
    except psycopg2.Error as e:
        return jsonify(
            message=f"Psycopg2 driver error: {type(e)}",
            args=e.args,
            status_code=500,
            error_type="Internal Server Error",
        )
    except Exception as e:
        return jsonify(
            message=f"Internal Server Error: {type(e)}",
            args=e.args,
            status_code=500,
            error_type="Internal Server Error",
        )
Exemple #4
0
def run():
    logger.info('初始化sqlite数据库...')
    sqlite_opt.init_db()
    scheduler = BackgroundScheduler()
    scheduler.add_job(crawl, 'interval', seconds=SPIDER['crawl_interval'])
    scheduler.add_job(validator.run,
                      'interval',
                      seconds=VALIDATOR['validate_interval'])
    scheduler.add_job(anonymity_validator.run,
                      'interval',
                      seconds=ANONYMITY_VALIDATOR['interval'])
    scheduler.start()
    app.run(host=WEB_SERVER['host'], port=WEB_SERVER['port'])
Exemple #5
0
 def add_proxy(self, proxy):
     session = self._DBSession()
     session.add(proxy)
     result = 0
     # 提交即保存到数据库:
     try:
         session.commit()
         result = 1
     except IntegrityError as e:
         logger.info(f'ip: {proxy.url} 已存在')
     finally:
         # 关闭session:
         session.close()
     return result
 async def crawl(self):
     logger.info(f'{self._name}开始爬取...')
     res = []
     for url in self._urls:
         try:
             for page in self.get_page_range():
                 async with aiohttp.ClientSession() as session:
                     async with session.get(self.get_page_url(url, page),
                                            headers=HEADERS) as resp:
                         resp.encoding = self.get_encoding()
                         temp = self.do_crawl(await resp.text())
                         res.extend(temp)
                         await asyncio.sleep(self.get_interval())
         except Exception as e:
             logger.exception(f'{self._name}爬取失败url: {url}, :e:{e}')
     return res
 async def valid_proxy(self, proxy_url, proxy_type):
     async with aiohttp.ClientSession() as session:
         try:
             async with session.get(
                     self.urls[proxy_type],
                     proxy=proxy_url,
                     headers=HEADERS,
                     timeout=ANONYMITY_VALIDATOR['request_timeout']
             ) as resp:
                 if resp.status == 200:
                     # 检验其匿名性
                     r_dict = json.loads(await resp.text())
                     headers = r_dict.get('headers', '')
                     ip = r_dict.get('origin')
                     proxy_connection = headers.get('Proxy-Connection',
                                                    None)
                     flag = True
                     if ',' in ip:
                         ips = str.split(ip, ',')
                         first = ips[0]
                         for p in ips:
                             if first != p.lstrip():
                                 proxy_cover = ProxyCoverEnum.TRANSPARENT.value  # 透明
                                 flag = False
                                 break
                     if flag:
                         if proxy_connection:
                             proxy_cover = ProxyCoverEnum.NORMAL_COVER.value  # 普匿
                         else:
                             proxy_cover = ProxyCoverEnum.HIGH_COVER.value  # 高匿
                     # 更新匿名性
                     sqlite_opt.update_anonymity(proxy_url, proxy_cover)
                     logger.info(
                         f'验证匿名性成功: url:{proxy_url}, coverValue:{proxy_cover}'
                     )
                 else:
                     logger.warn(
                         f'验证匿名性失败, proxy_url:{proxy_url}, 返回码:{resp.status}'
                     )
         except asyncio.TimeoutError:
             logger.warn(f'验证匿名性请求超时, proxy_url:{proxy_url}')
         except ConnectionRefusedError:
             logger.warn(f'验证匿名性请求被拒绝, proxy_url:{proxy_url}')
         except Exception as e:
             # logger.exception(e)
             logger.warn(f'验证匿名性失败, proxy_url:{proxy_url}, e:{e}')
Exemple #8
0
    tasks = []
    for spider_name in SPIDER['list']:
        tasks.append(spider_collection[spider_name].crawl())
        # proxies.extend(spider_collection[spider_name].crawl())
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    results = loop.run_until_complete(asyncio.gather(*tasks))
    loop.close()
    for proxies_list in results:
        proxies.extend(proxies_list)
    # proxies = loop.run_until_complete(asyncio.gather(*tasks))
    # 持久化
    save(proxies)


def save(proxies: typing.List[ProxyEntity]):
    for proxy in proxies:
        sqlite_opt.add_proxy(proxy)


if __name__ == '__main__':
    logger.info('初始化sqlite数据库...')
    sqlite_opt.init_db()
    scheduler = BackgroundScheduler()
    scheduler.add_job(crawl, 'interval', seconds=SPIDER['crawl_interval'])
    scheduler.add_job(validator.run,
                      'interval',
                      seconds=VALIDATOR['validate_interval'])
    scheduler.start()
    app.run(host=WEB_SERVER['host'], port=WEB_SERVER['port'])
Exemple #9
0
 def run(self):
     logger.info('开始删除不可用代理')
     sqlite_opt.remove_all_zero_reliability()
     logger.info('不可用代理删除完毕')
def spider_register(cls):
    spider_collection.update({cls.__name__: cls()})
    logger.info(f'注册{cls.__name__}')
    return cls
Exemple #11
0
def identify_plant():
    """
    Endpoint receiving an image and querying Plant.id to identify the plant species

    curl -X POST http://localhost:5000/identify_plant -F "file=@./tests/resources/test_flower_img.jpeg" -H "Content-Type: multipart/form-data"
    """

    if "file" not in request.files:
        return jsonify(
            message=f"Bad Request: file missing",
            status_code=400,
            error_type="Bad Request",
        )
    file = request.files["file"]
    # if user does not select file, browser also
    # submit a empty part without filename
    if file.filename == "" or not file \
            or not allowed_file(file.filename, ALLOWED_IMG_EXTENSIONS):
        return jsonify(
            message=f"Bad Request: file missing",
            status_code=400,
            error_type="Bad Request",
        )

    filename = secure_filename(file.filename)
    tmp_filename = os.path.join(app.config['UPLOAD_FOLDER'], filename)

    try:
        logger.info(f"Saving tmp image file at {tmp_filename}")
        # Save temp file, clean afterwards
        file.save(tmp_filename)

        # Build plant.id request
        with open(tmp_filename, "rb") as f:
            images = [base64.b64encode(f.read()).decode("ascii")]
        plant_id_payload = {
            "images":
            images,
            "modifiers": ["similar_images"],
            "plant_details": [
                "name_authority", "common_names", "url", "wiki_description",
                "taxonomy"
            ],
        }

        # Return plant.id response
        response = requests.post("https://api.plant.id/v2/identify",
                                 json=plant_id_payload,
                                 headers={
                                     "Content-Type":
                                     "application/json",
                                     "Api-Key":
                                     os.environ["PLANT_ID_API_ACCESS_TOKEN"]
                                 }).json()
        return response
    except Exception as e:
        return jsonify(
            message=f"Internal Server Error: {type(e)}",
            args=e.args,
            status_code=500,
            error_type="Internal Server Error",
        )
    finally:
        if os.path.exists(tmp_filename):
            logger.info(f"Removing tmp image file at {tmp_filename}")
            os.remove(tmp_filename)