Example #1
0
def main():
    source_db = MongDb(mongo_db_conf['host'],
                       mongo_db_conf['port'],
                       mongo_db_conf['db'],
                       mongo_db_conf['username'],
                       mongo_db_conf['password'],
                       log=log)

    count = 0
    total = 0
    already = 0
    with open(conf_name) as p_file:
        for line in p_file:
            total += 1
            company = line.strip("\n").strip("\r").strip(" ")
            item = source_db.find_one(table_name, {'company': company})
            if item is None:
                log.error("当前企业没有抓到: {company}".format(company=company))
                count += 1
            else:
                log.info("已抓到企业: {}".format(company))
                already += 1
        log.info("总共企业数目为: {}".format(total))
        log.info("当前已抓到的个数: {}".format(already))
        log.info("当前总共没有抓到企业数目为: {}".format(count))
 def __init__(self, page, log):
     self.__page = page
     self.log = log
     self.log.info("获得 {} 页之后的数据...".format(self.__page))
     self.mongo = MongDb(LocalMongoConfig.HOST,
                         LocalMongoConfig.PORT,
                         LocalMongoConfig.DB,
                         LocalMongoConfig.USER,
                         LocalMongoConfig.PASSWD,
                         log=self.log)
     self.table = "douban"
     self.request = self.__init_reqeust()
     self.douban_handler = DouBanInfoHandler()
Example #3
0
    def __init__(self, secrite_key, token, user_id, log):
        self.log = log
        self.secrite_key = secrite_key
        self.user_id = user_id
        self.token = token
        self.request = self.__init_reqeust()
        self.cp_mongo = MongDb(LocalMongoConfig.HOST,
                               LocalMongoConfig.PORT,
                               LocalMongoConfig.DB,
                               LocalMongoConfig.USER,
                               LocalMongoConfig.PASSWD,
                               log=self.log)

        self.cp_table = "yizhou_cp"
Example #4
0
def search_task():
    log = Gsxtlogger('hunan.log').get_logger()
    mongo_db_conf = {
        'host': '172.16.215.16',
        'port': 40042,
        'db': 'app_data',
        'username': '******',
        'password': '******'
    }

    # 搜索列表存储表
    source_db = MongDb(mongo_db_conf['host'], mongo_db_conf['port'], mongo_db_conf['db'],
                       mongo_db_conf['username'],
                       mongo_db_conf['password'], log=log)

    for company in data_list:
        item = source_db.find_one('enterprise_data_gov', {'company': company})
        if item is None:
            log.error(company)
            continue

        if 'shareholder_information' not in item:
            log.warn(company)
            continue
Example #5
0
    '重庆市': 'chongqing',
    '陕西省': 'shanxi',
    '总局': 'gsxt'
}

mongo_db_company_data = {
    'host': '172.16.215.2',
    'port': 40042,
    'db': 'company_data',
    'username': '******',
    'password': '******'
}

source_db = MongDb(mongo_db_company_data['host'],
                   mongo_db_company_data['port'],
                   mongo_db_company_data['db'],
                   mongo_db_company_data['username'],
                   mongo_db_company_data['password'],
                   log=log)

db_query = pymongo.MongoClient('172.16.215.2', 40042)['schedule_data']
db_query.authenticate('work', 'haizhi')

db_query_app_data = pymongo.MongoClient('172.16.215.16', 40042)['app_data']
db_query_app_data.authenticate('work', 'haizhi')


#
def main():
    try:
        count = 0
        log.info('开始读取数据...')
    'port': 40042,
    'db': 'company_data',
    'username': '******',
    'password': '******'
}

mongo_db_target = {
    'host': "103.36.136.211",
    'port': 40042,
    'db': 'company_data',
    "username": '******',
    "password": '******',
}

log = Gsxtlogger('copy_data_to_beihai.log').get_logger()
source_db = MongDb(mongo_db_source['host'], mongo_db_source['port'], mongo_db_source['db'],
                   mongo_db_source['username'], mongo_db_source['password'], log=log)

target_db = MongDb(mongo_db_target['host'], mongo_db_target['port'], mongo_db_target['db'],
                   mongo_db_target['username'], mongo_db_target['password'], log=log)


def main():
    collection_table = 'offline_all_list'

    log.info("开始导入数据..")
    result_list = []
    count = 0
    for item in source_db.traverse(collection_table):
        item['crawl_online'] = 0

        result_list.append(item)
Example #7
0
class YizhoucpCrawl(object):
    __START_URL = "https://api.myrightone.com/api/feed/moment-list"
    __LIKE_PID_URL = "https://api.myrightone.com/api/feed/like"

    __CRACK_SIGN_URL = "http://wx.zxiaoji.com/cp"

    __HOST = "api.myrightone.com"

    def __init__(self, secret_key, token, user_id, check_code, log):
        self.log = log
        self.secret_key = secret_key
        self.user_id = user_id
        self.token = token
        self.check_code = check_code
        self.request = self.__init_reqeust()
        self.cp_mongo = MongDb(LocalMongoConfig.HOST,
                               LocalMongoConfig.PORT,
                               LocalMongoConfig.DB,
                               LocalMongoConfig.USER,
                               LocalMongoConfig.PASSWD,
                               log=self.log)

        self.cp_table = "yizhou_cp"

    def __init_reqeust(self):
        headers = {
            "Host": self.__HOST,
            "App-Id": self.token.split("_")[0],
            "Platform": "ios",
            "Token": self.token,
            "User-Agent":
            "Right-iOS/3.33.2 (com.myrightone.datecha; build:224; iOS 12.1.2) Alamofire/4.8.0",
            "Accept": "*/*",
            "Accept-Encoding": "gzip;q=1.0, compress;q=0.5",
            "Accept-Language": "zh-Hans-CN;q=1.0, en-CN;q=0.9",
        }
        self.request = requests.Session()
        self.request.headers = headers
        return self.request

    def __get_sign(self, params):
        req = requests.get(self.__CRACK_SIGN_URL,
                           params={
                               "secret_key": self.secret_key,
                               "check_code": self.check_code,
                               "params": json.dumps(params)
                           },
                           timeout=30)
        req_json = req.json()
        if req_json.get("status") != 1:
            self.log.error("提取sign发生错误,错误原因是:")
            self.log.error(req_json.get("data"))
            return None
        return req_json.get("data")

    def get_moment_list(self):
        self.log.info("开始采集动态页")
        params = {
            "num": 20,
            "start": 0,
            "timestamp": int(time.time()),
            "type": "recommend",
            "user_id": self.user_id,
            "last_object_id": "",
        }

        sign = self.__get_sign(params)
        if not sign:
            return
        params["sign"] = sign
        resp = self.request.get(self.__START_URL,
                                params=params,
                                verify=False,
                                timeout=30)
        resp_json = resp.json()
        return resp_json

    def like_sex(self, post_data, sex=2, exclude_cp=True):
        """
        :param fid: 文章id
        :param sex: 性别
        :return:
        """

        is_cp = post_data.get('left_user', None)
        if exclude_cp and is_cp:
            self.log.info("过滤掉cp组")
            return False
        category = post_data.get("category")
        if category == "topic":
            self.log.info("过滤掉话题..")
            return False

        fid = post_data.get("fid")
        nick_name = post_data["user"].get("nickname")
        post_text = post_data["payload"].get("text")

        mongo_exists = self.__update_like_mongo(fid, nick_name, post_text)
        if mongo_exists == -1:
            self.log.info("之前已对这条数据点过赞了,跳过...")
            return False

        raw_sex = post_data["user"].get('sex')

        if raw_sex == sex:
            fid_params = {
                "cancel": "0",
                "fid": fid,
                "timestamp": "0",
                "user_id": self.user_id,
            }
            sign = self.__get_sign(fid_params)
            if not sign:
                return False
            fid_params["sign"] = sign
            resp = self.request.get(self.__LIKE_PID_URL,
                                    params=fid_params,
                                    verify=False,
                                    timeout=30)
            resp_json = resp.json()
            if resp_json.get("message") == "success":
                nick_name = post_data["user"].get("nickname")
                post_text = post_data["payload"].get("text")
                self.log.info("给用户({})发布的【{}】点赞成功".format(
                    nick_name, post_text))
                return True

    def start(self, *args, **kwargs):
        count = 0
        like_count = 0
        while True:
            count += 1
            moment_data = self.get_moment_list()
            like_count_batch = 0
            for per_post in moment_data["data"]["list"]:
                like_succeed = self.like_sex(per_post)
                if like_succeed:
                    like_count_batch += 1
                    like_count += 1
                time.sleep(random.randint(1, 2))
                if like_count % 100 == 0:
                    self.log.info("当前已经对 {} 位小姐姐点过赞了...".format(like_count))
            self.log.info("当前已经遍历了第 {} 次动态".format(count))
            time.sleep(
                random.randint(7 * like_count_batch, 10 * like_count_batch))
            now = datetime.datetime.now()
            if now.hour in range(2, 6):
                time.sleep(random.randint(3600, 4000))

    def __update_like_mongo(self, fid, nick_name, post_text):
        exist_data = self.cp_mongo.find_one(self.cp_table, {"_id": fid})
        if exist_data:
            self.log.info(">>>找到相同的数据啦...")
            count = exist_data['count']
            count += 1
            exist_data.update({"count": count})
            self.cp_mongo.insert_batch_data(self.cp_table, [exist_data])
            return -1
        new_data = {
            "_id": fid,
            "nick_name": nick_name,
            "post_text": post_text,
            "count": 1
        }
        self.cp_mongo.insert_batch_data(self.cp_table, [new_data], insert=True)
        return 1
Example #8
0
from logger import Gsxtlogger

log = Gsxtlogger('find_in_gsxt.log').get_logger()

db_conf = {
    'host': '172.16.215.16',
    'port': 40042,
    'db': 'app_data',
    'username': '******',
    'password': '******',
}

source_db = MongDb(db_conf['host'],
                   db_conf['port'],
                   db_conf['db'],
                   db_conf['username'],
                   db_conf['password'],
                   log=log)


def classify():
    with open("company_invalid.txt", "w") as invalid_file:
        with open("company_valid.txt", "w") as valid_file:
            with open("company_list.txt") as p_file:
                for line in p_file:
                    company = line.strip("\r").strip("\n").strip()
                    # if source_db.find_one("enterprise_data_gov", {"company": company}) is None:
                    #     log.warn("当前企业不存在: {}".format(company))
                    # else:
                    #     log.info("找到企业信息: {}".format(company))
                    if len(company) <= 15 or len(company) > 90:
    'db': 'crawl_data',
    'username': '******',
    'password': '******'
}

mongo_db_gov = {
    'host': '172.16.215.16',
    'port': 40042,
    'db': 'app_data_test',
    'username': '******',
    'password': '******'
}

crawl_data_db = MongDb(mongo_db_crawl_data['host'],
                       mongo_db_crawl_data['port'],
                       mongo_db_crawl_data['db'],
                       mongo_db_crawl_data['username'],
                       mongo_db_crawl_data['password'],
                       log=log)

gov_db = MongDb(mongo_db_gov['host'],
                mongo_db_gov['port'],
                mongo_db_gov['db'],
                mongo_db_gov['username'],
                mongo_db_gov['password'],
                log=log)

# while True:
#     data = {'province': 'yunnan', 'company_name': '国网'}
#     producer.produce(json.dumps(data))
#     log.info(count)
#     # time.sleep()
Example #10
0
    'password': '******'
}

app_data_conf = {
    'host': '172.16.215.16',
    'port': 40042,
    'db': 'app_data',
    'username': '******',
    'password': '******'
}

log = Gsxtlogger('count_gansu.log').get_logger()

company_data_db = MongDb(company_data_conf['host'],
                         company_data_conf['port'],
                         company_data_conf['db'],
                         company_data_conf['username'],
                         company_data_conf['password'],
                         log=log)

app_data_db = MongDb(app_data_conf['host'],
                     app_data_conf['port'],
                     app_data_conf['db'],
                     app_data_conf['username'],
                     app_data_conf['password'],
                     log=log)


def get_now_time():
    from datetime import datetime
    return datetime.now().strftime("%Y-%m-%d %H:%M:%S")
class DoubanCrawl(object):
    __START_URL = "https://www.douban.com/group/luohuzufang/discussion?start={}"

    __HOST = "www.douban.com"

    def __init__(self, page, log):
        self.__page = page
        self.log = log
        self.log.info("获得 {} 页之后的数据...".format(self.__page))
        self.mongo = MongDb(LocalMongoConfig.HOST,
                            LocalMongoConfig.PORT,
                            LocalMongoConfig.DB,
                            LocalMongoConfig.USER,
                            LocalMongoConfig.PASSWD,
                            log=self.log)
        self.table = "douban"
        self.request = self.__init_reqeust()
        self.douban_handler = DouBanInfoHandler()

    def __init_reqeust(self):
        headers = {
            "Host": self.__HOST,
            "Connection": "keep-alive",
            "Cache-Control": "max-age=0",
            "Upgrade-Insecure-Requests": "1",
            "DNT": "1",
            "User-Agent":
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7,ja;q=0.6",
        }
        self.request = requests.Session()
        self.request.headers = headers
        return self.request

    def __get_page_data(self, page_num=0, start_url=None):
        url = start_url.format(
            page_num) if start_url else self.__START_URL.format(page_num)
        resp = self.request.get(url)
        if resp is None:
            self.log.error("请求列表页出错...")
            return -1

        html_resp = html.fromstring(resp.text)

        # 遍历所有的帖子
        discussion_extract = html_resp.xpath(
            '//div[@class="article"]//tr[@class=""]')

        item_list = []
        for per_discussion in discussion_extract:
            title = per_discussion.xpath('./td[@class="title"]/a/@title')[0]
            detail_url = per_discussion.xpath(
                './td[@class="title"]/a/@href')[0]
            author = per_discussion.xpath('./td[2]/a/text()')[0]
            author_url = per_discussion.xpath('./td[2]/a/@href')[0]
            comment_count_raw = per_discussion.xpath('./td[3]/text()')
            comment_count = comment_count_raw[0] if comment_count_raw else 0
            comment_date = per_discussion.xpath('./td[4]/text()')[0]
            # titles.append(title)

            extract_info = self.douban_handler.clean_data(title)

            item = {
                "title": title,
                "detail_url": detail_url,
                "author": author,
                "author_url": author_url,
                "comment_count": comment_count,
                "comment_date": comment_date,
            }

            new_item = {**extract_info, **item}
            # print(new_item)
            item_list.append(new_item)
        self.mongo.insert_batch_data(self.table, item_list, key="detail_url")

    def start(self, *args, **kwargs):
        for url in init_urls:
            self.log.info("当前采集小组的链接是:{}".format(url))
            for i in tqdm(range(0, self.__page + 1)):
                self.log.info("当前即将采集第 {} 页".format(i))
                grab_list_page_status = self.__get_page_data(i * 25, url)
                if grab_list_page_status == -1:
                    self.log.info("当前采集列表页出错, 当前页面是第 {} 页".format(i))
                    continue
                self.log.info("当前页面采集完成: page = {}".format(i))
        self.log.info("成功退出采集程序...")
Example #12
0
    "db": "crawl_data_new",
    "username": "******",
    "password": "******",
}

mongo_db_webpage_old = {
    "host": "172.16.215.2",
    "port": 40042,
    "db": "crawl_data",
    "username": "******",
    "password": "******",
}

log = Gsxtlogger('find_equity_field.log').get_logger()

target_db_new = MongDb(mongo_db_webpage_new['host'], mongo_db_webpage_new['port'], mongo_db_webpage_new['db'],
                       mongo_db_webpage_new['username'], mongo_db_webpage_new['password'], log=log)

target_db_old = MongDb(mongo_db_webpage_old['host'], mongo_db_webpage_old['port'], mongo_db_webpage_old['db'],
                       mongo_db_webpage_old['username'], mongo_db_webpage_old['password'], log=log)

mail_from_addr = '*****@*****.**'
mail_password = '******'
mail_to_addrs = ['*****@*****.**']


def send_email(from_addr, password, to_addrs, subject, msg, smtp_host="smtp.weibangong.com", smtp_port=465):
    email_client = SMTP(smtp_host, smtp_port)
    email_client.login(from_addr, password)
    msg['Subject'] = Header(subject, 'utf-8')
    msg['From'] = from_addr
    msg['To'] = str(to_addrs)
mongo_db_source = {
    'host': '172.16.215.2',
    'port': 40042,
    'db': 'company_data',
    'username': '******',
    'password': '******'
}

global_logger = Gsxtlogger('insert_company.log')
global_log = global_logger.get_logger()

# 搜索列表存储表
source_db = MongDb(mongo_db_source['host'],
                   mongo_db_source['port'],
                   mongo_db_source['db'],
                   mongo_db_source['username'],
                   mongo_db_source['password'],
                   log=global_log)

beanstalk_consumer_conf = {'host': 'cs0.sz-internal.haizhi.com', 'port': 11400}
beanstalk = PyBeanstalk(beanstalk_consumer_conf['host'],
                        beanstalk_consumer_conf['port'])


def main(search_name, province, unified_social_credit_code, param):
    item = {
        "_id":
        "9c9d8f8b848514f240f54a40b0a0c6f02622b3d87d54d353e525ca58d9dbe312",
        "province": province,
        "crawl_online": 0,
        "error_times": 0,
    length = len(sys.argv)
    if length > 2:
        search_list = re.findall('config/(.*?)\.conf', sys.argv[1])
        if len(search_list) > 0:
            log_name = search_list[0] + '_' + sys.argv[2] + '.log'

    return log_name


global_logger = Gsxtlogger(get_log_name())
global_log = global_logger.get_logger()

# 旧网页库
target_db = MongDb(mongo_db_target['host'],
                   mongo_db_target['port'],
                   mongo_db_target['db'],
                   mongo_db_target['username'],
                   mongo_db_target['password'],
                   log=global_log)

# 新网页库
target_db_new = MongDb(mongo_db_target_new['host'],
                       mongo_db_target_new['port'],
                       mongo_db_target_new['db'],
                       mongo_db_target_new['username'],
                       mongo_db_target_new['password'],
                       log=global_log)

# 搜索列表存储表
source_db = MongDb(mongo_db_source['host'],
                   mongo_db_source['port'],
                   mongo_db_source['db'],