def save_member_rels(source_company_id, members, SOURCE): conn = db.connect_torndb() for m in members: member_key = m["key"] source_member = conn.get( "select * from source_member where source=%s and sourceId=%s order by id limit 1", SOURCE, member_key) if source_member is None: continue source_member_id = source_member["id"] source_company_member_rel = conn.get( "select * from source_company_member_rel where \ sourceCompanyId=%s and sourceMemberId=%s", source_company_id, source_member_id) type = name_helper.position_check(m["position"]) logger.info("position %s, type %s", m["position"], type) if source_company_member_rel is None: conn.insert( "insert source_company_member_rel(sourceCompanyId, sourceMemberId, \ position,type,createTime,modifyTime) \ values(%s,%s,%s,%s, now(),now())", source_company_id, source_member_id, m["position"], type) conn.close()
def parseMember_save(source_company_id, item, download_crawler): logger.info("parseMember_save") companyKey = item["key"] d = pq(html.fromstring(item['content'].decode("utf-8"))) members = d('.startups-member') for m in members: name = d(m)('.media-heading').text() logger.info(name) desc = d(m)('.desc').text() position = d(m)('.title').text() logo = 'http:' + d(m)(".media-object").attr('src').replace( '@!logom', '') if logo.find('deafult') >= 0 or logo.find('default') >= 0: logo = None if logo: logo = logo.replace("https://", "http://") sourceId = d(m)('.media-body a').attr('href') if sourceId is not None: sourceId = str(companyKey) + '_' + sourceId.split( 'person/')[-1].strip() else: sourceId = str( companyKey) + '_' + kr36_company_parser_2.get_company_code( name) source_member = { "source": SOURCE, "sourceId": sourceId, "name": name, "photo_url": logo, "weibo": None, "location": 0, "role": position[:50], "description": desc, "education": None, "work": None } ptype = name_helper.position_check(position) source_company_member_rel = { "sourceCompanyId": source_company_id, "position": position[:50], "joinDate": None, "leaveDate": None, "type": ptype } try: parser_db_util.save_member_standard(source_member, download_crawler, source_company_member_rel) # logger.info(source_member) # logger.info(source_company_member_rel) except Exception, ex: logger.info("%s:%s", Exception, ex) exit()
def save_member(r, SOURCE, download_crawler): member_key, name, weibo, introduction, education, work, location, role, pictureUrl, company_key, position = r conn = db.connect_torndb() source_member = conn.get( "select * from source_member where source=%s and sourceId=%s order by id limit 1", SOURCE, member_key) logo_id = None if source_member == None or source_member[ "photo"] == None or source_member["photo"] == "": if pictureUrl is not None and pictureUrl != "": # image_value = download_crawler.get_image(pictureUrl) # if image_value is not None: # logo_id = imgfs.put(image_value, content_type='jpeg', filename='member_%s_%s.jpg' % (SOURCE, member_key)) # logger.info("gridfs logo_id=%s" % logo_id) (logo_id, w, h) = parser_mysql_util.get_logo_id_new(pictureUrl, download_crawler, SOURCE, member_key, "member") else: logo_id = source_member["photo"] if source_member is None: sql = "insert source_member(name,photo,weibo,location,role,description,\ education,work,source,sourceId,createTime,modifyTime,processStatus) \ values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,now(),now(),0)" source_member_id = conn.insert(sql, name, logo_id, weibo, location, role, introduction, education, work, SOURCE, member_key) else: source_member_id = source_member["id"] sql = "update source_member set name=%s,photo=%s,weibo=%s,location=%s,role=%s,description=%s,\ education=%s,work=%s,modifyTime=now(),processStatus=0 where id=%s" conn.update(sql, name, logo_id, weibo, location, role, introduction, education, work, source_member_id) if company_key is not None: source_company = conn.get( "select * from source_company where source=%s and sourceId=%s", SOURCE, company_key) if source_company is not None: source_company_id = source_company["id"] source_company_member_rel = conn.get( "select * from source_company_member_rel where \ sourceCompanyId=%s and sourceMemberId=%s", source_company_id, source_member_id) if source_company_member_rel is None: type = name_helper.position_check(position) logger.info("position %s, type %s", position, type) conn.insert( "insert source_company_member_rel(sourceCompanyId, sourceMemberId, \ position,type,createTime,modifyTime) \ values(%s,%s,%s,%s, now(),now())", source_company_id, source_member_id, position, type) conn.close()
def parseMember_save(source_company_id, item, download_crawler): if item is None: return None company_key = item["key"] html = item["content"] #logger.info(html) d = pq(html) members = [] # members logger.info("*** member ****") lis = d('ul.team-list> li') for li in lis: try: l = pq(li) member_name = l('div.per-name> a').text().strip() member_key = l('div.per-name> a').attr("href").split("/")[-1] position = l('div.per-position').text().strip() logo = l('a.avatar> img').attr("src") desc = l('div.per-des').text().strip() logger.info( "member_key: %s, member_name: %s, position: %s, desc: %s" % (member_key, member_name, position, desc)) source_member = { "source": SOURCE, "sourceId": str(member_key), "name": member_name, "photo_url": logo, "weibo": None, "location": 0, "role": position, "description": desc, "education": None, "work": None } # member = { # "key":member_key, # "name":member_name, # "position":position # } ptype = name_helper.position_check(position) source_company_member_rel = { "sourceCompanyId": source_company_id, "position": position, "joinDate": None, "leaveDate": None, "type": ptype } parser_db_util.save_member_standard(source_member, download_crawler, source_company_member_rel) # members.append(member) except Exception,ex: logger.exception(ex)
def parseMember_save(source_company_id, item, download_crawler): logger.info("parseMember_save") members = item["content"]["member"]["data"]["members"] for m in members: if not m.has_key("name"): continue logger.info(m["name"]) desc = m.get("intro") position = m.get("position", "") logo = m.get("avatar") if logo: logo = logo.replace("https://", "http://") source_member = { "source": SOURCE, "sourceId": str(m["id"]), "name": m["name"], "photo_url": logo, "weibo": None, "location": 0, "role": position, "description": desc, "education": None, "work": None } ptype = name_helper.position_check(position) source_company_member_rel = { "sourceCompanyId": source_company_id, "position": position, "joinDate": None, "leaveDate": None, "type": ptype } try: parser_db_util.save_member_standard(source_member, download_crawler, source_company_member_rel) # logger.info(source_member) # logger.info(source_company_member_rel) except: pass
def parse_member(item): if item is None: return [] company_key = item["key"] html = item["content"] #logger.info(html) d = pq(html) members = [] # members logger.info("*** member ****") lis = d('ul.list-prodcase> li') for li in lis: try: l = pq(li) member_name = l('h4> a> b> span.c').text().strip() position = l('h4> a> b> span.c-gray').text().strip() str = l('h4> a').attr("href").strip() (member_key, ) = util.re_get_result(r'person/(\d*?)$', str) logger.info("member_key: %s, member_name: %s, position: %s" % (member_key, member_name, position)) memberId = parser_mongo_util.find_mongo_memberId( SOURCE, member_key) if memberId is None: continue type = name_helper.position_check(position) member = { "_memberId": memberId, "name": member_name, "position": position, "type": type } members.append(member) except Exception, ex: logger.exception(ex)
#logger loghelper.init_logger("Member_data_clean", stream=True) logger = loghelper.get_logger("Member_data_clean") if __name__ == "__main__": conn = db.connect_torndb() start = 0 while True: membertitles = list( conn.query( "select * from company_member_rel where (active is null or active='Y') and (type not in (5010,5020,5030,5040) or type is null) order by id limit %s, 1000", start)) if len(membertitles) == 0: break for title in membertitles: type = name_helper.position_check(title["position"]) logger.info("%s->%s", title["position"], type) conn.update( "update company_member_rel set type=%s,modifyTime=now(),modifyUser=139 where id=%s", type, title["id"]) #break while True: membertitles = list( conn.query( "select * from source_company_member_rel where type not in (5010,5020,5030,5040) or type is null order by id limit %s, 1000", start)) if len(membertitles) == 0: break for title in membertitles:
def parseMember_save(source_company_id, item): if item is None: return logger.info("*** member ***") html = item["content"] d = pq(html) lis = d('.manager_list > li') member_rank = 0 if len(lis) > 0: for li in lis: mem = pq(li) try: logo_url = mem('img').attr('src') if logo_url.startswith("http") or logo_url.startswith("https"): pass else: logo_url = "http:" + logo_url member_rank += 1 member_key = str(item["key"]) + '_' + str(member_rank) member_name = mem('p.item_manager_name > span').text() member_link = mem('p.item_manager_name > a').attr('href') member_position = mem('p.item_manager_title').text() member_desc = mem('div.item_manager_content').text() # print member_position # print member_name # print member_desc weibo = None if member_link is not None: if 'weibo.com' in member_link: weibo = member_link source_member = { 'name': member_name, 'photo_url': logo_url, 'weibo': weibo, 'location': None, 'role': member_position, 'description': member_desc, 'education': None, 'work': None, 'source': SOURCE, 'sourceId': member_key, } ptype = name_helper.position_check(member_position) source_company_member_rel = { 'sourceCompanyId': source_company_id, 'position': member_position, 'joinDate': None, 'leaveDate': None, 'type': ptype } logger.info( json.dumps(source_member, ensure_ascii=False, cls=util.CJsonEncoder)) parser_db_util.save_member_standard(source_member, download_crawler, source_company_member_rel) except Exception, ex: logger.exception(ex)