def insert_to_campusTalk(entrys): if len(entrys) == 0: return LOG.info("Startging to Store to multi tables") table = T_campusTalk id = None count = 0 for entry in entrys: if count % 2000 == 0: LOG.info("[%s] Has been inserted!"%count) collegeID = insert_subtable(entry,T_collegeInfo) if collegeID is None: LOG.error("INSERT TO [CollegeInfo] FAILS:%s"\ %("|".join(entry.values()))) return None campanyID = insert_subtable(entry,T_companyInfo) if campanyID is None: LOG.error("INSERT TO [CampanyInfo] FAILS:%s"\ %("|".join(entry.values()))) return None new_entry = {} new_entry[COLLEGE_ID] = collegeID new_entry[COMPANY_ID] = campanyID for key,value in entry.items(): if value is None: continue if ct_field_map[key][0] == table: new_entry[ct_field_map[key][1]] = value if len(new_entry.keys()) > 0: id = db_helper.insert_entry(new_entry,table) count += 1 return id
def analyse_data(entrys): c_name_map = collections.defaultdict(list) c_name_infoid_list = [] i = 0 j = 0 for i in range(len(entrys)): j = i + 1 name_i = str(entrys[i][COMPANY_NAME]) id_i = str(entrys[i][INFO_ID]) if name_i.strip() == "" or name_i == "None": continue if i % 10 == 0: LOG.info("[%s] has been dealed!"%i) print "[%s:%s]"%(name_i,id_i),id_i + ",", for j in range(len(entrys))[i+1:]: name_j = str(entrys[j][COMPANY_NAME]) if name_j.strip() == "" or name_j == "None": continue id_j = str(entrys[j][INFO_ID]) if name_j in name_i or \ name_i in name_j or \ StrSim.get_sim(name_i,name_j) > 0.8: #print "(%s:%s)"%(name_j,id_j),'\t', print id_j + ",", print
def load_lasttime_failed_entrys(self): LOG.info("Loading lasttime failed entrys") lasttime_info_ids = self.load_ids_from_rec() if len(lasttime_info_ids) == 0: return [] db_helper = DBHelper() entrys = db_helper.get_data_by_infoids(lasttime_info_ids,self.table, self.db,isdict=True) LOG.info("Loaded [%s] lasttime failed entrys" %(len(entrys))) return entrys
def update_info_table(entrys,table=T_info,database=DB_1): if len(entrys) == 0: return LOG.info("Begin to update to table [%s.%s]"%(database,table)) count = 0 for entry in entrys: #del entry[INFO_ID] id = db_helper.insert_entry(entry,table,database) #插入成功 if id is not None: count += 1 LOG.info("Successfully insert [%s] Entrys To Table."%count)
def send_request(self,retry_time=3): count = 0 res = None resp = None while(count < retry_time): LOG.info("Send Request Round: [%s]" %(count)) try: resp = urllib2.urlopen(self.url) res = resp.read() break except Exception,e: LOG.error(e) time.sleep(1) count += 1
def deal_old_list(self, cmp_columns, entrys): campus_entrys = [] recruit_entrys = [] LOG.info("Dealing Old List...") entry_idx = 0 for entry in entrys: # 宣讲会的数据 if entry[INFO_TYPE] == INFO_TYPE_CAM: campus_entrys.append(entry) else: recruit_entrys.append(entry) entry_idx += 1 LOG.info("Finish Dealing Old List.") return campus_entrys, recruit_entrys
def rec_protobuf(self,res): if not os.path.isdir(RECORD_DIR): os.mkdir(RECORD_DIR) rec_file = os.path.join(RECORD_DIR,self.make_file_name()) file = open(rec_file,'w') LOG.info("Recording the proto to file [%s]" %(rec_file)) for protobuf in res: arr = array.array('I') length = len(protobuf) li = [length] arr.fromlist(li) file.write(arr.tostring()) file.write(protobuf) file.close() LOG.info("Recording to [%s] successfully!"%(rec_file))
def rm_college_from_loc(self,entrys): LOG.info("Dealing Location field to remove college from loc") count = 0 for entry in entrys: info_type = entry.get(INFO_TYPE) if info_type == 0: continue college = entry.get(COLLEGE_NAME) loc = entry.get(MEETING_LOCATION) if loc.startswith(college): count += 1 loc = loc.replace(college,'') entry[MEETING_LOCATION] = loc LOG.info("Removing [%s] College Name from meeting_location field!"%count) return entrys
def get_entrys(self,seconds_before=1800,is_full=True): """ to get data from content base ,the data which updated from seconds_before to now """ LOG.info("Begin To Requset Data From DMG.") entrys = [] # When is_full is True , to get all the entrys from dmg if is_full: now_str = None before_str = None else: now_str,before_str = self.make_time(seconds_before) self.construct_args(before_str,now_str,is_full) resp = self.send_request() if resp != None: entrys = self.deal_resp(resp) return entrys
def serilize_entrys(self,entrys): protobufs = [] LOG.info("Begin Serilizing Entrys,[%s] Entrys To Be Serilized!" %(len(entrys))) for entry in entrys: mdoc = merged_doc_pb2.MergedDocInfo() new_entry = self.convertor.dmg_to_proto_entry(entry) for key,value in new_entry.items(): try: if isinstance(value,list): getattr(mdoc,key).extend(value) else: setattr(mdoc,key,value) except Exception,e: LOG.error("[%s]:%s" % (key,value)) LOG.error(e) protobuf = mdoc.SerializeToString() protobufs.append(protobuf)
def load_columns(self, filepath=None): columns_list = [] try: dom_tree = ElementTree.parse(filepath) root = dom_tree.getroot() columns = root.findall("field") if columns is None or len(columns) == 0: LOG.error("No columns found in xml conf [%s]" % (filepath)) sys.exit(-1) columns_count = 0 for column in columns: columns_list.append(column.text) columns_count += 1 LOG.info("Total Load [%d] columns in conf [%s]" % (columns_count, filepath)) except Exception as e: LOG.error(e) sys.exit(-1) return columns_list
def write_entrys(entrys,filepath): file = open(filepath,'w') count = 0 for entry in entrys: if isinstance(entry,dict): for key,value in entry.items(): value = str(value).replace('\n','').replace('\t','') file.write("%s\t"%value) file.write("\n") count += 1 if isinstance(entry,list): for value in entry: value = str(value).replace('\n','').replace('\t','') file.write("%s***\t"%value) file.write("\n") count += 1 file.close() LOG.info("write [%s] entrys"%(count))
def index_list_string(self, protobufs): if not check_list(protobufs): LOG.info("Do Not Call Indexing, For the input list is:%s" % (protobufs)) return count = 0 try: self.transport.open() is_ready = self.start_index() if not is_ready: LOG.error("Index Server Is Not Ready!") return 0 count = self.client.put_list_string(protobufs) LOG.info("[%s] Entrys Has Been Successfully Indexed." % (count)) self.stop_index() self.transport.close() except Exception, e: LOG.error(e)
def write_ids_to_rec(self): LOG.info("Recording Failed IDs Into File [%s]" %(FAILED_INFOID_REC)) if not (len(self.failed_ids)>0): return # record the failed_ids to file # check the record file dir if exists ,if not create it dir= os.path.dirname(FAILED_INFOID_REC) file = None if not os.path.isdir(dir): os.mkdir(dir) try: file = open(FAILED_INFOID_REC,'w') file.write(";".join([str(id) for id in self.failed_ids])) LOG.info("Record Successfully Totally [%s] entrys" %(len(self.failed_ids))) except Exception as e: LOG.error(e) finally: if file: file.close()
def submit_to_RB(self,entrys,posturl=POSTURL): """ submit data to dmg """ failed_count = 0 suc_count = 0 suc_ids = [] failed_ids = [] failed_entrys = [] count = 0 state = "" res = None for entry in entrys[:]: #just for test should delet after testing count += 1 info_id = entry[INFO_ID] entry_json = self.construct_json_data(entry) resjson = {"resjson":json.dumps(entry_json)} try: state =self.post(posturl,resjson) res = json.loads(state)['result'] if (res == 0): LOG.warning("[%s] Submitted Error!" %(info_id)) LOG.warning(state) failed_count += 1 if (res == 1): suc_count += 1 except Exception as e: LOG.error(e) failed_count += 1 LOG.error(state) failed_ids.append(info_id) failed_entrys.append(entry) # LOG.debug("Post one entry into RB the result is: [%s],Suc_Count=\ # [%s],Fail_count=[%s]" % (res,suc_count,failed_count)) if (suc_count + 1) % 1000 == 0 or (failed_count + 1 ) % 100 == 0: LOG.info("Post Entrys To DMG,Suc:[%s] Failed:[%s]" %(suc_count,failed_count)) self.failed_ids = failed_ids self.failed_entrys = failed_entrys LOG.info("Successfully Submitted To DMG [%s],Failed:[%s]" %(suc_count,failed_count))
def get_term_info(res): socket = TSocket.TSocket(host, port) transport = TTransport.TBufferedTransport(socket) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = TextProcessServer.Client(protocol) transport.open() file = open("output.protobuffer",'wb') resps = client.word_seg_list(res) for resp in resps: arr = array.array('I') length = len(resp); li = [length] arr.fromlist(li) LOG.info("The response length is : [%s]"%length) file.write(arr.tostring()) file.write(resp) file.close() transport.close() return resps
def repair_data(self,entrys,cmp_entrys=None): if cmp_entrys == None: cmp_entrys = self.cmp_entrys LOG.info("repairing Data...") LOG.info("Entrys to Repair size is [%s],cmp_entrys size is [%s]"%(len(entrys),len(cmp_entrys))) cmple_info_dict = collections.defaultdict(dict) for entry in cmp_entrys: origin_url = entry.get(ORIGIN_URL) if origin_url != None: cmple_info_dict[origin_url].update(entry) for entry in entrys: origin_url = entry.get(ORIGIN_URL) if origin_url in cmple_info_dict: for clm in cmple_info_dict[origin_url]: value = entry.get(clm) if value is None: new_value = cmple_info_dict[origin_url][clm] value = new_value entry[clm] = value return entrys
def deal_new_list(self, cmp_columns, entrys): # 宣讲会的数据和招聘会的数据 campus_entrys = [] recruit_entrys = [] fields_set = set() title_set = set() LOG.info("Dealing New List...") entry_idx = 0 key_fields = cmp_columns + [ORIGIN_URL] for entry in entrys: # 宣讲会的数据 if entry_idx % 5000 == 0: LOG.info("Dealing new_entrys In Progress:[%s]" % entry_idx) if entry[INFO_TYPE] == INFO_TYPE_CAM: # 同一时间的数据进行相似度对比去重 key = make_key(entry, key_fields) if key not in fields_set: fields_set.add(key) # 处理应届生海投这种一个页面对应多条信息的情况 entry = self.deal_special_url(entry) campus_entrys.append(entry) # 招聘会的数据 else: key = entry.get(RECRUIT_TITLE) if key not in title_set: title_set.add(key) recruit_entrys.append(entry) entry_idx += 1 LOG.info("Finish Dealing New List.") return campus_entrys, recruit_entrys
def filter_illegal(self,entrys): LOG.info("Begin to Filter illegal entrys..") not_null_fields = [ORIGIN_URL,ORIGIN_WEBSITE_NAME,RECRUIT_TITLE] legal_entrys = [] illegal_entrys = [] while len(entrys) > 0: entry = entrys.pop() info_type = entry.get(INFO_TYPE) flag = True for field in not_null_fields: if entry.get(field) is None: illegal_entrys.append(entry) flag = False break if info_type == 0 and flag: if entry.get(RELEASE_DATE) is None: illegal_entrys.append(entry) flag = False if info_type == 1 and flag: if entry.get(MEETING_TIME) is None: illegal_entrys.append(entry) flag = False if flag:legal_entrys.append(entry) LOG.info("Finish filering entrys.[%s] entrys are illegal"%(len(illegal_entrys))) db_illegal = DB_1 table_illegal = TB_ILLEGAL LOG.info("Insert illegal Entrys into[%s.%s]"%(db_illegal,table_illegal)) fields = list(INFO_FIELDS) fields.remove(GROUP_ID) fields.extend(['author','tmp_path']) self.db_helper.batch_insert(illegal_entrys,table_illegal,db_illegal,fields) return legal_entrys
def deal_resp(self,resp): entrys = resp.strip('\n').split("\n") new_entrys = [] LOG.info("[%s] Entrys We Totally Get From DMG"%len(entrys)) for entry in entrys: new_entry = {} try: entry = json.loads(entry) specials = entry.get('specials') origins = entry.get('contentOrigns') if origins != None: origins = origins[0] if specials != None: new_entry.update(specials) if origins != None: new_entry.update(origins) #LOG.debug(new_entry) new_entry = self.convertor.dmg_to_db(new_entry) except Exception,e: LOG.debug(e) LOG.debug("broken json is:[%s]"%entry) if len(new_entry) != 0: new_entrys.append(new_entry)
def extract(recruit_title): if not recruit_title: return recruit_title #保存一份原始标题 origin_recruit_title = recruit_title.strip() recruit_title = pre_clean(recruit_title) if len(recruit_title) < 2: #如果抽出为空,那么返回原始标题 recruit_title = origin_recruit_title recruit_title = recruit_title.strip(u',、-—') match_result = pat_company_pre_tail.search(recruit_title) if match_result is not None: LOG.info('extract %s by pat_company_pre_tail' %(recruit_title)) return match_result.group(0) match_result = pat_company_tail.search(recruit_title) if match_result is not None: #LOG.info('extract %s by pat_company_tail' %(recruit_title)) return match_result.group(0) match_result = pat_company_tail_sec.search(recruit_title) if match_result is not None: #LOG.info('extract %s by pat_company_tail_sec' %(recruit_title)) return match_result.group(0) match_result = pat_company_tail_third.search(recruit_title) if match_result is not None: #LOG.info('extract %s by pat_company_tail_third' %(recruit_title)) return match_result.group(0) #..xxx中心 match_result = pat_company_tail_forth.search(recruit_title) if match_result is not None: #LOG.info('extract %s by pat_company_tail_forth' %(recruit_title)) return match_result.group(0) company_name = clean(recruit_title) if len(company_name) < 2: return recruit_title return company_name
def word_seg_list(self,protobufs): res = [] if not check_list(protobufs): LOG.info("Do Not Call Word Segging,For the input list is:%s"%(protobufs)) return res try: self.transport.open() LOG.info("Begin RPC Word Segging,[%s] To Be Segged!" %(len(protobufs))) res = self.client.word_seg_list(protobufs) self.transport.close() LOG.info("Finish RPC Word Segging,[%s] Entrys Have Been Segged!" %(len(res))) self.rec_protobuf(res) except Exception,e: LOG.error(e)
def resubmit_failed_data(self,url=POSTURL,retry_time=3): """ resubmit the failed data to RB, will try 3 times, before every new try system will sleep 1 minute.Finally after 3 trys,the all-the-time failed data will be submitted with next program call """ db_helper = DBHelper() try_count = 0 while(len(self.failed_ids)>0 and try_count <retry_time): time.sleep(1) LOG.info("\nResubmit Retry Round [%s]." %try_count) LOG.info("[%s] Entrys Are To Re-Submit"%(len(self.failed_ids))) self.submit_to_RB(self.failed_entrys,url) try_count += 1 LOG.info("Finish Resubmitting ,[%s] Remains UnSubmitted." %len(self.failed_ids)) if len(self.failed_ids) != 0: self.write_ids_to_rec()
def rm_field(self, entrys, field): LOG.info("Removing Field :[%s]" % (field)) for entry in entrys: if entry.get(field): del entry[field]
def diff_tables(self, table_old, table_new, db_old, db_new): LOG.info("Begin To Remove Duplicates entrys...") entrys = [] group_checker = GroupChecker() # group_id_offset = db_helper.get_max_group_id(table_old,db_old) group_id_offset = 0 group_checker.set_group_id_offset(group_id_offset) # load 关键字段是信息判重的依据 cmp_columns = self.load_columns(KEY_FIELDS_PATH) # 已发布的数据要多选取group_id字段 old_columns = cmp_columns + [ORIGIN_URL, GROUP_ID, RECRUIT_TITLE] old_list = self.get_list_fromDB(old_columns, table_old, db=db_old, isdict=True)[:] LOG.info("Get Published Data From Table[%s.%s] [%s] " % (db_old, table_old, len(old_list))) # 新爬取数据要多选取info_id字段,唯一标识该信息 new_columns = cmp_columns + [INFO_ID, ORIGIN_URL, RECRUIT_TITLE] new_list = self.get_list_fromDB(new_columns, table_new, db=db_new, isdict=True)[:] LOG.info("Get New Data From Table[%s.%s] [%s] " % (db_new, table_new, len(new_list))) # 对新数据做预处理,分类,初步去重 cam_entrys_new, rec_entrys_new = group_checker.deal_new_list(cmp_columns, new_list) # 对已发布数据分类 cam_entrys_old, rec_entrys_old = group_checker.deal_old_list(cmp_columns, old_list) # LOG.info("[%s] entrys are campus_entrys"%len(cam_entrys_new)) # LOG.info("[%s] entrys are recruit_entrys"%len(rec_entrys_new)) # 对宣讲会数据去重,返回去重之后的数据info_id列表和与之对应的group_id cam_updates_infoids_dict = group_checker.deal_campus_list(cmp_columns, cam_entrys_new, cam_entrys_old) LOG.info("Totally we got [%s] updates campus entry_ids" % len(cam_updates_infoids_dict.keys())) # 对招聘会数据去重,返回招聘会数据的info_id列表 rec_updates_infoids = group_checker.get_rec_updates_ids(rec_entrys_new, rec_entrys_old) LOG.info("Totally we got [%s] updates recruit entry_ids" % len(rec_updates_infoids)) # 根据info_id列表,取出宣讲会信息的所有属性 cam_updates_infoids = cam_updates_infoids_dict.keys() cam_full_entrys = self.fetch_data_by_ids(cam_updates_infoids, table_new, db_new, isdict=True) LOG.info("Update Campus Entrys NO. Is [%s]" % (len(cam_full_entrys))) # 为去重之后的新数据,增加group_id信息 group_checker.add_cam_groupid_info(cam_full_entrys, cam_updates_infoids_dict) # 根据info_id列表,取出招聘会信息的所有属性 rec_full_entrys = self.fetch_data_by_ids(rec_updates_infoids, table_new, db_new, isdict=True) group_checker.add_rec_groupid_info(rec_full_entrys) LOG.info("Update Recreuit Entrys No. Is [%s]" % (len(rec_full_entrys))) # 得到最终的更a新数据 entrys = cam_full_entrys + rec_full_entrys # 过滤不合法的entry entrys = self.filter.deal_entrys(entrys) LOG.info("Finish Removing Duplicate Entrys!") LOG.info("Totally We got Update Entrys NO.Is:[%s]." % len(entrys)) return entrys
def start_index(self): LOG.info("Prepare To Index") res = self.client.send_start_sig() if res: LOG.info("Server %s Get Ready To Index" % (self.host)) return res
def stop_index(self): LOG.info("Stop Index") self.client.send_stop_sig()
def serilize_entrys(entrys): res = [] for entry in entrys: mdoc = merged_doc_pb2.MergedDocInfo() for key,value in entry.items(): # 空值的跳过 if value is None: continue # 数据库读出的long值改为int,info_id字段除外 if isinstance(value,long): if(key != INFO_ID): value = int(value) # 时间转化成时间戳int类型 if isinstance(value,datetime.datetime): value = get_timestamp(value) # 对recruit_cities和编号进行映射 if key == RECRUIT_CITIES: for city in value.strip().split(","): if len(city) > 0: id = city_id.get(value) if id is None: LOG.error("City Name [%s] Not Found!"%value) else: mdoc.recruit_cities.append(id) continue # 对collegeName和编号进行映射 if key == COLLEGE_NAME: value = value.strip() if len(value) > 0: id = college_id.get(value) if id is None: LOG.error("College Name [%s] Not Found!"%value) missed_colleges.add(value) #默认为0 mdoc.college_ID = 1 else: #填充protobuffer中的college_id 字段 mdoc.college_ID = id #处理work_place值 if key == 'work_place': for place in value.strip().split(","): if len(place) > 0: mdoc.work_place.append(place) LOG.info(";".join(mdoc.work_place)) continue try: #计算term-weight需要gbk编码的value if isinstance(value,str): value = value.decode('utf-8').encode('gbk') setattr(mdoc, key, value) mdoc.origin_url_sign = 0 mdoc.recruit_title_sign = 0 mdoc.info_text_sign = 0 mdoc.group_id = 1 except Exception,e: LOG.error(e) LOG.error("key is [%s] and value is[%s]"%(key,value)) serilized = mdoc.SerializeToString() res.append(serilized)