def load_city_map(filepath): city_id_map = {} lines = [] try: file = codecs.open(filepath,encoding='utf-8') lines = file.readlines() except Exception as e: LOG.error("Doesn't find the city list file!") LOG.error(e) if len(lines) == 0: return city_id_map for line in lines: line = line.strip("\n") names = [] #编号,城市名,别名 (id,name,alias)= line.split("\t") id = int(id) names.append(name) alias = alias.split(";") for name in alias: names.append(name) for name in names: name = name.strip('\"') if len(name.strip()) == 0: continue city_id_map.update({name:id}) return city_id_map
def deal_data(self,raw_data): entrys = [] tt_sum = 0 cam_tt_sum = 0 rec_tt_sum = 0 fut_cam_tt_sum = 0 for row in raw_data: try: entry = [] site = row[0] tt = row[1] cam_tt = row[2] rec_tt = row[3] fut_cam_tt = row[4] tt_sum += tt cam_tt_sum += cam_tt rec_tt_sum += rec_tt fut_cam_tt_sum += fut_cam_tt college = self.site_college_dict.get(site) if college is None: LOG.error('[%s] name not found!'%site) college = site entry.append(college) entry.append(tt) entry.append(cam_tt) entry.append(rec_tt) entry.append(fut_cam_tt) entrys.append(entry) except Exception,e: LOG.error(e)
def analyse_data(entrys): c_name_map = collections.defaultdict(list) c_name_infoid_list = [] i = 0 j = 0 for i in range(len(entrys)): j = i + 1 name_i = str(entrys[i][COMPANY_NAME]) id_i = str(entrys[i][INFO_ID]) if name_i.strip() == "" or name_i == "None": continue if i % 10 == 0: LOG.info("[%s] has been dealed!"%i) print "[%s:%s]"%(name_i,id_i),id_i + ",", for j in range(len(entrys))[i+1:]: name_j = str(entrys[j][COMPANY_NAME]) if name_j.strip() == "" or name_j == "None": continue id_j = str(entrys[j][INFO_ID]) if name_j in name_i or \ name_i in name_j or \ StrSim.get_sim(name_i,name_j) > 0.8: #print "(%s:%s)"%(name_j,id_j),'\t', print id_j + ",", print
def deal_data(self,raw_data): entrys = [] full_sum = 0 refined_sum = 0 for row in raw_data: try: entry = [] site = row[0] full_tt = row[1] refined = row[2] full_sum += full_tt refined_sum += refined college = self.site_college_dict.get(site) if college is None: LOG.error('[%s] name not found!'%site) college = site entry.append(college) entry.append(full_tt) entry.append(refined) # no need to prevent site_tt_num as zero :impossible suc_rate = self.get_rate(refined,full_tt) entry.append(suc_rate) entrys.append(entry) except Exception,e: LOG.error(e)
def deal_data(self,raw_data): entrys = [] site_tt_sum = 0 err_1_sum = 0 err_2_sum = 0 for row in raw_data: try: entry = [] site = row[0] site_tt = row[1] err_1 = row[2] err_2 = row[3] site_tt_sum += site_tt err_1_sum += err_1 err_2_sum += err_2 college = self.site_college_dict.get(site) if college is None: LOG.error('[%s] name not found!'%site) college = site entry.append(college) entry.append(site_tt) entry.append(err_1) entry.append(err_2) # no need to prevent site_tt_num as zero :impossible suc_rate = self.get_rate(site_tt - err_1 - err_2,site_tt) entry.append(suc_rate) entrys.append(entry) except Exception,e: LOG.error(e)
def load_lasttime_failed_entrys(self): LOG.info("Loading lasttime failed entrys") lasttime_info_ids = self.load_ids_from_rec() if len(lasttime_info_ids) == 0: return [] db_helper = DBHelper() entrys = db_helper.get_data_by_infoids(lasttime_info_ids,self.table, self.db,isdict=True) LOG.info("Loaded [%s] lasttime failed entrys" %(len(entrys))) return entrys
def make_key(entry, key_fields=None): """ based on the key_fields generate key for a entry""" if key_fields is None: LOG.warning("Compared Fileds Are None!") key_fields = [MEETING_TIME, MEETING_LOCATION, RECRUIT_URL] key = "" for field in key_fields: if entry.get(field) is None: continue key += str(entry[field]) return key
def update_info_table(entrys,table=T_info,database=DB_1): if len(entrys) == 0: return LOG.info("Begin to update to table [%s.%s]"%(database,table)) count = 0 for entry in entrys: #del entry[INFO_ID] id = db_helper.insert_entry(entry,table,database) #插入成功 if id is not None: count += 1 LOG.info("Successfully insert [%s] Entrys To Table."%count)
def load_ids_from_rec(self): file = None info_ids = [] try: file = open(FAILED_INFOID_REC,'r') info_ids_str = file.read() info_ids = info_ids_str.split(";") except Exception as e: LOG.error(e) finally: if file:file.close() return info_ids
def map_job_type(self, value): value = value.strip() job_type_id = 0 if len(value) > 0: if value == "兼职": job_type_id = 1 elif value == "实习": job_type_id = 2 elif value == "全职": job_type_id = 0 else: LOG.error("The Job Type Value is not valid,[%s]" % (value)) return job_type_id
def dumpjson_to_file(self, item): # res = json.dumps(json_list) res = str(item) output_file = "json_" + datetime.datetime.now().strftime("%m_%d_%H%M") output = os.path.join(self.output_dir, output_file) try: json_file = open(output, "a") json_file.write(res) json_file.write("\n") json_file.close() except IOError, e: LOG.error(e) sys.exit(-1)
def send_request(self,retry_time=3): count = 0 res = None resp = None while(count < retry_time): LOG.info("Send Request Round: [%s]" %(count)) try: resp = urllib2.urlopen(self.url) res = resp.read() break except Exception,e: LOG.error(e) time.sleep(1) count += 1
def deal_old_list(self, cmp_columns, entrys): campus_entrys = [] recruit_entrys = [] LOG.info("Dealing Old List...") entry_idx = 0 for entry in entrys: # 宣讲会的数据 if entry[INFO_TYPE] == INFO_TYPE_CAM: campus_entrys.append(entry) else: recruit_entrys.append(entry) entry_idx += 1 LOG.info("Finish Dealing Old List.") return campus_entrys, recruit_entrys
def insert_to_campusTalk(entrys): if len(entrys) == 0: return LOG.info("Startging to Store to multi tables") table = T_campusTalk id = None count = 0 for entry in entrys: if count % 2000 == 0: LOG.info("[%s] Has been inserted!"%count) collegeID = insert_subtable(entry,T_collegeInfo) if collegeID is None: LOG.error("INSERT TO [CollegeInfo] FAILS:%s"\ %("|".join(entry.values()))) return None campanyID = insert_subtable(entry,T_companyInfo) if campanyID is None: LOG.error("INSERT TO [CampanyInfo] FAILS:%s"\ %("|".join(entry.values()))) return None new_entry = {} new_entry[COLLEGE_ID] = collegeID new_entry[COMPANY_ID] = campanyID for key,value in entry.items(): if value is None: continue if ct_field_map[key][0] == table: new_entry[ct_field_map[key][1]] = value if len(new_entry.keys()) > 0: id = db_helper.insert_entry(new_entry,table) count += 1 return id
def dmg_to_proto_entry(self, entry): new_entry = {} convert_int_keys = set([INFO_ID, GROUP_ID, HAS_HUKOU, HAS_EXAM, HAS_RESUME, INFO_TYPE, CLICK_RATE]) convert_time_keys = set( [MEETING_TIME, RELEASE_DATE, RESUME_START_DATE, RESUME_END_DATE, EXAM_TIME, INTERVIEW_TIME, LAST_MOD_TIME] ) for key, value in entry.items(): if value is None: continue if isinstance(value, basestring): if value.strip() == "": continue # convert to int count:9 if key in convert_int_keys: try: value = int(value) except ValueError: LOG.error("ValueError,key is [%s] value is [%s]" % (key, value)) # convert to id count:2 elif key == COLLEGE_NAME: college_id = self.map_college(value) new_entry[COLLEGE_ID] = college_id elif key == RECRUIT_CITIES: try: value = self.map_cities(value) except: LOG.error("In Map cities[%s]:%s" % (key, value)) # convert to type: if key == JOB_TYPE: value = self.map_job_type(value) if key == COMPANY_TYPE: value = self.map_company_type(value) # convert to list count:1 elif key == WORK_PLACE: value = self.deal_work_place(value) # convert time to timestamp count:6 elif key in convert_time_keys: try: if not isinstance(value, datetime.datetime): value = datetime.datetime.strptime(value, DATEFORMAT) value = get_timestamp(value) except Exception, e: LOG.error("[%s]:[%s]" % (key, e)) # last_mod_time has micro-second scale elif key == LAST_MOD_TIME: try: value = datetime.datetime.strptime(value, DATEFORMAT) value = get_timestamp(value) except Exception, e: LOG.error("[%s]:[%s]" % (key, e))
def rm_college_from_loc(self,entrys): LOG.info("Dealing Location field to remove college from loc") count = 0 for entry in entrys: info_type = entry.get(INFO_TYPE) if info_type == 0: continue college = entry.get(COLLEGE_NAME) loc = entry.get(MEETING_LOCATION) if loc.startswith(college): count += 1 loc = loc.replace(college,'') entry[MEETING_LOCATION] = loc LOG.info("Removing [%s] College Name from meeting_location field!"%count) return entrys
def rec_protobuf(self,res): if not os.path.isdir(RECORD_DIR): os.mkdir(RECORD_DIR) rec_file = os.path.join(RECORD_DIR,self.make_file_name()) file = open(rec_file,'w') LOG.info("Recording the proto to file [%s]" %(rec_file)) for protobuf in res: arr = array.array('I') length = len(protobuf) li = [length] arr.fromlist(li) file.write(arr.tostring()) file.write(protobuf) file.close() LOG.info("Recording to [%s] successfully!"%(rec_file))
def is_similar(self, columns, new, old): res = True for key in columns: LOG.debug("Comparing[%s],new is [%s],old is [%s]" % (key, new[key], old[key])) if new[key] == old[key]: continue if new[key] is None or old[key] is None: res = False break if new[key] in old[key] or old[key] in new[key]: continue if StrSim.get_sim(str(new[key]), str(old[key])) < THRESHOLD: res = False break return res
def load_table(filepath=None): try: domtree = ElementTree.parse(filepath) root = domtree.getroot() tables = root.findall('table') table_info = {} for table in tables: columns = [] table_name = table.find('name').text columns_list = table.find('columns').findall('column') for column in columns_list: columns.append(column.text) table_info[table_name] = columns except Exception,e: LOG.error(e) sys.exit(-1)
def write_entrys(entrys,filepath): file = open(filepath,'w') count = 0 for entry in entrys: if isinstance(entry,dict): for key,value in entry.items(): value = str(value).replace('\n','').replace('\t','') file.write("%s\t"%value) file.write("\n") count += 1 if isinstance(entry,list): for value in entry: value = str(value).replace('\n','').replace('\t','') file.write("%s***\t"%value) file.write("\n") count += 1 file.close() LOG.info("write [%s] entrys"%(count))
def get_entrys(self,seconds_before=1800,is_full=True): """ to get data from content base ,the data which updated from seconds_before to now """ LOG.info("Begin To Requset Data From DMG.") entrys = [] # When is_full is True , to get all the entrys from dmg if is_full: now_str = None before_str = None else: now_str,before_str = self.make_time(seconds_before) self.construct_args(before_str,now_str,is_full) resp = self.send_request() if resp != None: entrys = self.deal_resp(resp) return entrys
def get_term_info(res): socket = TSocket.TSocket(host, port) transport = TTransport.TBufferedTransport(socket) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = TextProcessServer.Client(protocol) transport.open() file = open("output.protobuffer",'wb') resps = client.word_seg_list(res) for resp in resps: arr = array.array('I') length = len(resp); li = [length] arr.fromlist(li) LOG.info("The response length is : [%s]"%length) file.write(arr.tostring()) file.write(resp) file.close() transport.close() return resps
def repair_data(self,entrys,cmp_entrys=None): if cmp_entrys == None: cmp_entrys = self.cmp_entrys LOG.info("repairing Data...") LOG.info("Entrys to Repair size is [%s],cmp_entrys size is [%s]"%(len(entrys),len(cmp_entrys))) cmple_info_dict = collections.defaultdict(dict) for entry in cmp_entrys: origin_url = entry.get(ORIGIN_URL) if origin_url != None: cmple_info_dict[origin_url].update(entry) for entry in entrys: origin_url = entry.get(ORIGIN_URL) if origin_url in cmple_info_dict: for clm in cmple_info_dict[origin_url]: value = entry.get(clm) if value is None: new_value = cmple_info_dict[origin_url][clm] value = new_value entry[clm] = value return entrys
def filter_illegal(self,entrys): LOG.info("Begin to Filter illegal entrys..") not_null_fields = [ORIGIN_URL,ORIGIN_WEBSITE_NAME,RECRUIT_TITLE] legal_entrys = [] illegal_entrys = [] while len(entrys) > 0: entry = entrys.pop() info_type = entry.get(INFO_TYPE) flag = True for field in not_null_fields: if entry.get(field) is None: illegal_entrys.append(entry) flag = False break if info_type == 0 and flag: if entry.get(RELEASE_DATE) is None: illegal_entrys.append(entry) flag = False if info_type == 1 and flag: if entry.get(MEETING_TIME) is None: illegal_entrys.append(entry) flag = False if flag:legal_entrys.append(entry) LOG.info("Finish filering entrys.[%s] entrys are illegal"%(len(illegal_entrys))) db_illegal = DB_1 table_illegal = TB_ILLEGAL LOG.info("Insert illegal Entrys into[%s.%s]"%(db_illegal,table_illegal)) fields = list(INFO_FIELDS) fields.remove(GROUP_ID) fields.extend(['author','tmp_path']) self.db_helper.batch_insert(illegal_entrys,table_illegal,db_illegal,fields) return legal_entrys
def deal_new_list(self, cmp_columns, entrys): # 宣讲会的数据和招聘会的数据 campus_entrys = [] recruit_entrys = [] fields_set = set() title_set = set() LOG.info("Dealing New List...") entry_idx = 0 key_fields = cmp_columns + [ORIGIN_URL] for entry in entrys: # 宣讲会的数据 if entry_idx % 5000 == 0: LOG.info("Dealing new_entrys In Progress:[%s]" % entry_idx) if entry[INFO_TYPE] == INFO_TYPE_CAM: # 同一时间的数据进行相似度对比去重 key = make_key(entry, key_fields) if key not in fields_set: fields_set.add(key) # 处理应届生海投这种一个页面对应多条信息的情况 entry = self.deal_special_url(entry) campus_entrys.append(entry) # 招聘会的数据 else: key = entry.get(RECRUIT_TITLE) if key not in title_set: title_set.add(key) recruit_entrys.append(entry) entry_idx += 1 LOG.info("Finish Dealing New List.") return campus_entrys, recruit_entrys
def word_seg_list(self,protobufs): res = [] if not check_list(protobufs): LOG.info("Do Not Call Word Segging,For the input list is:%s"%(protobufs)) return res try: self.transport.open() LOG.info("Begin RPC Word Segging,[%s] To Be Segged!" %(len(protobufs))) res = self.client.word_seg_list(protobufs) self.transport.close() LOG.info("Finish RPC Word Segging,[%s] Entrys Have Been Segged!" %(len(res))) self.rec_protobuf(res) except Exception,e: LOG.error(e)
def deal_data(self, raw_data): entrys = [] rq_sum = 0 rp_sum = 0 uniq_url_sum = 0 rows = self.prepare_data() for row in raw_data: # TODO try catch the index exceed the row length try: entry = [] dm = row[0] rq = row[1] rp = row[2] uniq_url = row[3] rq_sum += int(rq) rp_sum += int(rp) uniq_url_sum += int(uniq_url) college = self.domain_college_dict.get(dm) if college is None: LOG.error("[%s] domain not found" % dm) college = dm entry.append(college) entry.append(rq) entry.append(rp) # zero can not be divided suc_rate = self.get_rate(rp, rq) entry.append(suc_rate) entry.append(uniq_url) entrys.append(entry) except Exception, e: LOG.error(e) suc_rate_aver = self.get_rate(rp_sum, rq_sum)
def extract(recruit_title): if not recruit_title: return recruit_title #保存一份原始标题 origin_recruit_title = recruit_title.strip() recruit_title = pre_clean(recruit_title) if len(recruit_title) < 2: #如果抽出为空,那么返回原始标题 recruit_title = origin_recruit_title recruit_title = recruit_title.strip(u',、-—') match_result = pat_company_pre_tail.search(recruit_title) if match_result is not None: LOG.info('extract %s by pat_company_pre_tail' %(recruit_title)) return match_result.group(0) match_result = pat_company_tail.search(recruit_title) if match_result is not None: #LOG.info('extract %s by pat_company_tail' %(recruit_title)) return match_result.group(0) match_result = pat_company_tail_sec.search(recruit_title) if match_result is not None: #LOG.info('extract %s by pat_company_tail_sec' %(recruit_title)) return match_result.group(0) match_result = pat_company_tail_third.search(recruit_title) if match_result is not None: #LOG.info('extract %s by pat_company_tail_third' %(recruit_title)) return match_result.group(0) #..xxx中心 match_result = pat_company_tail_forth.search(recruit_title) if match_result is not None: #LOG.info('extract %s by pat_company_tail_forth' %(recruit_title)) return match_result.group(0) company_name = clean(recruit_title) if len(company_name) < 2: return recruit_title return company_name
def index_list_string(self, protobufs): if not check_list(protobufs): LOG.info("Do Not Call Indexing, For the input list is:%s" % (protobufs)) return count = 0 try: self.transport.open() is_ready = self.start_index() if not is_ready: LOG.error("Index Server Is Not Ready!") return 0 count = self.client.put_list_string(protobufs) LOG.info("[%s] Entrys Has Been Successfully Indexed." % (count)) self.stop_index() self.transport.close() except Exception, e: LOG.error(e)
def serilize_entrys(self,entrys): protobufs = [] LOG.info("Begin Serilizing Entrys,[%s] Entrys To Be Serilized!" %(len(entrys))) for entry in entrys: mdoc = merged_doc_pb2.MergedDocInfo() new_entry = self.convertor.dmg_to_proto_entry(entry) for key,value in new_entry.items(): try: if isinstance(value,list): getattr(mdoc,key).extend(value) else: setattr(mdoc,key,value) except Exception,e: LOG.error("[%s]:%s" % (key,value)) LOG.error(e) protobuf = mdoc.SerializeToString() protobufs.append(protobuf)