def deal_data(self,raw_data): entrys = [] site_tt_sum = 0 err_1_sum = 0 err_2_sum = 0 for row in raw_data: try: entry = [] site = row[0] site_tt = row[1] err_1 = row[2] err_2 = row[3] site_tt_sum += site_tt err_1_sum += err_1 err_2_sum += err_2 college = self.site_college_dict.get(site) if college is None: LOG.error('[%s] name not found!'%site) college = site entry.append(college) entry.append(site_tt) entry.append(err_1) entry.append(err_2) # no need to prevent site_tt_num as zero :impossible suc_rate = self.get_rate(site_tt - err_1 - err_2,site_tt) entry.append(suc_rate) entrys.append(entry) except Exception,e: LOG.error(e)
def deal_data(self,raw_data): entrys = [] full_sum = 0 refined_sum = 0 for row in raw_data: try: entry = [] site = row[0] full_tt = row[1] refined = row[2] full_sum += full_tt refined_sum += refined college = self.site_college_dict.get(site) if college is None: LOG.error('[%s] name not found!'%site) college = site entry.append(college) entry.append(full_tt) entry.append(refined) # no need to prevent site_tt_num as zero :impossible suc_rate = self.get_rate(refined,full_tt) entry.append(suc_rate) entrys.append(entry) except Exception,e: LOG.error(e)
def insert_to_campusTalk(entrys): if len(entrys) == 0: return LOG.info("Startging to Store to multi tables") table = T_campusTalk id = None count = 0 for entry in entrys: if count % 2000 == 0: LOG.info("[%s] Has been inserted!"%count) collegeID = insert_subtable(entry,T_collegeInfo) if collegeID is None: LOG.error("INSERT TO [CollegeInfo] FAILS:%s"\ %("|".join(entry.values()))) return None campanyID = insert_subtable(entry,T_companyInfo) if campanyID is None: LOG.error("INSERT TO [CampanyInfo] FAILS:%s"\ %("|".join(entry.values()))) return None new_entry = {} new_entry[COLLEGE_ID] = collegeID new_entry[COMPANY_ID] = campanyID for key,value in entry.items(): if value is None: continue if ct_field_map[key][0] == table: new_entry[ct_field_map[key][1]] = value if len(new_entry.keys()) > 0: id = db_helper.insert_entry(new_entry,table) count += 1 return id
def load_city_map(filepath): city_id_map = {} lines = [] try: file = codecs.open(filepath,encoding='utf-8') lines = file.readlines() except Exception as e: LOG.error("Doesn't find the city list file!") LOG.error(e) if len(lines) == 0: return city_id_map for line in lines: line = line.strip("\n") names = [] #编号,城市名,别名 (id,name,alias)= line.split("\t") id = int(id) names.append(name) alias = alias.split(";") for name in alias: names.append(name) for name in names: name = name.strip('\"') if len(name.strip()) == 0: continue city_id_map.update({name:id}) return city_id_map
def deal_data(self,raw_data): entrys = [] tt_sum = 0 cam_tt_sum = 0 rec_tt_sum = 0 fut_cam_tt_sum = 0 for row in raw_data: try: entry = [] site = row[0] tt = row[1] cam_tt = row[2] rec_tt = row[3] fut_cam_tt = row[4] tt_sum += tt cam_tt_sum += cam_tt rec_tt_sum += rec_tt fut_cam_tt_sum += fut_cam_tt college = self.site_college_dict.get(site) if college is None: LOG.error('[%s] name not found!'%site) college = site entry.append(college) entry.append(tt) entry.append(cam_tt) entry.append(rec_tt) entry.append(fut_cam_tt) entrys.append(entry) except Exception,e: LOG.error(e)
def load_ids_from_rec(self): file = None info_ids = [] try: file = open(FAILED_INFOID_REC,'r') info_ids_str = file.read() info_ids = info_ids_str.split(";") except Exception as e: LOG.error(e) finally: if file:file.close() return info_ids
def map_job_type(self, value): value = value.strip() job_type_id = 0 if len(value) > 0: if value == "兼职": job_type_id = 1 elif value == "实习": job_type_id = 2 elif value == "全职": job_type_id = 0 else: LOG.error("The Job Type Value is not valid,[%s]" % (value)) return job_type_id
def dumpjson_to_file(self, item): # res = json.dumps(json_list) res = str(item) output_file = "json_" + datetime.datetime.now().strftime("%m_%d_%H%M") output = os.path.join(self.output_dir, output_file) try: json_file = open(output, "a") json_file.write(res) json_file.write("\n") json_file.close() except IOError, e: LOG.error(e) sys.exit(-1)
def send_request(self,retry_time=3): count = 0 res = None resp = None while(count < retry_time): LOG.info("Send Request Round: [%s]" %(count)) try: resp = urllib2.urlopen(self.url) res = resp.read() break except Exception,e: LOG.error(e) time.sleep(1) count += 1
def dmg_to_proto_entry(self, entry): new_entry = {} convert_int_keys = set([INFO_ID, GROUP_ID, HAS_HUKOU, HAS_EXAM, HAS_RESUME, INFO_TYPE, CLICK_RATE]) convert_time_keys = set( [MEETING_TIME, RELEASE_DATE, RESUME_START_DATE, RESUME_END_DATE, EXAM_TIME, INTERVIEW_TIME, LAST_MOD_TIME] ) for key, value in entry.items(): if value is None: continue if isinstance(value, basestring): if value.strip() == "": continue # convert to int count:9 if key in convert_int_keys: try: value = int(value) except ValueError: LOG.error("ValueError,key is [%s] value is [%s]" % (key, value)) # convert to id count:2 elif key == COLLEGE_NAME: college_id = self.map_college(value) new_entry[COLLEGE_ID] = college_id elif key == RECRUIT_CITIES: try: value = self.map_cities(value) except: LOG.error("In Map cities[%s]:%s" % (key, value)) # convert to type: if key == JOB_TYPE: value = self.map_job_type(value) if key == COMPANY_TYPE: value = self.map_company_type(value) # convert to list count:1 elif key == WORK_PLACE: value = self.deal_work_place(value) # convert time to timestamp count:6 elif key in convert_time_keys: try: if not isinstance(value, datetime.datetime): value = datetime.datetime.strptime(value, DATEFORMAT) value = get_timestamp(value) except Exception, e: LOG.error("[%s]:[%s]" % (key, e)) # last_mod_time has micro-second scale elif key == LAST_MOD_TIME: try: value = datetime.datetime.strptime(value, DATEFORMAT) value = get_timestamp(value) except Exception, e: LOG.error("[%s]:[%s]" % (key, e))
def word_seg_list(self,protobufs): res = [] if not check_list(protobufs): LOG.info("Do Not Call Word Segging,For the input list is:%s"%(protobufs)) return res try: self.transport.open() LOG.info("Begin RPC Word Segging,[%s] To Be Segged!" %(len(protobufs))) res = self.client.word_seg_list(protobufs) self.transport.close() LOG.info("Finish RPC Word Segging,[%s] Entrys Have Been Segged!" %(len(res))) self.rec_protobuf(res) except Exception,e: LOG.error(e)
def load_table(filepath=None): try: domtree = ElementTree.parse(filepath) root = domtree.getroot() tables = root.findall('table') table_info = {} for table in tables: columns = [] table_name = table.find('name').text columns_list = table.find('columns').findall('column') for column in columns_list: columns.append(column.text) table_info[table_name] = columns except Exception,e: LOG.error(e) sys.exit(-1)
def index_list_string(self, protobufs): if not check_list(protobufs): LOG.info("Do Not Call Indexing, For the input list is:%s" % (protobufs)) return count = 0 try: self.transport.open() is_ready = self.start_index() if not is_ready: LOG.error("Index Server Is Not Ready!") return 0 count = self.client.put_list_string(protobufs) LOG.info("[%s] Entrys Has Been Successfully Indexed." % (count)) self.stop_index() self.transport.close() except Exception, e: LOG.error(e)
def serilize_entrys(self,entrys): protobufs = [] LOG.info("Begin Serilizing Entrys,[%s] Entrys To Be Serilized!" %(len(entrys))) for entry in entrys: mdoc = merged_doc_pb2.MergedDocInfo() new_entry = self.convertor.dmg_to_proto_entry(entry) for key,value in new_entry.items(): try: if isinstance(value,list): getattr(mdoc,key).extend(value) else: setattr(mdoc,key,value) except Exception,e: LOG.error("[%s]:%s" % (key,value)) LOG.error(e) protobuf = mdoc.SerializeToString() protobufs.append(protobuf)
def load_columns(self, filepath=None): columns_list = [] try: dom_tree = ElementTree.parse(filepath) root = dom_tree.getroot() columns = root.findall("field") if columns is None or len(columns) == 0: LOG.error("No columns found in xml conf [%s]" % (filepath)) sys.exit(-1) columns_count = 0 for column in columns: columns_list.append(column.text) columns_count += 1 LOG.info("Total Load [%d] columns in conf [%s]" % (columns_count, filepath)) except Exception as e: LOG.error(e) sys.exit(-1) return columns_list
def write_ids_to_rec(self): LOG.info("Recording Failed IDs Into File [%s]" %(FAILED_INFOID_REC)) if not (len(self.failed_ids)>0): return # record the failed_ids to file # check the record file dir if exists ,if not create it dir= os.path.dirname(FAILED_INFOID_REC) file = None if not os.path.isdir(dir): os.mkdir(dir) try: file = open(FAILED_INFOID_REC,'w') file.write(";".join([str(id) for id in self.failed_ids])) LOG.info("Record Successfully Totally [%s] entrys" %(len(self.failed_ids))) except Exception as e: LOG.error(e) finally: if file: file.close()
def submit_to_RB(self,entrys,posturl=POSTURL): """ submit data to dmg """ failed_count = 0 suc_count = 0 suc_ids = [] failed_ids = [] failed_entrys = [] count = 0 state = "" res = None for entry in entrys[:]: #just for test should delet after testing count += 1 info_id = entry[INFO_ID] entry_json = self.construct_json_data(entry) resjson = {"resjson":json.dumps(entry_json)} try: state =self.post(posturl,resjson) res = json.loads(state)['result'] if (res == 0): LOG.warning("[%s] Submitted Error!" %(info_id)) LOG.warning(state) failed_count += 1 if (res == 1): suc_count += 1 except Exception as e: LOG.error(e) failed_count += 1 LOG.error(state) failed_ids.append(info_id) failed_entrys.append(entry) # LOG.debug("Post one entry into RB the result is: [%s],Suc_Count=\ # [%s],Fail_count=[%s]" % (res,suc_count,failed_count)) if (suc_count + 1) % 1000 == 0 or (failed_count + 1 ) % 100 == 0: LOG.info("Post Entrys To DMG,Suc:[%s] Failed:[%s]" %(suc_count,failed_count)) self.failed_ids = failed_ids self.failed_entrys = failed_entrys LOG.info("Successfully Submitted To DMG [%s],Failed:[%s]" %(suc_count,failed_count))
def deal_data(self, raw_data): entrys = [] rq_sum = 0 rp_sum = 0 uniq_url_sum = 0 rows = self.prepare_data() for row in raw_data: # TODO try catch the index exceed the row length try: entry = [] dm = row[0] rq = row[1] rp = row[2] uniq_url = row[3] rq_sum += int(rq) rp_sum += int(rp) uniq_url_sum += int(uniq_url) college = self.domain_college_dict.get(dm) if college is None: LOG.error("[%s] domain not found" % dm) college = dm entry.append(college) entry.append(rq) entry.append(rp) # zero can not be divided suc_rate = self.get_rate(rp, rq) entry.append(suc_rate) entry.append(uniq_url) entrys.append(entry) except Exception, e: LOG.error(e) suc_rate_aver = self.get_rate(rp_sum, rq_sum)
def serilize_entrys(entrys): res = [] for entry in entrys: mdoc = merged_doc_pb2.MergedDocInfo() for key,value in entry.items(): # 空值的跳过 if value is None: continue # 数据库读出的long值改为int,info_id字段除外 if isinstance(value,long): if(key != INFO_ID): value = int(value) # 时间转化成时间戳int类型 if isinstance(value,datetime.datetime): value = get_timestamp(value) # 对recruit_cities和编号进行映射 if key == RECRUIT_CITIES: for city in value.strip().split(","): if len(city) > 0: id = city_id.get(value) if id is None: LOG.error("City Name [%s] Not Found!"%value) else: mdoc.recruit_cities.append(id) continue # 对collegeName和编号进行映射 if key == COLLEGE_NAME: value = value.strip() if len(value) > 0: id = college_id.get(value) if id is None: LOG.error("College Name [%s] Not Found!"%value) missed_colleges.add(value) #默认为0 mdoc.college_ID = 1 else: #填充protobuffer中的college_id 字段 mdoc.college_ID = id #处理work_place值 if key == 'work_place': for place in value.strip().split(","): if len(place) > 0: mdoc.work_place.append(place) LOG.info(";".join(mdoc.work_place)) continue try: #计算term-weight需要gbk编码的value if isinstance(value,str): value = value.decode('utf-8').encode('gbk') setattr(mdoc, key, value) mdoc.origin_url_sign = 0 mdoc.recruit_title_sign = 0 mdoc.info_text_sign = 0 mdoc.group_id = 1 except Exception,e: LOG.error(e) LOG.error("key is [%s] and value is[%s]"%(key,value)) serilized = mdoc.SerializeToString() res.append(serilized)
except Exception, e: LOG.error("[%s]:[%s]" % (key, e)) # last_mod_time has micro-second scale elif key == LAST_MOD_TIME: try: value = datetime.datetime.strptime(value, DATEFORMAT) value = get_timestamp(value) except Exception, e: LOG.error("[%s]:[%s]" % (key, e)) # else convert to string else: try: value = value except Exception as e: LOG.error("[s%]:[%s]" % (key, e)) new_entry[key] = value # set the default key and value # new_entry['normalized_url_sign'] = 0 # new_entry['recruit_title_sign'] = 0 # new_entry['info_text_sign'] = 0 return new_entry # convert cities to cityid_list def map_cities(self, value): city_ids = [] for city in value.strip().split(","): if len(city) > 0: