def dmg_to_proto_entry(self, entry): new_entry = {} convert_int_keys = set([INFO_ID, GROUP_ID, HAS_HUKOU, HAS_EXAM, HAS_RESUME, INFO_TYPE, CLICK_RATE]) convert_time_keys = set( [MEETING_TIME, RELEASE_DATE, RESUME_START_DATE, RESUME_END_DATE, EXAM_TIME, INTERVIEW_TIME, LAST_MOD_TIME] ) for key, value in entry.items(): if value is None: continue if isinstance(value, basestring): if value.strip() == "": continue # convert to int count:9 if key in convert_int_keys: try: value = int(value) except ValueError: LOG.error("ValueError,key is [%s] value is [%s]" % (key, value)) # convert to id count:2 elif key == COLLEGE_NAME: college_id = self.map_college(value) new_entry[COLLEGE_ID] = college_id elif key == RECRUIT_CITIES: try: value = self.map_cities(value) except: LOG.error("In Map cities[%s]:%s" % (key, value)) # convert to type: if key == JOB_TYPE: value = self.map_job_type(value) if key == COMPANY_TYPE: value = self.map_company_type(value) # convert to list count:1 elif key == WORK_PLACE: value = self.deal_work_place(value) # convert time to timestamp count:6 elif key in convert_time_keys: try: if not isinstance(value, datetime.datetime): value = datetime.datetime.strptime(value, DATEFORMAT) value = get_timestamp(value) except Exception, e: LOG.error("[%s]:[%s]" % (key, e)) # last_mod_time has micro-second scale elif key == LAST_MOD_TIME: try: value = datetime.datetime.strptime(value, DATEFORMAT) value = get_timestamp(value) except Exception, e: LOG.error("[%s]:[%s]" % (key, e))
def sort_func(self,entry): info_type = entry.get(INFO_TYPE) if info_type == 1: time = entry.get(MEETING_TIME) else: time = entry.get(RELEASE_DATE) if time is None: return 0 else : return get_timestamp(time)
def test_timestamp(self): format = "%Y-%m-%d %H:%M:%S.%f" format = "%Y-%m-%d" date = "1970-01-01" day = datetime.datetime.strptime(date, format) print get_timestamp(day)
def serilize_entrys(entrys): res = [] for entry in entrys: mdoc = merged_doc_pb2.MergedDocInfo() for key,value in entry.items(): # 空值的跳过 if value is None: continue # 数据库读出的long值改为int,info_id字段除外 if isinstance(value,long): if(key != INFO_ID): value = int(value) # 时间转化成时间戳int类型 if isinstance(value,datetime.datetime): value = get_timestamp(value) # 对recruit_cities和编号进行映射 if key == RECRUIT_CITIES: for city in value.strip().split(","): if len(city) > 0: id = city_id.get(value) if id is None: LOG.error("City Name [%s] Not Found!"%value) else: mdoc.recruit_cities.append(id) continue # 对collegeName和编号进行映射 if key == COLLEGE_NAME: value = value.strip() if len(value) > 0: id = college_id.get(value) if id is None: LOG.error("College Name [%s] Not Found!"%value) missed_colleges.add(value) #默认为0 mdoc.college_ID = 1 else: #填充protobuffer中的college_id 字段 mdoc.college_ID = id #处理work_place值 if key == 'work_place': for place in value.strip().split(","): if len(place) > 0: mdoc.work_place.append(place) LOG.info(";".join(mdoc.work_place)) continue try: #计算term-weight需要gbk编码的value if isinstance(value,str): value = value.decode('utf-8').encode('gbk') setattr(mdoc, key, value) mdoc.origin_url_sign = 0 mdoc.recruit_title_sign = 0 mdoc.info_text_sign = 0 mdoc.group_id = 1 except Exception,e: LOG.error(e) LOG.error("key is [%s] and value is[%s]"%(key,value)) serilized = mdoc.SerializeToString() res.append(serilized)