Exemple #1
0
    def dmg_to_proto_entry(self, entry):
        new_entry = {}

        convert_int_keys = set([INFO_ID, GROUP_ID, HAS_HUKOU, HAS_EXAM, HAS_RESUME, INFO_TYPE, CLICK_RATE])

        convert_time_keys = set(
            [MEETING_TIME, RELEASE_DATE, RESUME_START_DATE, RESUME_END_DATE, EXAM_TIME, INTERVIEW_TIME, LAST_MOD_TIME]
        )

        for key, value in entry.items():

            if value is None:
                continue
            if isinstance(value, basestring):
                if value.strip() == "":
                    continue

            # convert to  int count:9
            if key in convert_int_keys:
                try:
                    value = int(value)
                except ValueError:
                    LOG.error("ValueError,key is [%s] value is [%s]" % (key, value))

            # convert to id count:2
            elif key == COLLEGE_NAME:
                college_id = self.map_college(value)
                new_entry[COLLEGE_ID] = college_id
            elif key == RECRUIT_CITIES:
                try:
                    value = self.map_cities(value)
                except:
                    LOG.error("In Map cities[%s]:%s" % (key, value))
            # convert to type:
            if key == JOB_TYPE:
                value = self.map_job_type(value)
            if key == COMPANY_TYPE:
                value = self.map_company_type(value)

            # convert to list count:1
            elif key == WORK_PLACE:
                value = self.deal_work_place(value)

            # convert time to timestamp  count:6
            elif key in convert_time_keys:
                try:
                    if not isinstance(value, datetime.datetime):
                        value = datetime.datetime.strptime(value, DATEFORMAT)
                    value = get_timestamp(value)
                except Exception, e:
                    LOG.error("[%s]:[%s]" % (key, e))

            # last_mod_time has micro-second scale
            elif key == LAST_MOD_TIME:
                try:
                    value = datetime.datetime.strptime(value, DATEFORMAT)
                    value = get_timestamp(value)
                except Exception, e:
                    LOG.error("[%s]:[%s]" % (key, e))
Exemple #2
0
 def sort_func(self,entry):
     info_type = entry.get(INFO_TYPE)
     if info_type == 1:
         time = entry.get(MEETING_TIME)
     else:
         time = entry.get(RELEASE_DATE)
     if time is None:
         return 0
     else :
         return get_timestamp(time)
Exemple #3
0
 def test_timestamp(self):
     format = "%Y-%m-%d %H:%M:%S.%f"
     format = "%Y-%m-%d"
     date = "1970-01-01"
     day = datetime.datetime.strptime(date, format)
     print get_timestamp(day)
Exemple #4
0
def serilize_entrys(entrys):
    res = []
    for entry in entrys:
        mdoc = merged_doc_pb2.MergedDocInfo()
        for key,value in entry.items():

            # 空值的跳过
            if value is None:
                continue

            # 数据库读出的long值改为int,info_id字段除外
            if isinstance(value,long):
                if(key != INFO_ID):
                    value = int(value)

            # 时间转化成时间戳int类型
            if isinstance(value,datetime.datetime):
                value = get_timestamp(value)

            # 对recruit_cities和编号进行映射
            if key == RECRUIT_CITIES:
                for city in value.strip().split(","):
                    if len(city) > 0:

                        id = city_id.get(value)
                        if id is None:
                            LOG.error("City Name [%s] Not Found!"%value)
                        else:
                            mdoc.recruit_cities.append(id)
                continue

            # 对collegeName和编号进行映射
            if key == COLLEGE_NAME:
                value = value.strip()
                if len(value) > 0:
                    id = college_id.get(value)
                    if id is None:
                        LOG.error("College Name [%s] Not Found!"%value)
                        missed_colleges.add(value)

                        #默认为0
                        mdoc.college_ID = 1
                    else:
                        #填充protobuffer中的college_id 字段
                        mdoc.college_ID = id

            #处理work_place值
            if key == 'work_place':
                for place in value.strip().split(","):
                    if len(place) > 0:
                        mdoc.work_place.append(place)
                LOG.info(";".join(mdoc.work_place))
                continue

            try:
                #计算term-weight需要gbk编码的value
                if isinstance(value,str):
                    value = value.decode('utf-8').encode('gbk')
                setattr(mdoc, key, value)

                mdoc.origin_url_sign = 0
                mdoc.recruit_title_sign = 0
                mdoc.info_text_sign = 0
                mdoc.group_id = 1
            except Exception,e:
                LOG.error(e)
                LOG.error("key is [%s] and value is[%s]"%(key,value))
        serilized = mdoc.SerializeToString()
        res.append(serilized)