def deal_data(self,raw_data):
        entrys = []
        site_tt_sum = 0 
        err_1_sum = 0 
        err_2_sum = 0 
        for row in raw_data:
            try:
                entry = []
                site = row[0]
                site_tt = row[1]
                err_1 = row[2]
                err_2 = row[3]

                site_tt_sum += site_tt
                err_1_sum += err_1
                err_2_sum += err_2

                college = self.site_college_dict.get(site)
                if college is None:
                    LOG.error('[%s] name not found!'%site)
                    college = site
                entry.append(college)
                entry.append(site_tt)
                entry.append(err_1)
                entry.append(err_2)
                # no need to prevent site_tt_num as zero :impossible
                suc_rate = self.get_rate(site_tt - err_1 - err_2,site_tt)
                entry.append(suc_rate)
                entrys.append(entry)
            except Exception,e:
                LOG.error(e)
Beispiel #2
0
    def deal_data(self,raw_data):
       
        entrys = []
        full_sum = 0 
        refined_sum = 0 

        for row in raw_data:
            try:
                entry = []
                site = row[0]
                full_tt = row[1]
                refined = row[2]
                
                full_sum += full_tt
                refined_sum += refined

                college = self.site_college_dict.get(site)
                if college is None:
                    LOG.error('[%s] name not found!'%site)
                    college = site
                entry.append(college)
                entry.append(full_tt)
                entry.append(refined)
                # no need to prevent site_tt_num as zero :impossible
                suc_rate = self.get_rate(refined,full_tt)
                entry.append(suc_rate)
                entrys.append(entry)
            except Exception,e:
                LOG.error(e)
Beispiel #3
0
def insert_to_campusTalk(entrys):
    if len(entrys) == 0:
        return
    LOG.info("Startging to Store to multi tables")
    table = T_campusTalk
    id = None
    count = 0
    for entry in entrys:
        if count % 2000 == 0:
            LOG.info("[%s] Has been inserted!"%count)
        collegeID = insert_subtable(entry,T_collegeInfo)
        if collegeID is None:
            LOG.error("INSERT TO [CollegeInfo] FAILS:%s"\
                        %("|".join(entry.values())))
            return None

        campanyID = insert_subtable(entry,T_companyInfo)
        if campanyID is None:
            LOG.error("INSERT TO [CampanyInfo] FAILS:%s"\
                        %("|".join(entry.values())))
            return None
        new_entry = {}
        new_entry[COLLEGE_ID] = collegeID
        new_entry[COMPANY_ID] = campanyID
        for key,value in entry.items():
            if value is None:
                continue
            if ct_field_map[key][0] == table:
                new_entry[ct_field_map[key][1]] = value
        if len(new_entry.keys()) > 0:
            id = db_helper.insert_entry(new_entry,table)
        count += 1
    return id
Beispiel #4
0
def load_city_map(filepath):

    city_id_map = {}
    lines = []
    try:
        file = codecs.open(filepath,encoding='utf-8')
        lines = file.readlines()
    except Exception as e:
        LOG.error("Doesn't find the city list file!")
        LOG.error(e)
    if len(lines) == 0:
        return city_id_map

    for line in lines:
        line = line.strip("\n")
        names = []
        #编号,城市名,别名
        (id,name,alias)= line.split("\t")
        id = int(id)
        names.append(name)
        alias = alias.split(";")

        for name in alias:
            names.append(name)
        for name in names:
            name = name.strip('\"')
            if len(name.strip()) == 0:
                continue
            city_id_map.update({name:id})
    return city_id_map
Beispiel #5
0
    def deal_data(self,raw_data):
        
        entrys = []
        tt_sum = 0
        cam_tt_sum = 0
        rec_tt_sum = 0
        fut_cam_tt_sum = 0

        for row in raw_data:
            try:
                entry = []
                site = row[0]
                tt = row[1]
                cam_tt = row[2]
                rec_tt = row[3]
                fut_cam_tt = row[4]

                tt_sum += tt
                cam_tt_sum += cam_tt
                rec_tt_sum += rec_tt
                fut_cam_tt_sum += fut_cam_tt

                college = self.site_college_dict.get(site)
                if college is None:
                    LOG.error('[%s] name not found!'%site)
                    college = site
                entry.append(college)
                entry.append(tt)
                entry.append(cam_tt)
                entry.append(rec_tt)
                entry.append(fut_cam_tt)
                entrys.append(entry)
            except Exception,e:
                LOG.error(e)
Beispiel #6
0
 def load_ids_from_rec(self):
     file = None
     info_ids = []
     try:
         file = open(FAILED_INFOID_REC,'r')
         info_ids_str = file.read()
         info_ids = info_ids_str.split(";")
     except Exception as e:
         LOG.error(e)
     finally:
         if file:file.close()
     return info_ids
Beispiel #7
0
 def map_job_type(self, value):
     value = value.strip()
     job_type_id = 0
     if len(value) > 0:
         if value == "兼职":
             job_type_id = 1
         elif value == "实习":
             job_type_id = 2
         elif value == "全职":
             job_type_id = 0
         else:
             LOG.error("The Job Type Value is not valid,[%s]" % (value))
     return job_type_id
Beispiel #8
0
 def dumpjson_to_file(self, item):
     # res = json.dumps(json_list)
     res = str(item)
     output_file = "json_" + datetime.datetime.now().strftime("%m_%d_%H%M")
     output = os.path.join(self.output_dir, output_file)
     try:
         json_file = open(output, "a")
         json_file.write(res)
         json_file.write("\n")
         json_file.close()
     except IOError, e:
         LOG.error(e)
         sys.exit(-1)
Beispiel #9
0
 def send_request(self,retry_time=3):
     count = 0
     res = None
     resp = None
     while(count < retry_time):
         LOG.info("Send Request Round: [%s]" %(count))
         try:
             resp = urllib2.urlopen(self.url)
             res = resp.read()
             break
         except Exception,e:
             LOG.error(e)
         time.sleep(1)
         count += 1
Beispiel #10
0
    def dmg_to_proto_entry(self, entry):
        new_entry = {}

        convert_int_keys = set([INFO_ID, GROUP_ID, HAS_HUKOU, HAS_EXAM, HAS_RESUME, INFO_TYPE, CLICK_RATE])

        convert_time_keys = set(
            [MEETING_TIME, RELEASE_DATE, RESUME_START_DATE, RESUME_END_DATE, EXAM_TIME, INTERVIEW_TIME, LAST_MOD_TIME]
        )

        for key, value in entry.items():

            if value is None:
                continue
            if isinstance(value, basestring):
                if value.strip() == "":
                    continue

            # convert to  int count:9
            if key in convert_int_keys:
                try:
                    value = int(value)
                except ValueError:
                    LOG.error("ValueError,key is [%s] value is [%s]" % (key, value))

            # convert to id count:2
            elif key == COLLEGE_NAME:
                college_id = self.map_college(value)
                new_entry[COLLEGE_ID] = college_id
            elif key == RECRUIT_CITIES:
                try:
                    value = self.map_cities(value)
                except:
                    LOG.error("In Map cities[%s]:%s" % (key, value))
            # convert to type:
            if key == JOB_TYPE:
                value = self.map_job_type(value)
            if key == COMPANY_TYPE:
                value = self.map_company_type(value)

            # convert to list count:1
            elif key == WORK_PLACE:
                value = self.deal_work_place(value)

            # convert time to timestamp  count:6
            elif key in convert_time_keys:
                try:
                    if not isinstance(value, datetime.datetime):
                        value = datetime.datetime.strptime(value, DATEFORMAT)
                    value = get_timestamp(value)
                except Exception, e:
                    LOG.error("[%s]:[%s]" % (key, e))

            # last_mod_time has micro-second scale
            elif key == LAST_MOD_TIME:
                try:
                    value = datetime.datetime.strptime(value, DATEFORMAT)
                    value = get_timestamp(value)
                except Exception, e:
                    LOG.error("[%s]:[%s]" % (key, e))
Beispiel #11
0
 def word_seg_list(self,protobufs):
     res = []
     if not check_list(protobufs):
         LOG.info("Do Not Call Word Segging,For the input list is:%s"%(protobufs))
         return res
     try:
         self.transport.open()
         LOG.info("Begin  RPC Word Segging,[%s] To Be Segged!"
                  %(len(protobufs)))
         res = self.client.word_seg_list(protobufs)
         self.transport.close()
         LOG.info("Finish RPC Word Segging,[%s] Entrys Have Been Segged!"
                  %(len(res)))
         self.rec_protobuf(res)
     except Exception,e:
         LOG.error(e)
Beispiel #12
0
def load_table(filepath=None):
    try:
        domtree = ElementTree.parse(filepath)
        root = domtree.getroot()
        tables = root.findall('table')
        table_info = {}
        for table in tables:
            columns = []
            table_name = table.find('name').text
            columns_list = table.find('columns').findall('column')
            for column in columns_list:
                columns.append(column.text)
            table_info[table_name] = columns
    except Exception,e:
        LOG.error(e)
        sys.exit(-1)
Beispiel #13
0
    def index_list_string(self, protobufs):

        if not check_list(protobufs):
            LOG.info("Do Not Call Indexing, For the input list is:%s" % (protobufs))
            return
        count = 0
        try:
            self.transport.open()
            is_ready = self.start_index()
            if not is_ready:
                LOG.error("Index Server Is Not Ready!")
                return 0
            count = self.client.put_list_string(protobufs)
            LOG.info("[%s] Entrys Has Been Successfully Indexed." % (count))
            self.stop_index()
            self.transport.close()
        except Exception, e:
            LOG.error(e)
Beispiel #14
0
 def serilize_entrys(self,entrys):
     protobufs = []
     LOG.info("Begin Serilizing Entrys,[%s] Entrys To Be Serilized!"
              %(len(entrys)))
     for entry in entrys:
         mdoc = merged_doc_pb2.MergedDocInfo()
         new_entry = self.convertor.dmg_to_proto_entry(entry)
         for key,value in new_entry.items():
             try:
                 if isinstance(value,list):
                     getattr(mdoc,key).extend(value)
                 else:
                     setattr(mdoc,key,value)
             except Exception,e:
                 LOG.error("[%s]:%s" % (key,value))
                 LOG.error(e)
         protobuf = mdoc.SerializeToString()
         protobufs.append(protobuf)
Beispiel #15
0
 def load_columns(self, filepath=None):
     columns_list = []
     try:
         dom_tree = ElementTree.parse(filepath)
         root = dom_tree.getroot()
         columns = root.findall("field")
         if columns is None or len(columns) == 0:
             LOG.error("No columns found in xml conf [%s]" % (filepath))
             sys.exit(-1)
         columns_count = 0
         for column in columns:
             columns_list.append(column.text)
             columns_count += 1
         LOG.info("Total Load [%d] columns in conf [%s]" % (columns_count, filepath))
     except Exception as e:
         LOG.error(e)
         sys.exit(-1)
     return columns_list
Beispiel #16
0
 def write_ids_to_rec(self):
     LOG.info("Recording Failed IDs Into File [%s]" %(FAILED_INFOID_REC))
     if not (len(self.failed_ids)>0):
         return
     # record the failed_ids to file
     # check the record file dir if exists ,if not create it
     dir= os.path.dirname(FAILED_INFOID_REC)
     file = None
     if not os.path.isdir(dir):
         os.mkdir(dir)
     try:
         file = open(FAILED_INFOID_REC,'w')
         file.write(";".join([str(id) for id in self.failed_ids]))
         LOG.info("Record Successfully Totally [%s] entrys"
                   %(len(self.failed_ids)))
     except Exception as e:
         LOG.error(e)
     finally:
         if file: file.close()
Beispiel #17
0
    def submit_to_RB(self,entrys,posturl=POSTURL):
        """ submit data to dmg
        """
        failed_count = 0
        suc_count = 0
        suc_ids = []
        failed_ids = []
        failed_entrys = []
        count = 0
        state = ""
        res = None

        for entry in entrys[:]:
            #just for test should delet after testing
            count += 1
            info_id = entry[INFO_ID]
            entry_json = self.construct_json_data(entry)
            resjson = {"resjson":json.dumps(entry_json)}
            try:
                state =self.post(posturl,resjson)
                res = json.loads(state)['result']
                if (res == 0):
                    LOG.warning("[%s] Submitted Error!" %(info_id))
                    LOG.warning(state)
                    failed_count += 1
                if (res == 1): suc_count += 1
            except Exception as e:
                LOG.error(e)
                failed_count += 1
                LOG.error(state)
                failed_ids.append(info_id)
                failed_entrys.append(entry)

       #     LOG.debug("Post one entry into RB the result is: [%s],Suc_Count=\
        #            [%s],Fail_count=[%s]" % (res,suc_count,failed_count))
            if (suc_count + 1) % 1000 == 0 or (failed_count + 1 ) % 100 == 0:
                LOG.info("Post Entrys To DMG,Suc:[%s] Failed:[%s]"
                        %(suc_count,failed_count))
        self.failed_ids = failed_ids
        self.failed_entrys = failed_entrys
        LOG.info("Successfully Submitted To DMG [%s],Failed:[%s]"
                %(suc_count,failed_count))
    def deal_data(self, raw_data):
        entrys = []
        rq_sum = 0
        rp_sum = 0
        uniq_url_sum = 0
        rows = self.prepare_data()
        for row in raw_data:
            # TODO try catch the index exceed the row length
            try:
                entry = []
                dm = row[0]
                rq = row[1]
                rp = row[2]
                uniq_url = row[3]

                rq_sum += int(rq)
                rp_sum += int(rp)
                uniq_url_sum += int(uniq_url)

                college = self.domain_college_dict.get(dm)
                if college is None:
                    LOG.error("[%s] domain not found" % dm)
                    college = dm
                entry.append(college)
                entry.append(rq)
                entry.append(rp)
                # zero can not be divided

                suc_rate = self.get_rate(rp, rq)
                entry.append(suc_rate)
                entry.append(uniq_url)
                entrys.append(entry)
            except Exception, e:
                LOG.error(e)

            suc_rate_aver = self.get_rate(rp_sum, rq_sum)
Beispiel #19
0
def serilize_entrys(entrys):
    res = []
    for entry in entrys:
        mdoc = merged_doc_pb2.MergedDocInfo()
        for key,value in entry.items():

            # 空值的跳过
            if value is None:
                continue

            # 数据库读出的long值改为int,info_id字段除外
            if isinstance(value,long):
                if(key != INFO_ID):
                    value = int(value)

            # 时间转化成时间戳int类型
            if isinstance(value,datetime.datetime):
                value = get_timestamp(value)

            # 对recruit_cities和编号进行映射
            if key == RECRUIT_CITIES:
                for city in value.strip().split(","):
                    if len(city) > 0:

                        id = city_id.get(value)
                        if id is None:
                            LOG.error("City Name [%s] Not Found!"%value)
                        else:
                            mdoc.recruit_cities.append(id)
                continue

            # 对collegeName和编号进行映射
            if key == COLLEGE_NAME:
                value = value.strip()
                if len(value) > 0:
                    id = college_id.get(value)
                    if id is None:
                        LOG.error("College Name [%s] Not Found!"%value)
                        missed_colleges.add(value)

                        #默认为0
                        mdoc.college_ID = 1
                    else:
                        #填充protobuffer中的college_id 字段
                        mdoc.college_ID = id

            #处理work_place值
            if key == 'work_place':
                for place in value.strip().split(","):
                    if len(place) > 0:
                        mdoc.work_place.append(place)
                LOG.info(";".join(mdoc.work_place))
                continue

            try:
                #计算term-weight需要gbk编码的value
                if isinstance(value,str):
                    value = value.decode('utf-8').encode('gbk')
                setattr(mdoc, key, value)

                mdoc.origin_url_sign = 0
                mdoc.recruit_title_sign = 0
                mdoc.info_text_sign = 0
                mdoc.group_id = 1
            except Exception,e:
                LOG.error(e)
                LOG.error("key is [%s] and value is[%s]"%(key,value))
        serilized = mdoc.SerializeToString()
        res.append(serilized)
Beispiel #20
0
                except Exception, e:
                    LOG.error("[%s]:[%s]" % (key, e))

            # last_mod_time has micro-second scale
            elif key == LAST_MOD_TIME:
                try:
                    value = datetime.datetime.strptime(value, DATEFORMAT)
                    value = get_timestamp(value)
                except Exception, e:
                    LOG.error("[%s]:[%s]" % (key, e))
            # else convert to string
            else:
                try:
                    value = value
                except Exception as e:
                    LOG.error("[s%]:[%s]" % (key, e))

            new_entry[key] = value

            # set the default key and value
            # new_entry['normalized_url_sign'] = 0
            # new_entry['recruit_title_sign'] = 0
            # new_entry['info_text_sign'] = 0

        return new_entry

    # convert cities to cityid_list
    def map_cities(self, value):
        city_ids = []
        for city in value.strip().split(","):
            if len(city) > 0: