class FetchCsvSample(BaseTask, CsvWriteBase): def __init__(self, channel, thread_cnt, need_cnt): BaseTask.__init__(self, "fetch_csv_sample", thread_cnt=thread_cnt) self.channel = channel self.rand = RandomDispatcher(channel, self._queue, need_cnt) self.dispatcher = lambda q: self.rand.dispatcher() self.dir_path = self.get_save_dir_path() CsvWriteBase.__init__(self, self.dir_path, self.channel) self.cv_measure_store = CVRawStore(channel, 'measure') def _load_data(self): self.rand.load_data() def get_save_dir_path(self): dir = os.path.join(os.path.dirname(__file__), '%s_%s_result' % (self.channel, self._name)) if not os.path.exists(dir): os.mkdir(dir) return dir def get_pagecontent(self, cvId): doc = self.rand.cv_page_store.get_one(cvId) filepath = doc['pageContentPath'] return self.rand.getPageContent(filepath, 'remote') def save_html(self, cvId, pagecontent): path = os.path.join(self.dir_path, '%s_html_result' % self.channel) if not os.path.exists(path): os.mkdir(path) with open('%s/%s.html' % (path, cvId.split("://")[1]), 'wb') as f: f.write(pagecontent) def run_job(self, job): cvId = job.get('cvId') pagecontent = self.get_pagecontent(cvId) self.save_html(cvId, pagecontent) measure_data = self.cv_measure_store.get_one(cvId) self.save(job, measure_data) print "SUCCESS COPIED %s" % cvId def end_operation(self, *args, **kwargs): print "***********************************" * 2
class ETLDispatcher(ETLDispatcherBase): def __init__(self, channel, q): ETLDispatcherBase.__init__(self, channel, q) self.cv_raw_store = CVRawStore(self.channel, stage='raw') def fill_data_with_flag(self, indexUrl, realUrl, contentSign, updateTime, filePath, flag): self.process_item[indexUrl] = { 'updateTime': updateTime, 'contentSign': contentSign, 'realUrl': realUrl, 'filePath': filePath, 'flag': flag, } if len(self.process_item)%10000 == 0: print "load items: %d" % len(self.process_item) def check_and_put(self, item): updateTime = item.get('updateTime') indexUrl = item.get('indexUrl') contentSign = item.get('contentSign') file_path = item.get('pageContentPath') realUrl = item.get('realUrl') self.fill_data_with_flag(indexUrl, realUrl, contentSign, updateTime, file_path, 0) def real_dispatcher(self, from_which): if from_which == 'db': self.load_data() self.dispatcher_from_db() # elif from_which == 'file': # self.dispatcher_from_file() else: raise Exception("unknown from_which") def exist_in_raw(self, indexUrl): if self.cv_raw_store.get_one(indexUrl): return True return False def load_data(self): for item in self.cv_page_store.get_all(): # if self.exist_in_raw(item['indexUrl']): # continue self.check_and_put(item) print "============= totally load %d items ===============" % len(self.process_item) def dispatcher_from_db(self): i = 0 total_cnt = len(self.process_item) for item in self.process_item: pagecontent = self.getPageContent(self.process_item[item].get("filePath"), self.bin_file_location) self.queue.put({'indexUrl': item, 'pagecontent': pagecontent, 'updateTime':self.process_item[item]['updateTime'], 'contentSign': self.process_item[item]['contentSign']}) i += 1 if i % 10000 == 0: print "processed %f%%" % (float(i*100/total_cnt)) self.queue.put(None)