def __init__(self, thread_name, event): super(Controller, self).__init__() self.name = thread_name self.threadEvent = event self.logger_info = utils.initlog('Console', 'Console.log') self.schedule = sched.scheduler(time.time, time.sleep) self.profiles_name, profiles = utils.load_profiles('profiles') self.related_tweets = [[] for _ in range(len(profiles))] self.pushed_tweets = [[] for _ in range(len(profiles))] self.pushed_tweets_ids = set([]) self.related_tweets_hash = set([]) self.classifier = Classifier() self.ranker = self.load_ranker() self.pusher = Pusher()
def process(self): data_file_path = sys.argv[1] files = os.listdir(data_file_path) files.sort() for f in files: filename = os.path.join(data_file_path, f) logging.info(filename) count = 0 for line in open(filename, 'rb'): start = time.clock() tweet_text, tid_origin, tid_retweet, timestamp, tweet_json = utils.extract_text(line) simhash_value = Simhash(tweet_text).value if simhash_value in self.related_tweets_hash or tid_origin in self.pushed_tweets_ids or tid_retweet in self.pushed_tweets_ids: continue topic_id, similarity = self.classifier.classify(tweet_text) if topic_id == '': continue count += 1 if count % 10000 == 0: logging.info('%d' % count) tweet_json['similarity'] = similarity evaluate_score = self.ranker.predict(json.dumps(tweet_json)) total_score = (evaluate_score ** 0.5) * similarity # if total_score < 0.15: # continue timestruct = time.gmtime(int(timestamp[:-3])) is_pushed = self.pusher.push(total_score, topic_id, timestruct) if is_pushed: delivery_time = float(timestamp) / 1000.0 + (time.clock() - start) self.pushed_tweets[topic_id].append([tid_origin, str(delivery_time)[:10], similarity, total_score, tweet_text]) utc_time = time.strftime('%Y%m%d', timestruct) self.related_tweets[topic_id].append([utc_time, tid_origin, total_score, tid_retweet, timestamp[:-3], tweet_text]) self.related_tweets_hash.add(simhash_value) self.pushed_tweets_ids.add(tid_retweet) self.dump_result(f) self.pusher = Pusher() self.logger_info.info('\n=======finished!=======\n')
def __init__(self, thread_name, event): super(Controller, self).__init__() self.name = thread_name self.threadEvent = event self.logger_info = utils.initlog('Console', 'Console.log') self.schedule = sched.scheduler(time.time, time.sleep) self.profiles_name, profiles = utils.load_profiles('profiles') self.related_tweets = [[] for _ in range(len(profiles))] # 当天相关推文记录,存储共离线分析 self.pushed_tweets = [[] for _ in range(len(profiles))] self.pushed_tweets_ids = set([]) self.related_tweets_hash = set([]) self.classifier = Classifier() self.ranker = self.load_ranker() self.pusher = Pusher()
last_layer_optimizer_specs = [{ 'params': ppnet.last_layer.parameters(), 'lr': last_layer_optimizer_lr }] last_layer_optimizer = torch.optim.Adam(last_layer_optimizer_specs) # train the model log('start training') pusher = Pusher( train_push_loader, prototype_network_parallel=ppnet_multi, bank_size=bank_size, class_specific=class_specific, preprocess_input_function=preprocess_input_function, # normalize if needed prototype_layer_stride=1, dir_for_saving_prototypes= img_dir, # if not None, prototypes will be saved here prototype_img_filename_prefix=prototype_img_filename_prefix, prototype_self_act_filename_prefix=prototype_self_act_filename_prefix, proto_bound_boxes_filename_prefix=proto_bound_boxes_filename_prefix, save_prototype_class_identity=True, log=log) if do_initial_push: perform_push(pusher, 0) for epoch in range(num_train_epochs): log('epoch: \t{0}'.format(epoch)) if epoch < num_warm_epochs: tnt.warm_only(model=ppnet_multi, log=log)
class Controller(threading.Thread): def __init__(self, thread_name, event): super(Controller, self).__init__() self.name = thread_name self.threadEvent = event self.logger_info = utils.initlog('Console', 'Console.log') self.schedule = sched.scheduler(time.time, time.sleep) self.profiles_name, profiles = utils.load_profiles('profiles') self.related_tweets = [[] for _ in range(len(profiles))] self.pushed_tweets = [[] for _ in range(len(profiles))] self.pushed_tweets_ids = set([]) self.related_tweets_hash = set([]) self.classifier = Classifier() self.ranker = self.load_ranker() self.pusher = Pusher() def load_ranker(self): self.logger_info.info('loading ranker...') gateway = JavaGateway() ranker = gateway.entry_point self.logger_info.info('ranker loaded!') return ranker def run(self): self.logger_info.info('%s is starting...' % self.name) self.threadEvent.wait() self.logger_info.info('%s is running...' % self.name) self.schedule.enter(0, 0, self.dump_schedule, ()) self.schedule.run() self.process() def process(self): start_day = time.gmtime(time.time()).tm_mday for line in sys.stdin: tweet_text, tid_origin, tid_retweet, timestamp, tweet_json = utils.extract_text( line) simhash_value = Simhash(tweet_text).value if simhash_value in self.related_tweets_hash or tid_origin in self.pushed_tweets_ids or tid_retweet in self.pushed_tweets_ids: continue topic_id, similarity = self.classifier.classify(tweet_text) if topic_id == '': continue tweet_json['similarity'] = similarity evaluate_score = self.ranker.predict(json.dumps(tweet_json)) total_score = similarity * evaluate_score if total_score < 0.15: continue is_pushed = self.pusher.push(evaluate_score, topic_id) if is_pushed: delivery_time = time.time() self.pushed_tweets[topic_id].append([ tid_origin, str(delivery_time)[:10], similarity, total_score, tweet_text ]) self.pushed_tweets_ids.add(tid_retweet) struct_time = time.gmtime(float(timestamp[:-3])) utc_time = time.strftime('%Y%m%d', struct_time) self.related_tweets[topic_id].append( [utc_time, tid_origin, total_score, tweet_text]) self.related_tweets_hash.add(simhash_value) if struct_time.tm_mday != start_day: self.dump_result(start_day) start_day = struct_time.tm_mday def dump_result(self, file_name): self.logger_info.info('saving result...') with open('submit/task-b/' + file_name, 'w') as fw: for index, records in enumerate(self.related_tweets): pid = str(index + 226) sorted_records = sorted(records, key=lambda item: -item[2]) for rank, record in enumerate(sorted_records): if rank >= 100: break fw.write('%s\tMB%s\tQ0\t%s\t%d\t%f\t%s\n' % (record[0], pid, record[1], rank + 1, record[2], 'CSSNA')) with open('submit/task-a/' + file_name, 'w') as fw: with open('submit/task-a_extr/' + file_name, 'w') as fw_extr: for index, records in enumerate(self.pushed_tweets): pid = str(index + 226) for record in records: fw.write('MB%s\t%s\t%s\tCSSNA\n' % (pid, record[0], record[1])) fw_extr.write('MB%s\t%s\t%s\tCSSNA\t%s\t%s\t%s\n' % (pid, record[0], record[1], record[2], record[3], record[4])) self.related_tweets = [[] for _ in range(225)] self.pushed_tweets = [[] for _ in range(225)]
class Controller(threading.Thread): def __init__(self, thread_name, event): super(Controller, self).__init__() self.name = thread_name self.threadEvent = event self.logger_info = utils.initlog('Console', 'Console.log') self.schedule = sched.scheduler(time.time, time.sleep) self.profiles_name, profiles = utils.load_profiles('profiles') self.related_tweets = [[] for _ in range(len(profiles))] # 当天相关推文记录,存储共离线分析 self.pushed_tweets = [[] for _ in range(len(profiles))] self.pushed_tweets_ids = set([]) self.related_tweets_hash = set([]) self.classifier = Classifier() self.ranker = self.load_ranker() self.pusher = Pusher() def load_ranker(self): self.logger_info.info('loading ranker...') gateway = JavaGateway() ranker = gateway.entry_point self.logger_info.info('ranker loaded!') return ranker def run(self): self.logger_info.info('%s is starting...' % self.name) self.threadEvent.wait() self.logger_info.info('%s is running...' % self.name) # self.schedule.enter(0, 0, self.dump_schedule, ()) # self.schedule.run() self.process() def process(self): data_file_path = sys.argv[1] files = os.listdir(data_file_path) files.sort() for f in files: filename = os.path.join(data_file_path, f) logging.info(filename) count = 0 for line in open(filename, 'rb'): start = time.clock() tweet_text, tid_origin, tid_retweet, timestamp, tweet_json = utils.extract_text(line) simhash_value = Simhash(tweet_text).value if simhash_value in self.related_tweets_hash or tid_origin in self.pushed_tweets_ids or tid_retweet in self.pushed_tweets_ids: continue topic_id, similarity = self.classifier.classify(tweet_text) if topic_id == '': continue count += 1 if count % 10000 == 0: logging.info('%d' % count) tweet_json['similarity'] = similarity evaluate_score = self.ranker.predict(json.dumps(tweet_json)) total_score = (evaluate_score ** 0.5) * similarity # if total_score < 0.15: # continue timestruct = time.gmtime(int(timestamp[:-3])) is_pushed = self.pusher.push(total_score, topic_id, timestruct) if is_pushed: delivery_time = float(timestamp) / 1000.0 + (time.clock() - start) self.pushed_tweets[topic_id].append([tid_origin, str(delivery_time)[:10], similarity, total_score, tweet_text]) utc_time = time.strftime('%Y%m%d', timestruct) self.related_tweets[topic_id].append([utc_time, tid_origin, total_score, tid_retweet, timestamp[:-3], tweet_text]) self.related_tweets_hash.add(simhash_value) self.pushed_tweets_ids.add(tid_retweet) self.dump_result(f) self.pusher = Pusher() self.logger_info.info('\n=======finished!=======\n') def dump_result(self, file_name): self.logger_info.info('saving result...') with open('submit/task-b/b_submit', 'a') as fw: with open('submit/task-b/b_review/B_candidateday_' + file_name[-2:], 'w') as fw_review: for index, records in enumerate(self.related_tweets): pid = str(index+226) sorted_records = sorted(records, key=lambda item: -item[2]) for rank, record in enumerate(sorted_records): if rank >= 100: break fw.write('%s\tMB%s\tQ0\t%s\t%d\t%f\t%s\n' % (record[0], pid, record[1], rank+1, record[2], 'CSSNA')) fw_review.write('%s\tMB%s\tQ0\t%s\t%f\tSNACS\t%s\t%s\t%s\n' % (record[0], pid, record[1], record[2], record[3], record[4], record[5])) with open('submit/task-a/a_submit', 'a') as fw: with open('submit/task-a/a_review', 'a') as fw_review: for index, records in enumerate(self.pushed_tweets): pid = str(index+226) for record in records: fw.write('MB%s\t%s\t%s\tCSSNA\n' % (pid, record[0], record[1])) fw_review.write('MB%s\t%s\t%s\tCSSNA\t%s\t%s\t%s\n' % (pid, record[0], record[1], record[2], record[3], record[4])) self.related_tweets = [[] for _ in range(225)] # 清空前天相关推文记录 self.pushed_tweets = [[] for _ in range(225)] def dump_schedule(self): self.logger_info.info('saving result...') utc_time = time.strftime('%Y%m%d', time.gmtime()) for index, records in self.related_tweets: pid = str(index+226) with open('profile_MB' + pid, 'w') as fw: for record in records: fw.write(utc_time + '\t' + pid + '\tQ0\t' + record + '\n') self.related_tweets = [[] for _ in range(226)] # 清空前天相关推文记录 self.schedule.enter(24*60*60, 0, self.dump_schedule, ()) def detect_tweet_stream(self, year, month, d, h, m, s, ms): start = datetime.datetime(year, month, d, h, m, s, ms) delta = (start - datetime.datetime.now()).seconds self.logger_info.info('waiting secondes: ' + str(delta)) time.sleep(delta) self.logger_info.info('tweet stream is ready') is_ready = True return is_ready
class Controller(threading.Thread): def __init__(self, thread_name, event): super(Controller, self).__init__() self.name = thread_name self.threadEvent = event self.logger_info = utils.initlog('Console', 'Console.log') self.schedule = sched.scheduler(time.time, time.sleep) self.profiles_name, profiles = utils.load_profiles('profiles') self.related_tweets = [[] for _ in range(len(profiles))] self.pushed_tweets = [[] for _ in range(len(profiles))] self.pushed_tweets_ids = set([]) self.related_tweets_hash = set([]) self.classifier = Classifier() self.ranker = self.load_ranker() self.pusher = Pusher() def load_ranker(self): self.logger_info.info('loading ranker...') gateway = JavaGateway() ranker = gateway.entry_point self.logger_info.info('ranker loaded!') return ranker def run(self): self.logger_info.info('%s is starting...' % self.name) self.threadEvent.wait() self.logger_info.info('%s is running...' % self.name) self.schedule.enter(0, 0, self.dump_schedule, ()) self.schedule.run() self.process() def process(self): start_day = time.gmtime(time.time()).tm_mday for line in sys.stdin: tweet_text, tid_origin, tid_retweet, timestamp, tweet_json = utils.extract_text(line) simhash_value = Simhash(tweet_text).value if simhash_value in self.related_tweets_hash or tid_origin in self.pushed_tweets_ids or tid_retweet in self.pushed_tweets_ids: continue topic_id, similarity = self.classifier.classify(tweet_text) if topic_id == '': continue tweet_json['similarity'] = similarity evaluate_score = self.ranker.predict(json.dumps(tweet_json)) total_score = similarity * evaluate_score if total_score < 0.15: continue is_pushed = self.pusher.push(evaluate_score, topic_id) if is_pushed: delivery_time = time.time() self.pushed_tweets[topic_id].append([tid_origin, str(delivery_time)[:10], similarity, total_score, tweet_text]) self.pushed_tweets_ids.add(tid_retweet) struct_time = time.gmtime(float(timestamp[:-3])) utc_time = time.strftime('%Y%m%d', struct_time) self.related_tweets[topic_id].append([utc_time, tid_origin, total_score, tweet_text]) self.related_tweets_hash.add(simhash_value) if struct_time.tm_mday != start_day: self.dump_result(start_day) start_day = struct_time.tm_mday def dump_result(self, file_name): self.logger_info.info('saving result...') with open('submit/task-b/'+file_name, 'w') as fw: for index, records in enumerate(self.related_tweets): pid = str(index+226) sorted_records = sorted(records, key=lambda item: -item[2]) for rank, record in enumerate(sorted_records): if rank >= 100: break fw.write('%s\tMB%s\tQ0\t%s\t%d\t%f\t%s\n' % (record[0], pid, record[1], rank+1, record[2], 'CSSNA')) with open('submit/task-a/'+file_name, 'w') as fw: with open('submit/task-a_extr/'+file_name, 'w') as fw_extr: for index, records in enumerate(self.pushed_tweets): pid = str(index+226) for record in records: fw.write('MB%s\t%s\t%s\tCSSNA\n' % (pid, record[0], record[1])) fw_extr.write('MB%s\t%s\t%s\tCSSNA\t%s\t%s\t%s\n' % (pid, record[0], record[1], record[2], record[3], record[4])) self.related_tweets = [[] for _ in range(225)] self.pushed_tweets = [[] for _ in range(225)]
def log (msg, elapsed=None): pusher = Pusher() pusher.add("<br />") pusher.addstyle("="*34) pusher.addstyle(msg) if elapsed: pusher.addstyle("Elapsed time: " + elapsed) pusher.addstyle("="*34) pusher.add("<br />") pusher.push()