Example #1
0
    def __init__(self, thread_name, event):
        super(Controller, self).__init__()
        self.name = thread_name
        self.threadEvent = event
        self.logger_info = utils.initlog('Console', 'Console.log')
        self.schedule = sched.scheduler(time.time, time.sleep)

        self.profiles_name, profiles = utils.load_profiles('profiles')
        self.related_tweets = [[] for _ in range(len(profiles))]
        self.pushed_tweets = [[] for _ in range(len(profiles))]
        self.pushed_tweets_ids = set([])
        self.related_tweets_hash = set([])

        self.classifier = Classifier()
        self.ranker = self.load_ranker()
        self.pusher = Pusher()
Example #2
0
    def process(self):
        data_file_path = sys.argv[1]
        files = os.listdir(data_file_path)
        files.sort()
        for f in files:
            filename = os.path.join(data_file_path, f)
            logging.info(filename)
            count = 0
            for line in open(filename, 'rb'):
                start = time.clock()
                tweet_text, tid_origin, tid_retweet, timestamp, tweet_json = utils.extract_text(line)
                simhash_value = Simhash(tweet_text).value
                if simhash_value in self.related_tweets_hash or tid_origin in self.pushed_tweets_ids or tid_retweet in self.pushed_tweets_ids:
                    continue

                topic_id, similarity = self.classifier.classify(tweet_text)
                if topic_id == '':
                    continue

                count += 1
                if count % 10000 == 0:  logging.info('%d' % count)

                tweet_json['similarity'] = similarity
                evaluate_score = self.ranker.predict(json.dumps(tweet_json))
                total_score = (evaluate_score ** 0.5) * similarity
                # if total_score < 0.15:
                #     continue

                timestruct = time.gmtime(int(timestamp[:-3]))
                is_pushed = self.pusher.push(total_score, topic_id, timestruct)
                if is_pushed:
                    delivery_time = float(timestamp) / 1000.0 + (time.clock() - start)
                    self.pushed_tweets[topic_id].append([tid_origin, str(delivery_time)[:10], similarity, total_score, tweet_text])

                utc_time = time.strftime('%Y%m%d', timestruct)
                self.related_tweets[topic_id].append([utc_time, tid_origin, total_score, tid_retweet, timestamp[:-3], tweet_text])

                self.related_tweets_hash.add(simhash_value)
                self.pushed_tweets_ids.add(tid_retweet)
            self.dump_result(f)
            self.pusher = Pusher()
        self.logger_info.info('\n=======finished!=======\n')
Example #3
0
    def __init__(self, thread_name, event):
        super(Controller, self).__init__()
        self.name = thread_name
        self.threadEvent = event
        self.logger_info = utils.initlog('Console', 'Console.log')
        self.schedule = sched.scheduler(time.time, time.sleep)

        self.profiles_name, profiles = utils.load_profiles('profiles')
        self.related_tweets = [[] for _ in range(len(profiles))]    # 当天相关推文记录,存储共离线分析
        self.pushed_tweets = [[] for _ in range(len(profiles))]
        self.pushed_tweets_ids = set([])
        self.related_tweets_hash = set([])

        self.classifier = Classifier()
        self.ranker = self.load_ranker()
        self.pusher = Pusher()
Example #4
0
last_layer_optimizer_specs = [{
    'params': ppnet.last_layer.parameters(),
    'lr': last_layer_optimizer_lr
}]
last_layer_optimizer = torch.optim.Adam(last_layer_optimizer_specs)

# train the model
log('start training')
pusher = Pusher(
    train_push_loader,
    prototype_network_parallel=ppnet_multi,
    bank_size=bank_size,
    class_specific=class_specific,
    preprocess_input_function=preprocess_input_function,  # normalize if needed
    prototype_layer_stride=1,
    dir_for_saving_prototypes=
    img_dir,  # if not None, prototypes will be saved here
    prototype_img_filename_prefix=prototype_img_filename_prefix,
    prototype_self_act_filename_prefix=prototype_self_act_filename_prefix,
    proto_bound_boxes_filename_prefix=proto_bound_boxes_filename_prefix,
    save_prototype_class_identity=True,
    log=log)

if do_initial_push:
    perform_push(pusher, 0)

for epoch in range(num_train_epochs):
    log('epoch: \t{0}'.format(epoch))

    if epoch < num_warm_epochs:
        tnt.warm_only(model=ppnet_multi, log=log)
Example #5
0
class Controller(threading.Thread):
    def __init__(self, thread_name, event):
        super(Controller, self).__init__()
        self.name = thread_name
        self.threadEvent = event
        self.logger_info = utils.initlog('Console', 'Console.log')
        self.schedule = sched.scheduler(time.time, time.sleep)

        self.profiles_name, profiles = utils.load_profiles('profiles')
        self.related_tweets = [[] for _ in range(len(profiles))]
        self.pushed_tweets = [[] for _ in range(len(profiles))]
        self.pushed_tweets_ids = set([])
        self.related_tweets_hash = set([])

        self.classifier = Classifier()
        self.ranker = self.load_ranker()
        self.pusher = Pusher()

    def load_ranker(self):
        self.logger_info.info('loading ranker...')
        gateway = JavaGateway()
        ranker = gateway.entry_point
        self.logger_info.info('ranker loaded!')
        return ranker

    def run(self):
        self.logger_info.info('%s is starting...' % self.name)
        self.threadEvent.wait()
        self.logger_info.info('%s is running...' % self.name)
        self.schedule.enter(0, 0, self.dump_schedule, ())
        self.schedule.run()
        self.process()

    def process(self):
        start_day = time.gmtime(time.time()).tm_mday
        for line in sys.stdin:
            tweet_text, tid_origin, tid_retweet, timestamp, tweet_json = utils.extract_text(
                line)
            simhash_value = Simhash(tweet_text).value
            if simhash_value in self.related_tweets_hash or tid_origin in self.pushed_tweets_ids or tid_retweet in self.pushed_tweets_ids:
                continue

            topic_id, similarity = self.classifier.classify(tweet_text)
            if topic_id == '':
                continue

            tweet_json['similarity'] = similarity
            evaluate_score = self.ranker.predict(json.dumps(tweet_json))
            total_score = similarity * evaluate_score
            if total_score < 0.15:
                continue

            is_pushed = self.pusher.push(evaluate_score, topic_id)
            if is_pushed:
                delivery_time = time.time()
                self.pushed_tweets[topic_id].append([
                    tid_origin,
                    str(delivery_time)[:10], similarity, total_score,
                    tweet_text
                ])
                self.pushed_tweets_ids.add(tid_retweet)

            struct_time = time.gmtime(float(timestamp[:-3]))
            utc_time = time.strftime('%Y%m%d', struct_time)
            self.related_tweets[topic_id].append(
                [utc_time, tid_origin, total_score, tweet_text])
            self.related_tweets_hash.add(simhash_value)

            if struct_time.tm_mday != start_day:
                self.dump_result(start_day)
                start_day = struct_time.tm_mday

    def dump_result(self, file_name):
        self.logger_info.info('saving result...')
        with open('submit/task-b/' + file_name, 'w') as fw:
            for index, records in enumerate(self.related_tweets):
                pid = str(index + 226)
                sorted_records = sorted(records, key=lambda item: -item[2])
                for rank, record in enumerate(sorted_records):
                    if rank >= 100:
                        break
                    fw.write('%s\tMB%s\tQ0\t%s\t%d\t%f\t%s\n' %
                             (record[0], pid, record[1], rank + 1, record[2],
                              'CSSNA'))
        with open('submit/task-a/' + file_name, 'w') as fw:
            with open('submit/task-a_extr/' + file_name, 'w') as fw_extr:
                for index, records in enumerate(self.pushed_tweets):
                    pid = str(index + 226)
                    for record in records:
                        fw.write('MB%s\t%s\t%s\tCSSNA\n' %
                                 (pid, record[0], record[1]))
                        fw_extr.write('MB%s\t%s\t%s\tCSSNA\t%s\t%s\t%s\n' %
                                      (pid, record[0], record[1], record[2],
                                       record[3], record[4]))

        self.related_tweets = [[] for _ in range(225)]
        self.pushed_tweets = [[] for _ in range(225)]
Example #6
0
class Controller(threading.Thread):
    def __init__(self, thread_name, event):
        super(Controller, self).__init__()
        self.name = thread_name
        self.threadEvent = event
        self.logger_info = utils.initlog('Console', 'Console.log')
        self.schedule = sched.scheduler(time.time, time.sleep)

        self.profiles_name, profiles = utils.load_profiles('profiles')
        self.related_tweets = [[] for _ in range(len(profiles))]    # 当天相关推文记录,存储共离线分析
        self.pushed_tweets = [[] for _ in range(len(profiles))]
        self.pushed_tweets_ids = set([])
        self.related_tweets_hash = set([])

        self.classifier = Classifier()
        self.ranker = self.load_ranker()
        self.pusher = Pusher()

    def load_ranker(self):
        self.logger_info.info('loading ranker...')
        gateway = JavaGateway()
        ranker = gateway.entry_point
        self.logger_info.info('ranker loaded!')
        return ranker

    def run(self):
        self.logger_info.info('%s is starting...' % self.name)
        self.threadEvent.wait()
        self.logger_info.info('%s is running...' % self.name)
        # self.schedule.enter(0, 0, self.dump_schedule, ())
        # self.schedule.run()
        self.process()

    def process(self):
        data_file_path = sys.argv[1]
        files = os.listdir(data_file_path)
        files.sort()
        for f in files:
            filename = os.path.join(data_file_path, f)
            logging.info(filename)
            count = 0
            for line in open(filename, 'rb'):
                start = time.clock()
                tweet_text, tid_origin, tid_retweet, timestamp, tweet_json = utils.extract_text(line)
                simhash_value = Simhash(tweet_text).value
                if simhash_value in self.related_tweets_hash or tid_origin in self.pushed_tweets_ids or tid_retweet in self.pushed_tweets_ids:
                    continue

                topic_id, similarity = self.classifier.classify(tweet_text)
                if topic_id == '':
                    continue

                count += 1
                if count % 10000 == 0:  logging.info('%d' % count)

                tweet_json['similarity'] = similarity
                evaluate_score = self.ranker.predict(json.dumps(tweet_json))
                total_score = (evaluate_score ** 0.5) * similarity
                # if total_score < 0.15:
                #     continue

                timestruct = time.gmtime(int(timestamp[:-3]))
                is_pushed = self.pusher.push(total_score, topic_id, timestruct)
                if is_pushed:
                    delivery_time = float(timestamp) / 1000.0 + (time.clock() - start)
                    self.pushed_tweets[topic_id].append([tid_origin, str(delivery_time)[:10], similarity, total_score, tweet_text])

                utc_time = time.strftime('%Y%m%d', timestruct)
                self.related_tweets[topic_id].append([utc_time, tid_origin, total_score, tid_retweet, timestamp[:-3], tweet_text])

                self.related_tweets_hash.add(simhash_value)
                self.pushed_tweets_ids.add(tid_retweet)
            self.dump_result(f)
            self.pusher = Pusher()
        self.logger_info.info('\n=======finished!=======\n')

    def dump_result(self, file_name):
        self.logger_info.info('saving result...')
        with open('submit/task-b/b_submit', 'a') as fw:
            with open('submit/task-b/b_review/B_candidateday_' + file_name[-2:], 'w') as fw_review:
                for index, records in enumerate(self.related_tweets):
                    pid = str(index+226)
                    sorted_records = sorted(records, key=lambda item: -item[2])
                    for rank, record in enumerate(sorted_records):
                        if rank >= 100:
                            break
                        fw.write('%s\tMB%s\tQ0\t%s\t%d\t%f\t%s\n' % (record[0], pid, record[1], rank+1, record[2], 'CSSNA'))
                        fw_review.write('%s\tMB%s\tQ0\t%s\t%f\tSNACS\t%s\t%s\t%s\n' % (record[0], pid, record[1], record[2], record[3], record[4], record[5]))

        with open('submit/task-a/a_submit', 'a') as fw:
            with open('submit/task-a/a_review', 'a') as fw_review:
                for index, records in enumerate(self.pushed_tweets):
                    pid = str(index+226)
                    for record in records:
                        fw.write('MB%s\t%s\t%s\tCSSNA\n' % (pid, record[0], record[1]))
                        fw_review.write('MB%s\t%s\t%s\tCSSNA\t%s\t%s\t%s\n' % (pid, record[0], record[1], record[2], record[3], record[4]))

        self.related_tweets = [[] for _ in range(225)]    # 清空前天相关推文记录
        self.pushed_tweets = [[] for _ in range(225)]


    def dump_schedule(self):
        self.logger_info.info('saving result...')
        utc_time = time.strftime('%Y%m%d', time.gmtime())
        for index, records in self.related_tweets:
            pid = str(index+226)
            with open('profile_MB' + pid, 'w') as fw:
                for record in records:
                    fw.write(utc_time + '\t' + pid + '\tQ0\t' + record + '\n')
        self.related_tweets = [[] for _ in range(226)]    # 清空前天相关推文记录
        self.schedule.enter(24*60*60, 0, self.dump_schedule, ())

    def detect_tweet_stream(self, year, month, d, h, m, s, ms):
        start = datetime.datetime(year, month, d, h, m, s, ms)
        delta = (start - datetime.datetime.now()).seconds
        self.logger_info.info('waiting secondes: ' + str(delta))
        time.sleep(delta)
        self.logger_info.info('tweet stream is ready')
        is_ready = True
        return is_ready
Example #7
0
class Controller(threading.Thread):
    def __init__(self, thread_name, event):
        super(Controller, self).__init__()
        self.name = thread_name
        self.threadEvent = event
        self.logger_info = utils.initlog('Console', 'Console.log')
        self.schedule = sched.scheduler(time.time, time.sleep)

        self.profiles_name, profiles = utils.load_profiles('profiles')
        self.related_tweets = [[] for _ in range(len(profiles))]
        self.pushed_tweets = [[] for _ in range(len(profiles))]
        self.pushed_tweets_ids = set([])
        self.related_tweets_hash = set([])

        self.classifier = Classifier()
        self.ranker = self.load_ranker()
        self.pusher = Pusher()

    def load_ranker(self):
        self.logger_info.info('loading ranker...')
        gateway = JavaGateway()
        ranker = gateway.entry_point
        self.logger_info.info('ranker loaded!')
        return ranker

    def run(self):
        self.logger_info.info('%s is starting...' % self.name)
        self.threadEvent.wait()
        self.logger_info.info('%s is running...' % self.name)
        self.schedule.enter(0, 0, self.dump_schedule, ())
        self.schedule.run()
        self.process()

    def process(self):
        start_day = time.gmtime(time.time()).tm_mday
        for line in sys.stdin:
            tweet_text, tid_origin, tid_retweet, timestamp, tweet_json = utils.extract_text(line)
            simhash_value = Simhash(tweet_text).value
            if simhash_value in self.related_tweets_hash or tid_origin in self.pushed_tweets_ids or tid_retweet in self.pushed_tweets_ids:
                continue

            topic_id, similarity = self.classifier.classify(tweet_text)
            if topic_id == '':
                continue

            tweet_json['similarity'] = similarity
            evaluate_score = self.ranker.predict(json.dumps(tweet_json))
            total_score = similarity * evaluate_score
            if total_score < 0.15:
                continue

            is_pushed = self.pusher.push(evaluate_score, topic_id)
            if is_pushed:
                delivery_time = time.time()
                self.pushed_tweets[topic_id].append([tid_origin, str(delivery_time)[:10], similarity, total_score, tweet_text])
                self.pushed_tweets_ids.add(tid_retweet)

            struct_time = time.gmtime(float(timestamp[:-3]))
            utc_time = time.strftime('%Y%m%d', struct_time)
            self.related_tweets[topic_id].append([utc_time, tid_origin, total_score, tweet_text])
            self.related_tweets_hash.add(simhash_value)

            if struct_time.tm_mday != start_day:
                self.dump_result(start_day)
                start_day = struct_time.tm_mday

    def dump_result(self, file_name):
        self.logger_info.info('saving result...')
        with open('submit/task-b/'+file_name, 'w') as fw:
            for index, records in enumerate(self.related_tweets):
                pid = str(index+226)
                sorted_records = sorted(records, key=lambda item: -item[2])
                for rank, record in enumerate(sorted_records):
                    if rank >= 100:
                        break
                    fw.write('%s\tMB%s\tQ0\t%s\t%d\t%f\t%s\n' % (record[0], pid, record[1], rank+1, record[2], 'CSSNA'))
        with open('submit/task-a/'+file_name, 'w') as fw:
            with open('submit/task-a_extr/'+file_name, 'w') as fw_extr:
                for index, records in enumerate(self.pushed_tweets):
                    pid = str(index+226)
                    for record in records:
                        fw.write('MB%s\t%s\t%s\tCSSNA\n' % (pid, record[0], record[1]))
                        fw_extr.write('MB%s\t%s\t%s\tCSSNA\t%s\t%s\t%s\n' % (pid, record[0], record[1], record[2], record[3], record[4]))

        self.related_tweets = [[] for _ in range(225)]
        self.pushed_tweets = [[] for _ in range(225)]
Example #8
0
def log (msg, elapsed=None):
    pusher = Pusher()
    pusher.add("<br />")
    pusher.addstyle("="*34)
    pusher.addstyle(msg)
    if elapsed:
        pusher.addstyle("Elapsed time: " + elapsed)
    pusher.addstyle("="*34)
    pusher.add("<br />")
    pusher.push()