Beispiel #1
0
 def test_parse_line(self):
   i = util.parse_line('acc +36')
   self.assertEqual(i[0], 'acc')
   self.assertEqual(i[1], 36)
   i = util.parse_line('acc +36\n')
   self.assertEqual(i[0], 'acc')
   self.assertEqual(i[1], 36)
   i = util.parse_line('acc -36')
   self.assertEqual(i[0], 'acc')
   self.assertEqual(i[1], -36)
   # self.assertEqual(expected, util.func(args))
   pass
    def test_parse_line(self):
        self.assertEqual(
            ('posh chartreuse', (2, 'faded blue'), (2, 'pale plum'),
             (2, 'posh coral')),
            util.parse_line(
                'posh chartreuse bags contain 2 faded blue bags, 2 pale plum bags, 2 posh coral bags.'
            ))

        self.assertEqual(
            ('clear purple', (4, 'bright red')),
            util.parse_line('clear purple bags contain 4 bright red bags.'))
        self.assertEqual(
            ('shiny crimson', ),
            util.parse_line('shiny crimson bags contain no other bags.'))
Beispiel #3
0
def _downtime(bot, message, channel):
  drinks = ['a beer', 'a scotch', 'a bloody mary', 'a nice glass of wine', 'F****N FOUR LOKO', 'a crisp cider']
  action_string = "\001ACTION "
  user = parse_line(message).user
  if user.lower() == "george" or "thorogood" in user.lower():
    bot.say(channel, action_string + ' gets ' + user + ' one bourbon, one scotch, one beer'+ "\001\n")
  else:
    bot.say(channel, action_string + ' gets ' + user + ' ' + random.choice(drinks)+ "\001\n")
def parse_reverse_replacements(data):
    reverse_replacements = dict(parse_line(line)[::-1] for line in data)

    ordered_reverse_replacements = OrderedDict()
    for key in sorted(reverse_replacements.keys(), key=lambda x: len(x), reverse=True):
        ordered_reverse_replacements[key] = reverse_replacements[key]

    return ordered_reverse_replacements
Beispiel #5
0
def reflect_reload(bot, message, channel):
    logger = bot.logger
    parse_line(message)
    # for k,v in parsed.__dict__.iteritems():
    #  print k, ": ", v
    body = message.split(":")[2:]
    recomposed = "".join(body)
    if recomposed.startswith(".reflect reload"):
        for m in recomposed.split()[2:]:
            if m in sys.modules:
                try:
                    imp.reload(sys.modules[m])
                    bot.brain.notice(channel, 'reloaded ' + m)
                    logger.write(
                        Logger.INFO,
                        bot.getName() +
                        ": reloaded " +
                        m,
                        bot.NICK)
                except ImportError as e:
                    bot.brain.notice(channel, 'failed to reload ' + m)
                    logger.write(
                        Logger.CRITICAL,
                        bot.getName() +
                        ": failed to reload " +
                        m,
                        bot.NICK)
                    logger.write(
                        Logger.CRITICAL,
                        bot.getName() +
                        ": error was:",
                        bot.NICK)
                    logger.write(
                        Logger.CRITICAL,
                        bot.getName() + ": " + e,
                        bot.NICK)
                    return
Beispiel #6
0
    def handle(self, event):
        self.bot.bare_send("WHOIS " + self.rets_current_nick)
        idle_time = 0
        for line in list(self.bot.recent_lines):
            try:
                if int(util.parse_line(line).message_number) == 317:
                    idle_time = int(line.split()[4])
            except BaseException:
                pass

        # print(idle_time)
        since = datetime.datetime.now() - datetime.timedelta(seconds=idle_time)
        self.say(
            event.channel, self.rets_current_nick + " has been idle since " +
            since.strftime("%Y-%m-%d %H:%M:%S"))
Beispiel #7
0
def _downtime(bot, message, channel):
    drinks = [
        'a beer', 'a scotch', 'a bloody mary', 'a nice glass of wine',
        'F****N FOUR LOKO', 'a crisp cider',
        'a margarita from BoneKin and Jennos\'s house'
    ]
    action_string = "\001ACTION "
    user = parse_line(message).user
    if user.lower() == "george" or "thorogood" in user.lower():
        bot.say(
            channel, action_string + ' gets ' + user +
            ' one bourbon, one scotch, one beer' + "\001\n")
    else:
        bot.say(
            channel, action_string + ' gets ' + user + ' ' +
            random.choice(drinks) + "\001\n")
Beispiel #8
0
def make(path):
	f = open(path, "r")
	width = 0
	height = 0
	map = []
	
	#generate map
	count = 0
	rows = []
	columns = []
	
	for line in f:
		l = util.parse_line(line)
		if count == 0:
			width = int(l[0]) #columns
			height = int(l[1])	#rows
			print(l)
		
		elif(count <= height):
			row = []
			for segment in l:
				row.append(int(segment))
			rows.append(row)
			print(row)
			
		else:
			col = []
			for segment in l:
				col.append(int(segment))
			columns.append(col)
			print(col)
		count += 1
		
	bitmap = util.get_bitmap_vector(max(width, height))
	print()
	
	gac = GAC()
	#gen variables
	vars = gen_variables(rows, columns, width, height, bitmap)
	for var in vars:
		gac.add_variable(var)

	#gen constraints
	constraints = gen_constraints(width, height)
	gac.constraints = constraints
	
	return gac, width, height
Beispiel #9
0
def make(path):
    f = open(path, "r")
    width = 0
    height = 0
    map = []

    #generate map
    count = 0
    rows = []
    columns = []

    for line in f:
        l = util.parse_line(line)
        if count == 0:
            width = int(l[0])  #columns
            height = int(l[1])  #rows
            print(l)

        elif (count <= height):
            row = []
            for segment in l:
                row.append(int(segment))
            rows.append(row)
            print(row)

        else:
            col = []
            for segment in l:
                col.append(int(segment))
            columns.append(col)
            print(col)
        count += 1

    bitmap = util.get_bitmap_vector(max(width, height))
    print()

    gac = GAC()
    #gen variables
    vars = gen_variables(rows, columns, width, height, bitmap)
    for var in vars:
        gac.add_variable(var)

    #gen constraints
    constraints = gen_constraints(width, height)
    gac.constraints = constraints

    return gac, width, height
Beispiel #10
0
def reflect_reload(bot, message, channel):
  logger = bot.logger
  parsed = parse_line(message)
  #for k,v in parsed.__dict__.iteritems():
  #  print k, ": ", v
  body = message.split(":")[2:]
  recomposed = "".join(body)
  if recomposed.startswith(".reflect reload"):
    for m in recomposed.split()[2:]:
      if m in sys.modules:
        try:
          reload(sys.modules[m])
          bot.brain.notice(channel, 'reloaded ' + m)
          logger.write(Logger.INFO, bot.getName() + ": reloaded " + m, bot.NICK)
        except ImportError as e:
          bot.brain.notice(channel, 'failed to reload ' + m)
          logger.write(Logger.CRITICAL, bot.getName() + ": failed to reload " + m, bot.NICK)
          logger.write(Logger.CRITICAL, bot.getName() + ": error was:", bot.NICK)
          logger.write(Logger.CRITICAL, bot.getName() + ": " + e, bot.NICK)
          return
Beispiel #11
0
def reload(bot, message, channel):
    logger = bot.logger
    parsed = parse_line(message)
    if parsed.startswith(".snippets reload"):
        try:
            bot.load_snippets()
            bot.set_snippets()
            bot.brain.notice(channel, "snippets reloaded")
        except BaseException:
            e = sys.exc_info()[0]
            logger.write(
                Logger.WARNING,
                bot.getName() +
                ": failed to reload snippets",
                bot.NICK)
            logger.write(
                Logger.WARNING,
                bot.getName() +
                ": error was:",
                bot.NICK)
            logger.write(Logger.WARNING, bot.getName() + ": " + e, bot.NICK)
Beispiel #12
0
def main():
    # 遍历所有文件
    count = 0
    size = 0
    save_file = open(save_file_path, 'w')
    for root, dirs, file_names in os.walk(dataset_folder_path):
        if len(dirs) != 0:
            continue
        print root, dirs, file_names
        for file_name in file_names:
            if file_name == '.DS_Store':  # avoid error in  macos
                continue
            file_path = join(root, file_name)
            # open file
            size += getsize(file_path)
            if size < 5:
                continue

            save_file.write('###################### %s' % file_name + os.linesep)
            with open(file_path, 'r') as f:
                count += 1
                for line in f:
                    line = line.rstrip()

                    if line == '':
                        continue

                    # print line
                    new_line = parse_line(line)
                    # save line
                    save_file.write(new_line + os.linesep)
            # new article separate

    save_file.close()

    print 'count: {}'.format(count)
    print 'size: {}'.format(size)
    print 'size (m): {}'.format(size / (1024 * 1024))

    '''
Beispiel #13
0
    def get(self):
        tx = self.request.get('tx')
        response = confirm(tx)
        if not response.startswith('SUCCESS'):
            print "Suspicious: invalid autoreturn"
            self.response.write("Suspicious: invalid autoreturn")
            return
        params = util.parse_line(response)
        if not util.check_transaction(params):
            return

        name, email = params['custom'].split('|')
        name = name.replace('+', ' ')
        #assume email is already vetted
        if not my_db.get(email):
            user = my_db.new_user(name, email, params['txn_id'])
            user.put()
        passwordReset = my_db.new_reset(email)
        passwordReset.put()

        #add password reset key
        html2 =  html.password_reset(key=str(passwordReset.key()), email=email)
        self.response.write(html2)
def extract_items_features(train_file_path,begin_date,end_date):
    # 按商品id排序
    generate_sortedfile(train_file_path, "sorted_by_item-" + train_file_path, 1)

    train_file = open("sorted_by_item-" + train_file_path)
    items_feat_file_path = "./feature/" + begin_date + "-" + end_date + "-itemfeat.csv"
    items_features_file = open(items_feat_file_path, 'w')

    userful_feat_file_path = "./feature/" + begin_date + "-" + end_date + "-itemusefulfeat.csv"
    useful_features_file = open(userful_feat_file_path, 'w')

    tmp_features = {}
    initial_tmp_features(tmp_features)

    # 输出栏位名
    other_features = get_other_basic_item_features(tmp_features)    # 获取用户其他特征
    item_features,userful_features = merge_features(tmp_features, other_features,item_feat_name,useful_feat_name)
    items_features_file.write("item_id" + "," + get_features_key(item_features) + "\n")
    useful_features_file.write("item_id" + "," + get_features_key(userful_features) + "\n")
    initial_tmp_features(tmp_features)

    global lastdate
    lastdate = datetime.strptime(end_date, "%Y-%m-%d").date() - timedelta(days=1)

    pre_item_id = train_file.readline().split(delimiter)[1]  # 获取第一行的item_id
    train_file.seek(0)
    for line in train_file:
        user_id, item_id, behavior_type, user_geohash, item_category, date = parse_line(line)

        # 如果前一个物品pre_item_id和读取到的item_id不一样则输出当前item_features并置空
        if not item_id == pre_item_id:
            other_features = get_other_basic_item_features(tmp_features)  # 获取用户其他特征
            item_features,userful_features = merge_features(tmp_features, other_features,item_feat_name,useful_feat_name)
            items_features_file.write(pre_item_id + "," + get_features_key(item_features) + "\n")
            useful_features_file.write(pre_item_id + "," + get_features_key(userful_features) + "\n")
            initial_tmp_features(tmp_features)  # 初始化置空item_features

        ##计算当前item基本特征
        #行为基本特征计算
        xxx_counts = basic_feature_behaviour[behavior_type][0]
        xxx_user  = basic_feature_behaviour[behavior_type][1]
        user_xxx_first_time  = basic_feature_behaviour[behavior_type][2]
        eachday_xxx_counts  = basic_feature_behaviour[behavior_type][3] #每天销售量、收藏量

        tmp_features[xxx_counts] += 1
        tmp_features[xxx_user][user_id] = tmp_features[xxx_user].get(user_id, 0) + 1
        tmp_features[eachday_xxx_counts][date.date()] = tmp_features[eachday_xxx_counts].get(date.date(), 0) + 1

        #new-added
        if behavior_type == 4:
            week = datetime.datetime(date.date()).strftime("%w")
            tmp_features['buy_week_list'][week] += 1#一周购买量分布

        tmp_features['multibuy_time_list'][user_id].append(date.date()) #多次购买时间间隔均值

        #收藏到购买时间间隔均值
        if behavior_type == 2:
            tmp_features['save_to_buy_time'][user_id]['save_time'].append(date.date())

        if behavior_type == 4:
            tmp_features['save_to_buy_time'][user_id]['buy_time'].append(date.date())

        #收藏到购买时间间隔均值
        if behavior_type == 2:
            tmp_features['cart_to_buy_time'][user_id]['cart_time'].append(date.date())

        if behavior_type == 4:
            tmp_features['cart_to_buy_time'][user_id]['buy_time'].append(date.date())

        
        if user_id not in tmp_features[user_xxx_first_time]:
            tmp_features[user_xxx_first_time][user_id] = date.date()

        pre_item_id = item_id

    #最后一个item特征
    other_features = get_other_basic_item_features(tmp_features)
    item_features,userful_features = merge_features(tmp_features, other_features,item_feat_name,useful_feat_name)
    items_features_file.write(pre_item_id + "," + get_features_key(item_features) + "\n")
    useful_features_file.write(pre_item_id + "," + get_features_key(userful_features) + "\n")

    train_file.close()
    items_features_file.close()

    return items_feat_file_path
Beispiel #15
0
def extract_items_features(train_file_path, begin_date, end_date):
    # 按商品id排序
    generate_sortedfile(train_file_path, "sorted_by_item-" + train_file_path,
                        1)

    train_file = open("sorted_by_item-" + train_file_path)
    items_feat_file_path = "./feature/" + begin_date + "-" + end_date + "-itemfeat.csv"
    items_features_file = open(items_feat_file_path, 'w')

    userful_feat_file_path = "./feature/" + begin_date + "-" + end_date + "-itemusefulfeat.csv"
    useful_features_file = open(userful_feat_file_path, 'w')

    tmp_features = {}
    initial_tmp_features(tmp_features)

    # 输出栏位名
    other_features = get_other_basic_item_features(tmp_features)  # 获取用户其他特征
    item_features, userful_features = merge_features(tmp_features,
                                                     other_features,
                                                     item_feat_name,
                                                     useful_feat_name)
    items_features_file.write("item_id" + "," +
                              get_features_key(item_features) + "\n")
    useful_features_file.write("item_id" + "," +
                               get_features_key(userful_features) + "\n")
    initial_tmp_features(tmp_features)

    global lastdate
    lastdate = datetime.strptime(end_date,
                                 "%Y-%m-%d").date() - timedelta(days=1)

    pre_item_id = train_file.readline().split(delimiter)[1]  # 获取第一行的item_id
    train_file.seek(0)
    for line in train_file:
        user_id, item_id, behavior_type, user_geohash, item_category, date = parse_line(
            line)

        # 如果前一个物品pre_item_id和读取到的item_id不一样则输出当前item_features并置空
        if not item_id == pre_item_id:
            other_features = get_other_basic_item_features(
                tmp_features)  # 获取用户其他特征
            item_features, userful_features = merge_features(
                tmp_features, other_features, item_feat_name, useful_feat_name)
            items_features_file.write(pre_item_id + "," +
                                      get_features_key(item_features) + "\n")
            useful_features_file.write(pre_item_id + "," +
                                       get_features_key(userful_features) +
                                       "\n")
            initial_tmp_features(tmp_features)  # 初始化置空item_features

        ##计算当前item基本特征
        #行为基本特征计算
        xxx_counts = basic_feature_behaviour[behavior_type][0]
        xxx_user = basic_feature_behaviour[behavior_type][1]
        user_xxx_first_time = basic_feature_behaviour[behavior_type][2]
        eachday_xxx_counts = basic_feature_behaviour[behavior_type][
            3]  #每天销售量、收藏量

        tmp_features[xxx_counts] += 1
        tmp_features[xxx_user][user_id] = tmp_features[xxx_user].get(
            user_id, 0) + 1
        tmp_features[eachday_xxx_counts][date.date(
        )] = tmp_features[eachday_xxx_counts].get(date.date(), 0) + 1

        #new-added
        if behavior_type == 4:
            week = datetime.datetime(date.date()).strftime("%w")
            tmp_features['buy_week_list'][week] += 1  #一周购买量分布

        tmp_features['multibuy_time_list'][user_id].append(
            date.date())  #多次购买时间间隔均值

        #收藏到购买时间间隔均值
        if behavior_type == 2:
            tmp_features['save_to_buy_time'][user_id]['save_time'].append(
                date.date())

        if behavior_type == 4:
            tmp_features['save_to_buy_time'][user_id]['buy_time'].append(
                date.date())

        #收藏到购买时间间隔均值
        if behavior_type == 2:
            tmp_features['cart_to_buy_time'][user_id]['cart_time'].append(
                date.date())

        if behavior_type == 4:
            tmp_features['cart_to_buy_time'][user_id]['buy_time'].append(
                date.date())

        if user_id not in tmp_features[user_xxx_first_time]:
            tmp_features[user_xxx_first_time][user_id] = date.date()

        pre_item_id = item_id

    #最后一个item特征
    other_features = get_other_basic_item_features(tmp_features)
    item_features, userful_features = merge_features(tmp_features,
                                                     other_features,
                                                     item_feat_name,
                                                     useful_feat_name)
    items_features_file.write(pre_item_id + "," +
                              get_features_key(item_features) + "\n")
    useful_features_file.write(pre_item_id + "," +
                               get_features_key(userful_features) + "\n")

    train_file.close()
    items_features_file.close()

    return items_feat_file_path
Beispiel #16
0
import util

input_path = 'elb_log_file.txt'
output_path = 'anonymized_data.txt'
o_f = open(output_path, "w+")
with open(input_path) as fp:
    line = fp.readline()
    record = ""
    while line:
        stripped_line = line.strip()
        if line == "\n":
            parsed_line = util.parse_line(record)
            if len(parsed_line) > 0:
                o_f.write(parsed_line)
                o_f.write('\n')
                record = ""
        else:
            record = record + " " + stripped_line
        line = fp.readline()

o_f.close()
Beispiel #17
0
def main():

    # 遍历所有文件
    count = 0
    size = 0
    article_count = 0

    # 词在语料库中的总次数
    total_word_counter = Counter()

    # 词在文章中出现的次数
    articles_counter = Counter()

    # 文章中每一个词出现的次数
    article_word_counter_dict = {}

    for root, dirs, file_names in os.walk(dataset_folder_path):
        if len(dirs) != 0:
            continue
        print root, dirs, file_names

        for file_name in file_names:
            if file_name == '.DS_Store':  # avoid error in  macos
                continue
            file_path = join(root, file_name)
            # open file
            size += getsize(file_path)

            if size < 5:
                continue

            article_word_counter = Counter()
            article_token_words = []
            with codecs.open(file_path, 'r', encoding='utf-8') as f:
                count += 1
                for line in f:

                    # 某个词是否在某篇文章中出现
                    line = line.rstrip()

                    if line == '':
                        continue

                    # print line
                    new_line = parse_line(line)

                    # token_words = new_line.split()
                    cleaned_words = remove_words(new_line, stop_words)
                    total_word_counter.update(cleaned_words)

                    article_token_words += cleaned_words

            article_word_counter.update(article_token_words)
            article_word_counter_dict[file_name] = article_word_counter

            # 变为set
            article_token_words_set = set(article_token_words)
            articles_counter.update(article_token_words_set)

            article_count += 1

        words_idf = {}
        # 计算每一个词的IDF(逆文档频率)
        for word, count in articles_counter.iteritems():
            idf = math.log(article_count / (count + 1))
            words_idf[word] = idf

        pickle.dump(words_idf, open("./../data/people's_daily_words_idf.pkl", "wb"))

        pickle.dump(article_word_counter_dict, open("./../data/people's_daily_article_word_counter_dict.pkl", "wb"))

        pickle.dump(total_word_counter, open("./../data/people's_daily_word_counter.pkl", "wb"))


        print('words_idf: {}'.format(words_idf))
Beispiel #18
0
    computers = set()
    auth_types = set()
    logon_types = set()

    filenames = os.listdir(dir_path)
    for filename in tqdm(filenames):
        if not re.findall("txt$", filename):
            continue
        filepath = os.path.join(dir_path, filename)
        if not os.path.isfile(filepath):
            continue
        f = open(filepath)

        for line in f:
            try:
                values = parse_line(line)
            except AssertionError:
                os.remove(filepath)
                continue

            src_user, dest_user = values['src_user'], values['dest_user']
            src_comp, dest_comp = values['src_comp'], values['dest_comp']
            auth_type, logon_type = values['auth_type'], values['logon_type']

            for user in [src_user, dest_user]:
                if user and user not in users:
                    users.add(user)
            for computer in [src_comp, dest_comp]:
                if computer and computer not in computers:
                    computers.add(computer)
            if auth_type not in auth_types:
def extract_items_features(train_file_path,begin_date,end_date):
    # 按商品id排序
    print "\n" + begin_date + "---" + end_date + "extracting items features..."
    generate_sortedfile(train_file_path, "temp/sorted_by_item-" + train_file_path.split('/')[-1], 1)

    train_file = open("temp/sorted_by_item-" + train_file_path.split('/')[-1])
    items_feat_file_path = "./feature/" + begin_date + "-" + end_date + "-itemfeat.csv"
    items_features_file = open(items_feat_file_path, 'w')

    tmp_features = {}
    initial_tmp_features(tmp_features)

    # 输出栏位名
    other_features = get_other_basic_item_features(tmp_features)    # 获取用户其他特征
    all_features = merge_features(tmp_features, other_features)
    # print get_features_key(all_features)
    items_features_file.write("item_id" + "," + get_features_key(all_features) + "\n")
    initial_tmp_features(tmp_features)

    global lastday
    lastday = datetime.strptime(end_date, "%Y-%m-%d").date() - timedelta(days=1)
    
    pre_item_id = train_file.readline().split(delimiter)[1]  # 获取第一行的item_id
    train_file.seek(0)
    for line in train_file:
        user_id, item_id, behavior_type, user_geohash, item_category, date = parse_line(line)

        # 如果前一个物品pre_item_id和读取到的item_id不一样则输出当前item_features并置空
        if not item_id == pre_item_id:
            other_features = get_other_basic_item_features(tmp_features)  # 获取用户其他特征
            all_features   = merge_features(tmp_features,other_features)
            items_features_file.write(pre_item_id + "," + get_item_features_str(all_features) + "\n")  # 输出当前item_features
            initial_tmp_features(tmp_features)  # 初始化置空item_features

        ##计算当前item基本特征
        #行为基本特征计算
        xxx_counts = basic_feature_behaviour[behavior_type][0]
        xxx_user  = basic_feature_behaviour[behavior_type][1]
        user_xxx_first_time  = basic_feature_behaviour[behavior_type][2]
        eachday_xxx_counts  = basic_feature_behaviour[behavior_type][3]

        tmp_features[xxx_counts] += 1
        tmp_features[xxx_user][user_id] = tmp_features[xxx_user].get(user_id, 0) + 1
        tmp_features[eachday_xxx_counts][date.date()] = tmp_features[eachday_xxx_counts].get(date.date(), 0) + 1
        
        if user_id not in tmp_features[user_xxx_first_time]:
            tmp_features[user_xxx_first_time][user_id] = date.date()

        pre_item_id = item_id

    #最后一个item特征
    other_features = get_other_basic_item_features(tmp_features)
    all_features   = merge_features(tmp_features,other_features)
    # print item_features
    #  输出最后一个item_features到文件
    items_features_file.write(pre_item_id + "," + get_item_features_str(all_features) + "\n")

    train_file.close()
    items_features_file.close()

    print "extract" + begin_date + "---" + begin_date + "items features completed"
    return items_feat_file_path
Beispiel #20
0
def extract_categorys_features(train_file_path, begin_date, end_date):
    # 按商品id排序
    print "\n" + begin_date + "---" + end_date + "extracting categorys features..."
    generate_sortedfile(
        train_file_path,
        "temp/sorted_by_category-" + train_file_path.split('/')[-1], 4)

    train_file = open("temp/sorted_by_category-" +
                      train_file_path.split('/')[-1])
    categorys_feat_file_path = "./feature/" + begin_date + "-" + end_date + "-categoryfeat.csv"
    categorys_features_file = open(categorys_feat_file_path, 'w')

    tmp_features = {}
    initial_tmp_features(tmp_features)

    # 输出栏位名
    other_features = get_other_basic_category_features(tmp_features)  # 获取其他特征
    all_features = merge_features(tmp_features, other_features)
    # print "category_featurename is :\n",get_features_key(all_features)
    categorys_features_file.write("category_id" + "," +
                                  get_features_key(all_features) + "\n")
    initial_tmp_features(tmp_features)

    global lastday
    lastday = datetime.strptime(end_date,
                                "%Y-%m-%d").date() - timedelta(days=1)

    pre_category_id = train_file.readline().split(delimiter)[
        4]  # 获取第一行的category_id
    train_file.seek(0)
    for line in train_file:
        user_id, item_id, behavior_type, user_geohash, item_category, date = parse_line(
            line)

        # 如果前一个物品pre_category_id和读取到的item_category不一样则输出当前category_features并置空
        if not item_category == pre_category_id:
            other_features = get_other_basic_category_features(
                tmp_features)  # 获取用户其他特征
            all_features = merge_features(tmp_features, other_features)
            categorys_features_file.write(
                pre_category_id + "," +
                get_category_features_str(all_features) +
                "\n")  # 输出当前category_features
            initial_tmp_features(tmp_features)  # 初始化置空category_features

        ##计算当前category基本特征
        #行为基本特征计算
        xxx_counts = basic_feature_behaviour[behavior_type][0]
        xxx_user = basic_feature_behaviour[behavior_type][1]
        user_xxx_first_time = basic_feature_behaviour[behavior_type][2]
        eachday_xxx_counts = basic_feature_behaviour[behavior_type][3]

        tmp_features[xxx_counts] += 1
        tmp_features[xxx_user][user_id] = tmp_features[xxx_user].get(
            user_id, 0) + 1
        tmp_features[eachday_xxx_counts][date.date(
        )] = tmp_features[eachday_xxx_counts].get(date.date(), 0) + 1

        if user_id not in tmp_features[user_xxx_first_time]:
            tmp_features[user_xxx_first_time][user_id] = date.date()

        pre_category_id = item_category

    #最后一个category特征
    other_features = get_other_basic_category_features(tmp_features)
    all_features = merge_features(tmp_features, other_features)
    # print category_features
    #  输出最后一个category_features到文件
    categorys_features_file.write(pre_category_id + "," +
                                  get_category_features_str(all_features) +
                                  "\n")

    train_file.close()
    categorys_features_file.close()

    print "extract" + begin_date + "---" + begin_date + "categorys features completed"
    return categorys_feat_file_path
Beispiel #21
0
    def processline(self, line):
        """
        Grab newline-delineated lines sent to us, and determine what to do with them.
        This function handles our initial low-level IRC stuff, as well; if we haven't joined, it waits for the MOTD message (or message indicating there isn't one) and then issues our own JOIN calls.

        Also immediately passes off PING messages to PONG.

        Args:
        line: string.

        """

        self.recent_lines.appendleft(line)
        if self.DEBUG:
            if os.name == "posix":  # because windows doesn't like the color codes.
                self.debug_print(util.bcolors.OKBLUE + "<< " +
                                 util.bcolors.ENDC + line)
            else:
                self.debug_print("<< " + ": " + line)

        message_number = line.split()[1]

        try:
            first_word = line.split(":", 2)[2].split()[0]
            channel = line.split()[2]
        except IndexError:
            pass
        else:
            if first_word in self.command_function_map:
                self.command_function_map[first_word](self,
                                                      parse_line(line).message,
                                                      channel)

        try:
            for e in self.events_list:
                if e.matches(line):
                    e.notifySubscribers(line)
            # don't bother going any further if it's a PING/PONG request
            if line.startswith("PING"):
                ping_response_line = line.split(":", 1)
                self.pong(ping_response_line[1])
            # pings we respond to directly. everything else...
            else:
                # patch contributed by github.com/thekanbo
                if self.JOINED is False and (message_number == "376"
                                             or message_number == "422"):
                    # wait until we receive end of MOTD before joining, or
                    # until the server tells us the MOTD doesn't exist
                    if not self.chan_list:
                        self.chan_list = self.conf.getChannels(self.network)
                    for c in self.chan_list:
                        self.send(('JOIN ' + c + ' \n').encode())
                    self.JOINED = True

                line_array = line.split()
                user_and_mask = line_array[0][1:]
                usr = user_and_mask.split("!")[0]
                channel = line_array[2]
                try:
                    message = line.split(":", 2)[2]
                    self.brain.respond(usr, channel, message)
                except IndexError:
                    try:
                        message = line.split(":", 2)[1]
                        self.brain.respond(usr, channel, message)
                    except IndexError:
                        print(("index out of range.", line))

        except Exception:
            print(("Unexpected error:", sys.exc_info()[0]))
            traceback.print_exc(file=sys.stdout)