def is_alive(df): date2 = df[record_date].map(lambda x: str2time(x)).max() date1 = datetime.datetime(date2.year, date2.month, 1).date() from dateutil.relativedelta import relativedelta date1 -= relativedelta(months=+2) grouped = DataView(df).filter_by_record_date2(date1, date2)[[ user_id, power_consumption ]].groupby([user_id], as_index=False).mean() alive = grouped[power_consumption].map(lambda x: 0 if x < 10 else 1) alive.name = 'is_alive' return grouped.join(alive).drop(power_consumption, axis=1)
def rise_rate(df): date1_2 = df[record_date].map(lambda x: str2time(x)).max() date1_1 = datetime.datetime(date1_2.year, date1_2.month, 1).date() grouped1 = DataView(df).filter_by_record_date2( date1_1, date1_2)[[user_id, power_consumption]].groupby([user_id], as_index=False).mean() from dateutil.relativedelta import relativedelta date2_1 = date1_1 - relativedelta(months=+1) date2_2 = date1_2 - relativedelta(months=+1) grouped2 = DataView(df).filter_by_record_date2( date2_1, date2_2)[[user_id, power_consumption]].groupby([user_id], as_index=False).mean() user_rise_rate = pd.Series( map(lambda x, y: float(x - y) / y, grouped1[power_consumption], grouped2[power_consumption])) user_rise_rate.name = 'user_rise_rate' return grouped1[[user_id]].join(user_rise_rate)
def __init__(self, conn, tunnel): self.cursor = conn.cursor() self.last_detail_table = '' self.last_query_table = '' self.current_tab = 0 self.current_page = 0 self.current_table = '' self.store = None self.tunnel = tunnel builder = Gtk.Builder() builder.add_objects_from_file('app.glade', ('winQuery', )) builder.connect_signals({ 'on_table_row_selected': self.on_table_row_selected, 'on_tab_selected': self.on_tab_selected, 'on_previous_page': self.on_previous_page, 'on_next_page': self.on_next_page, 'on_filter_activate': self.on_filter_activate, 'on_run_query': self.on_run_query, 'on_data_key_press': self.on_data_key_press }) self.query_tabs = builder.get_object('queryTabs') self.ent_data_filter = builder.get_object('entDataFilter') self.txt_query = builder.get_object('txtQuery') self.list_tables = builder.get_object('listTables') self.fetch_tables() self.data_view = DataView(builder.get_object('dataTree'), conn) self.query_view = QueryView(builder.get_object('queryTree'), builder.get_object('lblQueryInfo'), conn) self.window = builder.get_object('winQuery') WindowManager.add_window(self.window) self.window.show_all()
def user_info_m_p(df): date2 = df[record_date].map(lambda x: str2time(x)).max() date1 = datetime.datetime(date2.year, date2.month, 1).date() grouped = DataView(df).filter_by_record_date2( date1, date2)[[user_id, 'month', power_consumption]].groupby([user_id, 'month'], as_index=False) user_power_mean_m = grouped.mean() user_power_median_m = grouped.median() user_power_var_m = grouped.var() user_power_max_m = grouped.max() user_power_min_m = grouped.min() user_power_mean_m = user_power_mean_m.rename( columns={power_consumption: 'user_power_mean_m_p'}) user_power_median_m = user_power_median_m.rename( columns={power_consumption: 'user_power_median_m_p'}) user_power_var_m = user_power_var_m.rename( columns={power_consumption: 'user_power_var_m_p'}) user_power_max_m = user_power_max_m.rename( columns={power_consumption: 'user_power_max_m_p'}) user_power_min_m = user_power_min_m.rename( columns={power_consumption: 'user_power_min_m_p'}) return pd.merge(user_power_mean_m, user_power_median_m).merge(user_power_var_m).\ merge(user_power_max_m).merge(user_power_min_m).drop('month', axis=1)
from data_view import DataView from config import * import pandas as pd import numpy as np def generate_predict_data(n): dates = pd.date_range('2016/9/1', '2016/9/30') dates = list(map(lambda x: x.strftime('%Y/%-m/%-d'), dates)) * 1454 ids = [i + 1 for i in range(1454)] ids = list(map(lambda x: [x] * 30, ids)) ids = list(np.array(ids).flatten()) pd.DataFrame({'record_date': dates, 'user_id': ids}).to_csv(data_paths.format(n), index=False) if __name__ == '__main__': data_view = DataView(tianchi_power_csv) for i in range(len(date_durations)): print('generate num{} data'.format(str(i))) start_date, end_date = date_durations[i].split('-') data_view.filter_by_record_date(start_date, end_date).to_csv(data_paths.format(str(i)), index=False) start_date, end_date = feature_date_durations[i].split('-') data_view.filter_by_record_date(start_date, end_date).to_csv(feature_data_paths.format(str(i)), index=False) generate_predict_data(8)
def gen_all_sentences(): relevant_ip_files = os.listdir(relevant_ip_path) sentences = list() ip_sentences = list() for each in relevant_ip_files: if each[-3:] == 'csv': df = pd.read_csv(relevant_ip_path + each) single_sentence, single_ip_sentence = gen_sentence(df) sentences.append(single_sentence) ip_sentences.append(single_ip_sentence) with open(sentence_path, 'wb') as f: pickle.dump(sentences, f) with open(ip_sentence_path, 'wb') as f: pickle.dump(ip_sentences, f) if __name__ == '__main__': dv = DataView() themes = list(dv.theme_set) theme_to_int = dict((c, i + 1) for i, c in enumerate(themes)) int_to_theme = dict((i + 1, c) for i, c in enumerate(themes)) with open(theme2int_path, 'wb') as f: pickle.dump(theme_to_int, f) with open(int2theme_path, 'wb') as f: pickle.dump(int_to_theme, f) print(theme_to_int) print(int_to_theme) gen_all_sentences()
# frame.name ='user_action_count' # action_user = df[[user_label]].join(frame) # grouped=action_user.groupby(user_label,as_index=False) # d1=grouped[frame.name].sum() # normal_user=set(d1[d1[frame.name]<10000][user_label]) # df=df[df[user_label].isin(normal_user)].reset_index(drop=True) # print 'normal_user',len(df) df.to_csv(cleaned_path, index=False) if __name__ == '__main__': print('preprocessing datainininggg') preprocess(train_raw_path, train_file_path) print('loading training data...') train_data = DataView(train_file_path) item_data = ItemView(item_file_path) train_user_list, trian_user_set = train_data.user_list( ), train_data.user_set() train_item_list, train_item_set = train_data.item_list( ), train_data.item_set() # train_category_list,train_category_set=train_data.category_list,train_data.category_set all_item_list, all_item_set = item_data.item_list(), item_data.item_set() # all_item_category_list,all_item_category_set=item_data.category_list,item_data.category_set print("spliting data...") #if(not os.path.exists(train_raw_data_path)): train_raw_data = train_data.filter_by_time(train_feature_start_time, train_feature_end_time) one_week = get_data(train_raw_data, train_feature_end_time)
#! /usr/bin/env python2.7 # -*- coding: utf-8 -*- # File: data_split.py # Date: 2016-10-14 # Author: Chaos <*****@*****.**> from data_view import DataView from config import * if __name__ == '__main__': train_offline_data = DataView(offline_train_file_path) test_offline_data = DataView(offline_test_file_path) train_online_data = DataView(online_train_file_path) # split by user train_offline_user_list, train_offline_user_set = train_offline_data.user_list, train_offline_data.user_set test_offline_user_list, test_offline_user_set = test_offline_data.user_list, test_offline_data.user_set train_online_user_list, train_online_user_set = train_online_data.user_list, train_online_data.user_set active_users = train_offline_user_set & train_online_user_set active_user_offline_record = train_offline_data.data[train_offline_data.data[user_label].isin(active_users)] active_user_online_record = train_online_data.data[train_online_data.data[user_label].isin(active_users)] offline_user_record = train_offline_data.data[~train_offline_data.data[user_label].isin(active_users)] online_user_record = train_online_data.data[~train_online_data.data[user_label].isin(active_users)] active_user_offline_record.to_csv(active_user_offline_data_path, index=False) active_user_online_record.to_csv(active_user_online_data_path, index=False) offline_user_record.to_csv(offline_user_data_path, index=False) online_user_record.to_csv(online_user_data_path, index=False)
#! /usr/bin/env python2.7 # -*- coding: utf-8 -*- # File: data_split.py # Date: 2016-10-14 # Author: Chaos <*****@*****.**> from data_view import DataView from config import * if __name__ == '__main__': train_offline_data = DataView(offline_train_file_path) test_offline_data = DataView(offline_test_file_path) train_online_data = DataView(online_train_file_path) # split by user train_offline_user_list, train_offline_user_set = train_offline_data.user_list, train_offline_data.user_set test_offline_user_list, test_offline_user_set = test_offline_data.user_list, test_offline_data.user_set train_online_user_list, train_online_user_set = train_online_data.user_list, train_online_data.user_set active_users = train_offline_user_set & train_online_user_set active_user_offline_record = train_offline_data.data[ train_offline_data.data[user_label].isin(active_users)] active_user_online_record = train_online_data.data[ train_online_data.data[user_label].isin(active_users)] offline_user_record = train_offline_data.data[ ~train_offline_data.data[user_label].isin(active_users)] online_user_record = train_online_data.data[ ~train_online_data.data[user_label].isin(active_users)] active_user_offline_record.to_csv(active_user_offline_data_path, index=False)
def gen_ips_csv(): dv = DataView() df = dv.df ip_groups, ip_groups2 = ip_group(df) for k, v in ip_groups.items(): save_csv(df, ip_groups, k)
class QueryWindow(object): LIMIT_SIZE = 20 tunnel = None def __del__(self): if self.tunnel: self.tunnel.stop() def __init__(self, conn, tunnel): self.cursor = conn.cursor() self.last_detail_table = '' self.last_query_table = '' self.current_tab = 0 self.current_page = 0 self.current_table = '' self.store = None self.tunnel = tunnel builder = Gtk.Builder() builder.add_objects_from_file('app.glade', ('winQuery', )) builder.connect_signals({ 'on_table_row_selected': self.on_table_row_selected, 'on_tab_selected': self.on_tab_selected, 'on_previous_page': self.on_previous_page, 'on_next_page': self.on_next_page, 'on_filter_activate': self.on_filter_activate, 'on_run_query': self.on_run_query, 'on_data_key_press': self.on_data_key_press }) self.query_tabs = builder.get_object('queryTabs') self.ent_data_filter = builder.get_object('entDataFilter') self.txt_query = builder.get_object('txtQuery') self.list_tables = builder.get_object('listTables') self.fetch_tables() self.data_view = DataView(builder.get_object('dataTree'), conn) self.query_view = QueryView(builder.get_object('queryTree'), builder.get_object('lblQueryInfo'), conn) self.window = builder.get_object('winQuery') WindowManager.add_window(self.window) self.window.show_all() def fetch_tables(self): self.cursor.execute( "SELECT relname FROM pg_class WHERE relkind='r' AND relname !~ '^(pg_|sql_)' ORDER BY relname" ) for table in self.cursor.fetchall(): self.list_tables.add(ListBoxRowWithData(table[0])) def on_table_row_selected(self, list, row): if not row: return self.ent_data_filter.set_text('') self.ent_data_filter.show_all() self.current_table = row.data self.refresh(row.data) def refresh_details(self, table): if table == self.last_detail_table: return self.last_detail_table = table print 'REFRESHING DETAILS' pass def refresh_data(self, table): self.data_view.set_table(table) def refresh_query(self, table): pass def on_data_key_press(self, element, key): print key, key.is_modifier, key.get_keyval() def on_run_query(self, el): buffer = self.txt_query.get_buffer() start, end = buffer.get_bounds() self.query_view.run_query(buffer.get_text(start, end, False)) def on_previous_page(self, el): self.data_view.previous_page() def on_next_page(self, el): self.data_view.next_page() def refresh(self, table): if self.current_tab == 1: self.refresh_details(table) elif self.current_tab == 0: self.refresh_data(table) else: self.refresh_query(table) def on_tab_selected(self, notebook, pane, index): self.current_tab = index self.refresh(self.current_table) def on_filter_activate(self, element): self.data_view.set_where(element.get_text()) self.data_view.refresh_data()