Ejemplo n.º 1
0
def is_alive(df):
    date2 = df[record_date].map(lambda x: str2time(x)).max()
    date1 = datetime.datetime(date2.year, date2.month, 1).date()
    from dateutil.relativedelta import relativedelta
    date1 -= relativedelta(months=+2)
    grouped = DataView(df).filter_by_record_date2(date1, date2)[[
        user_id, power_consumption
    ]].groupby([user_id], as_index=False).mean()
    alive = grouped[power_consumption].map(lambda x: 0 if x < 10 else 1)
    alive.name = 'is_alive'
    return grouped.join(alive).drop(power_consumption, axis=1)
Ejemplo n.º 2
0
def rise_rate(df):
    date1_2 = df[record_date].map(lambda x: str2time(x)).max()
    date1_1 = datetime.datetime(date1_2.year, date1_2.month, 1).date()
    grouped1 = DataView(df).filter_by_record_date2(
        date1_1, date1_2)[[user_id,
                           power_consumption]].groupby([user_id],
                                                       as_index=False).mean()
    from dateutil.relativedelta import relativedelta
    date2_1 = date1_1 - relativedelta(months=+1)
    date2_2 = date1_2 - relativedelta(months=+1)
    grouped2 = DataView(df).filter_by_record_date2(
        date2_1, date2_2)[[user_id,
                           power_consumption]].groupby([user_id],
                                                       as_index=False).mean()
    user_rise_rate = pd.Series(
        map(lambda x, y: float(x - y) / y, grouped1[power_consumption],
            grouped2[power_consumption]))
    user_rise_rate.name = 'user_rise_rate'
    return grouped1[[user_id]].join(user_rise_rate)
Ejemplo n.º 3
0
    def __init__(self, conn, tunnel):
        self.cursor = conn.cursor()

        self.last_detail_table = ''
        self.last_query_table = ''
        self.current_tab = 0
        self.current_page = 0
        self.current_table = ''
        self.store = None
        self.tunnel = tunnel

        builder = Gtk.Builder()
        builder.add_objects_from_file('app.glade', ('winQuery', ))

        builder.connect_signals({
            'on_table_row_selected': self.on_table_row_selected,
            'on_tab_selected': self.on_tab_selected,
            'on_previous_page': self.on_previous_page,
            'on_next_page': self.on_next_page,
            'on_filter_activate': self.on_filter_activate,
            'on_run_query': self.on_run_query,
            'on_data_key_press': self.on_data_key_press
        })

        self.query_tabs = builder.get_object('queryTabs')

        self.ent_data_filter = builder.get_object('entDataFilter')
        self.txt_query = builder.get_object('txtQuery')

        self.list_tables = builder.get_object('listTables')
        self.fetch_tables()

        self.data_view = DataView(builder.get_object('dataTree'), conn)
        self.query_view = QueryView(builder.get_object('queryTree'),
                                    builder.get_object('lblQueryInfo'), conn)

        self.window = builder.get_object('winQuery')
        WindowManager.add_window(self.window)
        self.window.show_all()
Ejemplo n.º 4
0
def user_info_m_p(df):
    date2 = df[record_date].map(lambda x: str2time(x)).max()
    date1 = datetime.datetime(date2.year, date2.month, 1).date()
    grouped = DataView(df).filter_by_record_date2(
        date1, date2)[[user_id, 'month',
                       power_consumption]].groupby([user_id, 'month'],
                                                   as_index=False)
    user_power_mean_m = grouped.mean()
    user_power_median_m = grouped.median()
    user_power_var_m = grouped.var()
    user_power_max_m = grouped.max()
    user_power_min_m = grouped.min()
    user_power_mean_m = user_power_mean_m.rename(
        columns={power_consumption: 'user_power_mean_m_p'})
    user_power_median_m = user_power_median_m.rename(
        columns={power_consumption: 'user_power_median_m_p'})
    user_power_var_m = user_power_var_m.rename(
        columns={power_consumption: 'user_power_var_m_p'})
    user_power_max_m = user_power_max_m.rename(
        columns={power_consumption: 'user_power_max_m_p'})
    user_power_min_m = user_power_min_m.rename(
        columns={power_consumption: 'user_power_min_m_p'})
    return pd.merge(user_power_mean_m, user_power_median_m).merge(user_power_var_m).\
        merge(user_power_max_m).merge(user_power_min_m).drop('month', axis=1)
Ejemplo n.º 5
0
from data_view import DataView
from config import *
import pandas as pd
import numpy as np


def generate_predict_data(n):
    dates = pd.date_range('2016/9/1', '2016/9/30')
    dates = list(map(lambda x: x.strftime('%Y/%-m/%-d'), dates)) * 1454
    ids = [i + 1 for i in range(1454)]
    ids = list(map(lambda x: [x] * 30, ids))
    ids = list(np.array(ids).flatten())
    pd.DataFrame({'record_date': dates, 'user_id': ids}).to_csv(data_paths.format(n), index=False)


if __name__ == '__main__':
    data_view = DataView(tianchi_power_csv)
    for i in range(len(date_durations)):
        print('generate num{} data'.format(str(i)))
        start_date, end_date = date_durations[i].split('-')
        data_view.filter_by_record_date(start_date, end_date).to_csv(data_paths.format(str(i)), index=False)
        start_date, end_date = feature_date_durations[i].split('-')
        data_view.filter_by_record_date(start_date, end_date).to_csv(feature_data_paths.format(str(i)), index=False)
    generate_predict_data(8)
Ejemplo n.º 6
0

def gen_all_sentences():
    relevant_ip_files = os.listdir(relevant_ip_path)
    sentences = list()
    ip_sentences = list()
    for each in relevant_ip_files:
        if each[-3:] == 'csv':
            df = pd.read_csv(relevant_ip_path + each)
            single_sentence, single_ip_sentence = gen_sentence(df)
            sentences.append(single_sentence)
            ip_sentences.append(single_ip_sentence)
    with open(sentence_path, 'wb') as f:
        pickle.dump(sentences, f)
    with open(ip_sentence_path, 'wb') as f:
        pickle.dump(ip_sentences, f)


if __name__ == '__main__':
    dv = DataView()
    themes = list(dv.theme_set)
    theme_to_int = dict((c, i + 1) for i, c in enumerate(themes))
    int_to_theme = dict((i + 1, c) for i, c in enumerate(themes))
    with open(theme2int_path, 'wb') as f:
        pickle.dump(theme_to_int, f)
    with open(int2theme_path, 'wb') as f:
        pickle.dump(int_to_theme, f)
    print(theme_to_int)
    print(int_to_theme)
    gen_all_sentences()
Ejemplo n.º 7
0
    # frame.name ='user_action_count'
    # action_user = df[[user_label]].join(frame)
    # grouped=action_user.groupby(user_label,as_index=False)
    # d1=grouped[frame.name].sum()
    # normal_user=set(d1[d1[frame.name]<10000][user_label])
    # df=df[df[user_label].isin(normal_user)].reset_index(drop=True)
    # print 'normal_user',len(df)

    df.to_csv(cleaned_path, index=False)


if __name__ == '__main__':
    print('preprocessing datainininggg')
    preprocess(train_raw_path, train_file_path)
    print('loading training data...')
    train_data = DataView(train_file_path)
    item_data = ItemView(item_file_path)

    train_user_list, trian_user_set = train_data.user_list(
    ), train_data.user_set()
    train_item_list, train_item_set = train_data.item_list(
    ), train_data.item_set()
    #	train_category_list,train_category_set=train_data.category_list,train_data.category_set
    all_item_list, all_item_set = item_data.item_list(), item_data.item_set()
    #	all_item_category_list,all_item_category_set=item_data.category_list,item_data.category_set

    print("spliting data...")
    #if(not os.path.exists(train_raw_data_path)):
    train_raw_data = train_data.filter_by_time(train_feature_start_time,
                                               train_feature_end_time)
    one_week = get_data(train_raw_data, train_feature_end_time)
#! /usr/bin/env python2.7
# -*- coding: utf-8 -*-
# File: data_split.py
# Date: 2016-10-14
# Author: Chaos <*****@*****.**>
from data_view import DataView
from config import *


if __name__ == '__main__':
    train_offline_data = DataView(offline_train_file_path)
    test_offline_data = DataView(offline_test_file_path)
    train_online_data = DataView(online_train_file_path)

    # split by user

    train_offline_user_list, train_offline_user_set = train_offline_data.user_list, train_offline_data.user_set
    test_offline_user_list, test_offline_user_set = test_offline_data.user_list, test_offline_data.user_set
    train_online_user_list, train_online_user_set = train_online_data.user_list, train_online_data.user_set
    active_users = train_offline_user_set & train_online_user_set

    active_user_offline_record = train_offline_data.data[train_offline_data.data[user_label].isin(active_users)]
    active_user_online_record = train_online_data.data[train_online_data.data[user_label].isin(active_users)]
    offline_user_record = train_offline_data.data[~train_offline_data.data[user_label].isin(active_users)]
    online_user_record = train_online_data.data[~train_online_data.data[user_label].isin(active_users)]

    active_user_offline_record.to_csv(active_user_offline_data_path, index=False)
    active_user_online_record.to_csv(active_user_online_data_path, index=False)
    offline_user_record.to_csv(offline_user_data_path, index=False)
    online_user_record.to_csv(online_user_data_path, index=False)
Ejemplo n.º 9
0
#! /usr/bin/env python2.7
# -*- coding: utf-8 -*-
# File: data_split.py
# Date: 2016-10-14
# Author: Chaos <*****@*****.**>
from data_view import DataView
from config import *

if __name__ == '__main__':
    train_offline_data = DataView(offline_train_file_path)
    test_offline_data = DataView(offline_test_file_path)
    train_online_data = DataView(online_train_file_path)

    # split by user

    train_offline_user_list, train_offline_user_set = train_offline_data.user_list, train_offline_data.user_set
    test_offline_user_list, test_offline_user_set = test_offline_data.user_list, test_offline_data.user_set
    train_online_user_list, train_online_user_set = train_online_data.user_list, train_online_data.user_set
    active_users = train_offline_user_set & train_online_user_set

    active_user_offline_record = train_offline_data.data[
        train_offline_data.data[user_label].isin(active_users)]
    active_user_online_record = train_online_data.data[
        train_online_data.data[user_label].isin(active_users)]
    offline_user_record = train_offline_data.data[
        ~train_offline_data.data[user_label].isin(active_users)]
    online_user_record = train_online_data.data[
        ~train_online_data.data[user_label].isin(active_users)]

    active_user_offline_record.to_csv(active_user_offline_data_path,
                                      index=False)
Ejemplo n.º 10
0
def gen_ips_csv():
    dv = DataView()
    df = dv.df
    ip_groups, ip_groups2 = ip_group(df)
    for k, v in ip_groups.items():
        save_csv(df, ip_groups, k)
Ejemplo n.º 11
0
class QueryWindow(object):
    LIMIT_SIZE = 20
    tunnel = None

    def __del__(self):
        if self.tunnel:
            self.tunnel.stop()

    def __init__(self, conn, tunnel):
        self.cursor = conn.cursor()

        self.last_detail_table = ''
        self.last_query_table = ''
        self.current_tab = 0
        self.current_page = 0
        self.current_table = ''
        self.store = None
        self.tunnel = tunnel

        builder = Gtk.Builder()
        builder.add_objects_from_file('app.glade', ('winQuery', ))

        builder.connect_signals({
            'on_table_row_selected': self.on_table_row_selected,
            'on_tab_selected': self.on_tab_selected,
            'on_previous_page': self.on_previous_page,
            'on_next_page': self.on_next_page,
            'on_filter_activate': self.on_filter_activate,
            'on_run_query': self.on_run_query,
            'on_data_key_press': self.on_data_key_press
        })

        self.query_tabs = builder.get_object('queryTabs')

        self.ent_data_filter = builder.get_object('entDataFilter')
        self.txt_query = builder.get_object('txtQuery')

        self.list_tables = builder.get_object('listTables')
        self.fetch_tables()

        self.data_view = DataView(builder.get_object('dataTree'), conn)
        self.query_view = QueryView(builder.get_object('queryTree'),
                                    builder.get_object('lblQueryInfo'), conn)

        self.window = builder.get_object('winQuery')
        WindowManager.add_window(self.window)
        self.window.show_all()

    def fetch_tables(self):
        self.cursor.execute(
            "SELECT relname FROM pg_class WHERE relkind='r' AND relname !~ '^(pg_|sql_)' ORDER BY relname"
        )

        for table in self.cursor.fetchall():
            self.list_tables.add(ListBoxRowWithData(table[0]))

    def on_table_row_selected(self, list, row):
        if not row:
            return

        self.ent_data_filter.set_text('')
        self.ent_data_filter.show_all()
        self.current_table = row.data
        self.refresh(row.data)

    def refresh_details(self, table):
        if table == self.last_detail_table:
            return

        self.last_detail_table = table

        print 'REFRESHING DETAILS'
        pass

    def refresh_data(self, table):
        self.data_view.set_table(table)

    def refresh_query(self, table):
        pass

    def on_data_key_press(self, element, key):
        print key, key.is_modifier, key.get_keyval()

    def on_run_query(self, el):
        buffer = self.txt_query.get_buffer()
        start, end = buffer.get_bounds()
        self.query_view.run_query(buffer.get_text(start, end, False))

    def on_previous_page(self, el):
        self.data_view.previous_page()

    def on_next_page(self, el):
        self.data_view.next_page()

    def refresh(self, table):
        if self.current_tab == 1:
            self.refresh_details(table)
        elif self.current_tab == 0:
            self.refresh_data(table)
        else:
            self.refresh_query(table)

    def on_tab_selected(self, notebook, pane, index):
        self.current_tab = index
        self.refresh(self.current_table)

    def on_filter_activate(self, element):
        self.data_view.set_where(element.get_text())
        self.data_view.refresh_data()