Python DataIO Beispiele, data_io.DataIO Python Beispiele

Beispiel #1

0

Datei anzeigen

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-c', '--config', help='config file', default=None)
    parser.add_argument('-o', '--output_file', dest='output_file', help='output data file', default='data.hdf5')
    parser.add_argument('-r', '--rebound_file', help='Rebound simulation file', default=None)
    parser.add_argument('-t', '--t_end', type=float, dest='t_end', help='Termination time')
    parser.add_argument('-d', '--dt', type=float, dest='dt', help='Integration time step (optional for certain integrators)', default=None)
    parser.add_argument('-s', '--store_dt', type=float, dest='store_dt', help='output time step', default=100)
    parser.add_argument('-i', '--integrator', dest='integrator', help='Name of the integrator [GaussRadau15|WisdomHolman|RungeKutta|AdamsBashForth|LeapFrog|Euler]', default='GaussRadau15')

    args = parser.parse_args()
    abie = ABIE()
    abie.integrator = args.integrator
    if args.output_file is not None:
        abie.output_file = args.output_file
    if args.t_end is not None:
        abie.t_end = args.t_end
    if args.config is not None:
        abie.initialize(DataIO.parse_config_file(args.config))
    elif args.rebound_file is not None:
        # populate the initial conditions from rebound simulation files
        abie.initialize()
        DataIO.ic_populate_from_rebound(args.rebound_file, abie)
    if args.dt is not None:
        abie.h = args.dt
    abie.store_dt = args.store_dt
    abie.integrate()

Beispiel #2

0

Datei anzeigen

Datei: cord19_app.py Projekt: laranea/Covid19

def update_data():
    dataio = DataIO(autoload=False)
    df = dataio.update()
    if df.empty:
        st.sidebar.error("Failed To Update!")
    else:
        st.sidebar.balloons()
        st.sidebar.success("Updated Data, Wrote to disk and Loaded!")
    return df

Beispiel #3

0

Datei anzeigen

class analysis_dif:
    dataio = DataIO()
    def diff_analysis(self,date,distinct):
        difflist = []
        gaplist = []
        for slice in range(143):
            ts = date+'-'+str(slice+1)
            diffdata = self.dataio.select_orderDiff_by_ds_distinct(ts,distinct)
            gap = self.dataio.select_gap(ts,distinct)
            gaplist.append(float(gap))
            difflist.append(float(diffdata))
            #print(type(diffdata))
        #plt.plot(difflist,'ro-')
        fig = plt.figure()
        ax1 = fig.add_subplot(311)
        fig = sm.graphics.tsa.plot_acf(gaplist, lags=20, ax=ax1)
        ax2 = fig.add_subplot(312)
        fig = sm.graphics.tsa.plot_pacf(gaplist, lags=20, ax=ax2)
        ax3 = fig.add_subplot(313)
        ax3.plot(difflist,'ro-')
        title = date+" Distinct:"+str(distinct)
        plt.title(title)


        #arma_11 = sm.tsa.ARMA(difflist,(1,1)).fit()
        #arma_02 = sm.tsa.ARMA(difflist,(0,2)).fit()
        #arma_01 = sm.tsa.ARMA(gaplist,(1,0)).fit()

        arima = sm.tsa.ARIMA(gaplist,(1,1,0)).fit()

        fig1 = plt.figure(1)
        fig1 = arima.plot_predict()

        plt.show()

Beispiel #4

0

Datei anzeigen

    def initialize(self, config=None):
        # Initialize the integrator
        self.__integrators = Integrator.load_integrators()
        if self.__integrator is None:
            print('Use GaussRadau15 as the default integrator...')
            self.integrator = 'GaussRadau15'
            self.integrator.initialize()
            self.integrator.acceleration_method = 'ctypes'
        else:
            self.__integrator.CONST_G = self.CONST_G
            self.__integrator.t_end = self.__t_end
            self.__integrator.h = self.__h
            self.__integrator.t_start = self.__t_start
            self.__integrator.output_file = self.output_file
            self.__integrator.store_dt = self.__store_dt
            self.__integrator.buffer_len = self.__buffer_len


        if config is not None:
            # Gravitational parameter
            self.integrator.CONST_G = np.array(config['physical_params']['G'])

            # Integration parameters
            self.integrator = config['integration']['integrator']
            self.integrator.initialize()
            self.integrator.h = float(config['integration']['h'])
            if 'acc_method' in config['integration']:
                self.integrator.acceleration_method = config['integration']['acc_method']
            else:
                self.integrator.acceleration_method = 'ctypes'

            # Load sequence of object names
            if 'names' in config:
                names = config['names']
            else:
                names = None

            # Initial and final times
            if self.integrator.t_start == 0:
                self.integrator.t_start = float(config['integration']['t0'])
            if self.integrator.t_end == 0:
                self.integrator.t_end = float(config['integration']['tf'])
            self.integrator.active_integrator = config['integration']['integrator']
            DataIO.ic_populate(config['initial_conds'], self, names=names)

Beispiel #5

0

Datei anzeigen

 def initialize(self):
     if self.__buf is None:
         self.__buf = DataIO(
             buf_len=self.buffer_len,
             output_file_name=self.output_file,
             close_encounter_output_file_name=self.
             close_encounter_output_file,
             collision_output_file_name=self.collision_output_file,
             CONST_G=self.CONST_G)
     if self.particles.N > 0:
         # initialize the C library
         self.libabie.initialize_code(
             self.CONST_G,
             self.CONST_C,
             self.particles.N,
             MAX_CE_EVENTS=self.max_close_encounter_events,
             MAX_COLLISION_EVENTS=self.max_collision_events,
             close_encounter_distance=self.close_encounter_distance)
         self.buf.initialize_buffer(self.particles.N)

Beispiel #6

0

Datei anzeigen

 def __init__(self):
     self.data_io = DataIO()
     self.model = None
     self.x_train = None
     self.y_train = None
     self.x_test = None
     self.y_test = None
     self.num_classes = 0
     self.epochs = 50
     self.batch_size = 8
     self.use_noise = True
     self.distributed_training = False
     self.multi_gpu_training = False
     self._multi_gpu_model = None
     self._n_gpus = 1
     self.callbacks = []
     self.logger = None
     self.log_level = logging.DEBUG
     self.input_shape = (512, 512, 3)  # (256, 256, 3)
     self._t_start = 0
     self._t_end = 0

Beispiel #7

0

Datei anzeigen

Datei: submission_splitted.py Projekt: ageek/kaggle-job-salary

from data_io import DataIO
from sklearn.ensemble import ExtraTreesRegressor
#from sklearn.cross_validation import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

dio = DataIO("Settings.json")
submission = False
n_trees = 10
min_samples_split = 2
param = """Normal count vector with max 200. New submission which is repeatable.
 and nicer

count_vector_titles = TfidfVectorizer(
    read_column(train_filename, column_name),
    max_features=200, norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)
 """

if submission:
    type_n = "train_full"
    type_v = "valid_full"
else:
    type_n = "train"
    type_v = "valid"


vectorizer = CountVectorizer(
    max_features=200,
)
short_id = "count_200f"
tfidf_columns = ["Title", "FullDescription", "LocationRaw"]

Beispiel #8

0

Datei anzeigen

Datei: test_notext.py Projekt: ageek/kaggle-job-salary

op.add_option("--n_features",
              action="store", type=int, default=2 ** 16,
              help="n_features when using the hashing vectorizer.")


(opts, args) = op.parse_args()
if len(args) > 0:
    op.error("this script takes no arguments.")
    sys.exit(1)

print __doc__
op.print_help()
print


dio = DataIO("Settings_loc5.json")
submission = False
n_trees = 10
min_samples_split = 2

if submission:
    type_n = "train_full"
    type_v = "valid_full"
else:
    type_n = "train"
    type_v = "valid"


vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    max_df=0.5,

Beispiel #9

0

Datei anzeigen

 def buf(self):
     if self.__buf is None:
         self.__buf = DataIO(buf_len=self.buffer_len,
                             output_file_name=self.output_file,
                             CONST_G=self.CONST_G)
     return self.__buf

Beispiel #10

0

Datei anzeigen

from data_io import DataIO
from sklearn.ensemble import ExtraTreesRegressor
#from sklearn.cross_validation import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

dio = DataIO("Settings.json")
submission = False
n_trees = 10
min_samples_split = 2
param = """Normal count vector with max 200. New submission which is repeatable.
 and nicer

count_vector_titles = TfidfVectorizer(
    read_column(train_filename, column_name),
    max_features=200, norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)
 """

if submission:
    type_n = "train_full"
    type_v = "valid_full"
else:
    type_n = "train"
    type_v = "valid"

vectorizer = CountVectorizer(max_features=200, )
short_id = "count_200f"
tfidf_columns = ["Title", "FullDescription", "LocationRaw"]
#dio.make_counts(vectorizer, short_id, tfidf_columns, type_n, type_v)

columns = ["Category", "ContractTime", "ContractType"]

Beispiel #11

0

Datei anzeigen

def tfidf_cloud(n_trees):
    dio = DataIO("/data/Settings_cloud.json")
    submission = False
    min_samples_split = 2
    param = """Normal count vector with max 200. New submission which is repeatable.
    and nicer

    count_vector_titles = TfidfVectorizer(
        read_column(train_filename, column_name),
        max_features=200, norm='l1', smooth_idf=True, sublinear_tf=False, use_idf=True)
    """

    if submission:
        type_n = "train_full"
        type_v = "valid_full"
    else:
        type_n = "train"
        type_v = "valid"

#features = dio.join_features("%s_" + type_n + "_tfidf_matrix_max_f_200",
#["Title", "FullDescription", "LocationRaw"],
#extra_features)
#validation_features = dio.join_features("%s_" + type_v + "_tfidf_matrix_max_f_200",
#["Title", "FullDescription", "LocationRaw"],
#extra_valid_features)

#np.save("train_200f_noNorm_categoryTimeType_tfidfl2_features", features)
#np.save("train_200f_noNorm_categoryTimeType_tfidfl2_valid_features", validation_features)

    def load(filename):
        return joblib.load(path_join("/data", filename))

    features = load("train_200f_noNorm_categoryTimeType_tfidfl1_features_jl")
    validation_features = load(
        "train_200f_noNorm_categoryTimeType_tfidfl1_valid_features_jl")

    print "features", features.shape
    print "valid features", validation_features.shape

    #salaries = dio.get_salaries(type_n, log=True)
    #if not submission:
    #valid_salaries = dio.get_salaries(type_v, log=True)

    #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_salaries", salaries)
    #np.save("train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries", valid_salaries)

    #joblib.dump(salaries, "train_200f_noNorm_categoryTimeType_tfidfl2_salaries_jl", compress=5)
    #joblib.dump(valid_salaries, "train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries_jl", compress=5)

    #TODO: valid salaries so narobe dumpane

    salaries = load("train_200f_noNorm_categoryTimeType_tfidfl2_salaries_jl")
    valid_salaries = load(
        "train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries_jl")
    dio.is_log = True

    print salaries.shape

    name = "ExtraTree_min_sample%d_%dtrees_200f_noNorm_categoryTimeType_tfidfl1_new_log" % (
        min_samples_split, n_trees)
    print name
    #dio.save_prediction("testni", np.array([1,2,3]), type_n="testno")
    classifier = ExtraTreesRegressor(
        n_estimators=n_trees,
        verbose=2,
        n_jobs=4,  # 2 jobs on submission / 4 on valid test
        oob_score=False,
        min_samples_split=min_samples_split,
        random_state=3465343)

    #dio.save_model(classifier, "testni_model", 99.)
    classifier.fit(features, salaries)
    predictions = classifier.predict(validation_features)
    if submission:
        dio.save_prediction(name, predictions, type_n=type_v)
        dio.write_submission(name + ".csv", predictions=predictions)
    else:
        dio.compare_valid_pred(valid_salaries, predictions)
        metric = dio.error_metric
        mae = metric(valid_salaries, predictions)
        print "MAE validation: ", mae
        dio.save_model(classifier, name, mae)
        dio.save_prediction(name, predictions, type_n=type_v)

Beispiel #12

0

Datei anzeigen

Datei: combine_new.py Projekt: oasic/kaggle-job-salary

from data_io import DataIO
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.cross_validation import cross_val_score
import numpy as np
from sklearn.metrics import mean_absolute_error
from itertools import combinations
from sklearn.linear_model import LinearRegression, RidgeCV, Ridge

dio = DataIO("Settings.json")
submission = False
n_trees = 10
min_samples_split = 2
param = """Normal count vector with max 200. New submission which is repeatable.
 and nicer

 """

if submission:
    type_n = "train_full"
    type_v = "valid_full"
else:
    type_n = "train"
    type_v = "valid"

#columns = ["Category", "ContractTime", "ContractType"]
#le_features = dio.get_le_features(columns, "train_full")
#extra_features = dio.get_features(columns, type_n, le_features)
#extra_valid_features = dio.get_features(columns, type_v, le_features)
#features = dio.join_features("%s_" + type_n + "_tfidf_matrix_max_f_200",
#["Title", "FullDescription", "LocationRaw"],
#extra_features)

Beispiel #13

0

Datei anzeigen

from data_io import DataIO

data = DataIO(autoload=False)
df = data.update()
print(df.shape)

Beispiel #14

0

Datei anzeigen

Datei: show_diff.py Projekt: oasic/kaggle-job-salary

from data_io import DataIO
import numpy as np
import matplotlib.pyplot as plt
dio = DataIO("Settings.json")

model_names = [
    "ExtraTree_min_sample2_40trees_200f_noNorm_categoryTimeType_log",
    "vowpall",
    #"ExtraTree_min_sample2_10trees_200f_noNorm_categoryTimeType_count_fake_14split_new_log",
    #"ExtraTree_min_sample2_10trees_200f_noNorm_categoryTimeType_count_fake_split_new_log",
    #"ExtraTree_min_sample2_10trees_200f_noNorm_categoryTimeType_count_rf10_4split_new1_log"
    #"ExtraTree_min_sample2_20trees_200f_noNorm_categoryTimeType_count_rf10_4split_newOKsalPredictValid_log",
    "ExtraTree_min_sample2_20trees_200f_noNorm_categoryTimeType_count_exTre20_4split_new_faked_log",
    #"ExtraTree_min_sample2_20trees_200f_noNorm_categoryTimeType_count_exTre20_4split_newOKsalPredictValid_log",
    #"ExtraTree_min_sample2_20trees_200f_noNorm_categoryTimeType_count_exTre20_4split_newPredictsalPredictValid_log",
    "Ridge_tfidf_05d_log"
    #"ExtraTree_min_sample2_20trees_200f_noNorm_categoryTimeType_count_exTre20_4split_newPredictsalPredictValid1_log",
]

valid_salaries = dio.get_salaries("valid", log=False)

ylim = (0, 8000)
xlim = (-50000, 50000)
grid = True

def encode_salaries(salaries, bins):
    bin_edges = np.linspace(11500.0, 100000, bins + 1, endpoint=True)
    #hist, bin_edges = np.histogram(salaries, bins)
    #bin_edges = list(bin_edges)
    #bin_edges.insert(0, 0)
    #bin_edges.append(salaries.max() + 1)

Beispiel #15

0

Datei anzeigen

Datei: submission_extra_gensim.py Projekt: ageek/kaggle-job-salary

from data_io import DataIO
from sklearn.decomposition import RandomizedPCA
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.base import clone
from sklearn.cross_validation import cross_val_score
import numpy as np

dio = DataIO("Settings.json")

title_corpus = dio.read_gensim_corpus("train_title_nltk_filtered.corpus.mtx")
pca = RandomizedPCA(random_state=3465343)
salaries = dio.get_salaries("train", log=True)

columns = ["Category", "ContractTime", "ContractType"]
le_features = dio.get_le_features(columns, "train_full")
extra_features = dio.get_features(columns, "train", le_features)
#extra_valid_features = dio.get_features(columns, "valid", le_features)

param = "RandomizedPCA title 200 Fulldescription 200 " + ",".join(columns)
print map(len, extra_features)
extra_features = map(lambda x: np.reshape(np.array(x),(len(x),1)),extra_features)



print type(title_corpus)
print title_corpus.shape


title_pca = clone(pca)
title_pca.set_params(n_components=200)
title_corpus_pca = title_pca.fit_transform(title_corpus)

Beispiel #16

0

Datei anzeigen

Datei: main_gui.py Projekt: longmoc/customer-segmentation

    def __init__(self):
        self.data_io = DataIO()
        self.input = None
        self.content_data = None
        self.execute_data, self.prior_1, self.prior_2, self.leave = None, None, None, None
        wx.Frame.__init__(self,
                          None,
                          wx.ID_ANY,
                          "Customer Segmentation",
                          size=(800, 640))

        self.panel = wx.Panel(self, wx.ID_ANY)
        status = self.CreateStatusBar()

        self.sizer = wx.BoxSizer(wx.VERTICAL)

        file_label = wx.StaticText(self.panel, -1, "Input:", (30, 20))
        self.InputPathTextBox = wx.TextCtrl(self.panel,
                                            -1,
                                            "",
                                            size=(200, -1),
                                            pos=(100, 21))
        self.browse_btn = wx.Button(self.panel, -1, "Browse", pos=(310, 20))
        self.Bind(wx.EVT_BUTTON, self.onOpenFile, self.browse_btn)

        # output_label = wx.StaticText(self.panel, -1, "Ket qua:", (30, 50))
        # self.OutputPathTextBox = wx.TextCtrl(self.panel, -1, "", size=(200, -1), pos=(100, 47))

        self.execute_btn = wx.Button(self.panel,
                                     -1,
                                     "Phân tích",
                                     pos=(400, 20))
        self.Bind(wx.EVT_BUTTON, self.onExecute, self.execute_btn)
        self.execute_btn.Disable()

        inform_label = wx.StaticText(self.panel, -1, "Tổng quát:", (30, 60))
        self.inform_txt = wx.TextCtrl(self.panel,
                                      -1,
                                      "",
                                      style=wx.TE_MULTILINE | wx.TE_READONLY,
                                      size=(463, 70),
                                      pos=(100, 60))

        self.prior1_btn = wx.Button(self.panel,
                                    -1,
                                    "Nhóm Ưu tiên 1",
                                    pos=(98, 140))
        self.Bind(wx.EVT_BUTTON, self.onOpenPrior1, self.prior1_btn)
        self.prior1_btn.Disable()

        self.prior2_btn = wx.Button(self.panel,
                                    -1,
                                    "Nhóm Ưu tiên 2",
                                    pos=(210, 140))
        self.Bind(wx.EVT_BUTTON, self.onOpenPrior2, self.prior2_btn)
        self.prior2_btn.Disable()

        self.leave_btn = wx.Button(self.panel,
                                   -1,
                                   "Dự đoán rời mạng",
                                   pos=(320, 140))
        self.Bind(wx.EVT_BUTTON, self.onOpenLeave, self.leave_btn)
        self.leave_btn.Disable()

        self.total_btn = wx.Button(self.panel,
                                   -1,
                                   "Toàn bộ các nhóm",
                                   pos=(443, 140))
        self.Bind(wx.EVT_BUTTON, self.onOpenTotal, self.total_btn)
        self.total_btn.Disable()

        self.chart_btn = wx.Button(self.panel, -1, "Biểu đồ", pos=(580, 140))
        self.Bind(wx.EVT_BUTTON, self.onOpenChart, self.chart_btn)
        self.chart_btn.Disable()

        self.save_btn = wx.Button(self.panel, -1, "Export", pos=(680, 140))
        self.Bind(wx.EVT_BUTTON, self.onSaveFile, self.save_btn)
        self.save_btn.Disable()

        self.data_grid = grid.Grid(self.panel, size=(780, 400), pos=(5, 180))
        self.data_grid.CreateGrid(20, 10)

        self.panel.SetSizer(self.sizer)

Beispiel #17

0

Datei anzeigen

Datei: submission_extra_new_cloud.py Projekt: ageek/kaggle-job-salary

def tfidf_cloud(n_trees):
    dio = DataIO("/data/Settings_cloud.json")
    submission = False
    min_samples_split = 2
    param = """Normal count vector with max 200. New submission which is repeatable.
    and nicer

    count_vector_titles = TfidfVectorizer(
        read_column(train_filename, column_name),
        max_features=200, norm='l1', smooth_idf=True, sublinear_tf=False, use_idf=True)
    """

    if submission:
        type_n = "train_full"
        type_v = "valid_full"
    else:
        type_n = "train"
        type_v = "valid"



#features = dio.join_features("%s_" + type_n + "_tfidf_matrix_max_f_200",
                                #["Title", "FullDescription", "LocationRaw"],
                                #extra_features)
#validation_features = dio.join_features("%s_" + type_v + "_tfidf_matrix_max_f_200",
                                            #["Title", "FullDescription", "LocationRaw"],
                                            #extra_valid_features)

#np.save("train_200f_noNorm_categoryTimeType_tfidfl2_features", features)
#np.save("train_200f_noNorm_categoryTimeType_tfidfl2_valid_features", validation_features)
    def load(filename):
        return joblib.load(path_join("/data", filename))

    features = load("train_200f_noNorm_categoryTimeType_tfidfl1_features_jl")
    validation_features = load("train_200f_noNorm_categoryTimeType_tfidfl1_valid_features_jl")

    print "features", features.shape
    print "valid features", validation_features.shape

#salaries = dio.get_salaries(type_n, log=True)
#if not submission:
        #valid_salaries = dio.get_salaries(type_v, log=True)

#np.save("train_200f_noNorm_categoryTimeType_tfidfl2_salaries", salaries)
#np.save("train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries", valid_salaries)

#joblib.dump(salaries, "train_200f_noNorm_categoryTimeType_tfidfl2_salaries_jl", compress=5)
#joblib.dump(valid_salaries, "train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries_jl", compress=5)

#TODO: valid salaries so narobe dumpane

    salaries = load("train_200f_noNorm_categoryTimeType_tfidfl2_salaries_jl")
    valid_salaries = load("train_200f_noNorm_categoryTimeType_tfidfl2_valid_salaries_jl")
    dio.is_log = True

    print salaries.shape


    name = "ExtraTree_min_sample%d_%dtrees_200f_noNorm_categoryTimeType_tfidfl1_new_log" % (min_samples_split, n_trees)
    print name
    #dio.save_prediction("testni", np.array([1,2,3]), type_n="testno")
    classifier = ExtraTreesRegressor(n_estimators=n_trees,
                                    verbose=2,
                                    n_jobs=4, # 2 jobs on submission / 4 on valid test
                                    oob_score=False,
                                    min_samples_split=min_samples_split,
                                    random_state=3465343)

    #dio.save_model(classifier, "testni_model", 99.)
    classifier.fit(features, salaries)
    predictions = classifier.predict(validation_features)
    if submission:
        dio.save_prediction(name, predictions, type_n=type_v)
        dio.write_submission(name + ".csv", predictions=predictions)
    else:
        dio.compare_valid_pred(valid_salaries, predictions)
        metric = dio.error_metric
        mae = metric(valid_salaries, predictions)
        print "MAE validation: ", mae
        dio.save_model(classifier, name, mae)
        dio.save_prediction(name, predictions, type_n=type_v)

Beispiel #18

0

Datei anzeigen

Datei: submission_extra_new_cloud.py Projekt: ageek/kaggle-job-salary

from data_io import DataIO
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.cross_validation import cross_val_score
from os.path import join as path_join
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import joblib
import cloud
import os

tfidf_columns = ["Title", "FullDescription", "LocationRaw"]
dio = DataIO("Settings.json")

vectorizer = TfidfVectorizer(
    max_features=200,
    norm='l1',
    smooth_idf=True,
    sublinear_tf=False,
    use_idf=True
)
short_id = "tfidf_200f_l1"
type_n = "train"
type_v = "valid"
dio.make_counts(vectorizer, short_id, tfidf_columns, "train", "valid")
columns = ["Category", "ContractTime", "ContractType"]
le_features = dio.get_le_features(columns, "train_full")
extra_features = dio.get_features(columns, type_n, le_features)
extra_valid_features = dio.get_features(columns, type_v, le_features)
features = dio.join_features("%s_" + type_n + "_" + short_id + "_matrix",
                             tfidf_columns,
                             extra_features)

Beispiel #19

0

Datei anzeigen

Datei: analysis_main.py Projekt: buptbubble/firstone

class analysis:
    dataio = DataIO()
    feature = cFeature()
    wa = wavelet_ana()
    verify_file_path = './predict_data_in_training.txt'

    weekend = [2, 3, 9, 17]
    weekday = [4, 5, 6, 12, 13, 14, 15, 18]
    Sat = [2, 9]
    Sun = [3, 17]

    def time2slice(self, i_time):
        t_array = datetime.datetime.strptime(i_time, "%Y-%m-%d %H:%M:%S")
        slice = t_array.hour * 6 + math.floor(t_array.minute / 10) + 1

        return slice

    def slice2time(self, slice):
        slice = int(slice)
        hour = math.floor((slice - 1) / 6)
        min = (slice - 1 - hour * 6) * 10
        timenow = "{:02}:{:02}".format(hour, min)
        return timenow

    def select_test_day(self, daylist):
        daytest = []
        for i in daylist:
            day = '{:02d}'.format(i)
            prefix = '2016-01-'
            date = prefix + day
            daytest.append(date)

        return daytest

    def weather_main_trend(self, date, hour_interval=1):
        #print(self.dataio.select_weatherdata_by_dateslice(date))
        weatherlist = []
        for i in range(1, 144, 6 * hour_interval):
            dateslice = date + '-' + str(i)
            weather = self.dataio.select_weatherdata_by_dateslice(dateslice)
            if date == '2016-01-16':
                print(weather)

            if type(weather) != type(None):
                weatherlist.append(weather)
        if len(weatherlist) == 0:
            print("len(weatherlist)==0")
            exit(1)

        weatherPD = pd.DataFrame(weatherlist)
        if date == '2016-01-16':
            print(weatherPD)

        #del weatherPD['temp']
        #del weatherPD['pm2.5']

        timelist = []
        for idx in weatherPD.index:
            slice = idx.split('-')[-1]

            timetext = self.slice2time(slice)
            timelist.append(timetext)
        weatherPD.index = timelist
        return weatherPD

    def write_weather_info(self):
        for day in range(21):

            prefix = '2016-01-'
            date = prefix + '{:02d}'.format(day + 1)
            print(date)
            pd_weather = self.weather_main_trend(date)
            filepath = './weather_info'
            filename = date + ".txt"
            fw = open(os.path.join(filepath, filename), 'w')
            pd_weather.to_csv(fw)
            fw.close()

    def do_analysis_drawGapTrend(self):
        weekend = [1, 2, 3, 9, 10, 16, 17]
        weekday1 = [4, 5, 6, 7, 8]
        weekday2 = [11, 12, 13, 14, 15]

        for type in range(3):

            if type == 0:
                daytest = self.select_test_day(weekend)
                ax = plt.subplot(311)
                ax.set_title("weekend")
            if type == 1:
                daytest = self.select_test_day(weekday1)
                ax = plt.subplot(312)
                ax.set_title("weekday1")
            if type == 2:
                daytest = self.select_test_day(weekday2)
                ax = plt.subplot(313)
                ax.set_title("weekday2")

            for day in daytest:
                data = self.dataio.select_orderdata_by_district(day, 8)
                gap = (data['demand'] - data['supply'])
                gaplen = gap.shape[0]
                idx = np.array(range(gaplen)) + 1
                x_label = []
                for i in range(144):
                    x_label.append(ana.slice2time(i + 1))
                gap.index = x_label
                gap.plot(label=day)

            ax.legend(loc=2)
        plt.show()

    def train_kernel_ridge_regression_clf(self,
                                          train_daylist,
                                          distinct,
                                          gamma=1,
                                          alpha=1):
        daytest = self.select_test_day(train_daylist)
        y_train = []
        X_train = []

        for day in daytest:
            for slice in range(144):
                dateslice = day + '-' + str(slice + 1)
                #feature,gap = self.generateFeatureLabel(dateslice,distinct)
                feature, gap = self.feature.generate(dateslice, distinct)

                if feature != None:
                    if gap != 0:
                        gap = math.log10(float(gap))
                    else:
                        gap = -0.1
                    X_train.append(feature)
                    y_train.append(gap)
        clf = KernelRidge(kernel='polynomial', gamma=gamma, alpha=alpha)
        #clf = KernelRidge(kernel='polynomial', degree=3,alpha=0.01)
        clf.fit(X_train, y_train)

        return clf

    def train_optimzation_model(self, train_daylist, distinct):
        daytest = self.select_test_day(train_daylist)
        y_train = []
        X_train = []

        for day in daytest:
            for slice in range(144):
                dateslice = day + '-' + str(slice + 1)
                #feature, label = self.generateFeatureLabel(dateslice, distinct)
                feature, label = self.feature.generate(dateslice, distinct)
                #print(feature,label)
                #print(feature1,label1)
                #print("-----------")

                if feature != None:
                    X_train.append(feature)
                    y_train.append(label)

        opt = optimization()
        opt.fit(X_train, y_train)
        return opt

    def train_gap_diff_curve(self, day, distinct):
        if len(day.split('-')) != 3:
            print(
                "The input of train_gap_diff_curve_by_distinct_day should be a xx-xx-xx"
            )
            exit(1)

        difflist = []
        for slice in range(144):
            dateslice = day + '-' + str(slice + 1)
            diffval = self.dataio.select_orderDiff_by_ds_distinct(
                dateslice, distinct)
            if diffval != None:
                difflist.append(diffval)
        coeffs = self.wa.get_wavelet_coeffs(difflist)
        #coeffs = self.wa.coeffs_process(coeffs)
        curve = self.wa.reconstruction_from_coeffs(coeffs)
        return np.array(curve)

    def train_gap_diff_by_distinctlist(self, distinct_list, diffcurveList,
                                       count):
        for distinct in distinct_list:
            count[0] += 1
            print("Training model in " + "{:.1f}".format(count[0] / 66 * 100) +
                  "% completed...")
            curve_dict = {}

            weekday = self.select_test_day(self.weekday)
            curve_sum = np.zeros(144)

            for day in weekday:
                curve = self.train_gap_diff_curve(day, distinct + 1)
                curve_sum += curve
            curve_dict['weekday'] = curve_sum / len(weekday)

            sat = self.select_test_day(self.Sat)
            curve_sum = np.zeros(144)
            for day in sat:
                curve = self.train_gap_diff_curve(day, distinct + 1)
                curve_sum += curve
            curve_dict['sat'] = curve_sum / len(sat)

            sun = self.select_test_day(self.Sun)
            curve_sum = np.zeros(144)
            for day in sun:
                curve = self.train_gap_diff_curve(day, distinct + 1)
                curve_sum += curve
            curve_dict['sun'] = curve_sum / len(sun)

            diffcurveList[distinct] = curve_dict

    def drawing_perform_by_distinct_daylist(self, clf, daylist, distinct):
        daytest = self.select_test_day(daylist)
        for i, day in enumerate(daytest):
            gap_real = []
            gap_predict = []
            slice_x = []
            for slice in range(144):
                dateslice = day + '-' + str(slice + 1)
                #feature,gap = self.generateFeatureLabel(dateslice,distinct)
                feature, gap = self.feature.generate(dateslice, distinct)
                if feature == None:
                    continue
                label_predicted = clf.predict([feature])
                gap_real.append(gap)
                gap_predict.append(label_predicted)
                slice_x.append(slice)

            plt.plot(slice_x, gap_real, color=get_color(i), label=day)
            plt.plot(slice_x, gap_predict, color=get_color(i), ls='--', lw=2)
        plt.legend(loc=2)
        plt.grid()
        plt.show()

    def verifying_in_training_set(self, clf):
        fr = open(self.verify_file_path, 'r')
        timeslicelist = []
        for line in fr:
            timeslice = line.split(' ')[0]
            timeslicelist.append(timeslice)
        fr.close()
        #------clf------distinct(0,65)-------type(0:weekday, 1:weekend)-----
        count = 0
        err_rate_sum = 0
        for timeslice in timeslicelist:
            for dis_ind in range(66):
                #clf[distinct][]
                distinct = dis_ind + 1

                date = timeslice[0:10]

                isWeekend = isWeekends(date)
                #feature,gap = self.generateFeatureLabel(timeslice,distinct)
                feature, gap = self.feature.generate(timeslice, distinct)

                if feature == None or gap == 0:
                    continue
                gap_predicted = clf[dis_ind][isWeekend].predict([feature])[0]

                gap_predicted = int(math.pow(10, gap_predicted))
                if gap_predicted < 0:
                    gap_predicted = 0
                err_rate = abs((gap - gap_predicted) / gap)

                err_rate_sum += err_rate
                count += 1

        err_rate_sum /= count
        return err_rate_sum

    def verifying_in_training_set_bydiff(self, diffcurve):
        fr = open(self.verify_file_path, 'r')
        timeslicelist = []
        for line in fr:
            timeslice = line.split(' ')[0]
            timeslicelist.append(timeslice)
        fr.close()
        # ------clf------distinct(0,65)-------type(0:weekday, 1:weekend)-----
        count = 0
        err_rate_sum = 0
        for timeslice in timeslicelist:
            for dis_ind in range(66):

                distinct = dis_ind + 1
                slice = int(timeslice.split('-')[-1])
                date = timeslice[0:10]

                gap = self.dataio.select_gap(timeslice, distinct)
                if gap == 0:
                    continue

                ts_before1 = date + '-' + str(slice - 1)
                ts_before2 = date + '-' + str(slice - 2)
                ts_before3 = date + '-' + str(slice - 3)
                gap1 = self.dataio.select_gap(ts_before1, distinct)
                gap2 = self.dataio.select_gap(ts_before2, distinct)
                gap3 = self.dataio.select_gap(ts_before3, distinct)
                diff1 = gap1 - gap2
                diff2 = gap2 - gap3

                daytype = isWeekends(date)
                diffval1 = 0
                diffval0 = 0
                if daytype == 0:
                    curve = diffcurve[dis_ind]['weekday']
                    diffval0 = curve[slice - 1]
                    diffval1 = curve[slice - 2]
                if daytype == 1:
                    curve = diffcurve[dis_ind]['sat']
                    diffval0 = curve[slice - 1]
                    diffval1 = curve[slice - 2]
                if daytype == 2:
                    curve = diffcurve[dis_ind]['sun']
                    diffval0 = curve[slice - 1]
                    diffval1 = curve[slice - 2]

                gapdiff_predict = 2 * diffval1 - diff1 + diffval0
                gap_predicted = gap1 + gapdiff_predict
                if gap_predicted < 0:
                    gap_predicted = 0
                err_rate = abs((gap - gap_predicted) / gap)

                err_rate_sum += err_rate
                count += 1

        err_rate_sum /= count
        return err_rate_sum

    def calculate_norm2_error(self, clf, daylist, distinct):
        err_val = 0
        count = 0
        daylist = self.select_test_day(daylist)
        for date in daylist:
            for slice in range(144):
                timeslice = date + '-' + str(slice + 1)

                feature, gap = self.feature.generate(timeslice, distinct)

                if feature == None or gap == 0:
                    continue
                if gap != 0:
                    gap_log = math.log10(gap)
                else:
                    gap_log = 0

                gap_predicted = clf.predict([feature])[0]

                err_val += (gap_log - gap_predicted)**2
                count += 1
        err_val /= count
        return err_val

    def calculate_mape_by_DayDistinct(self, clf, daylist, distinct):
        err_rate_sum = 0
        count = 0
        daylist = self.select_test_day(daylist)
        for date in daylist:
            for slice in range(144):
                timeslice = date + '-' + str(slice + 1)
                #feature, gap = self.generateFeatureLabel(timeslice, distinct)
                feature, gap = self.feature.generate(timeslice, distinct)

                if feature == None or gap == 0:
                    continue
                gap_predicted = clf.predict([feature])[0]
                #print('Before log:',gap_predicted)
                gap_predicted = int(math.pow(10, gap_predicted))

                if gap_predicted < 0:
                    gap_predicted = 0
                isWeekend = isWeekendsText(date)
                gap_filtered = self.dataio.select_filter_gap(
                    timeslice, distinct, isWeekend)
                if gap_predicted > 2 * gap_filtered:
                    gap_predicted = 2 * gap_filtered
            # print('After log:', gap_predicted,gap)
                err_rate = abs((gap - gap_predicted) / gap)
                #print(timeslice+"\t{:.2f}\t{}\t{:.0f}".format(err_rate,gap,gap_predicted))
                err_rate_sum += err_rate
                count += 1
        err_rate_sum /= count
        return err_rate_sum

    #
    # def generateFeatureLabel(self,dateslice,distinct):
    #     date = dateslice[0:10]
    #     weather = self.dataio.select_weatherdata_by_dateslice(dateslice)
    #     if type(weather) == type(None):
    #         #print("Weather info. does not exist in "+dateslice)
    #         return None,None
    #
    #
    #
    #     weather_feature = [0] * 4
    #     cur_weather = int(weather['weather'])
    #     if cur_weather == 2 or cur_weather == 3 or cur_weather == 4:
    #         weather_feature[0] = 1
    #     elif cur_weather == 8:
    #         weather_feature[1] = 1
    #     elif cur_weather == 9:
    #         weather_feature[2] = 1
    #     else:
    #         weather_feature[3] = 1
    #     #print(weather_feature)
    #     #weather_feature[int(weather['weather']) - 1] = 1
    #
    #     orderdata = self.dataio.select_orderdata_by_district(dateslice,distinct)
    #     gap_real = (orderdata['demand']-orderdata['supply']).values
    #     gap_real = gap_real[0]
    #     timeslice = int(dateslice.split('-')[-1])
    #     if timeslice <4:
    #         return None,None
    #     traffic_info = self.dataio.select_trafficdata_by_district(dateslice,distinct)
    #     if traffic_info.empty and distinct !=54:
    #         return None,None
    #
    #     ts_feature = gene_timeslice_feature(timeslice,4)
    #
    #
    #     result = isWeekends(date)
    #     if result == 0:
    #         daytype = 'weekday'
    #     if result == 1:
    #         daytype = 'sat'
    #     if result == 2:
    #         daytype = 'sun'
    #
    #     gap_filtered = self.dataio.select_filter_gap(dateslice,distinct,daytype)
    #     gap_filtered_last = self.dataio.select_filter_gap(get_last_ts(dateslice),distinct,daytype)
    #     traffic_level =[1,1,1,1]
    #     if not traffic_info.empty:
    #         level1 = (traffic_info['level1'].values)[0]
    #         level2 = (traffic_info['level2'].values)[0]
    #         level3 = (traffic_info['level3'].values)[0]
    #         level4 = (traffic_info['level4'].values)[0]
    #         traffic_level[0] = level1
    #         traffic_level[1] = level2
    #         traffic_level[2] = level3
    #         traffic_level[3] = level4
    #
    #     #print(traffic_level)
    #
    #     trafficBeList = []
    #     GapBeList = []
    #     for delta in range(3):
    #         datesliceBe = dateslice[0:11]+str(timeslice-delta-1)
    #         orderdataBe = self.dataio.select_orderdata_by_district(datesliceBe, distinct)
    #         gap_real_Be = (orderdataBe['demand'] - orderdataBe['supply']).values
    #         gap_real_Be = gap_real_Be[0]
    #         GapBeList.append(gap_real_Be)
    #
    #         traffic_info = self.dataio.select_trafficdata_by_district(datesliceBe,distinct)
    #         if not traffic_info.empty:
    #             level1 = (traffic_info['level1'].values)[0]
    #             level2 = (traffic_info['level2'].values)[0]
    #             level3 = (traffic_info['level3'].values)[0]
    #             level4 = (traffic_info['level4'].values)[0]
    #             traffic_temp = level1 + level2 * 2 + level3 * 3 + level4 * 4
    #         else:
    #             traffic_temp = 1
    #         trafficBeList.append(traffic_temp)
    #
    #
    #     #GapBeListExp2 = [x*x for x in GapBeList]
    #     GapBeListExp2 = math.pow(GapBeList[0],2)
    #     #GapBeListExp2 = math.exp(GapBeList[0])
    #     feature = []
    #
    #     #feature.extend(weather_feature)
    #     feature.extend(GapBeList)
    #     #feature.extend(ts_feature)
    #     feature.append(gap_filtered)
    #     feature.append(gap_filtered_last)
    #
    #     # diff = abs(gap_filtered - GapBeList[0])
    #     # diff_exp05 = math.pow(diff,0.5)
    #     # if gap_filtered - GapBeList[0]>0:
    #     #     pass
    #     # else:
    #     #     diff_exp05 *= -1
    #
    #     #feature.append(math.log(gap_filtered-GapBeList[0]))
    #
    #     #feature.append(math.pow((GapBeList[0] - GapBeList[1]),2))
    #     #feature.extend(GapBeListExp2)
    #     #feature.append(diff_exp05)
    #
    #
    #     #feature.extend(traffic_level)
    #     #feature.extend(trafficBeList)
    #     feature.append(1)
    #
    #     return feature,gap_real

    def gene_KRR_clf_bydaylist(self,
                               distinct_list,
                               clflist,
                               count,
                               gamma=1,
                               alpha=1):

        for distinct in distinct_list:
            rand = random.random()
            time.sleep(rand / 10)
            count[0] += 1
            print("Training model in " + "{:.1f}".format(count[0] / 66 * 100) +
                  "% completed...")
            clf_weekday = self.train_kernel_ridge_regression_clf(
                weekday, distinct + 1, gamma, alpha)
            clf_weekend = self.train_kernel_ridge_regression_clf(
                weekend, distinct + 1, gamma, alpha)
            clflist[distinct] = [clf_weekday, clf_weekend]

    def train_OPT_clf_bydaylist(self, distinct_list, clflist, count):
        for distinct in distinct_list:
            rand = random.random()
            time.sleep(rand / 10)
            count[0] += 1
            print("Training model in " + "{:.1f}".format(count[0] / 66 * 100) +
                  "% completed...")
            clf_weekday = self.train_optimzation_model(self.weekday,
                                                       distinct + 1)
            clf_weekend = self.train_optimzation_model(self.weekend,
                                                       distinct + 1)
            clflist[distinct] = [clf_weekday, clf_weekend]

Beispiel #20

0

Datei anzeigen

Datei: submission_splitted_category.py Projekt: ageek/kaggle-job-salary

from data_io import DataIO
from sklearn.ensemble import ExtraTreesRegressor
#from sklearn.cross_validation import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

dio = DataIO("Settings.json")
submission = False
n_trees = 10
min_samples_split = 2

if submission:
    type_n = "train_full"
    type_v = "valid_full"
else:
    type_n = "train"
    type_v = "valid"


vectorizer = CountVectorizer(
    max_features=200,
)
short_id = "count_200f"
tfidf_columns = ["Title", "FullDescription", "LocationRaw"]
#dio.make_counts(vectorizer, short_id, tfidf_columns, type_n, type_v)


columns = ["Category", "ContractTime", "ContractType"]
le_features = dio.get_le_features(columns, "train_full")
extra_features = dio.get_features(columns, type_n, le_features)
extra_valid_features = dio.get_features(columns, type_v, le_features)

Beispiel #21

0

Datei anzeigen

class cFeature:
    dataio = DataIO()
    datelice = ''
    date = ''
    distinct = 0
    daytype = ''
    back_len = 3

    def generate(self, ds, distinct):
        self.date = ds[0:10]
        self.datelice = ds
        self.distinct = distinct
        self.daytype = isWeekendsText(self.date)

        slice = int(ds.split('-')[-1])
        if slice <= self.back_len:

            return None, None

        #--------------------feature generate----------------------#
        f = []

        #wea_feature = self.weather_feature()
        # if wea_feature != None:
        #     f.extend(wea_feature)
        # else:
        #     return None, None

        gap_feature = self.gap_feature()
        if gap_feature == None:
            return None, None
        f.extend(gap_feature)
        # ts_feature = self.ts_feature()
        # f.extend(ts_feature)

        #f.append(1)
        gap = self.dataio.select_gap(self.datelice, self.distinct)
        return f, gap

    def weather_feature(self):
        weather = self.dataio.select_weatherdata_by_dateslice(self.datelice)
        if type(weather) == type(None):
            return None
        wea_feature = [0] * 4
        cur_weather = int(weather['weather'])
        if cur_weather == 2 or cur_weather == 3 or cur_weather == 4:
            wea_feature[0] = 1
        elif cur_weather == 8:
            wea_feature[1] = 1
        elif cur_weather == 9:
            wea_feature[2] = 1
        else:
            wea_feature[3] = 1
        return wea_feature

    def gap_feature(self):
        gapfeature = []

        ls = get_last_ts(self.datelice)
        gap_b1 = self.dataio.select_gap(ls, self.distinct)
        ls = get_last_ts(ls)
        gap_b2 = self.dataio.select_gap(ls, self.distinct)
        ls = get_last_ts(ls)
        gap_b3 = self.dataio.select_gap(ls, self.distinct)
        gap_std = np.std(np.array([gap_b1, gap_b2, gap_b3]))
        gapfeature.append(gap_std)

        gap_diff_b1 = gap_b1 - gap_b2
        gap_diff_b2 = gap_b2 - gap_b3

        # if gap_b1 != 0:
        #     gapfeature.append(gap_diff_b1/gap_b1)
        # else:
        #     gapfeature.append(gap_diff_b1)

        gapfeature.append(gap_b1)
        gapfeature.append(gap_diff_b1)
        #gapfeature.append(gap_diff_b1**2)
        gapfeature.append(gap_diff_b2)

        #ls = self.datelice
        # for i in range(self.back_len):
        #     gap_filtered = self.dataio.select_filter_gap(ls,self.distinct,self.daytype)
        #     #print(ls,self.daytype)
        #     gapfeature.append(gap_filtered)
        #     ls = get_last_ts(ls)

        gap_filtered_b2 = self.dataio.select_filter_gap(
            get_last_ts(get_last_ts(self.datelice)), self.distinct,
            self.daytype)
        gap_filtered_b1 = self.dataio.select_filter_gap(
            get_last_ts(self.datelice), self.distinct, self.daytype)
        gap_filtered_cur = self.dataio.select_filter_gap(
            self.datelice, self.distinct, self.daytype)
        gap_filtered_a1 = self.dataio.select_filter_gap(
            get_next_ts(self.datelice), self.distinct, self.daytype)
        if gap_filtered_a1 == None or gap_filtered_b1 == None or gap_filtered_b2 == None:
            return None

        gap_filter_diff_b2 = gap_filtered_b1 - gap_filtered_b2
        gap_filter_diff_b1 = gap_filtered_cur - gap_filtered_b1
        gap_filter_diff_a1 = gap_filtered_a1 - gap_filtered_cur
        #gapfeature.append(gap_filter_diff_b2)
        gapfeature.append(gap_filter_diff_b1)
        gapfeature.append(gap_filter_diff_a1)
        gapfeature.append(gap_filtered_cur)

        #gapfeature.append(math.pow(gap_filter_diff_b1,3))
        #gapfeature.append(math.pow(gap_filter_diff_b1,2))

        return gapfeature

    def ts_feature(self):
        slice = int(self.datelice.split('-')[-1])
        ts_feature = gene_timeslice_feature(slice, 8)
        return ts_feature

    def traffic_feature(self):
        traffic_info = self.dataio.select_trafficdata_by_district(
            get_last_ts(self.datelice), self.distinct)
        traffic_level = []
        if not traffic_info.empty:
            level1 = (traffic_info['level1'].values)[0]
            level2 = (traffic_info['level2'].values)[0]
            level3 = (traffic_info['level3'].values)[0]
            level4 = (traffic_info['level4'].values)[0]
            traffic_level = [level1, level2, level3, level4]
        else:
            traffic_level = [0, 0, 0, 0]
        return traffic_level

Beispiel #22

0

Datei anzeigen

Datei: submission_splitted_category.py Projekt: oasic/kaggle-job-salary

from data_io import DataIO
from sklearn.ensemble import ExtraTreesRegressor
#from sklearn.cross_validation import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np

dio = DataIO("Settings.json")
submission = False
n_trees = 10
min_samples_split = 2

if submission:
    type_n = "train_full"
    type_v = "valid_full"
else:
    type_n = "train"
    type_v = "valid"

vectorizer = CountVectorizer(max_features=200, )
short_id = "count_200f"
tfidf_columns = ["Title", "FullDescription", "LocationRaw"]
#dio.make_counts(vectorizer, short_id, tfidf_columns, type_n, type_v)

columns = ["Category", "ContractTime", "ContractType"]
le_features = dio.get_le_features(columns, "train_full")
extra_features = dio.get_features(columns, type_n, le_features)
extra_valid_features = dio.get_features(columns, type_v, le_features)

split_name = "Category"
#split_name = "ContractTime"
#split_name = "ContractType"

Beispiel #23

0

Datei anzeigen

class neural_network:
    fnn = FeedForwardNetwork()
    inputlen = 0
    outputlen = 7
    dataio = DataIO()
    feature = cFeature()
    dataset = {}

    def network_init(self):
        #输入feature len
        tempfeature, gap = self.feature.generate('2016-01-03-100', 1)
        self.inputlen = len(tempfeature)

        # 设立三层，一层输入层（3个神经元，别名为inLayer），一层隐藏层，一层输出层
        inLayer = LinearLayer(self.inputlen, name='inLayer')
        hiddenLayer1 = SigmoidLayer(7, name='hiddenLayer1')
        hiddenLayer2 = SigmoidLayer(7, name='hiddenLayer2')
        outLayer = LinearLayer(7, name='outLayer')

        # 将三层都加入神经网络（即加入神经元）
        self.fnn.addInputModule(inLayer)
        self.fnn.addModule(hiddenLayer1)
        self.fnn.addModule(hiddenLayer2)
        self.fnn.addOutputModule(outLayer)

        # 建立三层之间的连接
        in_to_hidden1 = FullConnection(inLayer, hiddenLayer1)
        hidden1_to_hidden2 = FullConnection(hiddenLayer1, hiddenLayer2)
        hidden_to_out = FullConnection(hiddenLayer2, outLayer)

        # 将连接加入神经网络
        self.fnn.addConnection(in_to_hidden1)
        self.fnn.addConnection(hidden1_to_hidden2)
        self.fnn.addConnection(hidden_to_out)

        # 让神经网络可用
        self.fnn.sortModules()

    def gene_training_sample(self):

        self.DS = SupervisedDataSet(self.inputlen, self.outputlen)

        if os.path.exists('nn_dataset.pkl'):
            with open('nn_dataset.pkl', 'rb') as f:
                self.dataset = pickle.load(f)
                for i in range(len(self.dataset['feature'])):
                    #print(self.dataset['feature'][i])
                    self.DS.addSample(self.dataset['feature'][i],
                                      self.dataset['label'][i])

        else:
            backdaylen = 3
            prefix = '2016-01-'
            loop = 0
            featurelist = []
            targetlist = []
            for day in range(2, 22, 1):
                date = prefix + "{:02}".format(day)
                for distinct in range(1, 67):
                    for slice in range(1, 145):
                        if slice < backdaylen:
                            continue
                        ts_cur = date + '-' + str(slice)
                        gap_cur = self.dataio.select_gap(ts_cur, distinct)
                        if gap_cur > 10:
                            continue

                        f_cur, gap = self.feature.generate(ts_cur, distinct)
                        if f_cur == None:
                            continue
                        output = self.gene_output(gap_cur)

                        featurelist.append(f_cur)
                        targetlist.append(output)

                        loop += 1
                        if loop % 1000 == 0:
                            print(loop)

            self.dataset['feature'] = featurelist
            self.dataset['label'] = targetlist
            for i in range(len(featurelist)):
                self.DS.addSample(featurelist[i], targetlist[i])
            print(
                "Building training set is finished. Total amount is {}".format(
                    loop))
            with open('nn_dataset.pkl', 'wb') as f:
                pickle.dump(self.dataset, f)

    def training_nerual_network(self):
        dataTrain, dataTest = self.DS.splitWithProportion(0.7)
        xTrain, yTrain = dataTrain['input'], dataTrain['target']

        xTest, yTest = dataTest['input'], dataTest['target']

        trainer = BackpropTrainer(self.fnn,
                                  dataTrain,
                                  verbose=True,
                                  learningrate=0.03,
                                  momentum=0.1)
        trainer.trainUntilConvergence(maxEpochs=20)

        output = self.fnn.activateOnDataset(dataTest)
        count = 0
        countRight = 0
        error = 0
        for i in range(len(output)):
            posReal = yTest[i].argmax()
            posPredict = output[i].argmax()
            #print('o',output[i],posPredict)
            #print('r',yTest[i],posReal)
            error += abs(posReal - posPredict)

            if posReal == posPredict:
                countRight += 1
            count += 1
        error /= count
        print('Correct rate:{:.2f}   Average error:{:.2f}'.format(
            countRight / count, error))

    def gene_output(self, val):
        output = np.zeros(self.outputlen)
        if val == 0 or val == 1:
            output[0] = 1
        if val == 2:
            output[1] = 1
        if val == 3:
            output[2] = 1
        if val == 4 or val == 5:
            output[3] = 1
        if val == 6 or val == 7:
            output[4] = 1
        if val == 8 or val == 9:
            output[5] = 1
        if val > 9:
            output[6] = 1
        return output

Beispiel #24

0

Datei anzeigen

Datei: combine_new.py Projekt: ageek/kaggle-job-salary

from data_io import DataIO
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.cross_validation import cross_val_score
import numpy as np
from sklearn.metrics import mean_absolute_error
from itertools import combinations
from sklearn.linear_model import LinearRegression, RidgeCV, Ridge

dio = DataIO("Settings.json")
submission = False
n_trees = 10
min_samples_split = 2
param = """Normal count vector with max 200. New submission which is repeatable.
 and nicer

 """

if submission:
    type_n = "train_full"
    type_v = "valid_full"
else:
    type_n = "train"
    type_v = "valid"

#columns = ["Category", "ContractTime", "ContractType"]
#le_features = dio.get_le_features(columns, "train_full")
#extra_features = dio.get_features(columns, type_n, le_features)
#extra_valid_features = dio.get_features(columns, type_v, le_features)
#features = dio.join_features("%s_" + type_n + "_tfidf_matrix_max_f_200",
                             #["Title", "FullDescription", "LocationRaw"],
                             #extra_features)

Beispiel #25

0

Datei anzeigen

Datei: main_gui.py Projekt: longmoc/customer-segmentation

class MainForm(wx.Frame):
    # ----------------------------------------------------------------------
    def __init__(self):
        self.data_io = DataIO()
        self.input = None
        self.content_data = None
        self.execute_data, self.prior_1, self.prior_2, self.leave = None, None, None, None
        wx.Frame.__init__(self,
                          None,
                          wx.ID_ANY,
                          "Customer Segmentation",
                          size=(800, 640))

        self.panel = wx.Panel(self, wx.ID_ANY)
        status = self.CreateStatusBar()

        self.sizer = wx.BoxSizer(wx.VERTICAL)

        file_label = wx.StaticText(self.panel, -1, "Input:", (30, 20))
        self.InputPathTextBox = wx.TextCtrl(self.panel,
                                            -1,
                                            "",
                                            size=(200, -1),
                                            pos=(100, 21))
        self.browse_btn = wx.Button(self.panel, -1, "Browse", pos=(310, 20))
        self.Bind(wx.EVT_BUTTON, self.onOpenFile, self.browse_btn)

        # output_label = wx.StaticText(self.panel, -1, "Ket qua:", (30, 50))
        # self.OutputPathTextBox = wx.TextCtrl(self.panel, -1, "", size=(200, -1), pos=(100, 47))

        self.execute_btn = wx.Button(self.panel,
                                     -1,
                                     "Phân tích",
                                     pos=(400, 20))
        self.Bind(wx.EVT_BUTTON, self.onExecute, self.execute_btn)
        self.execute_btn.Disable()

        inform_label = wx.StaticText(self.panel, -1, "Tổng quát:", (30, 60))
        self.inform_txt = wx.TextCtrl(self.panel,
                                      -1,
                                      "",
                                      style=wx.TE_MULTILINE | wx.TE_READONLY,
                                      size=(463, 70),
                                      pos=(100, 60))

        self.prior1_btn = wx.Button(self.panel,
                                    -1,
                                    "Nhóm Ưu tiên 1",
                                    pos=(98, 140))
        self.Bind(wx.EVT_BUTTON, self.onOpenPrior1, self.prior1_btn)
        self.prior1_btn.Disable()

        self.prior2_btn = wx.Button(self.panel,
                                    -1,
                                    "Nhóm Ưu tiên 2",
                                    pos=(210, 140))
        self.Bind(wx.EVT_BUTTON, self.onOpenPrior2, self.prior2_btn)
        self.prior2_btn.Disable()

        self.leave_btn = wx.Button(self.panel,
                                   -1,
                                   "Dự đoán rời mạng",
                                   pos=(320, 140))
        self.Bind(wx.EVT_BUTTON, self.onOpenLeave, self.leave_btn)
        self.leave_btn.Disable()

        self.total_btn = wx.Button(self.panel,
                                   -1,
                                   "Toàn bộ các nhóm",
                                   pos=(443, 140))
        self.Bind(wx.EVT_BUTTON, self.onOpenTotal, self.total_btn)
        self.total_btn.Disable()

        self.chart_btn = wx.Button(self.panel, -1, "Biểu đồ", pos=(580, 140))
        self.Bind(wx.EVT_BUTTON, self.onOpenChart, self.chart_btn)
        self.chart_btn.Disable()

        self.save_btn = wx.Button(self.panel, -1, "Export", pos=(680, 140))
        self.Bind(wx.EVT_BUTTON, self.onSaveFile, self.save_btn)
        self.save_btn.Disable()

        self.data_grid = grid.Grid(self.panel, size=(780, 400), pos=(5, 180))
        self.data_grid.CreateGrid(20, 10)

        self.panel.SetSizer(self.sizer)

    def onOpenFile(self, event):
        dlg = wx.FileDialog(
            self,
            message="Choose a file",
            defaultDir='../data/',
            defaultFile="",
            wildcard="Excel files (*.xlsx, *.xls)|*.xlsx; *.xls",
            style=wx.FD_OPEN | wx.FD_CHANGE_DIR | wx.FD_FILE_MUST_EXIST)
        if dlg.ShowModal() == wx.ID_OK:
            path = dlg.GetPaths()
            # print("You chose the following file:", path)
            self.InputPathTextBox.ChangeValue(path[0])
            self.input = path[0]
            data = self.data_io.import_data(path[0])
            self.content_data = self.fill_grid_data(data)
            self.execute_btn.Enable()
            self.prior1_btn.Disable()
            self.prior2_btn.Disable()
            self.leave_btn.Disable()
            self.total_btn.Disable()
            self.chart_btn.Disable()
            self.save_btn.Disable()
            self.inform_txt.ChangeValue('')
        dlg.Destroy()

    def onOpenPrior1(self, event):
        self.fill_grid_data(self.prior_1)

    def onOpenPrior2(self, event):
        self.fill_grid_data(self.prior_2)

    def onOpenLeave(self, event):
        self.fill_grid_data(self.leave)

    def onOpenTotal(self, event):
        self.fill_grid_data(self.execute_data)

    def onSaveFile(self, event):
        dlg = wx.FileDialog(
            self,
            message="Save as",
            defaultDir='../data/',
            defaultFile="",
            wildcard="Excel files (*.xlsx, *.xls)|*.xlsx; *.xls",
            style=wx.FD_SAVE)
        if dlg.ShowModal() == wx.ID_OK:
            path = dlg.GetPaths()[0]
            self.data_io.export_data(self.execute_data, path)
            # print("You chose the following file:", self.path)
        dlg.Destroy()

    def onExecute(self, event):
        if not self.content_data:
            msg = "You have an error.\nPlease reopen data file"
            self.show_error_dialog(msg, "ERROR", wx.OK | wx.ICON_EXCLAMATION)
        else:

            executed_data, prior_1, prior_2, leave = classify(
                self.content_data)
            self.execute_data = executed_data
            self.fill_grid_data(self.execute_data)
            self.prior_1 = prior_1
            self.prior_2 = prior_2
            self.leave = leave

            nums = len(self.content_data) - 1
            summary = 'Tổng số thuê bao: {}\n'.format(
                len(self.content_data) - 1)
            rate = 100 * (len(self.prior_1) - 1) / nums
            summary += 'Số thuê bao ưu tiên 1: {0} ({1:.2f}%)\n'.format(
                len(self.prior_1) - 1, round(rate, 2))
            rate = 100 * (len(self.prior_2) - 1) / nums
            summary += 'Số thuê bao ưu tiên 2: {0} ({1:.2f}%)\n'.format(
                len(self.prior_2) - 1, round(rate, 2))
            rate = 100 * (len(self.prior_1) - 1) / nums
            summary += 'Số thuê bao dự đoán rời mạng: {0} ({1:.2f}%)'.format(
                len(self.leave) - 1, round(rate, 2))
            self.inform_txt.ChangeValue(summary)
            self.prior1_btn.Enable()
            self.prior2_btn.Enable()
            self.leave_btn.Enable()
            self.total_btn.Enable()
            self.save_btn.Enable()
            self.chart_btn.Enable()
            # else:
            #     msg = "Something error. Please try again."
            #     self.show_error_dialog(msg, "ERROR", wx.OK | wx.ICON_EXCLAMATION)

    def onOpenChart(self, event):
        self.draw()

    def show_error_dialog(self, msg, title, style):
        dlg = wx.MessageDialog(parent=None,
                               message=msg,
                               caption=title,
                               style=style)
        dlg.ShowModal()
        dlg.Destroy()

    def fill_grid_data(self, data):
        # print(data[11])
        current_nrows, new_nrows = (self.data_grid.GetNumberRows(), len(data))
        if current_nrows < new_nrows:
            self.data_grid.AppendRows(new_nrows - current_nrows)
        elif current_nrows > new_nrows:
            self.data_grid.DeleteRows(new_nrows, current_nrows - new_nrows)
        current_ncols, new_ncols = (self.data_grid.GetNumberCols(),
                                    len(data[0]))
        if current_ncols < new_ncols:
            self.data_grid.AppendCols(new_ncols - current_ncols)
        elif current_ncols > new_ncols:
            self.data_grid.DeleteCols(new_ncols, current_ncols - new_ncols)

        for i in range(new_ncols):
            self.data_grid.SetCellValue(0, i, str(data[0][i]))

        for row in range(1, new_nrows):
            for col in range(new_ncols):
                self.data_grid.SetCellValue(row, col, str(data[row][col]))
        return data

    def draw(self):
        labels = ['Nhóm Ưu tiên 1', 'Nhóm Ưu tiên 2', 'Dự đoán rời mạng']
        sizes = [
            len(self.prior_1) - 1,
            len(self.prior_2) - 1,
            len(self.leave) - 1
        ]
        explode = (0, 0, 0.1)

        fig, ax = plt.subplots()
        fig.canvas.set_window_title('Biểu đồ')
        plt.title('Tỉ lệ các nhóm thuê bao')
        ax.pie(sizes,
               explode=explode,
               labels=labels,
               autopct='%1.1f%%',
               shadow=True,
               startangle=90)
        ax.axis('equal')

        plt.tight_layout()
        plt.show()

Beispiel #26

0

Datei anzeigen

Datei: cord19_app.py Projekt: laranea/Covid19

def get_data():
    dataio = DataIO()
    df = dataio.get_data()
    return df

Beispiel #27

0

Datei anzeigen

Datei: submission_splitted_part.py Projekt: oasic/kaggle-job-salary

from sklearn.grid_search import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix
from pprint import pprint
from time import time
import numpy as np
import joblib

dio = DataIO("Settings.json")
submission = False
n_trees = 10
min_samples_split = 2
param = """Normal count vector with max 200. New submission which is repeatable.
 and nicer

count_vector_titles = TfidfVectorizer(
    read_column(train_filename, column_name),
    max_features=200, norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)
 """

if submission:
    type_n = "train_full"
    type_v = "valid_full"
else:

Beispiel #28

0

Datei anzeigen

Datei: test_notext.py Projekt: oasic/kaggle-job-salary

op.add_option("--n_features",
              action="store",
              type=int,
              default=2**16,
              help="n_features when using the hashing vectorizer.")

(opts, args) = op.parse_args()
if len(args) > 0:
    op.error("this script takes no arguments.")
    sys.exit(1)

print __doc__
op.print_help()
print

dio = DataIO("Settings_loc5.json")
submission = False
n_trees = 10
min_samples_split = 2

if submission:
    type_n = "train_full"
    type_v = "valid_full"
else:
    type_n = "train"
    type_v = "valid"

vectorizer = TfidfVectorizer(sublinear_tf=True,
                             max_df=0.5,
                             stop_words='english')
#short_id = "tfidf_200f_l1"

Beispiel #29

0

Datei anzeigen

from data_io import DataIO
#from os.path import join as path_join
#import joblib
import numpy as np
dio = DataIO("Settings_submission.json")
submission = True
if submission:
    type_n = "train_full"
    type_v = "test_full"
else:
    type_n = "train"
    type_v = "valid"


model_names = [
    "ExtraTree_min_sample2_30trees_200f_noNorm_categoryTimeType_new_log",
    "ExtraTree_min_sample2_40trees_200f_noNorm_categoryTimeType_new_log",
    "ExtraTree_min_sample2_40trees_200f_noNorm_categoryTimeType_new",
    "ExtraTree_min_sample2_40trees_200f_noNorm_categoryTimeType_tfidfl2_new_log",
    "vowpall_submission",
    "vowpall_loc5"
]
#model_names = [model2, model4]
#model_names = [model1, model6, model4]


#fit_predict(model2)
#fit_predict(model1)
#fit_predict(model3)
#fit_predict(model5)

Beispiel #30

0

Datei anzeigen

class DeepGalaxyTraining(object):
    def __init__(self):
        self.data_io = DataIO()
        self.model = None
        self.x_train = None
        self.y_train = None
        self.x_test = None
        self.y_test = None
        self.num_classes = 0
        self.epochs = 50
        self.batch_size = 8
        self.use_noise = True
        self.distributed_training = False
        self.multi_gpu_training = False
        self._multi_gpu_model = None
        self._n_gpus = 1
        self.callbacks = []
        self.logger = None
        self.log_level = logging.DEBUG
        self.input_shape = (512, 512, 3)  # (256, 256, 3)
        self._t_start = 0
        self._t_end = 0

    def get_flops(self, model):
        # run_meta = tf.RunMetadata()  # commented out since it doesn't work in TF2
        run_meta = tf.compat.v1.RunMetadata()
        # opts = tf.profiler.ProfileOptionBuilder.float_operation()
        opts = tf.compat.v1.profiler.ProfileOptionBuilder.float_operation()

        # We use the Keras session graph in the call to the profiler.
        flops = tf.compat.v1.profiler.profile(
            graph=tf.compat.v1.keras.backend.get_session().graph,
            run_meta=run_meta,
            cmd='op',
            options=opts)

        return flops.total_float_ops  # Prints the "flops" of the model.

    def initialize(self):
        # init_op = tf.initialize_all_variables()
        # init_op = tf.global_variables_initializer()
        # sess = tf.Session()
        # sess.run(init_op)

        # Check if GPUs are available
        # if tf.test.is_gpu_available():  # commented out since this test will cause a new session be created
        # allow growth
        # config = tf.compat.v1.ConfigProto()
        # config.gpu_options.per_process_gpu_memory_fraction = 1
        # config.gpu_options.allow_growth = True  # dynamically grow the memory used on the GPU
        # # config.log_device_placement = True  # to log device placement (on which device the operation ran)
        # sess = tf.compat.v1.Session(config=config)
        # tf.compat.v1.keras.backend.set_session(sess)  # set this TensorFlow session as the default session for Keras

        # Create logger
        self.logger = logging.getLogger('DeepGalaxyTrain')
        self.logger.setLevel(self.log_level)
        self.logger.addHandler(logging.FileHandler('train_log.txt'))
        if self.distributed_training is True:
            try:
                import horovod.tensorflow.keras as hvd
                # initialize horovod
                hvd.init()
                self.callbacks.append(
                    hvd.callbacks.BroadcastGlobalVariablesCallback(0))
                self.callbacks.append(hvd.callbacks.MetricAverageCallback())
                # self.callbacks = [hvd.BroadcastGlobalVariablesHook(0)]
                if hvd.rank() == 0:
                    self.logger.info('Parallel training enabled.')
                    self.logger.info(
                        'batch_size = %d, global_batch_size = %d, num_workers = %d\n'
                        % (self.batch_size, self.batch_size * hvd.size(),
                           hvd.size()))

                # Map an MPI process to a GPU (Important!)
                print('hvd_rank = %d, hvd_local_rank = %d' %
                      (hvd.rank(), hvd.local_rank()))
                self.logger.info('hvd_rank = %d, hvd_local_rank = %d' %
                                 (hvd.rank(), hvd.local_rank()))

                # Bind a CUDA device to one MPI process (has no effect if GPUs are not used)
                os.environ["CUDA_VISIBLE_DEVICES"] = str(hvd.local_rank())

                # # Horovod: pin GPU to be used to process local rank (one GPU per process)
                gpus = tf.config.experimental.list_physical_devices('GPU')
                for gpu in gpus:
                    tf.config.experimental.set_memory_growth(gpu, True)
                # if gpus:
                # tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU')
            except ImportError as identifier:
                print(
                    'Error importing horovod. Disabling distributed training.')
                self.distributed_training = False
        else:
            self.logger.info('Parallel training disabled.')
            self.logger.info('Batch_size = %d' % (self.batch_size))

    def load_data(self, data_fn, test_size=0.3, random=True):
        if not self.distributed_training:
            self.logger.info(
                'Loading the full dataset since distributed training is disabled ...'
            )
            # X, Y = self.data_io.load_all(data_fn, dset_name_pattern=dset_name_pattern, camera_pos=camera_pos)
            X, Y = self.data_io.load_all(data_fn)
        else:
            self.logger.info(
                'Loading part of the dataset since distributed training is enabled ...'
            )
            X, Y = self.data_io.load_partial(data_fn, hvd.size(), hvd.rank())
        self.logger.debug('Shape of X: %s' % str(X.shape))
        self.logger.debug('Shape of Y: %s' % str(Y.shape))

        # update the input_shape setting according to the loaded data
        self.input_shape = X.shape[1:]

        if test_size > 0:
            x_train, x_test, y_train, y_test = train_test_split(
                X, Y, test_size=test_size, random_state=42)
            self.x_train = x_train
            self.x_test = x_test
            self.y_train = y_train
            self.y_test = y_test
        else:
            self.x_train = X
            self.y_train = Y
        self.num_classes = np.unique(Y).shape[0]
        print("shapes:", self.x_train.shape, self.x_test.shape,
              self.y_train.shape, self.y_test.shape)
        self.logger.debug('Number of classes: %d' % self.num_classes)

    def load_model(self):
        if not os.path.isfile('efn_b4.h5'):
            # base_model = efn.EfficientNetB4(weights='imagenet', include_top=False, input_shape=(self.input_shape[0], self.input_shape[1], 3), classes=self.num_classes)
            base_model = efn.EfficientNetB4(weights=None,
                                            include_top=True,
                                            input_shape=(self.input_shape[0],
                                                         self.input_shape[1],
                                                         3),
                                            classes=self.num_classes)
            if self.distributed_training is True and hvd.rank == 0:
                base_model.save('efn_b4.h5')
        else:
            base_model = tf.keras.models.load_model('efn_b4.h5', compile=False)
        print(base_model.summary())
        if not self.use_noise:
            # x = base_model.output
            # x = tf.keras.layers.GlobalAveragePooling2D()(x)
            # x = tf.keras.layers.Dropout(0.3)(x)
            # predictions = tf.keras.layers.Dense(self.num_classes, activation='softmax')(x)
            # model = tf.keras.models.Model(inputs = base_model.input, outputs = predictions)
            # model = tf.keras.models.Model(inputs = base_model.input, outputs = base_model.outputs)
            model = tf.keras.models.Sequential()
            # model.add(tf.keras.layers.Lambda(lambda x: tf.repeat(x, 3, axis=-1), input_shape=self.input_shape))  # commented out since tf.repeat does not exist before 1.15
            model.add(
                tf.keras.layers.Lambda(
                    lambda x: tf.keras.backend.repeat_elements(x, 3, axis=-1),
                    input_shape=self.input_shape))
            model.add(base_model)
            # model.add(tf.keras.layers.GlobalAveragePooling2D())
            # model.add(tf.keras.layers.Dropout(0.3))
            # model.add(tf.keras.layers.Dense(self.num_classes, activation='softmax'))
        else:
            model = tf.keras.models.Sequential()
            # model.add(tf.keras.layers.Lambda(lambda x: tf.repeat(x, 3, axis=-1), input_shape=self.input_shape))  # commented out since tf.repeat does not exist before 1.15
            model.add(
                tf.keras.layers.Lambda(
                    lambda x: tf.keras.backend.repeat_elements(x, 3, axis=-1),
                    input_shape=self.input_shape))
            model.add(
                tf.keras.layers.GaussianNoise(0.5,
                                              input_shape=self.input_shape))
            model.add(base_model)
            # model.add(tf.keras.layers.GlobalAveragePooling2D(name="gap"))
            # model.add(tf.keras.layers.Dropout(0.3))
            # model.add(tf.keras.layers.Dense(self.num_classes, activation="softmax", name="fc_out"))

        if self.distributed_training is True:
            # opt = K.optimizers.SGD(0.001 * hvd.size())
            # opt = tf.keras.optimizers.Adam(hvd.size())
            opt = tf.keras.optimizers.Adadelta(1.0 * hvd.size())
            # Horovod: add Horovod Distributed Optimizer.
            opt = hvd.DistributedOptimizer(opt)
        else:
            opt = tf.keras.optimizers.Adam()

        if self.multi_gpu_training is True:
            # probe the number of GPUs
            from tensorflow.python.client import device_lib
            local_device_protos = device_lib.list_local_devices()
            gpu_list = [
                x.name for x in local_device_protos if x.device_type == 'GPU'
            ]
            self._n_gpus = len(gpu_list)
            print('Parallalizing the model on %d GPUs...' % self._n_gpus)
            parallel_model = tf.keras.utils.multi_gpu_model(model,
                                                            gpus=self._n_gpus)
            parallel_model.compile(
                loss=tf.keras.losses.sparse_categorical_crossentropy,
                optimizer=opt,
                metrics=['sparse_categorical_accuracy'])
            self._multi_gpu_model = parallel_model
            self.model = model
            print(parallel_model.summary())
        else:
            model.compile(loss=tf.keras.losses.sparse_categorical_crossentropy,
                          optimizer=opt,
                          metrics=['sparse_categorical_accuracy'])
            self.model = model
            if self.distributed_training is True:
                if hvd.rank() == 0:
                    print(model.summary())
            else:
                print(model.summary())

    def fit(self):
        if self.distributed_training is True:
            try:
                # print('len(train_iter)', len(train_iter))
                # if hvd.rank() == 0:
                # self.f_usage.write('len(train_iter) = %d, x_train.shape=%s\n' % (len(train_iter), x_train.shape))
                self._t_start = datetime.now()
                self.model.fit(self.x_train,
                               self.y_train,
                               batch_size=self.batch_size,
                               epochs=self.epochs,
                               callbacks=self.callbacks,
                               verbose=1 if hvd.rank() == 0 else 0,
                               validation_data=(self.x_test, self.y_test))
                self._t_end = datetime.now()
                # train_gen = ImageDataGenerator()
                # train_iter = train_gen.flow(self.x_train, self.y_train, batch_size=self.batch_size)
                # test_gen = ImageDataGenerator()
                # test_iter = test_gen.flow(self.x_test, self.y_test, batch_size=self.batch_size)
                # self.model.fit_generator(train_iter,
                #     # batch_size=batch_size,
                #     steps_per_epoch=len(train_iter) // hvd.size(),
                #     epochs=self.epochs,
                #     callbacks=self.callbacks,
                #     verbose=1 if hvd.rank() == 0 else 0,
                #     validation_data=test_gen.flow(self.x_test, self.y_test, self.batch_size),
                #     validation_steps=len(test_iter) // hvd.size())

            except KeyboardInterrupt:
                print('Terminating due to Ctrl+C...')
            finally:
                print(
                    "On hostname {0} - After training using {1} GB of memory".
                    format(
                        socket.gethostname(),
                        psutil.Process(os.getpid()).memory_info()[0] / 1024 /
                        1024 / 1024))
                self._t_end = datetime.now()
                if hvd.rank() == 0:
                    self.logger.info(
                        "On hostname {0} - After training using {1} GB of memory\n"
                        .format(
                            socket.gethostname(),
                            psutil.Process(os.getpid()).memory_info()[0] /
                            1024 / 1024 / 1024))
                    self.logger.info('Time is now %s\n' % datetime.now())
                    # self.f_usage.write('Elapsed time %s\n' % (t_end-t_start))
                # print('Elapsed time:', t_end-t_start)
        else:
            try:
                if self.multi_gpu_training is True:
                    self._t_start = datetime.now()
                    self._multi_gpu_model.fit(
                        self.x_train,
                        self.y_train,
                        batch_size=self.batch_size * self._n_gpus,
                        epochs=self.epochs,
                        #   callbacks=self.callbacks,
                        verbose=1,
                        validation_data=(self.x_test, self.y_test))
                    self._t_end = datetime.now()
                else:
                    self._t_start = datetime.now()
                    self.model.fit(
                        self.x_train,
                        self.y_train,
                        batch_size=self.batch_size,
                        epochs=self.epochs,
                        #    callbacks=self.callbacks,
                        verbose=1,
                        validation_data=(self.x_test, self.y_test))
                    self._t_end = datetime.now()
            except KeyboardInterrupt:
                pass
            finally:
                self._t_end = datetime.now()
                print('Elapsed time:', self._t_end - self._t_start)
                print('Saving model...')
        print(self.get_flops(self.model))

    def save_model(self):
        if self.distributed_training is True:
            if hvd.rank() == 0:
                if self.use_noise is True:
                    self.model.save('model_hvd_bw_%d_B0_with_noise_n_p_%d.h5' %
                                    (self.input_shape[0], hvd.size()))
                else:
                    self.model.save('model_hvd_bw_%d_B0_no_noise_%d_nodes.h5' %
                                    (self.input_shape[0], hvd.size()))
        else:
            if self.use_noise is True:
                self.model.save('model_bw_%d_B0_with_noise.h5' %
                                (self.input_shape[0]))
            else:
                self.model.save('model_bw_%d_B0_no_noise.h5' %
                                (self.input_shape[0]))

    def validate(self):
        y_pred = self.model.predict(self.x_test)
        print(
            precision_recall_fscore_support(self.y_test,
                                            np.argmax(y_pred, axis=1)))
        print(confusion_matrix(self.y_test, np.argmax(y_pred, axis=1)))

    def finalize(self):
        pass

Beispiel #31

0

Datei anzeigen

Datei: submission_extra_gensim.py Projekt: oasic/kaggle-job-salary

from data_io import DataIO
from sklearn.decomposition import RandomizedPCA
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.base import clone
from sklearn.cross_validation import cross_val_score
import numpy as np

dio = DataIO("Settings.json")

title_corpus = dio.read_gensim_corpus("train_title_nltk_filtered.corpus.mtx")
pca = RandomizedPCA(random_state=3465343)
salaries = dio.get_salaries("train", log=True)

columns = ["Category", "ContractTime", "ContractType"]
le_features = dio.get_le_features(columns, "train_full")
extra_features = dio.get_features(columns, "train", le_features)
#extra_valid_features = dio.get_features(columns, "valid", le_features)

param = "RandomizedPCA title 200 Fulldescription 200 " + ",".join(columns)
print map(len, extra_features)
extra_features = map(lambda x: np.reshape(np.array(x), (len(x), 1)),
                     extra_features)

print type(title_corpus)
print title_corpus.shape

title_pca = clone(pca)
title_pca.set_params(n_components=200)
title_corpus_pca = title_pca.fit_transform(title_corpus)

print type(title_corpus_pca)

Beispiel #32

0

Datei anzeigen

from data_io import DataIO
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.cross_validation import cross_val_score
from os.path import join as path_join
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import joblib
import cloud
import os

tfidf_columns = ["Title", "FullDescription", "LocationRaw"]
dio = DataIO("Settings.json")

vectorizer = TfidfVectorizer(max_features=200,
                             norm='l1',
                             smooth_idf=True,
                             sublinear_tf=False,
                             use_idf=True)
short_id = "tfidf_200f_l1"
type_n = "train"
type_v = "valid"
dio.make_counts(vectorizer, short_id, tfidf_columns, "train", "valid")
columns = ["Category", "ContractTime", "ContractType"]
le_features = dio.get_le_features(columns, "train_full")
extra_features = dio.get_features(columns, type_n, le_features)
extra_valid_features = dio.get_features(columns, type_v, le_features)
features = dio.join_features("%s_" + type_n + "_" + short_id + "_matrix",
                             tfidf_columns, extra_features)
validation_features = dio.join_features(
    "%s_" + type_v + "_" + short_id + "_matrix", tfidf_columns,
    extra_valid_features)

Beispiel #33

0

Datei anzeigen

class Integrator(object):
    def __init__(self):
        """
        The constructor of an abstract integrator
        """
        # =================== CONSTANTS ==================
        # by default, using the square of the Gaussian gravitational constant
        self.CONST_G = 0.000295912208232213  # units: (AU^3/day^2)
        self.CONST_C = 0.0  # speed of light; PN terms will be calculated if CONST_C > 0

        # =================== VARIABLES ==================
        self._t = 0.0
        self.t_start = 0.0
        self.t_end = 0.0
        self.h = 0.01  # time step size
        self.store_dt = 100  # storage time step
        self._particles = None
        self.acceleration_method = 'ctypes'
        self.output_file = 'data.hdf5'
        self.collision_output_file = 'collisions.txt'
        self.close_encounter_output_file = 'close_encounters.txt'
        self.max_close_encounter_events = 1
        self.max_collision_events = 1
        self.close_encounter_distance = 0.0
        self.__energy_init = 0.0
        self.__energy = 0.0
        self.__buf = None
        self.buffer_len = 1024
        self.__initialized = False

        # =============== C Library =============
        self.libabie = CLibABIE()

    @property
    def t(self):
        return self._t

    @property
    def particles(self):
        if self._particles is None:
            self._particles = Particles(self.CONST_G)
        return self._particles

    @property
    def buf(self):
        if self.__buf is None:
            self.__buf = DataIO(buf_len=self.buffer_len,
                                output_file_name=self.output_file,
                                CONST_G=self.CONST_G)
        return self.__buf

    @staticmethod
    def load_integrators():
        """
        Load integrator modules
        :return: a dict of integrator class objects, mapping the name of the integrator to the class object
        """
        mod_dict = dict()
        module_candidates = glob.glob(
            os.path.join(__mpa_dir__, 'integrator_*.py'))
        sys.path.append(__mpa_dir__)  # append the python path
        if __mpa_dir__ != __user_shell_dir__:
            # load the integrator module (if any) also from the current user shell directory
            module_cwd = glob.glob(
                os.path.join(__user_shell_dir__, 'integrator_*.py'))
            for m_cwd in module_cwd:
                module_candidates.append(m_cwd)
            sys.path.append(__user_shell_dir__)  # append the python path
        for mod_name in module_candidates:
            mod_name = os.path.basename(mod_name)
            mod = __import__(mod_name.split('.')[0])
            if hasattr(mod, '__integrator__'):
                # it is a valid ABI module, register it as a module
                mod_dict[mod.__integrator__] = mod
        return mod_dict

    def initialize(self):
        if self.__buf is None:
            self.__buf = DataIO(
                buf_len=self.buffer_len,
                output_file_name=self.output_file,
                close_encounter_output_file_name=self.
                close_encounter_output_file,
                collision_output_file_name=self.collision_output_file,
                CONST_G=self.CONST_G)
        if self.particles.N > 0:
            # initialize the C library
            self.libabie.initialize_code(
                self.CONST_G,
                self.CONST_C,
                self.particles.N,
                MAX_CE_EVENTS=self.max_close_encounter_events,
                MAX_COLLISION_EVENTS=self.max_collision_events,
                close_encounter_distance=self.close_encounter_distance)
            self.buf.initialize_buffer(self.particles.N)

    def stop(self):
        if self.__buf is not None:
            self.__buf.close()

    def calculate_orbital_elements(self, primary=None):
        return self._particles.calculate_orbital_elements(primary)

    def calculate_energy(self):
        # return self._particles.energy
        if self.acceleration_method == 'ctypes':
            return self.libabie.get_total_energy()
        else:
            return self._particles.energy

    def set_additional_forces(self, ext_acc):
        self.libabie.set_additional_forces(ext_acc)

    def integrator_warmup(self):
        pos = self.particles.positions.copy()
        vel = self.particles.velocities.copy()
        self.libabie.set_state(pos, vel, self.particles.masses,
                               self.particles.radii, self.particles.N,
                               self.CONST_G, self.CONST_C)

    def integrate(self, to_time=None):
        """
        Integrate the system to a given time.
        :param to_time: The termination time. If None, it will use the self.t_end value, and the code will be stopped
                        when reaching self.t_end (i.e. if this function is called without argument, it can only be called
                        once; but if it is called with a to_time argument specificed, then it can be called iteratively.
        :return:
        """
        if to_time is not None:
            self.t_end = to_time

        if self.__initialized is False:
            self.initialize()
            self.integrator_warmup()
            self.__initialized = True
        if self.t >= self.t_end:
            return

        # Note: this dt is not the integrator time step h
        dt = min(self.store_dt, self.t_end - self.t)

        ret = 0
        # launch the integration
        while self.t < self.t_end:
            if self.__energy_init == 0:
                self.__energy_init = self.calculate_energy()
            next_t = self.t + dt - ((self.t + dt) % dt)
            if self.acceleration_method == 'numpy':
                ret = self.integrate_numpy(next_t)
            elif self.acceleration_method == 'ctypes':
                ret = self.integrate_ctypes(next_t)
            # the self.t is updated by the subclass
            # energy check
            self.__energy = self.calculate_energy()
            print(('t = %f, N = %d, dE/E0 = %g' %
                   (self.t, self.particles.N,
                    np.abs(self.__energy - self.__energy_init) /
                    self.__energy_init)))
            if os.path.isfile('STOP'):
                break

        if to_time is None:
            # triggering the termination of the code, save the buffer to the file and close it
            self.stop()
            # if ret > 0:
            #     break
        # if self.t == self.t_end:
        #     self.__energy = self.calculate_energy()
        #     print('t = %f, E/E0 = %g' % (self.t, np.abs(self.__energy - self.__energy_init) / self.__energy_init))
        return ret

    def integrate_numpy(self, to_time):
        """
        Integrate the system to a given time using python/numpy.
        This method must be implemented in the subclasses.
        :param to_time: The termination time. If None, it will use the self.t_end value
        :return:
        """
        raise NotImplementedError('integrate_numpy() method not implemented!')

    def integrate_ctypes(self, to_time):
        """
        Integrate the system to a given time using the ctypes (libabie.so)
        This method must be implemented in the subclasses.
        :param to_time: The termination time. If None, it will use the self.t_end value
        :return:
        """
        raise NotImplementedError('integrate_ctypes() method not implemented!')

    def store_state(self):
        if self.buf is None:
            self.initialize()
        self.buf.initialize_buffer(self.particles.N)
        elem = self.particles.calculate_aei()
        self.buf.store_state(self.t,
                             self.particles.positions,
                             self.particles.velocities,
                             self.particles.masses,
                             radii=self.particles.radii,
                             names=self.particles.hashes,
                             ptypes=self.particles.ptypes,
                             a=elem[:, 0],
                             e=elem[:, 1],
                             i=elem[:, 2])

    def store_collisions(self, collision_buffer):
        self.buf.store_collisions(collision_buffer)

    def store_close_encounters(self, ce_buffer):
        self.buf.store_close_encounters(ce_buffer)

    def handle_collisions(self, collision_buffer, actions=None):
        if actions is None:
            actions = ['merge', 'store']
        if 'store' in actions:
            self.store_state()
            self.store_collisions(collision_buffer)
        if 'merge' in actions:
            collision_buffer = collision_buffer.reshape(
                len(collision_buffer), 4)
            for coll_pair in range(collision_buffer.shape[0]):
                pid1 = int(collision_buffer[coll_pair, 1])
                pid2 = int(collision_buffer[coll_pair, 2])
                self.particles.merge_particles_inelastically(pid1, pid2)
                self.libabie.reset_collision_buffer()
                self.integrator_warmup()
                self.buf.flush()
                self.buf.reset_buffer()
                self.buf.initialize_buffer(self.particles.N)
        if 'halt' in actions:
            print('Simulation terminated due to a collision event.')
            sys.exit(0)

    def handle_close_encounters(self, ce_buffer, actions=None):
        if actions is None:
            actions = ['merge', 'store']
        if 'store' in actions:
            self.store_state()
            self.store_close_encounters(ce_buffer)
        if 'halt' in actions:
            print('Simulation terminated due to a collision event.')
            sys.exit(0)

Beispiel #34

0

Datei anzeigen

Datei: submission_splitted_part.py Projekt: ageek/kaggle-job-salary

from sklearn.grid_search import GridSearchCV
from sklearn.metrics import f1_score
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix
from pprint import pprint
from time import time
import numpy as np
import joblib

dio = DataIO("Settings.json")
submission = False
n_trees = 10
min_samples_split = 2
param = """Normal count vector with max 200. New submission which is repeatable.
 and nicer

count_vector_titles = TfidfVectorizer(
    read_column(train_filename, column_name),
    max_features=200, norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)
 """

if submission:
    type_n = "train_full"
    type_v = "valid_full"
else:

Beispiel #35

0

Datei anzeigen

from data_io import DataIO

import logging
from sklearn.decomposition import RandomizedPCA
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from pprint import pprint
from time import time
from sklearn.ensemble import ExtraTreesRegressor

dio = DataIO("Settings_loc5.json")

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.INFO)

salaries = dio.get_salaries("train", log=True)

#title_corpus_csc = dio.read_gensim_corpus("train_title_nltk_filtered.corpus.mtx")
#desc_corpus_csc = dio.read_gensim_corpus("train_desc_nltk_filtered.corpus.mtx")
locraw_corpus_csc = dio.read_gensim_corpus(
    "train_locraw_nltk_filtered.corpus.mtx")

#print title_corpus_csc.shape
print locraw_corpus_csc.shape

pipeline = Pipeline([
    ('pca', RandomizedPCA(random_state=3465343)),
    ('trees',
     ExtraTreesRegressor(min_samples_split=2, n_estimators=10, n_jobs=4)),
])