Beispiel #1
0
def save_pred_as_submit_format(pred_path, output_file, col_name=('ID', "TARGET")):
    print ('writing prediction as submission format')
    print ('read prediction <{}>'.format(pred_path))
    pred = paratext.load_csv_to_pandas(pred_path, allow_quoted_newlines=True).values
    #(((test.mean(1) - test.mean(1).mean())/test.mean(1).std()/100. + 0.5).values + pred)/2.0
    submission = pd.read_csv(INPUT_PATH+SUBMIT_FORMAT)
    submission[col_name[1]] = pred
    submission.to_csv( output_file, columns = col_name, index = None )
    print ('done writing')
    return
Beispiel #2
0
def save_pred_as_submit_format(pred_path, output_file, col_name=('ID', "TARGET")):
    print 'writing prediction as submission format'
    print 'read prediction <{}>'.format(pred_path)
    pred = paratext.load_csv_to_pandas(pred_path, allow_quoted_newlines=True).values
    #(((test.mean(1) - test.mean(1).mean())/test.mean(1).std()/100. + 0.5).values + pred)/2.0
    submission = pd.read_csv(INPUT_PATH+SUBMIT_FORMAT)
    submission[col_name[1]] = pred
    submission.to_csv( output_file, columns = col_name, index = None )
    print 'done writing'
    return
Beispiel #3
0
def run(parallel=False, pool_size=-1, base_parser=BaseParser.pandas):
    file_name = '../../../ATTIC/data/lineitem.csv'
    expected_row_count = 6001216
    # file_name = '../../../ATTIC/data/customer.csv'
    # expected_row_count = 150001

    f = open(file_name)
    file_size = os.path.getsize(file_name)

    if base_parser is not BaseParser.paratext:
        parser = CSVParser(callback, parallel, pool_size, base_parser)

        start_time = timeit.default_timer()

        while True:
            chunk = f.read(READ_CHUNK_SIZE)
            if chunk:
                parser.pump(chunk)
            else:
                break
        parser.close()

        row_count = parser.line_count

        stop_time = timeit.default_timer()
    else:
        start_time = timeit.default_timer()
        df = paratext.load_csv_to_pandas(file_name, num_threads=pool_size)
        stop_time = timeit.default_timer()
        row_count = len(df) + 1

    time = stop_time - start_time
    print("Time: {}, Rows: {}, Size: {}, MB/Sec {}".format(
        time, row_count, file_size, (file_size / time) / 1000 / 1000))

    assert expected_row_count == row_count
Beispiel #4
0
def load_data(flist, drop_duplicates=False):
    '''
    Usage: set train, target, and test key and feature files.

    FEATURE_LIST_stage2 = {
                'train':(
                         TEMP_PATH + 'v1_stage1_all_fold.csv',
                         TEMP_PATH + 'v2_stage1_all_fold.csv',
                         TEMP_PATH + 'v3_stage1_all_fold.csv',
                        ),#target is not in 'train'

                'target':(
                         INPUT_PATH + 'target.csv',
                        ),#target is in 'target'

                'test':(
                         TEMP_PATH + 'v1_stage1_test.csv',
                         TEMP_PATH + 'v2_stage1_test.csv',
                         TEMP_PATH + 'v3_stage1_test.csv',
                        ),
                }
    '''
    if (len(flist['train'])==0) or (len(flist['target'])==0) or (len(flist['test'])==0):
        raise Exception('train, target, and test must be set at \
                                    least one file, respectively.')

    X_train = pd.DataFrame()
    test = pd.DataFrame()

    print ('Reading train dataset')
    for i in flist['train']: # why need many train data?
        X_train = pd.concat([X_train, paratext.load_csv_to_pandas(PATH+i, allow_quoted_newlines=True)],axis=1)

    print ('train dataset is created')

    print ('Reading target data')
    y_train = paratext.load_csv_to_pandas(PATH+flist['target'][0], allow_quoted_newlines=True)['target']

    print ('Reading train dataset')
    for i in flist['test']:
        test = pd.concat([test, paratext.load_csv_to_pandas(PATH+i, allow_quoted_newlines=True)],axis=1)

    #del test['t_id']
    #print X_train.columns
    #print test.columns
    assert( (False in X_train.columns == test.columns) == False)
    print ('train shape :{}'.format(X_train.shape))
    if drop_duplicates == True:
        #delete identical columns
        unique_col = X_train.T.drop_duplicates().T.columns
        X_train = X_train[unique_col]
        test = test[unique_col]
        assert( all(X_train.columns == test.columns))
        print ('train shape after concat and drop_duplicates :{}'.format(X_train.shape))

    # drop constant features
    #X_train = X_train.loc[:, (X_train != X_train.ix[0]).any()] 
    #test = test.loc[:, (test != test.ix[0]).any()] 

    #common_col = list(set(X_train.columns.tolist()) and set(test.columns.tolist()))
    #X_train = X_train[common_col]
    #test = test[common_col]
    #print 'shape after dropping constant features: {}'.format(X_train.shape)
    
    return X_train, y_train, test 
# Scikit learn
import sklearn as sk
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score

# IO
import paratext

# ## Read Data

# In[125]:

print('Reading data')
#df_all = pd.read_csv('./casted_data_norm.csv', encoding='utf-8')
df_all = paratext.load_csv_to_pandas('data/casted_data_norm.csv',
                                     in_encoding='utf-8')
print('(Read data) End')

# In[3]:

df_all = df_all.assign(RSP_FLG_N=1 - df_all.RSP_FLG)
print(df_all.shape)
df_all.head()

# In[4]:

print(len(df_all.columns.values) - 4)
# 600 = 12 * 50

# In[5]:
Beispiel #6
0
def load_data(flist, drop_duplicates=False):
    '''
    Usage: set train, target, and test key and feature files.

    FEATURE_LIST_stage2 = {
                'train':(
                         TEMP_PATH + 'v1_stage1_all_fold.csv',
                         TEMP_PATH + 'v2_stage1_all_fold.csv',
                         TEMP_PATH + 'v3_stage1_all_fold.csv',
                        ),#target is not in 'train'

                'target':(
                         INPUT_PATH + 'target.csv',
                        ),#target is in 'target'

                'test':(
                         TEMP_PATH + 'v1_stage1_test.csv',
                         TEMP_PATH + 'v2_stage1_test.csv',
                         TEMP_PATH + 'v3_stage1_test.csv',
                        ),
                }
    '''
    if (len(flist['train'])==0) or (len(flist['target'])==0) or (len(flist['test'])==0):
        raise Exception('train, target, and test must be set at \
                                    least one file, respectively.')

    X_train = pd.DataFrame()
    test = pd.DataFrame()

    print 'Reading train dataset'
    for i in flist['train']:
        X_train = pd.concat([X_train, paratext.load_csv_to_pandas(PATH+i, allow_quoted_newlines=True)],axis=1)

    print 'train dataset is created'


    print 'Reading target data'
    y_train = paratext.load_csv_to_pandas(PATH+flist['target'][0], allow_quoted_newlines=True)['target']

    print 'Reading train dataset'
    for i in flist['test']:
        test = pd.concat([test, paratext.load_csv_to_pandas(PATH+i, allow_quoted_newlines=True)],axis=1)

    #del test['t_id']
    #print X_train.columns
    #print test.columns
    assert( (False in X_train.columns == test.columns) == False)
    print 'train shape :{}'.format(X_train.shape)
    if drop_duplicates == True:
        #delete identical columns
        unique_col = X_train.T.drop_duplicates().T.columns
        X_train = X_train[unique_col]
        test = test[unique_col]
        assert( all(X_train.columns == test.columns))
        print 'train shape after concat and drop_duplicates :{}'.format(X_train.shape)

    # drop constant features
    #X_train = X_train.loc[:, (X_train != X_train.ix[0]).any()] 
    #test = test.loc[:, (test != test.ix[0]).any()] 

    #common_col = list(set(X_train.columns.tolist()) and set(test.columns.tolist()))
    #X_train = X_train[common_col]
    #test = test[common_col]
    #print 'shape after dropping constant features: {}'.format(X_train.shape)
    
    return X_train, y_train, test