Exemple #1
0
def preprocess():
    dataset = read_dataset('segment')
    data = dataset['data']

    df = pd.DataFrame(data)

    y = df['class'].copy()

    df = df.drop(columns=['class', 'region-pixel-count'])

    scaler = MinMaxScaler()

    X_num = scaler.fit_transform(df)
    df_X_num = pd.DataFrame(X_num, columns=df.columns)

    df_X_cat = df_X_num.copy()

    for col in list(df.columns):
        df_X_cat[col] = pd.qcut(x=df_X_num[col], q=5, duplicates='drop')

    df_X_num.to_csv(os.path.join('datasets', 'segment_clean_num.csv'),
                    index=False)
    df_X_cat.to_csv(os.path.join('datasets', 'segment_clean_cat.csv'),
                    index=False)

    y.to_csv(os.path.join('datasets', 'segment_clean_y.csv'),
             index=False,
             header=False)

    return 'segment_clean_num.csv', 'segment_clean_cat.csv', 'segment_clean_y.csv'
Exemple #2
0
def read_data(name: str) -> List[dict]:
    folds = []
    preprocess = preprocess_hypothyroid if name == 'hypothyroid' else preprocess_penn

    for i in tqdm(range(10), desc=f'Reading {name} dataset', ncols=150):
        train_data = read_dataset(name=f'{name}.fold.00000{i}.train',
                                  dataset_path=os.path.join('datasets', name))
        validation_data = read_dataset(name=f'{name}.fold.00000{i}.test',
                                       dataset_path=os.path.join(
                                           'datasets', name))
        (X_train, y_train), (X_val,
                             y_val) = preprocess(train_data, validation_data)
        folds.append({
            'X_train': X_train,
            'y_train': y_train,
            'X_val': X_val,
            'y_val': y_val
        })

    return folds
Exemple #3
0
def preprocess():
    DATASET = 'connect-4'

    data = read_dataset(DATASET)
    df = pd.DataFrame(data['data'])
    # subset needed for executing some of the algorithms in our computers!
    df = df.sample(n=5000, replace=False,
                   random_state=1).reset_index(drop=True)

    # since we are still doing unsupervised methods (clustering), we will ignore labels y... except for sup. evaluation
    df = df.applymap(lambda x: x.decode('utf-8')
                     )  # encode values as unicode strings instead of bytes
    X = df.loc[:, df.columns != 'class']
    y = df['class'].copy()  # for supervised evaluation of clustering

    # For all vars in X, the domain is ['b', 'o', 'x']
    # However, we will check it programatically.
    # Also, even if the dataset is supposed to have no missing values, we will check it as well, just in case.
    X_categories = set([])
    for index, row in X.iterrows():
        for col_val in row:
            X_categories.add(col_val)

    # {'b', 'o', 'x'}, so the domain is confirmed, Also, no missing values,
    # because otherwise would have None or others
    # Recall that: 'x' means that we have a cell with a disk belonging to player 'x',
    # 'o' means that we have a cell with a disk belonging to player 'o', and 'b' means that
    # the cell is empty (blank).

    # Instead of one hot encoding, we will apply label encoder with [0, 0.5, 1]. The reason why we will do it
    # this way is that 'x' and 'o' are antagonists, and 'b' is the neutral value. So, there is some kind of natural
    # order. This way, we can avoid the one hot encoding, which would increase the number of columns.
    # Since all the variables have the same domain, we should be consistent with the encoding. For us, 'x'
    # will always be encoded as '0' and 'o' will always be encoded as '1'.
    # X_encoded = X.apply(LabelEncoder().fit_transform)
    # LabelEncoder works alphabetically and with range [0,n_classes-1],
    # so 'b' will be encoded as 2, 'o' as 1, and 'x' as 0, which is not the intended outcome for us.
    # It has no additional parameters, so we will apply our own encoder:
    def recode(x):
        recode_map = {'x': 0, 'b': 0.5, 'o': 1}
        return recode_map[x]

    X_encoded = X.applymap(recode)

    # save the cleaned/encoded X as a CSV for later. y is needed for supervised evaluation.
    X.to_csv(os.path.join('datasets', 'connect_4_clean.csv'), index=False)
    X_encoded.to_csv(os.path.join('datasets', 'connect_4_clean_num.csv'),
                     index=False)
    y.to_csv(os.path.join('datasets', 'connect_4_clean_y.csv'),
             index=False,
             header=False)
    return 'connect_4_clean.csv', 'connect_4_clean_num.csv', 'connect_4_clean_y.csv'
Exemple #4
0
            if opt.deviceIds[0] >= 0 else "Use CPU as target slu torch device")
logger.info("Use GPU with index %s as target nlg device" % (opt.deviceIds[1])
            if opt.deviceIds[1] >= 0 else "Use CPU as target nlg torch device")

##### Vocab and Dataset Reader #####
slu_vocab, nlg_vocab = Vocab(dataset=opt.dataset,
                             task='slu'), Vocab(dataset=opt.dataset,
                                                task='nlg')
lm_vocab = Vocab(dataset=opt.dataset, task='lm')
slu_evaluator, nlg_evaluator = Evaluator.get_evaluator_from_task(
    task='slu',
    vocab=slu_vocab), Evaluator.get_evaluator_from_task(task='nlg',
                                                        vocab=nlg_vocab)

if not opt.testing:
    train_dataset, dev_dataset = read_dataset(
        opt.dataset, choice='train'), read_dataset(opt.dataset, choice='valid')
    labeled_dataset, unlabeled_dataset = split_dataset(train_dataset,
                                                       opt.labeled)
    logger.info(
        "Labeled/Unlabeled train and dev dataset size is: %s/%s and %s" %
        (len(labeled_dataset), len(unlabeled_dataset), len(dev_dataset)))
    unlabeled_dataset = labeled_dataset + unlabeled_dataset
test_dataset = read_dataset(opt.dataset, choice='test')
logger.info("Test dataset size is: %s" % (len(test_dataset)))

##### Model Construction and Init #####
if not opt.testing:
    params = vars(opt)
    json.dump(params,
              open(os.path.join(exp_path, 'params.json'), 'w'),
              indent=4)
Exemple #5
0
from classifier import train_classifier
from features import extract_features, flatten_features
from utils.dataset import read_dataset
from utils.preprocessing import preprocess_dataframe, extract_labels, decipher_labels, oversample_minority_classes
from utils.scoring import print_cv_score, evaluate_submission
from utils.splits import generate_hold_out_split

if __name__ == "__main__":
    print('Reading data...')
    raw_data = read_dataset('data')
    data = preprocess_dataframe(raw_data, 'raw_data')
    labels = extract_labels(data)

    print('Extracting features...')
    features = extract_features(data, raw_data)

    print('Flattening features...')
    flattened_features = flatten_features(features)

    print('Generating hold-out split...')
    training_data, testing_data, unused_data = generate_hold_out_split(raw_data)
    training_features, testing_features = flattened_features.iloc[training_data.index], flattened_features.iloc[testing_data.index]
    training_labels, testing_labels = labels.iloc[training_data.index], labels.iloc[testing_data.index]

    print('Oversampling minority classes...')
    oversampled_training_features, oversampled_training_labels = oversample_minority_classes(training_features, training_labels)

    print('Training classifier...')
    classifier = train_classifier(oversampled_training_features, oversampled_training_labels)

    print('Cross-validating...')
Exemple #6
0
def preprocess():
    dataset = read_dataset('adult')
    data = dataset['data']

    df = pd.DataFrame(data)
    df = df.sample(n=5000, replace=False, random_state=1).reset_index(drop=True)

    df = df.applymap(lambda x: x.decode('utf-8') if isinstance(x, bytes) else x)

    # Real Y labels
    y = df['class'].copy()
    df = df.drop(columns=['class'])

    categorical_features = ['workclass',
                            'education',
                            'marital-status',
                            'occupation',
                            'relationship',
                            'race',
                            'sex',
                            'native-country']

    numerical_features = ['age',
                          'fnlwgt',
                          'education-num',
                          'capital-gain',
                          'capital-loss',
                          'hours-per-week']

    # Encode categorical values into numerical with OHE
    ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
    X_categorical = ohe.fit_transform(df[categorical_features])

    columns = ohe.get_feature_names(input_features=categorical_features)
    X_categorical = pd.DataFrame(data=X_categorical, columns=columns)

    # Scale numerical values
    sc = MinMaxScaler()
    X_numerical = sc.fit_transform(df[numerical_features])
    X_numerical = pd.DataFrame(data=X_numerical, columns=numerical_features)

    # All to categorical
    X_numerical_as_categorical = X_numerical.copy()
    for feat in numerical_features:
        X_numerical_as_categorical[feat] = pd.qcut(x=X_numerical[feat], q=5, duplicates='drop')

    # Mix data
    X_df = pd.concat((df[categorical_features], X_numerical), axis=1)

    # Numerical only data
    X_df_num = pd.concat((X_categorical, X_numerical), axis=1)

    # Categorical only data
    X_df_cat = pd.concat((df[categorical_features], X_numerical_as_categorical), axis=1)

    # In[34]:

    X_df.to_csv(os.path.join('datasets', 'adult_clean.csv'), index=False)
    X_df_num.to_csv(os.path.join('datasets', 'adult_clean_num.csv'), index=False)
    X_df_cat.to_csv(os.path.join('datasets', 'adult_clean_cat.csv'), index=False)

    y.to_csv(os.path.join('datasets', 'adult_clean_y.csv'), index=False, header=False)

    return 'adult_clean_num.csv', 'adult_clean_cat.csv', 'adult_clean.csv', 'adult_clean_y.csv'
Exemple #7
0
    for name in namelist.strip().split(' '):
        index = 0
        if len(name) >= 3:
            while index < len(sen):
                if index < len(sen) - 2 and sen[index] == name[0] and sen[
                        index + 1] == name[1] and sen[index + 2] == name[2]:
                    tmp = name[1] + name[2]
                    sen = sen[:index + 1] + [tmp] + sen[index + 3:]
                    index += 1
                index += 1
    return sen


if __name__ == '__main__':
    print(time.strftime('%Y-%m-%d %H:%M:%S'))
    train_dataset = read_dataset()
    test_dataset = read_dataset(TEST_FILE2)
    hmm = HMM()
    hmm.fit(train_dataset)
    Pner = PlaceRec()
    Numner = NumRec()
    cname = CNNAME()
    cname.fit()
    separator = ' '
    f = open(OUT_PUT, 'wb')
    re_han = re.compile(ur"([\u4E00-\u9FA5\u25cb]+)")
    re_skip = re.compile(
        ur"^[\uff0d\-{0,1}a-zA-Z0-9\uff10-\uff19\u2014\uff21-\uff3a\uff41-\uff5a\u2026\u25cb\\.]$"
    )
    print(time.strftime('%Y-%m-%d %H:%M:%S'))
    print 'Start seg...'
Exemple #8
0

def create_transforms(additional):
    res = list(additional)
    # add necessary transformations
    res.extend([
        A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
        ChannelTranspose()
    ])
    res = A.Compose(res)
    return res


if __name__ == '__main__':
    df = read_dataset(
        '/mnt/HDD/home/druzhinin/kaggle/kaggle_severstal/dataset/train.csv',
        '/mnt/HDD/home/druzhinin/kaggle/kaggle_severstal/dataset/train_images')

    # Different transforms for TTA wrapper
    transforms = [[], [A.HorizontalFlip(p=1)], [A.VerticalFlip(p=1)],
                  [A.HorizontalFlip(p=1),
                   A.VerticalFlip(p=1)]]

    transforms = [create_transforms(t) for t in transforms]

    device = 'cuda'

    print('resnet34-class01')
    model = torch.jit.load(
        '/mnt/HDD/home/druzhinin/kaggle/kaggle_severstal/download/resnet34-class01/torchscript.pth',
        map_location=device)
Exemple #9
0
    progress = 0
    with open(EMBEDDING(dataset), 'w') as out_file:
        for word in vocab:
            progress += 1
            vector = word_embed.emb(word) + char_embed.emb(word)
            string = ' '.join([str(v) for v in vector])
            out_file.write(word + ' ' + string + '\n')
            if progress % 1000 == 0:
                print("Retrieve 400-dim GK Embedding for the", progress,
                      "-th word ...")
    print('In total, process %d words in %s' % (len(vocab), dataset))


if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('--dataset', required=True, nargs='+')
    parser.add_argument('--mwf',
                        type=int,
                        default=1,
                        help='minimum word frequency')
    args = parser.parse_args(sys.argv[1:])

    for d in args.dataset:
        print('\nStart processing domain %s ...' % (d))
        ex_list = read_dataset(d, 'train') + read_dataset(
            d, 'valid') + read_dataset(d, 'test')
        words, bios, slots, intents = construct_vocab(d, ex_list, args.mwf)
        construct_database_and_com(d, ex_list)
        get_pretrained_embeddings(d, words, slots, intents)
Exemple #10
0
    for name in namelist.strip().split(' '):
        index = 0
        if len(name) >= 3:
            while index < len(sen):
                if index < len(sen) - 2 and sen[index] == name[0] and sen[index + 1] == name[1] and sen[index + 2] == \
                        name[2]:
                    tmp = name[1] + name[2]
                    sen = sen[:index + 1] + [tmp] + sen[index + 3:]
                    index += 1
                index += 1
    return sen


if __name__ == '__main__':
    print(time.strftime('%Y-%m-%d %H:%M:%S'))
    train_dataset = read_dataset()
    hmm = HMM()
    hmm.fit(train_dataset)
    Pner = PlaceRec()
    Numner = NumRec()
    cname = CNNAME()
    cname.fit()
    separator = ' '
    test_sentence = open(TEST_FILE, 'rb')
    re_han = re.compile(ur"([\u4E00-\u9FA5\u25cb]+)")
    re_skip = re.compile(
        ur"^[\uff0d\-{0,1}a-zA-Z0-9\uff10-\uff19\u2014\uff21-\uff3a\uff41-\uff5a\u2026\u25cb\\.]$"
    )
    print(time.strftime('%Y-%m-%d %H:%M:%S'))
    print 'Start seg...'
    for line in test_sentence:
    #                             catalyst=False,
    #                             pin_memory=False,
    #                             binary=True,
    #                             multi=False)
    #
    # model = mobilenetv3(1).cuda().eval()
    # # state = torch.load('/home/druzhinin/HDD/kaggle/kaggle_severstal/logdir/1.6.mobilenet_multi/binary/checkpoints/best.pth')
    # # model.load_state_dict(state['model_state_dict'])
    # # del state
    # model = model.eval()
    # find_best_threshold_binary(np.arange(0.05, 1, 0.05), model, dataloader)

    # ------------------------------------------------------------------------------------------------------------------------------------------

    df = read_dataset(
        '../dataset/train.csv',
        '../dataset/train_images',
    )
    # df = df.dropna(subset=[1, 2, 3, 4], how='all')
    dataloader = get_dataloader(df,
                                transforms,
                                batch_size=2,
                                shuffle=False,
                                num_workers=6,
                                phase='valid',
                                catalyst=False,
                                pin_memory=False,
                                binary=False,
                                multi=False)

    from stage_experiments.transforms_1_7.model import Model
    # Load model
Exemple #12
0
   
def Name_Replace(namelist,sen):
    for name in namelist.strip().split(' '):
        index = 0
        if len(name)>=3:
            while index<len(sen):
                if index < len(sen)-2 and sen[index]== name[0] and sen[index+1] == name[1] and sen[index+2] == name[2]:
                    tmp = name[1]+name[2]
                    sen = sen[:index+1] + [tmp] + sen[index+3:]
                    index+=1
                index+=1
    return sen
        
if __name__ == '__main__':
    print (time.strftime('%Y-%m-%d %H:%M:%S'))
    train_dataset = read_dataset()
    test_dataset = read_dataset(TEST_FILE2)
    hmm = HMM()
    hmm.fit(train_dataset)
    Pner = PlaceRec()
    Numner = NumRec()
    cname = CNNAME()
    cname.fit()
    separator = ' '
    f = open(OUT_PUT, 'wb')
    re_han = re.compile(ur"([\u4E00-\u9FA5\u25cb]+)")
    re_skip = re.compile(ur"^[\uff0d\-{0,1}a-zA-Z0-9\uff10-\uff19\u2014\uff21-\uff3a\uff41-\uff5a\u2026\u25cb\\.]$")
    print (time.strftime('%Y-%m-%d %H:%M:%S'))
    print 'Start seg...'
    for line in test_dataset:
        res = ''