Esempio n. 1
0
def makeSplits(base, trainX, trainY, trainRatio, testX, testY):

    train_dev_data = read_data(base + trainX)
    train_dev_labels = load_labels_trec(base + trainY)

    print("shape of train and dev data ", train_dev_data.shape)
    print("shape of train and dev labels ", train_dev_labels.shape)
    print()

    data_length = train_dev_data.shape[0]

    train_X = train_dev_data[:int(trainRatio * data_length)]
    train_Y = train_dev_labels[:int(trainRatio * data_length)]

    dev_X = train_dev_data[int(trainRatio * data_length):]
    dev_Y = train_dev_labels[int(trainRatio * data_length):]

    test_X = read_data(base + testX)
    test_Y = load_labels_trec(base + testY)

    print("shape of train data ", train_X.shape)
    print("shape of train labels ", train_Y.shape)
    print()

    print("shape of test data ", test_X.shape)
    print("shape of test labels ", test_Y.shape)
    print()

    return train_X, train_Y, dev_X, dev_Y, test_X, test_Y
Esempio n. 2
0
def setup_data(train_path, val_path, img_folder_path, batch_size):
    train = read_data(train_path, img_folder_path)
    val = read_data(val_path, img_folder_path)
    train_dataset = create_dataloader(train,
                                      batch_size=batch_size,
                                      is_train=True,
                                      shuffle=True)
    val_dataset = create_dataloader(val,
                                    batch_size=batch_size,
                                    is_train=False,
                                    shuffle=False)
    return train_dataset, val_dataset
Esempio n. 3
0
def readSubj(data0, data1, trainRatio, devRatio):
    subj_data = read_data(data0)
    subj_labels = np.repeat([[1, 0]], subj_data.shape[0], axis=0)
    # Objective data
    obj_data = read_data(data1)
    obj_labels = np.repeat([[0, 1]], obj_data.shape[0], axis=0)

    print("DATA READ")
    sys.stdout.flush()

    # Shapes
    print("shape of positive data ", subj_data.shape)
    print("shape of positive labels ", subj_labels.shape)
    print()
    print("shape of negative data ", obj_data.shape)
    print("shape of negative labels ", obj_labels.shape)
    print()

    # unite data
    data = merge(subj_data, obj_data)
    labels = merge(subj_labels, obj_labels)

    # randomly shuffle data and labels
    np.random.seed(7)  # always the same split
    shuffle_indices = np.random.permutation(np.arange(len(data)))
    data_shuffled = data[shuffle_indices]
    labels_shuffled = labels[shuffle_indices]

    data_len = data.shape[0]
    train_index = int(trainRatio * data_len)
    dev_index = int(devRatio * data_len)
    test_index = data_len

    train_X = data[:train_index]
    dev_X = data[train_index:dev_index]
    test_X = data[train_index:test_index]

    train_Y = labels[:train_index]
    dev_Y = labels[train_index:dev_index]
    test_Y = labels[train_index:test_index]

    print("shape of train data ", train_X.shape)
    print("shape of train labels ", train_Y.shape)
    print()

    print("shape of test data ", test_X.shape)
    print("shape of test labels ", test_Y.shape)

    return train_X, train_Y, dev_X, dev_Y, test_X, test_Y
Esempio n. 4
0
def loadTrainDevTest(base, trainX, trainY, devX, devY, testX, testY):
    train_X = read_data(base + trainX)
    train_Y = load_labels_pe(base + trainY)

    dev_X = read_data(base + devX)
    dev_Y = load_labels_pe(base + devY)

    test_X = read_data(base + testX)
    test_Y = load_labels_pe(base + testY)

    # train
    print("train data shape ", train_X.shape)
    print("train labels shape ", train_Y.shape)
    print()
    # dev
    print("dev data shape ", dev_X.shape)
    print("dev labels shape ", dev_Y.shape)
    print()
    # test
    print("test data shape ", test_X.shape)
    print("test labels shape ", test_Y.shape)

    return train_X, train_Y, dev_X, dev_Y, test_X, test_Y
Esempio n. 5
0
    if args.batch_size:
        batch_size = args.batch_size
    if args.nb_epochs:
        nb_epochs = args.nb_epochs
    if args.lr:
        lr = args.lr
    if args.save_path:
        save_path = args.save_path

    # Make save_path
    if save_path is not None:
        os.makedirs(os.path.join(save_path, 'sequence_models'), exist_ok=True)

    # Read data
    smiles, y = read_data(data_path,
                          col_smiles='smiles',
                          col_target='HIV_active')
    tokens, num_words, max_phrase_len = generate_tokens(smiles,
                                                        len_percentile=100)

    # Get train and test set
    X_train, X_test, y_train, y_test = train_test_split(
        tokens,
        y,
        test_size=config.TEST_RATIO,
        shuffle=True,
        stratify=y,
        random_state=config.SEED)

    # Build en evaluate graph models
    model_scores = []
Esempio n. 6
0
from dash.dependencies import Input, Output

from utils.data import read_data
from utils.summary_table import summary_table
from utils.views import index, business, credit, household, employment, playground
from utils.playground import make_charts_for_questions


app = dash.Dash(
    __name__,
    external_stylesheets=[dbc.themes.BOOTSTRAP],
    suppress_callback_exceptions=True,
)


responses, question_labels = read_data()
label_questions = {v: k for k, v in question_labels.items()}
raw_data = pd.read_csv("./static/data/raw.csv")

unique_type_of_industry = raw_data["TypeofIndustry"].unique()
unique_genders = raw_data[label_questions["Gender"]].unique()
unique_states = raw_data["State"].unique()


app.layout = html.Div(
    className="container-fluid",
    style={"padding-right": "0px", "padding-left": "0px"},
    children=[dcc.Location(id="url", refresh=False), html.Div(id="body")],
)