def MLP_test():
    from preprocess import process_data, partition_data
    print('processing data...')
    X, y = process_data(collapse=False,
                        encode=True,
                        normalize=True,
                        predict_missing=True,
                        k_predict=3)
    [test, validate, train] = partition_data(X, y)

    print('fitting model... ')
    from sklearn.neural_network import MLPClassifier
    model = MLPClassifier(hidden_layer_sizes=(1000, 2000, 1000, 100, 50),
                          verbose=False)
    model.fit(train[0], train[1])

    valid_prob = model.predict_proba(validate[0])

    print(valid_prob[0:5])
    print(validate[1][0:5])

    from cross_entropy import cross_entropy
    print(valid_prob.shape, validate[1].shape)
    print('cross entropy:', cross_entropy(validate[1], valid_prob))

    from risk import empirical_risk
    print('mse:', empirical_risk('mse', valid_prob, validate[1]))

    from sklearn.metrics import accuracy_score
    print('accuracy', accuracy_score(validate[1], model.predict(validate[0])))
Beispiel #2
0
def data_corr():
    # Get data according to analysis above
    train_path = pr.process_data('train.csv')
    train_data = pd.DataFrame.from_csv(train_path)

    sn.set(style="white")
    sn.set(font_scale=0.7)
    # Compute the correlation matrix
    corr = train_data.corr()
    # Generate a mask for the upper triangle
    mask = np.zeros_like(corr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True

    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(11, 9))

    # Generate a custom diverging colormap
    cmap = sn.diverging_palette(220, 10, as_cmap=True)

    # Draw the heatmap with the mask and correct aspect ratio
    sn.heatmap(corr,
               mask=mask,
               cmap=cmap,
               vmax=.3,
               square=True,
               linewidths=.5,
               cbar_kws={"shrink": .5},
               ax=ax)
    plt.show()
Beispiel #3
0
def save_data(filename):
    """
    Save kaggle json data file called 'filename' to 'processed_<filename>'
    after pre-processing it.
    """
    print "Creating processed datafile for '{}'".format(filename)

    data_path = os.path.join(PROJECT_ROOT, 'data', filename)
    proc_path = os.path.join(PROJECT_ROOT, 'data', "processed_" + filename)
    try:
        pre = os.path.basename(data_path)
        post = os.path.basename(proc_path)

        print "\nOpening '{}'...".format(pre)
        with open(data_path, 'r') as datafile:
            data = pd.read_json(datafile)
        data = process_data(data)

        print "Writing processed data to '{}'...".format(post)

        with open(proc_path, "w") as processed_file:
            data.to_json(processed_file)

        print "Finished processing '{}' into '{}'.".format(pre, post)
    except IOError as ioe:
        print "Failed to process {} to file.".format(data_path)
        print ioe
Beispiel #4
0
def learn(basepath, features_file, labels_file):
    # Load the data
    print 'Loading data...'
    features_data = pd.read_msgpack(load_data(basepath, features_file)['data'])
    labels_data = pd.read_msgpack(load_data(basepath, labels_file)['data'])
    df = pd.concat([features_data,labels_data], axis=1)

    # Process features
    samples, labels = preprocess.process_data(df)

    # How many samples are we going to leave out for the test set?
    nb_test = int(len(labels) * 0.2)
    split = len(labels) - nb_test

    # Prepare training and test sets
    X_train = np.array(samples[:split])
    y_train = labels[:split]
    X_test = np.array(samples[split+1:])
    y_test = labels[split+1:]
    print len(X_train), 'train sequences'
    print len(X_test), 'test sequences'

    # How many classes?
    num_classes = np.max(labels)+1
    print num_classes, 'classes'

    # Train Model
    train_and_save(X_train, X_test, y_train, y_test, num_classes, basepath)
Beispiel #5
0
def learn(basepath, features_file, labels_file):
    # Load the data
    print 'Loading data...'
    features_data = pd.read_msgpack(load_data(basepath, features_file)['data'])
    labels_data = pd.read_msgpack(load_data(basepath, labels_file)['data'])
    df = pd.concat([features_data, labels_data], axis=1)

    # Process features
    samples, labels = preprocess.process_data(df)

    # How many samples are we going to leave out for the test set?
    nb_test = int(len(labels) * 0.2)
    split = len(labels) - nb_test

    # Prepare training and test sets
    X_train = np.array(samples[:split])
    y_train = labels[:split]
    X_test = np.array(samples[split + 1:])
    y_test = labels[split + 1:]
    print len(X_train), 'train sequences'
    print len(X_test), 'test sequences'

    # How many classes?
    num_classes = np.max(labels) + 1
    print num_classes, 'classes'

    # Train Model
    train_and_save(X_train, X_test, y_train, y_test, num_classes, basepath)
 def test_preprocess_ud(self):
     'Test loading in a ud data file.'
     infile = "testdata/test_wo_ud_input.txt"
     input_text: List[str] = preprocess.process_data(infile, "ud")
     expected = ("Ndax ku jëkk a jël dakkantal boobu, di Decce Fu Njogu "
                 "FAAL, dañu naan damm na li Kajoor seqante woon ak Jolof, "
                 "ndax Jolof moo nangu woon Kajoor.")
     self.assertEqual(input_text[0], expected)
 def test_load_missing_file(self):
     'Test loading in a file that does not exist.'
     infile = ""
     try:
         input_text: List[str] = preprocess.process_data(infile, "ud")
     except Exception:
         input_text = None
     expected = None
     self.assertEqual(input_text, expected)
 def test_end_to_end_with_file(self):
     'Test loading a file and normalizing it.'
     infile = "testdata/test_zu_lcc_input.tsv"
     input_text: List[str] = preprocess.process_data(infile, "lcc")
     normalized_text = norm.token_normalizer(input_text[0])
     expected = ("iningizimu afrika iyizwe elisezansi ezwenikazi "
                 "lase-afrika yaziwa ngokusemthethweni ngokuthi "
                 "iriphabhuliki yaseningizimu afrika")
     self.assertEqual(normalized_text, expected)
Beispiel #9
0
def initialize_data():
    global gh_scraper, generator, logger

    # scraping COVID-19 data
    gh_scraper.scrape()
    reports, countries = gh_scraper.cache, gh_scraper.valid_countries
    dates = process_dates(reports)
    data = process_data(reports, countries)

    generator = DataGenerator(dates, data, countries)
Beispiel #10
0
def get_inputs_labels_embedding_matrix(dataPath, emb_flag):
    reviews, labels = pp.process_data(dataPath)
    tokens = []
    indexes = []
    segments = []

    review_input = np.zeros(
        (len(reviews), config.MAX_SENTS, config.MAX_SENT_LENGTH),
        dtype='int32')

    for i, sentences in enumerate(reviews):
        for j, sent in enumerate(sentences):
            new_indexed_tokens = []
            if j < config.MAX_SENTS:
                tokenized_text, indexed_tokens, segments_ids = bp.get_tokenized_text(
                    sent)
                tokens += tokenized_text
                indexes += indexed_tokens
                segments += segments_ids
                if len(indexed_tokens) < config.MAX_SENT_LENGTH:
                    new_indexed_tokens += [0] * (config.MAX_SENT_LENGTH -
                                                 len(indexed_tokens))
                    review_input[i][j] = indexed_tokens + new_indexed_tokens
                else:
                    review_input[i][j] = indexed_tokens[:config.
                                                        MAX_SENT_LENGTH]

    if emb_flag == 1:
        del reviews
        indexes = torch.tensor(indexes)
        segments = torch.tensor(segments)

        bert_dataset = TensorDataset(indexes, segments)
        batch_size = 512
        train_dataloader = DataLoader(bert_dataset,
                                      batch_size=batch_size,
                                      drop_last=True)
        emb_matrix = torch.tensor(
            np.random.random(
                (len(tokenizer.vocab) + 1, config.embedding_dim))).to(device)
        for batch in tqdm(train_dataloader):
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_input_ids = b_input_ids.view((1, len(b_input_ids))).to(device)
            b_input_mask = b_input_mask.view((1, len(b_input_mask))).to(device)
            emb_matrix = bp.get_embeddings(
                bp.get_encoded_layers(b_input_ids, b_input_mask), emb_matrix,
                b_input_ids).to(device)
        emb_matrix = emb_matrix.cpu()
        emb_matrix = emb_matrix.numpy()[:, :]
        np.save('embeddings.npy', emb_matrix)
    else:
        emb_matrix = np.load("./embeddings.npy")

    return review_input, labels, emb_matrix
 def test_preprocess_oscar(self):
     'Test loading in an oscar data file.'
     infile = "testdata/test_af_oscar_input.txt"
     input_text: List[str] = preprocess.process_data(infile, "oscar")
     expected = ("Nadat dit duidelik geword het dat die Regering die "
                 "aangeleentheid nie verder sou voer nie, het die "
                 "Volksraad die ANC-regering se miskenning van "
                 "internasionaal-aanvaarde regte en verpligtinge en die "
                 "vergrype teen ons volk, onder die aandag van die "
                 "internasionale gemeenskap gebring.")
     self.assertEqual(input_text[0], expected)
 def test_preprocess_lcc(self):
     'Test loading in an lcc data file.'
     infile = "testdata/test_so_lcc_input.txt"
     input_text: List[str] = preprocess.process_data(infile, "lcc")
     expected = (
         "Dhamaha iyo madhamaha التام والناقص Falalka "
         "madhamaha waxay ku jirtaa weerta magaca ah iyadoo ka "
         "dhigaysa mid waqti cayiman dhacay ama qaab cayiman u "
         "dhacay, wa xayna u dhexeysaa falalka dhamaha iyo qodobada "
         """"xarfaha macnayaalka" ((أحرف المعاني)).""")
     self.assertEqual(input_text[0], expected)
Beispiel #13
0
def analyze(basepath, filename):
    global LOOKBACK, MAX_MIDPOINT_DELTA

    # Load the data & model
    df = load_data(basepath, filename)
    model = load_model(basepath)

    # Process data
    samples, labels = preprocess.process_data(df)

    # Test model and print results
    print "Running analysis..."
    predictions = model.predict_classes(samples, batch_size=32)
    print_results(labels, predictions)
Beispiel #14
0
def analyze(basepath, filename):
    global LOOKBACK, MAX_MIDPOINT_DELTA

    # Load the data & model
    df = load_data(basepath, filename)
    model = load_model(basepath)

    # Process data
    samples, labels = preprocess.process_data(df)

    # Test model and print results
    print "Running analysis..."
    predictions = model.predict_classes(samples, batch_size=32)
    print_results(labels, predictions)
def main():
    # read data file
    raw_df = pd.read_csv("filtered_sentences.tsv", sep="\t")
    # preprocess
    data = preprocess.process_data(raw_df)
    raw_sentences = map(lambda x: x["SENTENCE"], data.values())
    processed_sentences = map(lambda x: x["PROCESSED"], data.values())
    labels = map(lambda x: x["LABEL"], data.values())
    # split train and test datasets
    test_split = 0.3
    num_test_items = int(math.ceil(test_split * (len(processed_sentences))))

    train_sentences = processed_sentences[:-num_test_items]
    train_labels = labels[:-num_test_items]
    test_sentences = processed_sentences[-num_test_items:]
    test_labels = labels[-num_test_items:]
    # get tfidf features
    Xtrn, Xtst, Ytrn, Ytst = get_tfidf_features(train_sentences, test_sentences, train_labels, test_labels)
Beispiel #16
0
def main():
	
	all_res_genes = []
	
	genes, data = process_data(in_file)
	for i in range(len(data)):
		res_idx = get_res_genes(data[i])
		res_genes = []
		for j in range(len(res_idx)):
			res_genes.append(genes[j])
		print "resistant genes found: {}".format(len(res_genes))
		all_res_genes.append(res_genes)
	
	res_gene_list = all_res_genes[0]
	for i in range(len(all_res_genes)):
		res_gene_list = [x for x in res_gene_list if x in all_res_genes[i]]
		
	print res_gene_list
	print len(res_gene_list)
def cross_validate():
    net_hidden_layers = [
        (100),
        (1000),
        (100, 100),
        (1000, 1000),
        (1000, 100),
        (1000, 100, 100),
        (1000, 1000, 100),
        (1000, 1000, 100, 100),
        (1000, 2000, 100, 500, 100),
        (2000, 1000, 500, 100, 50),
    ]
    models = [FFNN(h) for h in net_hidden_layers]

    from preprocess import process_data, partition_data
    print('processing data...')
    X, y = process_data(collapse=False,
                        encode=True,
                        normalize=True,
                        predict_missing=True,
                        k_predict=3)

    from cross_validation import cross_validation
    r = cross_validation(X, y, models)

    print(r)
    i = np.argmin(r)

    print('best model...', net_hidden_layers[i])
    model = FFNN(net_hidden_layers[i])

    partitioned_data = partition_data(X, y, partitions=[0.2, 0.8])

    train = partitioned_data[1]
    valid = partitioned_data[0]

    model.fit(train[0], train[1])
    p = model.predict(valid[0])

    from evaluate import evaluate
    print(valid[1].shape, p.shape)
    evaluate(valid[1], p)
Beispiel #18
0
def run(subject, steps):
    # ===========================PROCESSING START===============================
    processing_start = time.time()

    X_Data, Y_Data = process_data(subject, steps)

    enc = preprocessing.OneHotEncoder(categories='auto')
    enc.fit(Y_Data)
    Y_Data = enc.transform(Y_Data).toarray()

    X_train, X_test, Y_train, Y_test = train_test_split(X_Data,
                                                        Y_Data,
                                                        test_size=0.25)

    processing_end = time.time()
    print("Preprocessed data in: %.2f sec\n" %
          (processing_end - processing_start))
    # ===========================TRAINING START===============================

    # train_start = time.time()

    model = train(X_train, Y_train)
    # save_model(model, "model_roi1_6steps.json")
    # model = load_model("model_roi1_5steps.json")
    f1, acc = evaluate(model, X_test, Y_test)

    # train_end = time.time()
    # print("Train and test in: %.2f sec" % (train_end - train_start))

    # ===========================TRAINING END===============================

    # result = {
    #     "processing_start": processing_start,
    #     "processing_end": processing_end,
    #     "train_start": train_start,
    #     "train_end": train_end
    # }
    #
    # with open("timestamps.json", "w") as outfile:
    #     json.dump(result, outfile)

    return subject, steps, f1, acc
Beispiel #19
0
def main():
    # read data file
    raw_df = pd.read_csv("filtered_sentences.tsv", sep="\t")
    # preprocess
    data = preprocess.process_data(raw_df)
    raw_sentences = map(lambda x: x['SENTENCE'], data.values())
    processed_sentences = map(lambda x: x['PROCESSED'], data.values())
    labels = map(lambda x: x['LABEL'], data.values())
    # split train and test datasets
    test_split = 0.3
    num_test_items = int(math.ceil(test_split * (len(processed_sentences))))

    train_sentences = processed_sentences[:-num_test_items]
    train_labels = labels[:-num_test_items]
    test_sentences = processed_sentences[-num_test_items:]
    test_labels = labels[-num_test_items:]
    # get tfidf features
    Xtrn, Xtst, Ytrn, Ytst = get_tfidf_features(train_sentences,
                                                test_sentences, train_labels,
                                                test_labels)
Beispiel #20
0
pat_reprs = []
pat_reprs_dict = {}
pat_dict = {}
count = 0

####################################################################

for file_l in range(len(file_lis)):
    data1, data2, data3, label_train, train_len, test_len,\
             elmo_model, categ, key = ld.load_mimic(file_lis[file_l], elmo_fname)

    data_l, data_r, data_z, data_c, embedding_matrix, idf_sent_1 = prs.process_data(
        data1,
        data2,
        data3,
        categ,
        elmo_model,
        dimx=dimx,
        dimy=dimy,
        vocab_size=vocab_size,
        embedding_dim=embedding_dim)

    X_train_l, X_test_l, X_dev_l, X_train_r, X_test_r, X_dev_r, X_train_z, X_test_z, X_dev_z, X_train_c, X_test_c, X_dev_c, idf_train_l, idf_test_l, idf_dev_l = ld.prepare_train_tests(
        data_l, data_r, data_z, data_c, train_len, test_len, idf_sent_1)

    np_dist = np.load(ssm_np[file_l])
    np_dist = np_dist[0]
    np_dist = np_dist[0:63, 0:63]
    ssm_np_train = X_train_r.shape[0] * [np_dist]
    ssm_np_train = np.asarray(ssm_np_train)
    np_shape = np_dist.shape[0]
Beispiel #21
0
def test():

	genes, data = process_data(in_file)
	print genes
	print data
def main(argv):
    """Normalizes text by all steps in the text normalizer.

    If given an input string and a language flag, will normalize that string
    using the language's config. If given the language flag and data source
    flag, will normalize the file listed for that data source in the language's
    config file, saving the output to a new tsv file.
    """
    try:
        language = importlib.import_module("config." + FLAGS.language)
    except:
        raise app.UsageError("Needs a value for the language flag.")

    norm = normalizer_lib.NormalizerLib(FLAGS.language)

    if len(argv) > 1:
        raise app.UsageError("Too many command-line arguments.")

    if FLAGS.string_to_normalize is not None:
        input_text: str = FLAGS.string_to_normalize
        print("TOKEN_BASED:\t" +
              norm.token_normalizer(FLAGS.string_to_normalize))
        print("SENTENCE_BASED:\t" +
              norm.sentence_normalizer(FLAGS.string_to_normalize))
    else:
        data_source: str = FLAGS.data_source
        if data_source == "ud":
            infile = language.UD
        elif data_source == "um":
            infile = language.UM
        elif data_source == "ac":
            infile = language.AC
        elif data_source == "oscar":
            infile = language.OSCAR
        elif data_source == "lcc":
            infile = language.LCC
        try:
            input_text: List[str] = preprocess.process_data(
                infile, FLAGS.data_source)
        except Exception:
            print(f"No data file from '{data_source}' for '{FLAGS.language}'.")
            return

        experiment_dir: str = "./output/" + FLAGS.experiment
        if not os.path.exists(experiment_dir):
            os.makedirs(experiment_dir)
        condition: str = ("language=" + FLAGS.language + "_" + "datasource=" +
                          data_source + "_" + "passvalid=" + FLAGS.pass_valid)
        outfile_human_readable: str = ("./output/" + FLAGS.experiment + "/" +
                                       condition + "_" + "humanreadable.tsv")
        outfile_unnormalized: str = ("./output/" + FLAGS.experiment + "/" +
                                     condition + "_" + "unnormalized.p")
        outfile_normalized: str = ("./output/" + FLAGS.experiment + "/" +
                                   condition + "_" + "normalized.p")

        unnormalized_data_for_lm = []
        normalized_data_for_lm = []

        human_readable_output = open(outfile_human_readable, "w")
        human_readable_output.write(
            "SENTENCE_ID\tUNNORMALIZED_TEXT\tNORMALIZED_TEXT\n")

        i = 0
        for line in tqdm(input_text):
            sentence_id: str = str(i)
            sentence_text: str = line.strip()
            if FLAGS.pass_valid == "token":
                normalized_text: str = norm.token_normalizer(sentence_text)
            elif FLAGS.pass_valid == "sentence":
                normalized_text: str = norm.sentence_normalizer(sentence_text)
            newline = (sentence_id + "\t" + sentence_text + "\t" +
                       normalized_text + "\n")
            human_readable_output.write(newline)
            unnormalized_data_for_lm.append(sentence_text.split(" "))
            normalized_data_for_lm.append(normalized_text.split(" "))
            i += 1
            # files pickled here after each line so there's data in case the
            # process ends before normalizing the entire data file
            pickle.dump(unnormalized_data_for_lm,
                        open(outfile_unnormalized, "wb"))
            pickle.dump(normalized_data_for_lm, open(outfile_normalized, "wb"))
Beispiel #23
0
import numpy as np
import preprocess as pp
from keras.models import model_from_json

question1, question2 = pp.extract_data("quora-question-pairs/test.csv", 'test')
question1_word_sequences, question2_word_sequences, word_index = pp.tokenize(
    question1, question2)
embeddings_index = pp.get_embeddings("glove.840B.300d/glove.840B.300d.txt")
nb_words, word_embedding_matrix = pp.get_embedding_matrix(
    word_index, embeddings_index)
q1_data, q2_data, word_embedding_matrix, nb_words = pp.process_data(
    question1_word_sequences, question2_word_sequences, word_embedding_matrix,
    nb_words, 'test')

X_train = np.stack((q1_data, q2_data), axis=1)
Q1_train = X_train[:, 0]
Q2_train = X_train[:, 1]

json_file = open('best_weights/model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
model = model_from_json(loaded_model_json)
model.load_weights("best_weights/weights.h5")
print("Loaded model from disk")

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
score = model.predict([Q1_train, Q2_train])
print(score)
Beispiel #24
0
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout

from model import modelfit
from preprocess import process_data
from model_report import model_report
from save_results import save_results
import load_data
gamma1_filename = load_data.gamma1_filename
neutron1_filename = load_data.neutron1_filename

(X_train, X_test, Y_train, Y_test) = process_data(gamma1_filename,
                                                  neutron1_filename)

# Create model here given constraints in the problem
model = Sequential()

model.add(Dense(1024))
model.add(Activation('relu'))
model.add(Dropout(0.30))
model.add(Dense(600))
model.add(Activation('relu'))
model.add(Dropout(0.20))
model.add(Dense(50))
model.add(Activation('relu'))
model.add(Dropout(0.20))
model.add(Dense(1))
#model.add(Activation('softmax'))
Beispiel #25
0
def train(args):
    KB_file = 'data/2H-kb.txt'
    data_file = 'data/2H.txt'
    start = time.time()
    Q, A, P, S, Triples, args.query_size, word2id, ent2id, rel2id = process_data(
        KB_file, data_file)
    args.path_size = len(P[0])
    args.nhop = args.path_size / 2

    print("read data cost %f seconds" % (time.time() - start))
    args.nwords = len(word2id)
    args.nrels = len(rel2id)
    args.nents = len(ent2id)

    trainQ, testQ, trainA, testA, trainP, testP, trainS, testS = train_test_split(
        Q, A, P, S, test_size=.1, random_state=123)
    trainQ, validQ, trainA, validA, trainP, validP, trainS, validS = train_test_split(
        trainQ, trainA, trainP, trainS, test_size=.11, random_state=0)

    n_train = trainQ.shape[0]
    n_test = testQ.shape[0]
    n_val = validQ.shape[0]
    print(trainQ.shape, trainA.shape, trainP.shape, trainS.shape)

    # 找到答案所在的坐标
    train_labels = np.argmax(trainA, axis=1)
    test_labels = np.argmax(testA, axis=1)
    valid_labels = np.argmax(validA, axis=1)
    batches = list(
        zip(range(0, n_train - args.batch_size, args.batch_size),
            range(args.batch_size, n_train, args.batch_size)))
    pre_batches = list(
        zip(range(0, Triples.shape[0] - args.batch_size, args.batch_size),
            range(args.batch_size, Triples.shape[0], args.batch_size)))

    model = IRN(args)
    optimizer = optim.Adam(model.parameters(), args.init_lr, weight_decay=1e-5)
    pre_val_preds = model.predict(Triples, validQ, validP)
    pre_test_preds = model.predict(Triples, testQ, testP)
    for t in range(args.nepoch):
        np.random.shuffle(batches)
        for i in range(args.inner_nepoch):
            np.random.shuffle(pre_batches)
            pre_total_cost = 0.0
            for s, e in pre_batches:
                pretrain_loss = model.batch_pretrain(
                    Triples[s:e], trainQ[0:args.batch_size],
                    trainA[0:args.batch_size],
                    np.argmax(trainA[0:args.batch_size],
                              axis=1), trainP[0:args.batch_size])
                optimizer.zero_grad()
                pretrain_loss.backward()
                optimizer.step()
        total_cost = 0.0

        for s, e in batches:
            total_cost = model(Triples[s:e], trainQ[s:e], trainA[s:e],
                               np.argmax(trainA[s:e], axis=1), trainP[s:e])
            optimizer.zero_grad()
            total_cost.backward()
            optimizer.step()
        if t % 1 == 0:

            train_preds = model.predict(Triples, trainQ, trainP)
            train_acc = MultiAcc(trainP, train_preds, model._path_size)
            train_true_acc = InSet(trainP, trainS, train_preds)

            val_preds = model.predict(Triples, validQ, validP)
            val_acc = MultiAcc(validP, val_preds, model._path_size)
            val_true_acc = InSet(validP, validS, val_preds)

            print('-----------------------')
            print('Epoch', t)
            print('Train Accuracy:', train_true_acc)
            print('Validation Accuracy:', val_true_acc)
            print('-----------------------')
Beispiel #26
0
from train_test import train, evaluate, save_model, load_model

args = sys.argv
if len(args) >= 3:
    roi = int(args[1])
    steps = int(args[2])
else:
    roi = 1
    steps = 6

print("\nRun pipeline with ROI #%d and %d time steps.\n" % (roi, steps))

# ===========================PROCESSING START===============================
processing_start = time.time()

X_Data, Y_Data = process_data(roi, steps)

enc = preprocessing.OneHotEncoder(categories='auto')
enc.fit(Y_Data)
Y_Data = enc.transform(Y_Data).toarray()

X_train, X_test, Y_train, Y_test = train_test_split(X_Data,
                                                    Y_Data,
                                                    test_size=0.2)

processing_end = time.time()
print("Preprocessed data in: %.2f sec\n" % (processing_end - processing_start))
# ===========================TRAINING START===============================

train_start = time.time()
Beispiel #27
0
def main(args):
    """Main function."""
    # Basic settings
    best_ci = 0
    best_epoch = 0
    best_train_loss = 10000
    rounds = args.rounds

    # Set CUDA device
    cuda_name = "cuda:" + str(args.cudanum)
    device = torch.device(cuda_name if torch.cuda.is_available() else "cpu")

    # Modeling...
    modeling = [GINConvNet, GATNet, GAT_GCN, GCNNet][args.model]
    model_st = modeling.__name__
    print(model_st)
    model = modeling().to(device)

    # Optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)    # Adam

    # Load data
    train_data = pd.read_csv("../../Data/BindingDB/BindingDB_values_mixed_train_ki_filter.csv")
    val_data = pd.read_csv("../../Data/BindingDB/BindingDB_values_mixed_val_ki_filter.csv")
    test_data = pd.read_csv("../../Data/BindingDB/BindingDB_values_mixed_test_ki_filter.csv")

    train_set = process_data(train_data)
    val_set = process_data(val_data)
    test_set = process_data(test_data)

    train_generator = TestbedDataset(root = 'dataset', dataset = 'BindingDB_train', groups=train_set[0], xd = train_set[1],
                                xt = train_set[2], y = train_set[3], smile_graph = train_set[4])
    val_generator = TestbedDataset(root = 'dataset', dataset = 'BindingDB_val', groups=val_set[0], xd = val_set[1],
                                xt = val_set[2], y = val_set[3], smile_graph = val_set[4])
    test_generator = TestbedDataset(root = 'dataset', dataset = 'BindingDB_test', groups=test_set[0], xd = test_set[1],
                                xt = test_set[2], y = test_set[3], smile_graph = test_set[4])

    # Make mini-batch processing
    train_loader = DataLoader(train_generator, batch_size = args.batchsize, shuffle = True)
    val_loader = DataLoader(val_generator, batch_size = args.batchsize, shuffle = False)
    test_loader = DataLoader(test_generator, batch_size = args.batchsize, shuffle = False)

    # Training...
    print("Training.....")
    for epoch in range(args.epochs):
        print("===============Go for Training===============")
        train_loss = train(model, device, train_loader, optimizer, epoch+1)

        # Validation...
        G, P, group_li = predicting(model, device, val_loader)
        val_ci = ci(G, P)

        # Get length of validation set
        result = {}
        for gl in group_li:
            if result.get(gl) == None:
                result[gl] = 1
            else:
                result[gl] += 1

        lens = []
        lens.extend(result.values())

        # Skip len=1 data
        k = 0
        new_G, new_P, new_lens = [], [], []
        for ll in lens:
            if ll == 1:
                k += 1
            else:
                new_G.extend(G[k:k+ll])
                new_P.extend(P[k:k+ll])
                new_lens.append(ll)
                k += ll
        new_G, new_P = np.array(new_G), np.array(new_P)

        # Calculate Weighted CI, Average CI of validation set
        s = 0
        w_ci,a_ci = [],[]
        for l in new_lens:
            try:
                w_ci.append(l*ci(new_G[s:s+l],new_P[s:s+l]))
                a_ci.append(ci(new_G[s:s+l],new_P[s:s+l]))
            except:
                pass
            s += l
        weight_ci, average_ci = np.sum(w_ci)/np.sum(new_lens), np.mean(a_ci)
        print("===============Go for Validation===============")
        print("Weighted CI:",weight_ci)
        print("Average CI:",average_ci)
        print("Overall CI:",val_ci)

        files = open("bestResult/GraphDTA_"+model_st+"_BindingDB_ki_result"+str(args.rounds)+".txt",'a')
        files.write("val_averageCI: "+str(average_ci)+", val_weightedCI: "+str(weight_ci)+", val_overallCI: "+str(val_ci)+", train_loss: "+str(train_loss)+'\n')
        model_name = "bestModel/GraphDTA_"+model_st+"_BindingDB_ki_"+str(rounds)+".model"

        # Save the best result
        if average_ci > best_ci:
            best_ci = average_ci
            best_epoch = epoch
            best_train_loss = train_loss
            # Save best model
            print("Saving the best model...")
            torch.save(model.state_dict(), model_name)

    print("===============Go for Testing===============")
    # Load the model
    model.load_state_dict(torch.load(model_name))

    # Testing...
    test_G, test_P, test_group_li = predicting(model, device, test_loader)
    test_CI, test_MSE = ci(test_G,test_P), mse(test_G,test_P)

    # Get length of testing set
    t_result = {}
    for t_gl in test_group_li:
        if t_result.get(t_gl)==None:
            t_result[t_gl]=1
        else:
            t_result[t_gl]+=1

    t_lens = []
    t_lens.extend(t_result.values())    
    
    # Skip len=1 data
    t_k = 0
    t_new_G,t_new_P,t_new_lens = [],[],[]
    for t_ll in t_lens:
        if t_ll == 1:
            t_k += 1
        else:
            t_new_G.extend(test_G[t_k:t_k+t_ll])
            t_new_P.extend(test_P[t_k:t_k+t_ll])
            t_new_lens.append(t_ll)
            t_k += t_ll
    t_new_G, t_new_P = np.array(t_new_G), np.array(t_new_P)

    # Calculate Weighted CI, Average CI of testing set
    t_s = 0
    t_w_ci,t_a_ci = [],[]
    for t_l in t_new_lens:
        try:
            t_w_ci.append(t_l*ci(t_new_G[t_s:t_s+t_l],t_new_P[t_s:t_s+t_l]))
            t_a_ci.append(ci(t_new_G[t_s:t_s+t_l],t_new_P[t_s:t_s+t_l]))
        except:
            pass
        t_s += t_l
    test_weight_ci, test_average_ci = np.sum(t_w_ci)/np.sum(t_new_lens), np.mean(t_a_ci)

    # Save the testing result
    files.write("test_MSE:" + str(test_MSE) + ", test_averageCI:" + 
                str(test_average_ci) + ", test_weightedCI:" + str(test_weight_ci) + ", test_overallCI:" + str(test_CI) + "\n")
    files.write("best_epoch:" + str(best_epoch + 1) + ", best_train_loss:" + str(best_train_loss) + "\n")
Beispiel #28
0
        driver.quit()

    def _rename(self, country):
        """ Rename countries to lower case, and perform some common renamings. """
        if country == "United States":
            country = "US"
        if country == "The Mainland of China":
            country = "China"
        if country == "Korea (Rep.)":
            country = "Korea, South"

        return country.lower()


if __name__ == "__main__":
    # TESTING ONLY
    scraper = GithubScraper(None)
    scraper.download_reports()
    reports = scraper.cache
    valid_countries = scraper.valid_countries

    data = process_data(reports, valid_countries)
    dates = process_dates(reports)
    confirmed = data[CONFIRMED]
    deaths = data[DEATHS]

    generator = DataGenerator(dates, data, valid_countries)
    top_movers = generator.top_movers()

    countries, top10 = generator.top_contributors()
 def test_preprocess_ac(self):
     'Test loading in an ac data file.'
     infile = "testdata/test_mg_ac_input.txt"
     input_text: List[str] = preprocess.process_data(infile, "ac")
     expected = "amin'ny habakabaka"
     self.assertEqual(input_text[0], expected)
 def test_preprocess_um(self):
     'Test loading in a um data file.'
     infile = "testdata/test_zu_um_input.txt"
     input_text: List[str] = preprocess.process_data(infile, "um")
     expected = "ubuntu"
     self.assertEqual(input_text[0], expected)
Beispiel #31
0
# get rid of first line
file_train_data = file_train_data[1:]
file_test_data = file_test_data[1:]

# must do
preprocess.clean_data_isalpha(file_train_data)
preprocess.clean_data_isalpha(file_test_data)

######### Balance the data set #########
#tag, info_dic_train = preprocess.clean_data_helper_get_info(file_train_data)
#file_train_data = preprocess.clean_data_balance(file_train_data, tag)
######### Balance the data set #########

# store the data into seprate array
article_number, text_data, article_topic = preprocess.process_data(
    file_train_data)
test_number, test_data, test_topic = preprocess.process_data(file_test_data)

(unique, counts) = np.unique(article_topic, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The frequency of different articles types (training set)")
print(frequencies)

(unique, counts) = np.unique(test_topic, return_counts=True)
frequencies = np.asarray((unique, counts)).T
print("The frequency of different articles types (testing set)")
print(frequencies)
print()

# start to preprocess
train_data = np.array(text_data)
Beispiel #32
0
def get_hero_race(history):
    clean_text = process_data([history])
    vector = vectorizer.transform(clean_text)
    result = model.predict(vector)
    print(result)
    return result[0].strip()
    model.fit(train[0], train[1])
    p = model.predict(valid[0])

    from evaluate import evaluate
    print(valid[1].shape, p.shape)
    evaluate(valid[1], p)


if __name__ == '__main__':

    from preprocess import process_data, partition_data
    print('processing data...')
    X, y = process_data(collapse=False,
                        encode=True,
                        normalize=True,
                        predict_missing=True,
                        k_predict=3)

    partitioned_data = partition_data(X, y, partitions=[0.2, 0.8])

    train = partitioned_data[1]
    valid = partitioned_data[0]

    #model = FFNN((1000, 100, 100), num_iterations=500)
    model = FFNN((1000, 10000, 1000, 100, 50), num_iterations=500)

    model.fit(train[0], train[1])

    from evaluate import evaluate
    evaluate(train[1], model.predict(train[0]))