Ejemplo n.º 1
0
def get_drop_down_menu_val(preferred_dataset):
    global SELECTED_DATASET, INPUT_FILE, DATA_FILE, DATA

    SELECTED_DATASET = preferred_dataset
    INPUT_FILE = data_reading.INPUT_DIR + 'Question_Answer_Dataset_v1.2/' + SELECTED_DATASET + '/question_answer_pairs.txt'
    DATA_FILE = data_reading.INPUT_DIR + 'Question_Answer_Dataset_v1.2/' + SELECTED_DATASET + '/'
    DATA = data_reading.read_data(
        INPUT_FILE)  # Load data: ArticleTitle, Question, Answer & Article.

    create_wiki_titles()
    main()
Ejemplo n.º 2
0
from PyAstronomy import pyasl as PAP
from scipy import signal
import scipy.optimize as SO
import consts as C
import lib
from matplotlib.colors import BoundaryNorm
import galpy as gp
import data_reading as dr

path = "/Users/htian/Documents/GitHub/rothalo/"
dpath = path + "data/"
ppath = "/Users/htian/Documents/GitHub/rothalo/plot/"

# *************** this part includes the code for reading data ***************
# this part can be modified
data = dr.read_data(-5, -0.8)
data.read_kgiant()
#*************** this part for constants *************************************
Rl, Rr, Rstp = 0, 40, 5
Zl, Zr, Zstp = -40, 40, 5
l_o = data.l_o
b_o = data.b_o
rv_o = data.rv_o
feh_o = data.feh_o
dist_o = data.dist_o
Z_o = data.Z_o
R_o = data.R_o
dist_o = data.dist_o
obj = data.name
# print(np.mean(rv_o),np.min(rv_o),np.max(rv_o))
# #*************** this part for mocking data **********************************
import itertools

from data_reading import read_data
from data_reading import write_to_file
from tsp_functions import select_rand_k
from tsp_functions import nearest_dist
from tsp_functions import incertion_rand_k
from tsp_functions import tour_length

# !!!!! DATA READING && PREPARE !!!!!
print("!!!!! DATA READING && PREPARE !!!!!")

inputFile = 'input.txt'
outputFile = 'output.txt'

init_txt_lists, distances, cities = read_data(inputFile)

max_dist_in_txt = max(distances)

# !!!!! MAIN LOOP !!!!!

tour = []
total_length = 0
unvisit_cities = copy.deepcopy(cities)

for index, city in enumerate(cities):

    if index == 1:

        nearest_city = nearest_dist(max_dist_in_txt, init_txt_lists,
                                    rand_k_city, tour)
Ejemplo n.º 4
0
import server_stuff
import data_reading
import data_parsing
import question_answering

from textblob import TextBlob
from nltk import word_tokenize

# Global variables for file management.
DATASET_OPTIONS = ['S08', 'S09', 'S10']
SELECTED_DATASET = DATASET_OPTIONS[2]
INPUT_FILE = data_reading.INPUT_DIR + 'Question_Answer_Dataset_v1.2/' + SELECTED_DATASET + '/question_answer_pairs.txt'
DATA_FILE = data_reading.INPUT_DIR + 'Question_Answer_Dataset_v1.2/' + SELECTED_DATASET + '/'

# Global variables for data read and management.
DATA = data_reading.read_data(
    INPUT_FILE)  # Load data: ArticleTitle, Question, Answer & Article.
WIKI_ARTICLES_TITLES_SET = []

# Global variables for GUI.
WINDOW = tk.Tk()


def main():
    """ Create a set with all available articles & start Graphical User Interface(GUI).

    :return: None
    :rtype: None
    """
    create_wiki_titles()
    gui_start()
Ejemplo n.º 5
0
from PyAstronomy import pyasl as PAP
from scipy import signal
import scipy.optimize as SO
import consts as C
import lib
from matplotlib.colors import BoundaryNorm
import galpy as gp
import data_reading as dr

path = "/Users/htian/Documents/GitHub/rothalo/"
dpath = path + "data/"
ppath = "/Users/htian/Documents/GitHub/rothalo/plot/"

# *************** this part includes the code for reading data ***************
# # # this part can be modified
data = dr.read_data(-1.1, -0.9)
data.read_kgiant()
#*************** this part for constants *************************************
Rl, Rr, Rstp = 0, 40, 5
Zl, Zr, Zstp = -40, 40, 5
l_o = data.l_o
b_o = data.b_o
rv_o = data.rv_o
feh_o = data.feh_o
dist_o = data.dist_o
Z_o = data.Z_o
R_o = data.R_o
dist_o = data.dist_o
obj = data.name
print('mean radial velocity ', np.mean(rv_o), np.min(rv_o), np.max(rv_o))
# #*************** this part for mocking data **********************************
def main(params):

    # Seed $ Logging
    seed_everything(SEED)
    log_dir_path, log_path = init_logging(params)

    with open(log_path, 'a') as f:
        f.write(f'\n[base_model] {params["base_model"]}')

    # Read  data
    print("Read data...")
    train_df, test_df, submission_df = read_data(params)

    # Splitter
    skf = StratifiedKFold(n_splits=N_SPLIT, shuffle=True, random_state=777)
    splits = list(skf.split(train_df.index.values, train_df.sentiment.values))
    tr_idx, val_idx = splits[params["n_fold"] - 1]
    test_idx = np.arange(N_TEST)

    if "debug" in params["mode"]:
        n_debug = int(params["mode"].split(":")[1])
        tr_idx, val_idx = tr_idx[:n_debug], val_idx[:n_debug]
        test_idx = test_idx[:n_debug]

    # Build & Compile model
    print("Build & Compile model...")
    tokenizer, base_model = get_base_model(params)
    combined_model = get_combined_model(base_model, params)

    opt = tf.keras.optimizers.Adam(learning_rate=params["lr"])
    loss = get_loss(params)
    combined_model.compile(loss=loss, optimizer=opt)

    # Prepare  data
    print("Prepare data...")
    known_idx = np.array(list(set(tr_idx) | set(val_idx)))
    input_ids, attention_mask, token_type_ids, start_tokens, end_tokens, train_sample_ind2new_ind2old_ind = get_train_data(
        train_df, tokenizer, idx=known_idx)
    test_word_ids, test_mask, test_segm_ids, test_sample_ind2new_ind2old_ind = get_test_data(
        test_df, tokenizer, idx=test_idx)

    # # Model hash
    # print(f'base_model hash: {np.array(base_model(test_word_ids[:16], test_mask[:16], test_segm_ids[:16])[0]).sum():.3}')
    # print(f'head_model hash: {combined_model.layers[-6].weights[0].numpy().sum():.3}')

    # Splitting data
    print("Splitting data...")
    tr_df = train_df.loc[tr_idx].reset_index(drop=True).set_index(tr_idx)
    val_df = train_df.loc[val_idx].reset_index(drop=True).set_index(val_idx)

    tr_word_ids, tr_mask, tr_segm_ids, tr_starts, tr_ends = input_ids[
        tr_idx, ], attention_mask[tr_idx, ], token_type_ids[
            tr_idx, ], start_tokens[tr_idx, ], end_tokens[tr_idx, ]
    tr_targets = np.concatenate([tr_starts, tr_ends], axis=1)
    val_word_ids, val_mask, val_segm_ids, val_starts, val_ends = input_ids[
        val_idx, ], attention_mask[val_idx, ], token_type_ids[
            val_idx, ], start_tokens[val_idx, ], end_tokens[val_idx, ]

    # Check Correcness
    print("Check Correcness...")
    tr_df["is_correct"] = tr_df.apply(
        lambda row:
        (" " + row.text + " ").find(" " + row.selected_text + " ") >= 0,
        axis=1)
    print(f'correct samples: {tr_df["is_correct"].mean():3f}')

    tr_df["recover_selected_text"] = get_st_prediction(
        tr_starts, tr_ends, tr_df, train_sample_ind2new_ind2old_ind)
    tr_df["recover_jaccard"] = tr_df.apply(lambda row: jaccard(
        row["recover_selected_text"], row["selected_text"]),
                                           axis=1)
    assert np.all(tr_df[tr_df["is_correct"]]["recover_jaccard"] == 1)
    print(f'preprocessing OK!')

    print(f'##### FOLD {params["n_fold"]} #####')
    gc.collect()

    # Model Paths & Pretraining (optional)
    best_weights_path = f'{log_dir_path}/{params["n_fold"]}/best_model.h5'
    pre_trained_weights_path = f'../attempt_logs/{params["weights_att_num"] or params["att_num"]}/{params["n_fold"]}/best_model.h5'

    pretrained_score = 0
    # if os.path.exists(pre_trained_weights_path):
    #     combined_model.load_weights(pre_trained_weights_path)
    #     start_proba, end_proba = get_proba_prediction(combined_model, val_word_ids, val_mask, val_segm_ids)
    #     pretrained_score = get_score(start_proba, end_proba, val_df, train_sample_ind2new_ind2old_ind)
    #     with open(log_path, 'a') as f:
    #         f.write(f'\nWeights PreTrained from {pre_trained_weights_path}, pretrained_score: {pretrained_score:.5f}')

    # Training (optional)
    if not params["wo_fitting"]:
        lr_scheduler = LearningRateScheduler(lambda epoch: 3e-5 * 0.2**epoch)
        custom_callback = CustomCallback(combined_model, val_word_ids,
                                         val_mask, val_segm_ids, val_df,
                                         train_sample_ind2new_ind2old_ind,
                                         params["n_fold"],
                                         params["start_epoch"], log_path,
                                         pretrained_score, best_weights_path)

        n_epoch = N_EPOCH - params["start_epoch"] + 1
        combined_model.fit(
            [tr_word_ids, tr_mask, tr_segm_ids],
            [tr_starts, tr_ends],  #tr_targets,
            batch_size=BATCH_SIZE,
            epochs=n_epoch,
            callbacks=[custom_callback, lr_scheduler],
            verbose=1,
        )

    combined_model.load_weights(best_weights_path)