Beispiel #1
0
    def __init__(self,
                 path_sim_word,
                 path_score,
                 path_id2word,
                 path_kneighbor,
                 threshold_ns=0.005,
                 threshold_ps=0.2):

        self.path_sim_word = path_sim_word
        self.path_score = path_score
        self.path_id2word = path_id2word
        self.path_kneighbor = path_kneighbor
        self.threshold_ps = threshold_ps
        self.threshold_ns = threshold_ns
        synset = self.get_synset(self.path_sim_word)
        score = load_from_pkl(self.path_score)
        kneighbor = load_from_pkl(self.path_kneighbor)
        id2word = self.get_id2word(self.path_id2word)
        ps, md, ns = self.get_psns(score, id2word, kneighbor,
                                   self.threshold_ps, self.threshold_ns)
        dump_to_pkl(ps, 'data/test/ps_0.01_test.pkl')
        dump_to_pkl(ns, 'data/test/ns_0.01_test.pkl')
        dump_to_pkl(md, 'data/sample/md_3_0.2_0.01.pkl')
        f1, precision, recall = self.ps_stat(synset, ps)
        print('precision: %f' % precision)
        print('recall: %f' % recall)
        print('f1: %f' % f1)
Beispiel #2
0
    def __init__(self,
                 trn_file,
                 wav_file,
                 mfcc_file,
                 args,
                 vocab_create_mode='BUILD',
                 mfcc_create='Y'):
        ''' 
        Args:
        data_file: data file path
        vocab_create_mode: 
                BUILD: create the vocab dict from raw label data
                LOAD : read from file directly
        '''
        self.args = args

        #trn file path
        self.trn_file = trn_file
        #wav file path
        self.wav_file = wav_file
        #mfcc file path
        self.mfcc_file = mfcc_file

        # data file path
        #self.data_file = data_file
        # <EOS>: end of the sentenset tag
        # <SOS>: start of the sentenset tag
        # <PAD>: padding tag
        self.special_signs = ['<EOS>', '<SOS>', '<PAD>', '<BIAS>']
        # label to index dict
        self.vocab = {}
        # index to label dict
        self.inverse_vocab = {}

        if vocab_create_mode == 'BUILD':
            self.label_process()
        elif vocab_create_mode == 'LOAD':
            self.vocab = utils.load_from_pkl('vocab.pkl')
            self.inverse_vocab = utils.invert_dict(self.vocab)

        if mfcc_create == 'Y':
            for i in range(len(self.wav_file)):
                wavlist = os.listdir(self.wav_file[i])
                for j in range(len(wavlist)):
                    wav_path = os.path.join(self.wav_file[i], wavlist[j])
                    # invert the radio to the mfcc feature
                    mfcc = self.read_wav_file(wav_path, 26, 9)
                    mfcc = np.transpose(mfcc)
                    np.save(os.path.join(self.mfcc_file[i], \
                            os.path.splitext(wavlist[j])[0]), mfcc, 'utf-8')
Beispiel #3
0
    def __init__(self,
                 input_wvectors,
                 input_word2id,
                 input_id2word,
                 input_vocabulary,
                 pair_file_path,
                 kn_file_name,
                 output_file_name,
                 topn = 20):
        word2id = dict()
        with codecs.open(input_word2id, 'r', encoding='utf-8') as f:
            for lines in f:
                word2id[lines.strip().split()[0]] = int(lines.strip().split()[1])
        id2word = dict()
        with codecs.open(input_id2word, 'r', encoding='utf-8') as f:
            for lines in f:
                id2word[int(lines.strip().split()[0])] = lines.strip().split()[1]
        vocabulary = []
        with codecs.open(input_vocabulary, 'r', encoding='utf-8') as f:
            for lines in f:
                vocabulary.append(int(lines.strip()))

        self.topn = topn
        kneighbor = KNeighbor(input_wvectors, vocabulary, word2id, id2word)
        dump_to_pkl(kneighbor, kn_file_name)

        logging_set('NSselect.log')
        files = os.listdir(pair_file_path)
        pairs = dict()
        for file in tqdm(files):
            if not os.path.isdir(file):
                path = pair_file_path + "/" + file
                pair = load_from_pkl(path)
                logging.info("pair size: %d" % (len(pair)))
                if len(pairs) == 0:
                    pairs = pair
                else:
                    for key in pair.keys():
                        if key in pairs:
                            pairs[key] += pair[key]
                        else:
                            pairs[key] = pair[key]
                logging.info("current total pair size: %d" % (len(pairs)))
        logging.info("start calculate score")
        score = self.select_new(pairs, kneighbor, self.topn)
        #score1 = self.select(pairs, kneighbor)
        logging.info("start saving")
        dump_to_pkl(score, output_file_name)
                    pd.read_feather(f'{FEATURE_DIR}/{feat}_train.feather')
                ],
                                    axis=1)
                X_test = pd.concat([
                    X_test,
                    pd.read_feather(f'{FEATURE_DIR}/{feat}_test.feather')
                ],
                                   axis=1)
                cat_features_index.append(fidx)
                fidx += 1
            X_train = reduce_mem_usage(X_train)
            X_test = reduce_mem_usage(X_test)
            save_as_pkl(X_train, f'X_train_{EXP_NAME}_.pkl')
            save_as_pkl(X_test, f'X_test_{EXP_NAME}_.pkl')
    elif skip_fr is True:
        X_train = load_from_pkl(f'X_train_task1_lgbm_fs100.pkl')
        X_test = load_from_pkl(f'X_test_task1_lgbm_fs100.pkl')
        cat_features_index = []

    mlflow.set_experiment(EXP_NAME)
    mlflow.start_run()
    run_id = mlflow.active_run().info.run_id

    with t.timer(f'load folds: {FOLD_NAME}-{FOLD_NUM}'):
        folds = pd.read_csv(
            f'{FOLD_DIR}/train_folds_{FOLD_NAME}{FOLD_NUM}_RS{RANDOM_STATE}.csv'
        )

    with t.timer(f'train XGB'):

        logging.info(f'Num. of Samples: {len(X_train)}')
Beispiel #5
0
                for i in [i[0]
                          for i in train_feat] + [i[0] for i in test_feat]:
                    tmp.extend(i)
                unique_num = len(set(tmp)) + 1
                unique_num_dic[feat] = unique_num

            print('Unique Num', unique_num_dic)
            print('Feature index', feature_index)

            save_as_pkl(X_train, f'X_train_{EXP_NAME}.pkl')
            save_as_pkl(X_test, f'X_test_{EXP_NAME}.pkl')
            save_as_pkl(unique_num_dic, f'unique_num_dic_{EXP_NAME}.pkl')
            save_as_pkl(feature_index, f'feature_index_{EXP_NAME}.pkl')

    elif skip_fr is True:
        X_train = load_from_pkl(f'X_train_task1_mlp_fs100.pkl')
        X_test = load_from_pkl(f'X_test_task1_mlp_fs100.pkl')
        unique_num_dic = load_from_pkl(f'unique_num_dic_task1_mlp_fs100.pkl')
        feature_index = load_from_pkl(f'feature_index_task1_mlp_fs100.pkl')

    X_train = X_train.fillna(0.0)
    X_test = X_test.fillna(0.0)

    with t.timer(f'load folds: {FOLD_NAME}-{FOLD_NUM}'):
        folds = pd.read_csv(
            f'{FOLD_DIR}/train_folds_{FOLD_NAME}{FOLD_NUM}_RS{RANDOM_STATE}.csv'
        )

    mlflow.set_experiment(EXP_NAME)
    mlflow.start_run()
    run_id = mlflow.active_run().info.run_id
    def __init__(self,
                 input_wvectors,
                 input_word2id,
                 input_id2word,
                 input_vocabulary,
                 pair_file_path,
                 kn_file_name,
                 output_file_name,
                 topn=20):
        word2id = dict()
        with codecs.open(input_word2id, 'r', encoding='utf-8') as f:
            for lines in f:
                word2id[lines.strip().split()[0]] = int(
                    lines.strip().split()[1])
        id2word = dict()
        with codecs.open(input_id2word, 'r', encoding='utf-8') as f:
            for lines in f:
                id2word[int(
                    lines.strip().split()[0])] = lines.strip().split()[1]
        vocabulary = []
        with codecs.open(input_vocabulary, 'r', encoding='utf-8') as f:
            for lines in f:
                vocabulary.append(int(lines.strip()))

        self.topn = topn
        logging.info("get kneighbors...")
        #kneighbor = load_from_pkl(kn_file_name)
        kneighbor = KNeighbor(input_wvectors, vocabulary, word2id, id2word)
        dump_to_pkl(kneighbor, kn_file_name)
        logging.info("kneightbors got.")

        logging.info("get pairs...")
        files = os.listdir(pair_file_path)
        pairs = dict()
        for file in tqdm(files):
            if file == '.DS_Store':
                continue
            if not os.path.isdir(file):
                path = pair_file_path + "/" + file
                pair = load_from_pkl(path)
                logging.info("pair size: %d" % (len(pair)))
                if len(pairs) == 0:
                    pairs = pair
                else:
                    for key in pair.keys():
                        if key in pairs:
                            pairs[key] += pair[key]
                        else:
                            pairs[key] = pair[key]
                logging.info("current total pair size: %d" % (len(pairs)))
        logging.info("pairs got")

        resplit_pairs(pairs, './data/pairs_large_resplit', 1000)

        logging.info("len(word2id): %d" % len(word2id))
        keys_before_sort_set = set([key[0] for key in pairs.keys()])
        logging.info("length of pair.keys[0]: %d" % len(keys_before_sort_set))
        id_missing_in_pairs = set(word2id.values()) - keys_before_sort_set
        logging.info("len(id_missing_in_pairs): %d" %
                     (len(id_missing_in_pairs)))
        if len(id_missing_in_pairs) > 0:
            logging.info("missing word in pairs: %s" %
                         str(id_missing_in_pairs))
            #dump_to_pkl(id_missing_in_pairs, './data/id_missing_in_pairs.pkl')

        logging.info("start calculate score")

        score = self.select_new(pairs, kneighbor, self.topn)
        logging.info("len(score): %d" % len(score))

        #score1 = self.select(pairs, kneighbor)
        logging.info("start saving")
        dump_to_pkl(score, output_file_name)
import os
from utils import load_from_pkl, dump_to_pkl

path = 'data/pair'
files = os.listdir(path)
pairs = dict()
for file in files:
    if not os.path.isdir(file):
        pair_file_path = path + "/" + file
        pair = load_from_pkl(pair_file_path)
        if len(pairs) == 0:
            pairs = pair
        else:
            for key in pair.keys():
                if key in pairs:
                    pairs[key] += pair[key]
                else:
                    pairs[key] = pair[key]

output_file_name = 'data/pairs'
dump_to_pkl(pairs, output_file_name)
Beispiel #8
0
    def __init__(
        self,
        input_file_name,
        input_wvectors,
        input_cvectors,
        input_ps,
        input_ns,
        output_file_name,
        emb_dimension=100,
        batch_size=50,
        window_size=5,
        kn=20,
        iteration=1,
        initial_lr=0.001,
        clip=1.0,
        min_count=30,
        batch_num_to_valid=100000,
    ):
        """Initilize class parameters.

        Args:
            input_file_name: Name of a text data from file. Each line is a sentence splited with space.
            input_vectors: Pretrained vector
            input_psns: Pretrained positive sample & negative sample
            output_file_name: Name of the final embedding file.
            emb_dimention: Embedding dimention, typically from 50 to 500.
            batch_size: The count of word pairs for one forward.
            window_size: Max skip length between words.
            kn: k neighbors.
            iteration: Control the multiple training iterations.
            initial_lr: Initial learning rate.
            min_count: The minimal word frequency, words with lower frequency will be filtered.

        Returns:
            None.
        """
        self.data = InputData(input_file_name, min_count)
        self.pre_wvectors = InputVector(input_wvectors)
        self.pre_cvectors = InputVector(input_cvectors)
        self.ps_w = load_from_pkl(input_ps)
        self.ns_w = load_from_pkl(input_ns)
        self.ps = convert_word_to_id(self.ps_w, self.data.word2id)
        self.ns = convert_word_to_id(self.ns_w, self.data.word2id)
        self.output_file_name = output_file_name
        self.emb_size = len(self.data.word2id)
        self.emb_dimension = emb_dimension
        self.batch_size = batch_size
        self.window_size = window_size
        self.kn = kn
        self.iteration = iteration
        self.initial_lr = initial_lr
        self.clip = clip
        self.skip_gram_model = SkipGramModel(self.emb_size, self.emb_dimension,
                                             self.pre_wvectors,
                                             self.pre_cvectors)
        self.use_cuda = torch.cuda.is_available()
        if self.use_cuda:
            self.skip_gram_model.cuda()
        self.optimizer = optim.SGD(self.skip_gram_model.parameters(),
                                   lr=self.initial_lr)
        self.batch_num_to_valid = batch_num_to_valid
Beispiel #9
0
def output_plots(
    df_list_path: str,
    var_to_viz: str = "temp_f",
    save_path: str = "./data/images/",
    figsize: Tuple[float, float] = (5., 7.5),
    color_range: Tuple[int, int] = (0, 100),
    cmap: str = "RdYlGn_r",
) -> None:
    """Makes plots for each timestep in input data
    df_list_path: Path to saved data from pull_data.py
    var_to_viz: Which variable to visualize
    save_path: Directory to save plots
    figsize: Size of output plots
    color_range: Min and Max value for making the colorbar
    cmap: Color mapping for plots and colorbar. "RdYlGn_r" and "plasma" work best
    """
    save_path += f"{var_to_viz}/"
    df_list = load_from_pkl(df_list_path)
    # print(df_list)
    print(f"Saving images for {len(df_list)} dfs")
    for now in df_list:
        df = df_list[now]
        if df.empty:
            print(f'No data for {now}')
            continue
        fl_name = f"{now}_map.png"

        # check if file already has image
        cont = 1
        for (_, _, fls) in walk(save_path):
            if fl_name in fls:
                cont = 0
        if not cont:
            continue

        # Create the figure and the axes
        fig = plt.figure(figsize=figsize)
        ax = fig.add_subplot(1, 1, 1, projection=ccrs.PlateCarree())

        # lat and lon coords to plot
        # extent = [-128, -65, 23, 50.5]  # whole USA
        extent = [-123, -121.75, 37, 38.75]  # Bay Area
        # extent = [-122.65, -122.15, 37.6, 38.]  # SF city Area

        df = df[df["lon"].between(extent[0], extent[1])]
        df = df[df["lat"].between(extent[2], extent[3])]
        lat = df["lat"].values
        lon = df["lon"].values

        # Variable from which to generate the color gradient
        if var_to_viz in ["pm_2.5", "PM2.5_CF_ATM_ug/m3"]:
            colors = df[var_to_viz].fillna(-1).apply(aqi_from_pm).values

        elif var_to_viz == "temp_f":
            colors = df[var_to_viz].values - 8

        else:
            colors = df[var_to_viz]

        # Display some map info
        ax.set_extent(extent)
        land10m = cfeature.NaturalEarthFeature(
            "physical",
            "land",
            "10m",
            edgecolor="black",
            facecolor="lightgray",
            linewidth=0.5,
        )
        ax.add_feature(land10m)

        # Add scatter points for each coordinate pair
        c_min, c_max = color_range[0], color_range[1]
        scatter = ax.scatter(
            lon,
            lat,
            marker="o",
            c=colors,
            cmap=cmap,
            zorder=5,
            s=5,
            vmin=c_min,
            vmax=c_max,
        )

        # Add scale
        plt.cm.ScalarMappable(cmap=cmap)
        plt.colorbar(scatter, fraction=0.06542, pad=0).set_label(
            f"{title_dict[var_to_viz]}", rotation=90
        )
        ax.set_facecolor("lightblue")
        plt.title(f"{title_dict[var_to_viz]} at {now}")

        if not os.path.exists(save_path):
            os.makedirs(save_path)
        plt.savefig(save_path + fl_name, dpi=300, bbox_inches="tight")
        plt.close()
Beispiel #10
0
                X_train = pd.concat([
                    X_train,
                    pd.read_feather(f'{FEATURE_DIR}/{feat}_train.feather')
                ],
                                    axis=1)
                X_test = pd.concat([
                    X_test,
                    pd.read_feather(f'{FEATURE_DIR}/{feat}_test.feather')
                ],
                                   axis=1)
            X_train = reduce_mem_usage(X_train)
            X_test = reduce_mem_usage(X_test)
            save_as_pkl(X_train, f'X_train_{EXP_NAME}_.pkl')
            save_as_pkl(X_test, f'X_test_{EXP_NAME}_.pkl')
    elif skip_fr is True:
        X_train = load_from_pkl(f'X_train_task1_xgb_fs100_meta_.pkl')
        X_test = load_from_pkl(f'X_test_task1_xgb_fs100_meta_.pkl')
        cat_features_index = []

    mlflow.set_experiment(EXP_NAME)
    mlflow.start_run()
    run_id = mlflow.active_run().info.run_id

    with t.timer(f'load folds: {FOLD_NAME}-{FOLD_NUM}'):
        folds = pd.read_csv(
            f'{FOLD_DIR}/train_folds_{FOLD_NAME}{FOLD_NUM}_RS{RANDOM_STATE}.csv'
        )

    with t.timer(f'train XGB'):

        logging.info(f'Num. of Samples: {len(X_train)}')
                for i in [i[0]
                          for i in train_feat] + [i[0] for i in test_feat]:
                    tmp.extend(i)
                unique_num = len(set(tmp)) + 1
                unique_num_dic[feat] = unique_num

            print('Unique Num', unique_num_dic)
            print('Feature index', feature_index)

            save_as_pkl(X_train, f'X_train_{EXP_NAME}.pkl')
            save_as_pkl(X_test, f'X_test_{EXP_NAME}.pkl')
            save_as_pkl(unique_num_dic, f'unique_num_dic_{EXP_NAME}.pkl')
            save_as_pkl(feature_index, f'feature_index_{EXP_NAME}.pkl')

    elif skip_fr is True:
        X_train = load_from_pkl(f'X_train_{EXP_NAME}.pkl')
        X_test = load_from_pkl(f'X_test_{EXP_NAME}.pkl')
        unique_num_dic = load_from_pkl(f'unique_num_dic_{EXP_NAME}.pkl')
        feature_index = load_from_pkl(f'feature_index_{EXP_NAME}.pkl')

    X_train = X_train.fillna(0.0)
    X_test = X_test.fillna(0.0)

    with t.timer(f'load folds: {FOLD_NAME}-{FOLD_NUM}'):
        folds = pd.read_csv(
            f'{FOLD_DIR}/train_folds_{FOLD_NAME}{FOLD_NUM}_RS{RANDOM_STATE}.csv'
        )

    mlflow.set_experiment(EXP_NAME)
    mlflow.start_run()
    run_id = mlflow.active_run().info.run_id
Beispiel #12
0
   If all procedures are within scope of same pregnancy - treat them as single row and find matches from control once.

Data Validation:
1. Rows in experiment should be removed from control.
2. Rows in experiment without parity - should be removed.

Other:
* rows colored purple - manually created couplings. Can be used to test final flow. 
  Comparison should be contains and not exact. 
  In 'experiment' sheet in experiment.xlsx, there is a list of the selected rows.
"""

# load files
pkl_pth = 'temp_12_2018.pkl'
if path.exists(pkl_pth):
    data = load_from_pkl(pkl_pth)
    control = data['control']
    experiment = data['experiment']
else:
    control = pd.read_excel('control_2.xlsx').dropna(how='all')
    # parse_dates=['maternal_birth_date', 'neonatal_birth_date']).dropna(how='all')
    # experiment = pd.read_excel('experiment_2.xlsx', sheet_name='experiment', parse_dates=['BirthDate']).dropna(how='all')
    experiment = pd.read_excel('experiment_2.xlsx').dropna(how='all')
    save_to_pkl(pkl_pth, control=control, experiment=experiment)


def create_new_index(df):
    """
    Removes rows without parity, and creates full name_id_parity index as 'new_index' column
    :param df:
    :return: