Esempio n. 1
0
def save_feather():
    train = utils.load_csv()
    for c in tqdm(train.columns):
        train[[c]].to_feather(f'features/train/{c}.ftr')
    test = utils.load_csv(test=True)
    for c in tqdm(test.columns):
        test[[c]].to_feather(f'features/test/{c}.ftr')
Esempio n. 2
0
def main(clf):
    x_train, y_train = utils.load_csv(POSTURES_TRAINING_DATA,
                                      feature_idx=range(2, 39),
                                      label_idx=1)
    x_test, y_test = utils.load_csv(POSTURES_TEST_DATA,
                                    feature_idx=range(2, 39),
                                    label_idx=1)
    model = models.PosturesEstimator(clf, aggregation = True, knn_n_neighbors = 9, \
        svm_c = 10 ** 0.6, svm_gamma = 10 ** -2.0, \
        nn_validation = True)

    assert len(x_train) == len(y_train) and len(x_test) == len(
        y_test), 'Data size not match'

    model.fit(x_train, y_train)
    acc_train = model.evaluate(x_train, y_train, cm='training.png')
    LOGGER.info('Training Accuracy: %.6f' % acc_train)

    acc_cv = model.cross_validate(x_train, y_train)
    if acc_cv is not None:
        LOGGER.info('Cross Validation Accuracy: %.6f ± %.6f' %
                    (np.mean(acc_cv), np.std(acc_cv)))

    acc_test = model.evaluate(x_test, y_test, cm='test.png')
    LOGGER.info('Test Accuracy: %.6f' % acc_test)
Esempio n. 3
0
 def __init__(self, config=cfg, cache=True):
     if not cache or not os.path.isfile(cfg.data_cache):
         self.train, self.val = self.train_val_split(
             utils.load_csv(cfg.train_csv), 0.9)
         self.test = utils.load_csv(cfg.test_csv, shuffle=False)
         utils.save_cache([self.train, self.val, self.test], cfg.data_cache)
     else:
         self.train, self.val, self.test = utils.load_cache(cfg.data_cache)
Esempio n. 4
0
def main():

    exp_name = f'baseline_{now()}'
    device, log, result_dir = setup(exp_name, conf)

    train_df = load_csv(conf.train_csv)
    if conf.npy:
        train_images = np.load(conf.train_images)
    else:
        train_images = pd.read_parquet(conf.train_images)

    test_df = load_csv(conf.test_csv)
    if conf.npy:
        test_images = np.load(conf.test_images)
    else:
        test_images = pd.read_parquet(conf.test_images)

    log.info('done')
    for i in range(5):
        if i != conf.fold:
            continue

        if "resnet" in conf.arch or "resnext" in conf.arch:
            model_ft = ResNet(conf,
                              arch_name=conf.arch,
                              input_size=conf.image_size)
            model_ft.load_state_dict(
                torch.load("result/baseline_2020_03_21_13_01_08/model_0.pkl"))
        elif "densenet" in conf.arch:
            model_ft = DenseNet(conf,
                                arch_name=conf.arch,
                                input_size=conf.image_size)
        elif "efficientnet" in conf.arch:
            model_ft = EfficientNet(conf, arch_name=conf.arch)

        criterion = [
            nn.CrossEntropyLoss(reduction="none"),
            nn.CrossEntropyLoss(reduction="none"),
            nn.CrossEntropyLoss(reduction="none")
        ]
        criterion = [c.to(device) for c in criterion]

        model_ft, val_preds = train_model(train_df,
                                          train_images,
                                          test_df,
                                          test_images,
                                          model_ft,
                                          criterion,
                                          log,
                                          device,
                                          result_dir,
                                          fold=i,
                                          num_epoch=conf.num_epoch)

        torch.save(model_ft.state_dict(), result_dir / f'model_{i}.pkl')
        np.save(result_dir / f'val_preds_{i}.npy', val_preds)
Esempio n. 5
0
 def load_subset(self, dataset, name):
     filename = os.path.join("..", "datasets", dataset, name,
                             "features.csv")
     data = np.asarray(utils.load_csv(filename, skiprows=1))
     filename = os.path.join("..", "datasets", dataset, name,
                             "y_{}.txt".format(name))
     activities = np.asarray(utils.load_csv(filename)).ravel()
     filename = os.path.join("..", "datasets", dataset, name,
                             "subject_{}.txt".format(name))
     subjects = np.asarray(utils.load_csv(filename)).ravel()
     return data, activities, subjects
Esempio n. 6
0
def train(train_file, test_file=None):
    data = utils.load_csv(train_file)
    feature_set = [(utils.feature_extract(i[0]), i[1]) for i in data]
    print 'Training'
    classifier = nltk.NaiveBayesClassifier.train(feature_set)
    utils.save_model(classifier)
    print 'Done Training'
    if test_file:
        data = utils.load_csv(test_file)
    test_feature_set = [(utils.feature_extract(i[0]), i[1]) for i in data]
    print 'Accuracy of model is {0}'.format(
        nltk.classify.accuracy(classifier, test_feature_set))
Esempio n. 7
0
 def load_artist_data(self, artist_name):
     artist_information = self._musicbrainz_searcher.get_musicbrainz_artist_info(
         artist_name)
     print('Load artist data:', artist_information)
     events_df = utils.load_csv(artist_information.name, 'events')
     setlists_df = utils.load_csv(artist_information.name, 'setlists')
     recordings_df = utils.load_csv(artist_information.name, 'recordings')
     if events_df is not None:
         events_df['eventdate'] = pd.to_datetime(events_df['eventdate'],
                                                 format='%Y-%m-%d')
     if recordings_df is not None:
         recordings_df['date'] = pd.to_datetime(recordings_df['date'],
                                                format='%Y-%m-%d')
     return ArtistData(artist_information, events_df, setlists_df,
                       recordings_df)
Esempio n. 8
0
def get_test_error(threshold):
    TP = 0
    TN = 0
    FP = 0
    FN = 0

    for i in range(test_data_size):
        data = utils.load_csv(test_set_dir + str(i) + ".csv")
        label = utils.get_label_from_model_with_threshold(
            data, model, threshold)

        if label == 1:
            if data[19][0] == 1:
                TP += 1
            elif data[19][0] == -1:
                FP += 1
            else:
                print('1 error')
        elif label == -1:
            if data[19][0] == 1:
                FN += 1
            elif data[19][0] == -1:
                TN += 1
            else:
                print('1 error')
        else:
            print(label)

    TPR = TP / (TP + FN)
    FPR = FP / (FP + TN)

    return TPR, FPR
Esempio n. 9
0
def get_cleaned_tweets(query_dict):
    """
    Get cleaned tweets
    :param query_dict:
        query_string: 'datacamp lang:en'
        time_since: '2019-03-01'
        time_until: '2019-05-01'
        max_tweets: 0 for unlimited
    :return: dataframe
    """

    file_name = _convert_query_dict_to_str_as_filename(query_dict)
    save_cleaned_file_name = paths.cleaned_tweets / 'cleaned_{}.csv'.format(
        file_name)

    if save_cleaned_file_name.is_file():
        print('Cleaned file {} already exists, reload'.format(
            save_cleaned_file_name))
        tweet_df = load_csv(save_cleaned_file_name)
    else:
        tweet_df = get_raw_tweets(query_dict)

        print('Cleaning tweets')
        cleaned_tweet_df = _clean_tweets_text(tweet_df)

        #print('Select only {USE_TWEETS_COLS} and save tweets to: {repr(save_cleaned_file_name)}'.format())
        cleaned_tweet_df[USE_TWEETS_COLS].to_csv(save_cleaned_file_name,
                                                 index=False)

    print('Done getting tweets.')
    return tweet_df
Esempio n. 10
0
def get_raw_tweets(query_dict):
    """
    Get raw tweets
    :param query_dict:
        query_string: 'datacamp lang:en'
        time_since: '2019-03-01'
        time_until: '2019-05-01'
        max_tweets: 0 for unlimited
    :return: dataframe
    """
    file_name = _convert_query_dict_to_str_as_filename(query_dict)
    save_raw_file_name = paths.raw_tweets / 'raw_{}.csv'.format(file_name)

    if save_raw_file_name.is_file():
        print('Raw file {} already exists, reload'.format(
            repr(save_raw_file_name)))
        tweet_df = load_csv(save_raw_file_name)
    else:
        _validate_query(query_dict)

        print(f'Getting raw tweets with query:\n{query_dict!r}')
        tweet_criteria = _create_search_criteria(**query_dict)
        tweet_objects = _get_tweet_object(tweet_criteria)
        tweet_df = _convert_tweets_to_dataframe(tweet_objects)

        print(f'Saving raw tweets to: {repr(save_raw_file_name)}')
        tweet_df.to_csv(save_raw_file_name, index=False)

    print('Done getting raw tweets.')
    return tweet_df
Esempio n. 11
0
	def __init__(self, generator, discriminator, data_set_file, y_dim):
		self.generator = generator
		self.discriminator = discriminator
		self.data_set_file = data_set_file
		self.y_dim = y_dim # useless     condition

		indexs, latitude, longitude = ut.load_csv(self.data_set_file, 2)

		self.borders = range(B, len(latitude), B)

		self.Generator_input = 100

		self.Generator_output = len(self.borders)

		self.G_in = tf.placeholder(tf.float32, [None, self.Generator_input])
		self.real_partition = tf.placeholder(tf.float32, [None, self.Generator_output], name='real_in')
		self.condition = tf.placeholder(tf.float32, shape=[None, self.Generator_output])

		self.G_out = self.generator(concat(self.G_in, self.condition), self.Generator_output)

		self.D_real, _ = self.discriminator(self.real_partition, self.Generator_output)

		self.D_fake, self.Q_fake = self.discriminator(self.G_out, self.Generator_output, reuse = True)

		self.D_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.D_real, labels=tf.ones_like(self.D_real)))

		self.G_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.D_fake, labels=tf.ones_like(self.D_fake)))

		self.Q_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.Q_fake, labels=self.real_partition))

		self.train_D = tf.train.AdamOptimizer(LR_D).minimize(self.D_loss,var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.discriminator.name))

		self.train_G = tf.train.AdamOptimizer(LR_G).minimize(self.G_loss,var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.generator.name))

		self.train_Q = tf.train.AdamOptimizer(LR_G).minimize(self.G_loss,var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.generator.name) + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.discriminator.name))
    def __init__(self):

        loaded_csv = utils.load_csv('../../Data/Train/NHID2013.csv')
        # print("loaded_csv",loaded_csv)

        # print("loaded_csv",loaded_csv.shape)
        # (3081208, 6)
        min_max_scaler = preprocessing.MinMaxScaler()
        loaded_csv_minmax = pd.DataFrame(
            min_max_scaler.fit_transform(loaded_csv.iloc[:, :4]))
        # print(loaded_csv_minmax)
        # print("loaded_csv_minmax",loaded_csv_minmax.shape)
        # loaded_csv_minmax (234426, 6)

        loaded_csv_minmax = pd.concat(
            [loaded_csv_minmax, loaded_csv.iloc[:, 4:]], axis=1)
        # print("loaded_csv_minmax",loaded_csv_minmax)

        self.train_X = np.array(loaded_csv_minmax.iloc[:3000000, :4])
        self.train_y = np.array(loaded_csv_minmax.iloc[:3000000, 4:])
        # self.train_X=np.array(loaded_csv_minmax.iloc[:3000,:4])
        # self.train_y=np.array(loaded_csv_minmax.iloc[:3000,4:])
        # print("train_X",train_X.shape)
        # print("train_y",train_y.shape)

        self.number_of_data = self.train_X.shape[0]
    def run(self):
        global total_removed
        if self.path is None:
            return

        print("%s starts..." % self.name)

        content = load_csv(self.path, select_list=[3,4,5,6])
        # str2float
        content = list(map(map_str2float, content))
        # gcj2wgs
        content = list(map(map_list, content))
        n = remove_baddata(content)
        lock.acquire()
        total_removed += n
        lock.release()
        content = np.array(content)
        content[:,[0,2]] -= 103.5
        content[:,[0,2]] /= 0.1
        content[:,[1,3]] -= 30.3
        content[:,[1,3]] /= 0.05
        content = list(map(map_float2int, content))
        tem_dis = np.zeros(self.distribution.shape)
        for row in content:
            tem_dis[row[0], row[1]] += 1
            tem_dis[row[2], row[3]] += 1
        # update
        lock.acquire()
        self.distribution += tem_dis
        lock.release()

        print("%s finished! There are %d removed." %(self.name, n))
Esempio n. 14
0
def get_angles_data(input_folder,
                    output_folder,
                    files_keep,
                    type_data="angles",
                    align=True):
    files_keep_clean = [file_name.split(".")[0] for file_name in files_keep]

    files_angles = get_files(join_path(input_folder, type_data))
    if align:
        files_events = get_files(join_path(input_folder, "events"))

    os.makedirs(join_path(output_folder, type_data), exist_ok=True)
    for file_ in files_angles:

        if file_.split(".")[0] in files_keep_clean:
            data = np.load(join_path(input_folder, type_data, file_),
                           allow_pickle=True)
            if len(data.shape) == 3:
                if np.count_nonzero(np.isnan(data)):
                    continue
            else:
                continue
            np.save(join_path(output_folder, type_data, file_), data)
            if align:
                events = load_csv(
                    join_path(input_folder, "events",
                              "{}.csv".format(file_.split(".")[0])),
                    dtype=str,
                )
                align_and_save_data(data,
                                    events,
                                    output_folder,
                                    file_,
                                    type_data=type_data)
Esempio n. 15
0
def get_feats_from_csv_in_partitions():
    """
    Extract the original features that are distributed in the dataset. Features
    are splitted according with the config.yaml file.
    """
    conf = utils.get_config()
    rows = [
        row for row in utils.load_csv()
        if utils.check_filter(row, conf['filters'])
    ]
    train_rows, valid_rows, test_rows = utils.split_dataset(
        rows,
        conf['valid_percent'],
        conf['test_percent'],
        rng=conf['rng_seed'])
    X_train, y_train, X_valid, y_valid, X_test, y_test = [], [], [], [], [], []
    prefixes = ['t_', 'i_', 's_']  # Feature names' prefixes
    datasets = [(X_train, y_train, train_rows), (X_test, y_test, test_rows),
                (X_valid, y_valid, valid_rows)]
    out = []
    for X, y, rows in datasets:
        for row in rows:
            X.append([
                float(v) for k, v in row.iteritems()
                if len(filter(k.startswith, prefixes)) > 0
            ])
            y.append(int(row['classification'] == 'Malign'))
        out.extend((np.asarray(X), np.asarray(y)))
    return out
Esempio n. 16
0
def get_feats_in_partitions():
    """
    Extracts features from all dataset and split them in train validation and
    test sets
    """
    conf = utils.get_config()
    paths = utils.get_paths()
    rows = utils.load_csv()
    filters = conf['filters']
    region_size = conf['region_size']
    region_stride = conf['region_stride']

    filtered_rows = [
        row for row in rows if utils.check_filter(row, conf['filters'])]
    train_rows, valid_rows, test_rows = utils.split_dataset(
        filtered_rows, conf['valid_percent'], conf['test_percent'], rng=conf['rng_seed'])

    conv = get_fprop_fn(False)
    print 'Getting features from train...'
    X_train = get_feats_from_rows(
        train_rows, conv, conf['stride'])
    print 'Getting features from valid...'
    X_valid = get_feats_from_rows(
        valid_rows, conv, conf['stride'])
    print 'Getting features from test...'
    X_test = get_feats_from_rows(
        test_rows, conv, conf['stride'])
    y_train = [row['classification'] == 'Malign' for row in train_rows]
    y_valid = [row['classification'] == 'Malign' for row in valid_rows]
    y_test = [row['classification'] == 'Malign' for row in test_rows]
    return X_train, y_train, X_valid, y_valid, X_test, y_test
Esempio n. 17
0
def main():
    args = parser.parse_args()
    header, dataset = utils.load_csv(args.input)
    if len(dataset) == 0:
        parser.error("Invalid input: file does not exist or is empty.")

    normalized = standardize(dataset)
    dendrogram_info = clusterize(normalized, args.linkage)

    fig = plot(dendrogram_info)
    fig.savefig(args.output + "_full.png", format="png")
    plt.show()

    weights = [args.average_weight, args.sd_weight]
    trees = cut(dendrogram_info, weights, args.class_range)
    fig = plot(trees)
    fig.savefig(args.output + ".png", format="png")
    plt.show()

    print("%d clusters were generated." % len(trees))
    classified = [header + ["Classification"]]
    clusters = get_clusters(trees)
    for i in range(len(dataset)):
        classified.append(dataset[i] + [clusters[i]])
    utils.save_csv(args.output + ".csv", classified)
def sort_orders(path_dir="data/process/order_201611.csv"):
    content = load_csv(path_dir)
    content = list(map(map_str2int, content))
    for i in range(len(content)):
        content[i][1] = max(content[i][1] - content[i][0], 1)
    content.sort(key=lambda x: x[0])
    write_csv(content, path_dir)
Esempio n. 19
0
    def __init__(self, feature_type, data_type):
        self.feature_type = feature_type
        self.data_type = data_type
        data_dir = f"data/features/{feature_type}/"
        ids_dir = f"data/features/ids/"

        tracks = utils.load_csv("tracks")["track"]
        self.genres = np.unique(tracks["genre_top"].to_numpy()).tolist()

        if data_type in ["test", "validate"]:
            filename = f"{data_type}_{feature_type}.npy"
            self.npy = np.load(f"{data_dir}{filename}", mmap_mode="r")
            self.ids = np.load(f"{ids_dir}{data_type}_ids.npy")
            self.genre_data = tracks["genre_top"].loc[self.ids].tolist()

        elif data_type == "train":
            self.npys = []
            self.ids = []
            self.genre_datas = []
            for i in range(8):
                filename = f"{data_type}_{feature_type}_{i}.npy"
                ids = np.load(f"{ids_dir}{data_type}_ids_{i}.npy")
                npy = np.load(f"{data_dir}{filename}", mmap_mode="r")
                genre_data = tracks["genre_top"].loc[ids].tolist()

                self.npys.append(npy)
                self.ids.append(ids)
                self.genre_datas.append(genre_data)

        del tracks
    def run(self):
        global total_removed
        if self.path is None:
            return

        print("%s starts..." % self.name)

        content = load_csv(self.path, select_list=[3,4,5,6])
        content = list(map(map_str2float, content))
        n = remove_baddata(content)
        lock.acquire()
        total_removed += n
        lock.release()
        content = np.array(list(map(map_list, content)))
        min_x = content[:,[0,2]].min()
        max_x = content[:,[0,2]].max()
        min_y = content[:,[1,3]].min()
        max_y = content[:,[1,3]].max()

        # update
        lock.acquire()
        self.min_x_list.append(min_x)
        self.max_x_list.append(max_x)
        self.min_y_list.append(min_y)
        self.max_y_list.append(max_y)
        lock.release()

        print("%s finished! There are %d removed." %(self.name, n))
def find_pos_range(path_dir="data/extracted"):
    min_x_list = []
    max_x_list = []
    min_y_list = []
    max_y_list = []
    n_total = 0     # total moved
    for _, _, files in os.walk(path_dir):
        for file_name in files:
            if file_name.startswith("order"):
                temp_path = os.path.join(path_dir, file_name)
                content = load_csv(temp_path, select_list=[3,4,5,6])
                content = list(map(map_str2float, content))
                n_total += remove_baddata(content)
                content = np.array(list(map(map_list, content)))
                min_x = content[:,[0,2]].min()
                max_x = content[:,[0,2]].max()
                min_y = content[:,[1,3]].min()
                max_y = content[:,[1,3]].max()
                min_x_list.append(min_x)
                max_x_list.append(max_x)
                min_y_list.append(min_y)
                max_y_list.append(max_y)
    print(min(min_x_list))  # 103.0002196712431
    print(max(max_x_list))  # 120.35693932767293
    print(min(min_y_list))  # 22.86432541244561
    print(max(max_y_list))  # 40.144055627798586
    print(n_total)
Esempio n. 22
0
def main():
    parser = fix_csv_parser.get_parser()
    args = parser.parse_args()

    extension = "tsv" if args.tsv else "csv" if args.csv else None
    delimiter = "\t" if args.tsv else "," if args.csv else None
    quotechar = '"'

    for csv_path in args.csv_paths:
        csv_path = Path(csv_path)

        destination_folder = (csv_path.parent
                              if args.destination_folder is None else
                              args.destination_folder)
        destination_folder = Path(destination_folder)

        os.makedirs(destination_folder, exist_ok=True)

        new_name = f"{utils.get_filename_without_extension(csv_path)}.{extension}"
        destination_path = destination_folder / new_name

        rows = utils.load_csv(csv_path=csv_path)
        utils.save_rows(
            rows=rows,
            destination_path=destination_path,
            delimiter=delimiter,
            quotechar=quotechar,
        )
        utils.save_rows(
            rows=rows,
            destination_path=str(destination_path)[:-4] + "-fixed.csv",
            delimiter=",",
            quotechar=quotechar,
        )
Esempio n. 23
0
def get_feats_in_partitions():
    """
    Extracts features from all dataset and split them in train validation and
    test sets
    """
    conf = utils.get_config()
    paths = utils.get_paths()
    rows = utils.load_csv()
    filters = conf['filters']
    region_size = conf['region_size']
    region_stride = conf['region_stride']

    filtered_rows = [
        row for row in rows if utils.check_filter(row, conf['filters'])
    ]
    train_rows, valid_rows, test_rows = utils.split_dataset(
        filtered_rows,
        conf['valid_percent'],
        conf['test_percent'],
        rng=conf['rng_seed'])

    conv = get_fprop_fn(False)
    print 'Getting features from train...'
    X_train = get_feats_from_rows(train_rows, conv, conf['stride'])
    print 'Getting features from valid...'
    X_valid = get_feats_from_rows(valid_rows, conv, conf['stride'])
    print 'Getting features from test...'
    X_test = get_feats_from_rows(test_rows, conv, conf['stride'])
    y_train = [row['classification'] == 'Malign' for row in train_rows]
    y_valid = [row['classification'] == 'Malign' for row in valid_rows]
    y_test = [row['classification'] == 'Malign' for row in test_rows]
    return X_train, y_train, X_valid, y_valid, X_test, y_test
Esempio n. 24
0
def main():
    """Goes through all the correspondance files, and foreach of our sitc codes, fetches the descriptions used by the
    harmonized systems. These extra descriptions will later be used to better match based on text similarity"""
    sitc_codes = load_csv(SITC2_FILE_PATH)

    # load all harmonized system categories
    hs_codes = {hs: {} for hs in HARMONIZED_SYSTEM_NAMES}

    for hs_system in HARMONIZED_SYSTEM_NAMES:
        hs = load_csv(HS_FILE_PATH_RAW.format(hs_system=hs_system))
        hs_codes[hs_system] = hs

    # load all correspondence tables
    hs_correspondence_tables = {hs: {} for hs in HARMONIZED_SYSTEM_NAMES}

    for hs_system in HARMONIZED_SYSTEM_NAMES:
        hs = load_correspondence_tables(
            CORRESPONDENCE_FILE_PATH_PREPROCESSED.format(hs_system=hs_system),
            system=hs_system)
        hs_correspondence_tables[hs_system] = hs

    sitc_codes_enriched = {code: set() for code in sitc_codes.keys()}
    # foreach sitc_code, find its correspondent from hs_codes and store them as set
    for sitc_code in sitc_codes.keys():
        # go through all mappings and fetch its description
        for hs_system, mappings in hs_correspondence_tables.items():
            mapping = mappings.get(sitc_code)

            if mapping:
                # might need to change to get
                sitc_codes_enriched[sitc_code].add(
                    hs_codes[hs_system][mapping])
    print(
        f'in total {len(sitc_codes_enriched)} and only {len([c for c, v in sitc_codes_enriched.items() if v])} '
        f'extended')

    # store the mapped stuff
    with open(ENRICHED_SITC_CODES_FILE_PATH, 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['ID', 'Mapping'])
        for sitc_code, desc in sitc_codes_enriched.items():
            if desc:
                writer.writerow([sitc_code, '~'.join(desc)])

    print(f'Extended mapping stored under {ENRICHED_SITC_CODES_FILE_PATH}')
Esempio n. 25
0
def main():
    csv_path = 'data/all_test_clean.csv'
    tweets, targets, labels = load_csv(csv_path)
    print('--- LOADED CSV ---')
    model = load_bert()
    print('--- LOADED MODEL ---')
    preds = predict(model, tweets, targets)
    save_npy(preds, 'ada_bert', 'preds/')
    print('--- SAVED PREDS ---')
    print_metrics(preds, labels, 'ada_bert')
def total_orders(path_dir="data/extracted"):
    n = 0
    for _, _, files in os.walk(path_dir):
        for file_name in files:
            if not file_name.startswith("order"):
                continue
            temp_path = os.path.join(path_dir, file_name)
            content = load_csv(temp_path, select_list=[3,4,5,6])
            n += len(content)
    print(n)    # 7065937
Esempio n. 27
0
def main():
	parser = argparse.ArgumentParser()
	parser.add_argument('-t', '--test', help="A test directory containing map.pgm, measure.csv, control.csv, and ground.csv files", required=True)
	parser.add_argument('-s', '--states', help='The file containing the starting states to use')
	parser.add_argument('-v', '--visualizer', action='store_const', const=True, help='Add this flag to turn on the visualizer', default=False)
	parser.add_argument('-n', '--numstart', type=int, default=200)

	args = parser.parse_args()
	lmap = utils.load_map('tests/' + args.test + '/map.pgm')
	controls = utils.load_csv('tests/' + args.test + '/control.csv') #a Tx2 array of T (delta phi, velocity)'s
	measurements = utils.load_measurements('tests/' + args.test + '/measure.csv') #a TxMx2 array of T sets of M measurements containing a degree and a measured distance at that degree
	true_start = utils.load_csv('tests/' + args.test + '/ground.csv')
	if args.states:
		start_posns = utils.load_csv(args.states) #a Nx3 array of N (x,y,phi)'s
	else:
		start_posns = generate_init_states(lmap, args.numstart)

	print("Using particle_filter function...")
	particle_filter(start_posns, controls, measurements, lmap, true_start, args.visualizer)
Esempio n. 28
0
def read_csv(input_file):
    """Reads a tab separated value file."""
    df = load_csv(input_file, header=0).fillna('|')
    print(df.head())
    jobcontent = df['content'].tolist()
    print("__________________________________________")
    jlabel = df.loc[:, hp.label_vocabulary].values
    print('Read csv finished!(1)')
    print(jlabel.head())
    return shuffle_one([[jlabel[i], jobcontent[i]] for i in range(len(jlabel))
                        if type(jobcontent[i]) == str])
Esempio n. 29
0
 def _read_csv(cls, input_file):
     """Reads a tab separated value file."""
     df = load_csv(input_file, header=0).fillna('|')
     jobcontent = df['content'].tolist()
     jlabel = df.loc[:, hp.label_vocabulary].values
     lines = [[jlabel[i], jobcontent[i]] for i in range(len(jlabel))
              if type(jobcontent[i]) == str]
     lines2 = shuffle_one(lines)
     print('Read csv finished!(1)')
     print('Head data:', lines2[0:5])
     print('Length of data:', len(lines2))
     return lines2
Esempio n. 30
0
def main():

    exp_name = f'baseline_{now()}'
    device, log, result_dir = setup(exp_name, conf)

    train_df = load_csv(conf.train_csv)
    if conf.npy:
        train_images = np.load(conf.train_images)
    else:
        train_images = pd.read_parquet(conf.train_images)

    train_df["gr"] = 0
    train_df["cd"] = 0
    train_df["vd"] = 0
    train_df["image_mean"] = 0

    models = [f"se_resnext50_f{i}.pkl" for i in range(5)]

    preds = np.zeros((len(train_df), conf.gr_size + conf.vd_size + conf.cd_size))
    image_stats = np.zeros((len(train_df), 2))

    log.info('done')
    for i in range(5):

        model = ResNet(conf, arch_name=conf.arch,
                          input_size=conf.image_size)
        model.load_state_dict(torch.load(models[i]))
        model.to(device)

        ds = val_split(train_df, train_images, fold=i)
        _, val_ds, _, val_images = ds['train'], ds['val'], ds['train_images'], ds['val_images']

        test_preds = predict(model, val_ds, val_images, valid_transform,
                             device)

        print(test_preds.shape)
        te_ind = ds['te_ind']
        preds[te_ind] += test_preds
        image_stats[te_ind, 0] = val_images.mean((1, 2))
        image_stats[te_ind, 0] = val_images.std((1, 2))

    preds = np.concatenate([preds, image_stats], axis=1)

    for t in ["grapheme_root", "vowel_diacritic", "consonant_diacritic"]:
        rf = RandomForestClassifier(n_jobs=16)
        # train = xgb.DMatrix(preds, label=train_df[t])
        # params = {"max_depth": 4, "nthread": 16, "objective": "multi:softmax",
        #           "eval_metric": ["merror", "mlogloss"], "num_class": conf.gr_size}
        # xgb.cv(params, train, num_boost_round=1000, nfold=5, seed=conf.seed,
        #        early_stopping_rounds=40, verbose_eval=10)
        rf.fit(preds, train_df[t])
        with open(f"{t}_rf2.pkl", "wb") as f:
            joblib.dump(rf, f)
Esempio n. 31
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input_file', help="Input csv file with data.")
    args = parser.parse_args()

    dataset = utils.load_csv(args.input_file)
    train_set, test_set = corpus.split_dataset(dataset, 0.67)
    separated = corpus.separate_by_class(train_set)
    summaries = corpus.summarize_by_class(train_set)
    predictions = predict_set(summaries, test_set)
    accuracy = utils.get_accuracy(test_set, predictions)
    print('Accuracy: {0}%').format(accuracy)
Esempio n. 32
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input_file', help="Input csv file with data.")
    args = parser.parse_args()

    dataset = utils.load_csv(args.input_file)
    train_set, test_set = corpus.split_dataset(dataset, 0.67)
    separated = corpus.separate_by_class(train_set)
    summaries = corpus.summarize_by_class(train_set)
    predictions = predict_set(summaries, test_set)
    accuracy = utils.get_accuracy(test_set, predictions)
    print('Accuracy: {0}%').format(accuracy)
Esempio n. 33
0
def main():
    db = "{}/data/db".format(environ["WD"])
    if not exists(db):
        con = connect(db)
        data = \
            pipe( load_csv()
                , rename_columns
                , clean_date
                )
        data.to_sql(name="data", con=con)
    else:
        print("data already compiled to {}".format(db))
def main():
  import sys
  from utils import load_csv, get_stencil_num

  raw_data = load_csv(sys.argv[1])

  k_l = set()
  for k in raw_data:
    k_l.add((get_stencil_num(k), k['Global NX']))
  k_l = list(k_l)

  bsz_l = set()
  for k in raw_data:
    if k['Multi-wavefront updates']=='0': continue
    bsz_l.add(k['Multi-wavefront updates'])
  bsz_l = sorted(list(bsz_l))

  for k, N in k_l:
    for bsz in bsz_l:
      gen_res(raw_data, int(k), int(bsz), int(N))
def main():
  import sys
  from ics_utils import get_stencil_num
  from utils import load_csv

  raw_data = load_csv(sys.argv[1])

  k_l = set()
  for k in raw_data:
    k_l.add(get_stencil_num(k))
  k_l = list(k_l)

  n_l = set()
  for k in raw_data:
    n_l.add(k['Global NX'])
  n_l = list(n_l)


  for k in k_l:
    for N in n_l:
      gen_res(raw_data, int(k), int(N))
Esempio n. 36
0
def get_feats_from_csv_in_partitions():
    """
    Extract the original features that are distributed in the dataset. Features
    are splitted according with the config.yaml file.
    """
    conf = utils.get_config()
    rows = [row for row in utils.load_csv() if utils.check_filter(row, conf['filters'])]
    train_rows, valid_rows, test_rows = utils.split_dataset(
        rows, conf['valid_percent'], conf['test_percent'], rng=conf['rng_seed'])
    X_train, y_train, X_valid, y_valid, X_test, y_test = [], [], [], [], [], []
    prefixes = ['t_', 'i_', 's_']  # Feature names' prefixes
    datasets = [(X_train, y_train, train_rows),
                (X_test, y_test, test_rows), (X_valid, y_valid, valid_rows)]
    out = []
    for X, y, rows in datasets:
        for row in rows:
            X.append(
                [float(v) for k, v in row.iteritems() if len(filter(k.startswith, prefixes)) > 0])
            y.append(int(row['classification'] == 'Malign'))
        out.extend((np.asarray(X), np.asarray(y)))
    return out
def main():
  import sys
  from utils import select_fields, load_csv
  raw_data = load_csv(sys.argv[1])

  stencil='7_pt_const'
  rows = [
           {'Thread group size':'0' , 'Stencil Kernel coefficients':'constant', 'Stencil Kernel semi-bandwidth':'1', 'OpenMP Threads':'6' },
           {'Thread group size':'1' , 'Stencil Kernel coefficients':'constant', 'Stencil Kernel semi-bandwidth':'1', 'OpenMP Threads':'10'},
           {'Thread group size':'2' , 'Stencil Kernel coefficients':'constant', 'Stencil Kernel semi-bandwidth':'1', 'OpenMP Threads':'10'},
           {'Thread group size':'5' , 'Stencil Kernel coefficients':'constant', 'Stencil Kernel semi-bandwidth':'1', 'OpenMP Threads':'10'},
           {'Thread group size':'10', 'Stencil Kernel coefficients':'constant', 'Stencil Kernel semi-bandwidth':'1', 'OpenMP Threads':'10'}
         ]
  create_table(raw_data, rows, stencil)


  stencil='7_pt_var'
  rows = [
           {'Thread group size':'0' , 'Stencil Kernel coefficients':'variable no-symmetry', 'Stencil Kernel semi-bandwidth':'1', 'OpenMP Threads':'6'},
           {'Thread group size':'1' , 'Stencil Kernel coefficients':'variable no-symmetry', 'Stencil Kernel semi-bandwidth':'1', 'OpenMP Threads':'8'},
           {'Thread group size':'2' , 'Stencil Kernel coefficients':'variable no-symmetry', 'Stencil Kernel semi-bandwidth':'1', 'OpenMP Threads':'10'},
           {'Thread group size':'5' , 'Stencil Kernel coefficients':'variable no-symmetry', 'Stencil Kernel semi-bandwidth':'1', 'OpenMP Threads':'10'},
           {'Thread group size':'10', 'Stencil Kernel coefficients':'variable no-symmetry', 'Stencil Kernel semi-bandwidth':'1', 'OpenMP Threads':'10'}
         ]
  create_table(raw_data, rows, stencil)


  stencil='25_pt_var'
  rows = [
           {'Thread group size':'0' , 'Stencil Kernel coefficients':'variable axis-symmetric', 'Stencil Kernel semi-bandwidth':'4', 'OpenMP Threads':'8' },
           {'Thread group size':'1' , 'Stencil Kernel coefficients':'variable axis-symmetric', 'Stencil Kernel semi-bandwidth':'4', 'OpenMP Threads':'7'},
           {'Thread group size':'2' , 'Stencil Kernel coefficients':'variable axis-symmetric', 'Stencil Kernel semi-bandwidth':'4', 'OpenMP Threads':'8'},
           {'Thread group size':'5' , 'Stencil Kernel coefficients':'variable axis-symmetric', 'Stencil Kernel semi-bandwidth':'4', 'OpenMP Threads':'10'},
           {'Thread group size':'10', 'Stencil Kernel coefficients':'variable axis-symmetric', 'Stencil Kernel semi-bandwidth':'4', 'OpenMP Threads':'10'}
         ]
  create_table(raw_data, rows, stencil)
Esempio n. 38
0
        X[i] = features[np.nonzero(int(row['segmentation_id']) == segm_ids)][0]
        y[i] = utils.is_positive(row)
    return X, y


rng = [2014, 12, 5]
rng = make_np_rng(None, rng, which_method='uniform')
scale_feats = True
n_runs = 20
C_range = 10.0 ** np.arange(-8, 8)
train_scores = np.zeros((n_runs, len(C_range)))
valid_scores = np.zeros((n_runs, len(C_range)))
fit_threshold = True
conf_file = sys.argv[1] if len(sys.argv) > 1 else None
conf = utils.get_config(conf_file)
features = np.empty([len(utils.load_csv()), 0])
#f_list = ['hcfeats', 'imnet', 'cnn']
f_list = ['cnn']

if 'imnet' in f_list:
    rows = utils.load_csv()
    feats, y = fe_extraction.get_feats_from_imagenet(rows)
    features = np.hstack((features, feats))
    segm_ids = np.asarray([int(row['segmentation_id']) for row in rows])
if 'hcfeats' in f_list:
    rows = utils.load_csv(conf['csv_features_file'])
    feats, y = fe_extraction.get_feats_from_csv(
        rows, prefixes=['s_', 't_', 'i_'])
    feats = np.asarray(feats)
    features = np.hstack((features, feats))
    segm_ids = np.asarray([int(row['segmentation_id']) for row in rows])