コード例 #1
0
ファイル: preprocess.py プロジェクト: ireko8/Malware
def save_feather():
    train = utils.load_csv()
    for c in tqdm(train.columns):
        train[[c]].to_feather(f'features/train/{c}.ftr')
    test = utils.load_csv(test=True)
    for c in tqdm(test.columns):
        test[[c]].to_feather(f'features/test/{c}.ftr')
コード例 #2
0
def main(clf):
    x_train, y_train = utils.load_csv(POSTURES_TRAINING_DATA,
                                      feature_idx=range(2, 39),
                                      label_idx=1)
    x_test, y_test = utils.load_csv(POSTURES_TEST_DATA,
                                    feature_idx=range(2, 39),
                                    label_idx=1)
    model = models.PosturesEstimator(clf, aggregation = True, knn_n_neighbors = 9, \
        svm_c = 10 ** 0.6, svm_gamma = 10 ** -2.0, \
        nn_validation = True)

    assert len(x_train) == len(y_train) and len(x_test) == len(
        y_test), 'Data size not match'

    model.fit(x_train, y_train)
    acc_train = model.evaluate(x_train, y_train, cm='training.png')
    LOGGER.info('Training Accuracy: %.6f' % acc_train)

    acc_cv = model.cross_validate(x_train, y_train)
    if acc_cv is not None:
        LOGGER.info('Cross Validation Accuracy: %.6f ± %.6f' %
                    (np.mean(acc_cv), np.std(acc_cv)))

    acc_test = model.evaluate(x_test, y_test, cm='test.png')
    LOGGER.info('Test Accuracy: %.6f' % acc_test)
コード例 #3
0
 def __init__(self, config=cfg, cache=True):
     if not cache or not os.path.isfile(cfg.data_cache):
         self.train, self.val = self.train_val_split(
             utils.load_csv(cfg.train_csv), 0.9)
         self.test = utils.load_csv(cfg.test_csv, shuffle=False)
         utils.save_cache([self.train, self.val, self.test], cfg.data_cache)
     else:
         self.train, self.val, self.test = utils.load_cache(cfg.data_cache)
コード例 #4
0
def main():

    exp_name = f'baseline_{now()}'
    device, log, result_dir = setup(exp_name, conf)

    train_df = load_csv(conf.train_csv)
    if conf.npy:
        train_images = np.load(conf.train_images)
    else:
        train_images = pd.read_parquet(conf.train_images)

    test_df = load_csv(conf.test_csv)
    if conf.npy:
        test_images = np.load(conf.test_images)
    else:
        test_images = pd.read_parquet(conf.test_images)

    log.info('done')
    for i in range(5):
        if i != conf.fold:
            continue

        if "resnet" in conf.arch or "resnext" in conf.arch:
            model_ft = ResNet(conf,
                              arch_name=conf.arch,
                              input_size=conf.image_size)
            model_ft.load_state_dict(
                torch.load("result/baseline_2020_03_21_13_01_08/model_0.pkl"))
        elif "densenet" in conf.arch:
            model_ft = DenseNet(conf,
                                arch_name=conf.arch,
                                input_size=conf.image_size)
        elif "efficientnet" in conf.arch:
            model_ft = EfficientNet(conf, arch_name=conf.arch)

        criterion = [
            nn.CrossEntropyLoss(reduction="none"),
            nn.CrossEntropyLoss(reduction="none"),
            nn.CrossEntropyLoss(reduction="none")
        ]
        criterion = [c.to(device) for c in criterion]

        model_ft, val_preds = train_model(train_df,
                                          train_images,
                                          test_df,
                                          test_images,
                                          model_ft,
                                          criterion,
                                          log,
                                          device,
                                          result_dir,
                                          fold=i,
                                          num_epoch=conf.num_epoch)

        torch.save(model_ft.state_dict(), result_dir / f'model_{i}.pkl')
        np.save(result_dir / f'val_preds_{i}.npy', val_preds)
コード例 #5
0
 def load_subset(self, dataset, name):
     filename = os.path.join("..", "datasets", dataset, name,
                             "features.csv")
     data = np.asarray(utils.load_csv(filename, skiprows=1))
     filename = os.path.join("..", "datasets", dataset, name,
                             "y_{}.txt".format(name))
     activities = np.asarray(utils.load_csv(filename)).ravel()
     filename = os.path.join("..", "datasets", dataset, name,
                             "subject_{}.txt".format(name))
     subjects = np.asarray(utils.load_csv(filename)).ravel()
     return data, activities, subjects
コード例 #6
0
def train(train_file, test_file=None):
    data = utils.load_csv(train_file)
    feature_set = [(utils.feature_extract(i[0]), i[1]) for i in data]
    print 'Training'
    classifier = nltk.NaiveBayesClassifier.train(feature_set)
    utils.save_model(classifier)
    print 'Done Training'
    if test_file:
        data = utils.load_csv(test_file)
    test_feature_set = [(utils.feature_extract(i[0]), i[1]) for i in data]
    print 'Accuracy of model is {0}'.format(
        nltk.classify.accuracy(classifier, test_feature_set))
コード例 #7
0
 def load_artist_data(self, artist_name):
     artist_information = self._musicbrainz_searcher.get_musicbrainz_artist_info(
         artist_name)
     print('Load artist data:', artist_information)
     events_df = utils.load_csv(artist_information.name, 'events')
     setlists_df = utils.load_csv(artist_information.name, 'setlists')
     recordings_df = utils.load_csv(artist_information.name, 'recordings')
     if events_df is not None:
         events_df['eventdate'] = pd.to_datetime(events_df['eventdate'],
                                                 format='%Y-%m-%d')
     if recordings_df is not None:
         recordings_df['date'] = pd.to_datetime(recordings_df['date'],
                                                format='%Y-%m-%d')
     return ArtistData(artist_information, events_df, setlists_df,
                       recordings_df)
コード例 #8
0
ファイル: draw_ROC.py プロジェクト: FlutteryEmbers/adaboost
def get_test_error(threshold):
    TP = 0
    TN = 0
    FP = 0
    FN = 0

    for i in range(test_data_size):
        data = utils.load_csv(test_set_dir + str(i) + ".csv")
        label = utils.get_label_from_model_with_threshold(
            data, model, threshold)

        if label == 1:
            if data[19][0] == 1:
                TP += 1
            elif data[19][0] == -1:
                FP += 1
            else:
                print('1 error')
        elif label == -1:
            if data[19][0] == 1:
                FN += 1
            elif data[19][0] == -1:
                TN += 1
            else:
                print('1 error')
        else:
            print(label)

    TPR = TP / (TP + FN)
    FPR = FP / (FP + TN)

    return TPR, FPR
コード例 #9
0
def get_cleaned_tweets(query_dict):
    """
    Get cleaned tweets
    :param query_dict:
        query_string: 'datacamp lang:en'
        time_since: '2019-03-01'
        time_until: '2019-05-01'
        max_tweets: 0 for unlimited
    :return: dataframe
    """

    file_name = _convert_query_dict_to_str_as_filename(query_dict)
    save_cleaned_file_name = paths.cleaned_tweets / 'cleaned_{}.csv'.format(
        file_name)

    if save_cleaned_file_name.is_file():
        print('Cleaned file {} already exists, reload'.format(
            save_cleaned_file_name))
        tweet_df = load_csv(save_cleaned_file_name)
    else:
        tweet_df = get_raw_tweets(query_dict)

        print('Cleaning tweets')
        cleaned_tweet_df = _clean_tweets_text(tweet_df)

        #print('Select only {USE_TWEETS_COLS} and save tweets to: {repr(save_cleaned_file_name)}'.format())
        cleaned_tweet_df[USE_TWEETS_COLS].to_csv(save_cleaned_file_name,
                                                 index=False)

    print('Done getting tweets.')
    return tweet_df
コード例 #10
0
def get_raw_tweets(query_dict):
    """
    Get raw tweets
    :param query_dict:
        query_string: 'datacamp lang:en'
        time_since: '2019-03-01'
        time_until: '2019-05-01'
        max_tweets: 0 for unlimited
    :return: dataframe
    """
    file_name = _convert_query_dict_to_str_as_filename(query_dict)
    save_raw_file_name = paths.raw_tweets / 'raw_{}.csv'.format(file_name)

    if save_raw_file_name.is_file():
        print('Raw file {} already exists, reload'.format(
            repr(save_raw_file_name)))
        tweet_df = load_csv(save_raw_file_name)
    else:
        _validate_query(query_dict)

        print(f'Getting raw tweets with query:\n{query_dict!r}')
        tweet_criteria = _create_search_criteria(**query_dict)
        tweet_objects = _get_tweet_object(tweet_criteria)
        tweet_df = _convert_tweets_to_dataframe(tweet_objects)

        print(f'Saving raw tweets to: {repr(save_raw_file_name)}')
        tweet_df.to_csv(save_raw_file_name, index=False)

    print('Done getting raw tweets.')
    return tweet_df
コード例 #11
0
ファイル: info_gan.py プロジェクト: Liuguanli/GAN
	def __init__(self, generator, discriminator, data_set_file, y_dim):
		self.generator = generator
		self.discriminator = discriminator
		self.data_set_file = data_set_file
		self.y_dim = y_dim # useless     condition

		indexs, latitude, longitude = ut.load_csv(self.data_set_file, 2)

		self.borders = range(B, len(latitude), B)

		self.Generator_input = 100

		self.Generator_output = len(self.borders)

		self.G_in = tf.placeholder(tf.float32, [None, self.Generator_input])
		self.real_partition = tf.placeholder(tf.float32, [None, self.Generator_output], name='real_in')
		self.condition = tf.placeholder(tf.float32, shape=[None, self.Generator_output])

		self.G_out = self.generator(concat(self.G_in, self.condition), self.Generator_output)

		self.D_real, _ = self.discriminator(self.real_partition, self.Generator_output)

		self.D_fake, self.Q_fake = self.discriminator(self.G_out, self.Generator_output, reuse = True)

		self.D_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.D_real, labels=tf.ones_like(self.D_real)))

		self.G_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.D_fake, labels=tf.ones_like(self.D_fake)))

		self.Q_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=self.Q_fake, labels=self.real_partition))

		self.train_D = tf.train.AdamOptimizer(LR_D).minimize(self.D_loss,var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.discriminator.name))

		self.train_G = tf.train.AdamOptimizer(LR_G).minimize(self.G_loss,var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.generator.name))

		self.train_Q = tf.train.AdamOptimizer(LR_G).minimize(self.G_loss,var_list=tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.generator.name) + tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.discriminator.name))
コード例 #12
0
    def __init__(self):

        loaded_csv = utils.load_csv('../../Data/Train/NHID2013.csv')
        # print("loaded_csv",loaded_csv)

        # print("loaded_csv",loaded_csv.shape)
        # (3081208, 6)
        min_max_scaler = preprocessing.MinMaxScaler()
        loaded_csv_minmax = pd.DataFrame(
            min_max_scaler.fit_transform(loaded_csv.iloc[:, :4]))
        # print(loaded_csv_minmax)
        # print("loaded_csv_minmax",loaded_csv_minmax.shape)
        # loaded_csv_minmax (234426, 6)

        loaded_csv_minmax = pd.concat(
            [loaded_csv_minmax, loaded_csv.iloc[:, 4:]], axis=1)
        # print("loaded_csv_minmax",loaded_csv_minmax)

        self.train_X = np.array(loaded_csv_minmax.iloc[:3000000, :4])
        self.train_y = np.array(loaded_csv_minmax.iloc[:3000000, 4:])
        # self.train_X=np.array(loaded_csv_minmax.iloc[:3000,:4])
        # self.train_y=np.array(loaded_csv_minmax.iloc[:3000,4:])
        # print("train_X",train_X.shape)
        # print("train_y",train_y.shape)

        self.number_of_data = self.train_X.shape[0]
コード例 #13
0
    def run(self):
        global total_removed
        if self.path is None:
            return

        print("%s starts..." % self.name)

        content = load_csv(self.path, select_list=[3,4,5,6])
        # str2float
        content = list(map(map_str2float, content))
        # gcj2wgs
        content = list(map(map_list, content))
        n = remove_baddata(content)
        lock.acquire()
        total_removed += n
        lock.release()
        content = np.array(content)
        content[:,[0,2]] -= 103.5
        content[:,[0,2]] /= 0.1
        content[:,[1,3]] -= 30.3
        content[:,[1,3]] /= 0.05
        content = list(map(map_float2int, content))
        tem_dis = np.zeros(self.distribution.shape)
        for row in content:
            tem_dis[row[0], row[1]] += 1
            tem_dis[row[2], row[3]] += 1
        # update
        lock.acquire()
        self.distribution += tem_dis
        lock.release()

        print("%s finished! There are %d removed." %(self.name, n))
コード例 #14
0
def get_angles_data(input_folder,
                    output_folder,
                    files_keep,
                    type_data="angles",
                    align=True):
    files_keep_clean = [file_name.split(".")[0] for file_name in files_keep]

    files_angles = get_files(join_path(input_folder, type_data))
    if align:
        files_events = get_files(join_path(input_folder, "events"))

    os.makedirs(join_path(output_folder, type_data), exist_ok=True)
    for file_ in files_angles:

        if file_.split(".")[0] in files_keep_clean:
            data = np.load(join_path(input_folder, type_data, file_),
                           allow_pickle=True)
            if len(data.shape) == 3:
                if np.count_nonzero(np.isnan(data)):
                    continue
            else:
                continue
            np.save(join_path(output_folder, type_data, file_), data)
            if align:
                events = load_csv(
                    join_path(input_folder, "events",
                              "{}.csv".format(file_.split(".")[0])),
                    dtype=str,
                )
                align_and_save_data(data,
                                    events,
                                    output_folder,
                                    file_,
                                    type_data=type_data)
コード例 #15
0
def get_feats_from_csv_in_partitions():
    """
    Extract the original features that are distributed in the dataset. Features
    are splitted according with the config.yaml file.
    """
    conf = utils.get_config()
    rows = [
        row for row in utils.load_csv()
        if utils.check_filter(row, conf['filters'])
    ]
    train_rows, valid_rows, test_rows = utils.split_dataset(
        rows,
        conf['valid_percent'],
        conf['test_percent'],
        rng=conf['rng_seed'])
    X_train, y_train, X_valid, y_valid, X_test, y_test = [], [], [], [], [], []
    prefixes = ['t_', 'i_', 's_']  # Feature names' prefixes
    datasets = [(X_train, y_train, train_rows), (X_test, y_test, test_rows),
                (X_valid, y_valid, valid_rows)]
    out = []
    for X, y, rows in datasets:
        for row in rows:
            X.append([
                float(v) for k, v in row.iteritems()
                if len(filter(k.startswith, prefixes)) > 0
            ])
            y.append(int(row['classification'] == 'Malign'))
        out.extend((np.asarray(X), np.asarray(y)))
    return out
コード例 #16
0
ファイル: fe_extraction.py プロジェクト: johnarevalo/cnn-bcdr
def get_feats_in_partitions():
    """
    Extracts features from all dataset and split them in train validation and
    test sets
    """
    conf = utils.get_config()
    paths = utils.get_paths()
    rows = utils.load_csv()
    filters = conf['filters']
    region_size = conf['region_size']
    region_stride = conf['region_stride']

    filtered_rows = [
        row for row in rows if utils.check_filter(row, conf['filters'])]
    train_rows, valid_rows, test_rows = utils.split_dataset(
        filtered_rows, conf['valid_percent'], conf['test_percent'], rng=conf['rng_seed'])

    conv = get_fprop_fn(False)
    print 'Getting features from train...'
    X_train = get_feats_from_rows(
        train_rows, conv, conf['stride'])
    print 'Getting features from valid...'
    X_valid = get_feats_from_rows(
        valid_rows, conv, conf['stride'])
    print 'Getting features from test...'
    X_test = get_feats_from_rows(
        test_rows, conv, conf['stride'])
    y_train = [row['classification'] == 'Malign' for row in train_rows]
    y_valid = [row['classification'] == 'Malign' for row in valid_rows]
    y_test = [row['classification'] == 'Malign' for row in test_rows]
    return X_train, y_train, X_valid, y_valid, X_test, y_test
コード例 #17
0
def main():
    args = parser.parse_args()
    header, dataset = utils.load_csv(args.input)
    if len(dataset) == 0:
        parser.error("Invalid input: file does not exist or is empty.")

    normalized = standardize(dataset)
    dendrogram_info = clusterize(normalized, args.linkage)

    fig = plot(dendrogram_info)
    fig.savefig(args.output + "_full.png", format="png")
    plt.show()

    weights = [args.average_weight, args.sd_weight]
    trees = cut(dendrogram_info, weights, args.class_range)
    fig = plot(trees)
    fig.savefig(args.output + ".png", format="png")
    plt.show()

    print("%d clusters were generated." % len(trees))
    classified = [header + ["Classification"]]
    clusters = get_clusters(trees)
    for i in range(len(dataset)):
        classified.append(dataset[i] + [clusters[i]])
    utils.save_csv(args.output + ".csv", classified)
コード例 #18
0
def sort_orders(path_dir="data/process/order_201611.csv"):
    content = load_csv(path_dir)
    content = list(map(map_str2int, content))
    for i in range(len(content)):
        content[i][1] = max(content[i][1] - content[i][0], 1)
    content.sort(key=lambda x: x[0])
    write_csv(content, path_dir)
コード例 #19
0
    def __init__(self, feature_type, data_type):
        self.feature_type = feature_type
        self.data_type = data_type
        data_dir = f"data/features/{feature_type}/"
        ids_dir = f"data/features/ids/"

        tracks = utils.load_csv("tracks")["track"]
        self.genres = np.unique(tracks["genre_top"].to_numpy()).tolist()

        if data_type in ["test", "validate"]:
            filename = f"{data_type}_{feature_type}.npy"
            self.npy = np.load(f"{data_dir}{filename}", mmap_mode="r")
            self.ids = np.load(f"{ids_dir}{data_type}_ids.npy")
            self.genre_data = tracks["genre_top"].loc[self.ids].tolist()

        elif data_type == "train":
            self.npys = []
            self.ids = []
            self.genre_datas = []
            for i in range(8):
                filename = f"{data_type}_{feature_type}_{i}.npy"
                ids = np.load(f"{ids_dir}{data_type}_ids_{i}.npy")
                npy = np.load(f"{data_dir}{filename}", mmap_mode="r")
                genre_data = tracks["genre_top"].loc[ids].tolist()

                self.npys.append(npy)
                self.ids.append(ids)
                self.genre_datas.append(genre_data)

        del tracks
コード例 #20
0
    def run(self):
        global total_removed
        if self.path is None:
            return

        print("%s starts..." % self.name)

        content = load_csv(self.path, select_list=[3,4,5,6])
        content = list(map(map_str2float, content))
        n = remove_baddata(content)
        lock.acquire()
        total_removed += n
        lock.release()
        content = np.array(list(map(map_list, content)))
        min_x = content[:,[0,2]].min()
        max_x = content[:,[0,2]].max()
        min_y = content[:,[1,3]].min()
        max_y = content[:,[1,3]].max()

        # update
        lock.acquire()
        self.min_x_list.append(min_x)
        self.max_x_list.append(max_x)
        self.min_y_list.append(min_y)
        self.max_y_list.append(max_y)
        lock.release()

        print("%s finished! There are %d removed." %(self.name, n))
コード例 #21
0
def find_pos_range(path_dir="data/extracted"):
    min_x_list = []
    max_x_list = []
    min_y_list = []
    max_y_list = []
    n_total = 0     # total moved
    for _, _, files in os.walk(path_dir):
        for file_name in files:
            if file_name.startswith("order"):
                temp_path = os.path.join(path_dir, file_name)
                content = load_csv(temp_path, select_list=[3,4,5,6])
                content = list(map(map_str2float, content))
                n_total += remove_baddata(content)
                content = np.array(list(map(map_list, content)))
                min_x = content[:,[0,2]].min()
                max_x = content[:,[0,2]].max()
                min_y = content[:,[1,3]].min()
                max_y = content[:,[1,3]].max()
                min_x_list.append(min_x)
                max_x_list.append(max_x)
                min_y_list.append(min_y)
                max_y_list.append(max_y)
    print(min(min_x_list))  # 103.0002196712431
    print(max(max_x_list))  # 120.35693932767293
    print(min(min_y_list))  # 22.86432541244561
    print(max(max_y_list))  # 40.144055627798586
    print(n_total)
コード例 #22
0
def main():
    parser = fix_csv_parser.get_parser()
    args = parser.parse_args()

    extension = "tsv" if args.tsv else "csv" if args.csv else None
    delimiter = "\t" if args.tsv else "," if args.csv else None
    quotechar = '"'

    for csv_path in args.csv_paths:
        csv_path = Path(csv_path)

        destination_folder = (csv_path.parent
                              if args.destination_folder is None else
                              args.destination_folder)
        destination_folder = Path(destination_folder)

        os.makedirs(destination_folder, exist_ok=True)

        new_name = f"{utils.get_filename_without_extension(csv_path)}.{extension}"
        destination_path = destination_folder / new_name

        rows = utils.load_csv(csv_path=csv_path)
        utils.save_rows(
            rows=rows,
            destination_path=destination_path,
            delimiter=delimiter,
            quotechar=quotechar,
        )
        utils.save_rows(
            rows=rows,
            destination_path=str(destination_path)[:-4] + "-fixed.csv",
            delimiter=",",
            quotechar=quotechar,
        )
コード例 #23
0
def get_feats_in_partitions():
    """
    Extracts features from all dataset and split them in train validation and
    test sets
    """
    conf = utils.get_config()
    paths = utils.get_paths()
    rows = utils.load_csv()
    filters = conf['filters']
    region_size = conf['region_size']
    region_stride = conf['region_stride']

    filtered_rows = [
        row for row in rows if utils.check_filter(row, conf['filters'])
    ]
    train_rows, valid_rows, test_rows = utils.split_dataset(
        filtered_rows,
        conf['valid_percent'],
        conf['test_percent'],
        rng=conf['rng_seed'])

    conv = get_fprop_fn(False)
    print 'Getting features from train...'
    X_train = get_feats_from_rows(train_rows, conv, conf['stride'])
    print 'Getting features from valid...'
    X_valid = get_feats_from_rows(valid_rows, conv, conf['stride'])
    print 'Getting features from test...'
    X_test = get_feats_from_rows(test_rows, conv, conf['stride'])
    y_train = [row['classification'] == 'Malign' for row in train_rows]
    y_valid = [row['classification'] == 'Malign' for row in valid_rows]
    y_test = [row['classification'] == 'Malign' for row in test_rows]
    return X_train, y_train, X_valid, y_valid, X_test, y_test
コード例 #24
0
def main():
    """Goes through all the correspondance files, and foreach of our sitc codes, fetches the descriptions used by the
    harmonized systems. These extra descriptions will later be used to better match based on text similarity"""
    sitc_codes = load_csv(SITC2_FILE_PATH)

    # load all harmonized system categories
    hs_codes = {hs: {} for hs in HARMONIZED_SYSTEM_NAMES}

    for hs_system in HARMONIZED_SYSTEM_NAMES:
        hs = load_csv(HS_FILE_PATH_RAW.format(hs_system=hs_system))
        hs_codes[hs_system] = hs

    # load all correspondence tables
    hs_correspondence_tables = {hs: {} for hs in HARMONIZED_SYSTEM_NAMES}

    for hs_system in HARMONIZED_SYSTEM_NAMES:
        hs = load_correspondence_tables(
            CORRESPONDENCE_FILE_PATH_PREPROCESSED.format(hs_system=hs_system),
            system=hs_system)
        hs_correspondence_tables[hs_system] = hs

    sitc_codes_enriched = {code: set() for code in sitc_codes.keys()}
    # foreach sitc_code, find its correspondent from hs_codes and store them as set
    for sitc_code in sitc_codes.keys():
        # go through all mappings and fetch its description
        for hs_system, mappings in hs_correspondence_tables.items():
            mapping = mappings.get(sitc_code)

            if mapping:
                # might need to change to get
                sitc_codes_enriched[sitc_code].add(
                    hs_codes[hs_system][mapping])
    print(
        f'in total {len(sitc_codes_enriched)} and only {len([c for c, v in sitc_codes_enriched.items() if v])} '
        f'extended')

    # store the mapped stuff
    with open(ENRICHED_SITC_CODES_FILE_PATH, 'w') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['ID', 'Mapping'])
        for sitc_code, desc in sitc_codes_enriched.items():
            if desc:
                writer.writerow([sitc_code, '~'.join(desc)])

    print(f'Extended mapping stored under {ENRICHED_SITC_CODES_FILE_PATH}')
コード例 #25
0
def main():
    csv_path = 'data/all_test_clean.csv'
    tweets, targets, labels = load_csv(csv_path)
    print('--- LOADED CSV ---')
    model = load_bert()
    print('--- LOADED MODEL ---')
    preds = predict(model, tweets, targets)
    save_npy(preds, 'ada_bert', 'preds/')
    print('--- SAVED PREDS ---')
    print_metrics(preds, labels, 'ada_bert')
コード例 #26
0
def total_orders(path_dir="data/extracted"):
    n = 0
    for _, _, files in os.walk(path_dir):
        for file_name in files:
            if not file_name.startswith("order"):
                continue
            temp_path = os.path.join(path_dir, file_name)
            content = load_csv(temp_path, select_list=[3,4,5,6])
            n += len(content)
    print(n)    # 7065937
コード例 #27
0
ファイル: localization.py プロジェクト: mninmr/localization
def main():
	parser = argparse.ArgumentParser()
	parser.add_argument('-t', '--test', help="A test directory containing map.pgm, measure.csv, control.csv, and ground.csv files", required=True)
	parser.add_argument('-s', '--states', help='The file containing the starting states to use')
	parser.add_argument('-v', '--visualizer', action='store_const', const=True, help='Add this flag to turn on the visualizer', default=False)
	parser.add_argument('-n', '--numstart', type=int, default=200)

	args = parser.parse_args()
	lmap = utils.load_map('tests/' + args.test + '/map.pgm')
	controls = utils.load_csv('tests/' + args.test + '/control.csv') #a Tx2 array of T (delta phi, velocity)'s
	measurements = utils.load_measurements('tests/' + args.test + '/measure.csv') #a TxMx2 array of T sets of M measurements containing a degree and a measured distance at that degree
	true_start = utils.load_csv('tests/' + args.test + '/ground.csv')
	if args.states:
		start_posns = utils.load_csv(args.states) #a Nx3 array of N (x,y,phi)'s
	else:
		start_posns = generate_init_states(lmap, args.numstart)

	print("Using particle_filter function...")
	particle_filter(start_posns, controls, measurements, lmap, true_start, args.visualizer)
コード例 #28
0
ファイル: classifier_utils.py プロジェクト: cui-z/ml
def read_csv(input_file):
    """Reads a tab separated value file."""
    df = load_csv(input_file, header=0).fillna('|')
    print(df.head())
    jobcontent = df['content'].tolist()
    print("__________________________________________")
    jlabel = df.loc[:, hp.label_vocabulary].values
    print('Read csv finished!(1)')
    print(jlabel.head())
    return shuffle_one([[jlabel[i], jobcontent[i]] for i in range(len(jlabel))
                        if type(jobcontent[i]) == str])
コード例 #29
0
ファイル: classifier_utils.py プロジェクト: cui-z/ml
 def _read_csv(cls, input_file):
     """Reads a tab separated value file."""
     df = load_csv(input_file, header=0).fillna('|')
     jobcontent = df['content'].tolist()
     jlabel = df.loc[:, hp.label_vocabulary].values
     lines = [[jlabel[i], jobcontent[i]] for i in range(len(jlabel))
              if type(jobcontent[i]) == str]
     lines2 = shuffle_one(lines)
     print('Read csv finished!(1)')
     print('Head data:', lines2[0:5])
     print('Length of data:', len(lines2))
     return lines2
コード例 #30
0
ファイル: stacking.py プロジェクト: ireko8/Bengali
def main():

    exp_name = f'baseline_{now()}'
    device, log, result_dir = setup(exp_name, conf)

    train_df = load_csv(conf.train_csv)
    if conf.npy:
        train_images = np.load(conf.train_images)
    else:
        train_images = pd.read_parquet(conf.train_images)

    train_df["gr"] = 0
    train_df["cd"] = 0
    train_df["vd"] = 0
    train_df["image_mean"] = 0

    models = [f"se_resnext50_f{i}.pkl" for i in range(5)]

    preds = np.zeros((len(train_df), conf.gr_size + conf.vd_size + conf.cd_size))
    image_stats = np.zeros((len(train_df), 2))

    log.info('done')
    for i in range(5):

        model = ResNet(conf, arch_name=conf.arch,
                          input_size=conf.image_size)
        model.load_state_dict(torch.load(models[i]))
        model.to(device)

        ds = val_split(train_df, train_images, fold=i)
        _, val_ds, _, val_images = ds['train'], ds['val'], ds['train_images'], ds['val_images']

        test_preds = predict(model, val_ds, val_images, valid_transform,
                             device)

        print(test_preds.shape)
        te_ind = ds['te_ind']
        preds[te_ind] += test_preds
        image_stats[te_ind, 0] = val_images.mean((1, 2))
        image_stats[te_ind, 0] = val_images.std((1, 2))

    preds = np.concatenate([preds, image_stats], axis=1)

    for t in ["grapheme_root", "vowel_diacritic", "consonant_diacritic"]:
        rf = RandomForestClassifier(n_jobs=16)
        # train = xgb.DMatrix(preds, label=train_df[t])
        # params = {"max_depth": 4, "nthread": 16, "objective": "multi:softmax",
        #           "eval_metric": ["merror", "mlogloss"], "num_class": conf.gr_size}
        # xgb.cv(params, train, num_boost_round=1000, nfold=5, seed=conf.seed,
        #        early_stopping_rounds=40, verbose_eval=10)
        rf.fit(preds, train_df[t])
        with open(f"{t}_rf2.pkl", "wb") as f:
            joblib.dump(rf, f)
コード例 #31
0
ファイル: naive_bayes.py プロジェクト: heolin123/naive_bayes
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input_file', help="Input csv file with data.")
    args = parser.parse_args()

    dataset = utils.load_csv(args.input_file)
    train_set, test_set = corpus.split_dataset(dataset, 0.67)
    separated = corpus.separate_by_class(train_set)
    summaries = corpus.summarize_by_class(train_set)
    predictions = predict_set(summaries, test_set)
    accuracy = utils.get_accuracy(test_set, predictions)
    print('Accuracy: {0}%').format(accuracy)
コード例 #32
0
ファイル: naive_bayes.py プロジェクト: heolin123/naive_bayes
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', '--input_file', help="Input csv file with data.")
    args = parser.parse_args()

    dataset = utils.load_csv(args.input_file)
    train_set, test_set = corpus.split_dataset(dataset, 0.67)
    separated = corpus.separate_by_class(train_set)
    summaries = corpus.summarize_by_class(train_set)
    predictions = predict_set(summaries, test_set)
    accuracy = utils.get_accuracy(test_set, predictions)
    print('Accuracy: {0}%').format(accuracy)
コード例 #33
0
def main():
    db = "{}/data/db".format(environ["WD"])
    if not exists(db):
        con = connect(db)
        data = \
            pipe( load_csv()
                , rename_columns
                , clean_date
                )
        data.to_sql(name="data", con=con)
    else:
        print("data already compiled to {}".format(db))
コード例 #34
0
def main():
  import sys
  from utils import load_csv, get_stencil_num

  raw_data = load_csv(sys.argv[1])

  k_l = set()
  for k in raw_data:
    k_l.add((get_stencil_num(k), k['Global NX']))
  k_l = list(k_l)

  bsz_l = set()
  for k in raw_data:
    if k['Multi-wavefront updates']=='0': continue
    bsz_l.add(k['Multi-wavefront updates'])
  bsz_l = sorted(list(bsz_l))

  for k, N in k_l:
    for bsz in bsz_l:
      gen_res(raw_data, int(k), int(bsz), int(N))
コード例 #35
0
def main():
  import sys
  from ics_utils import get_stencil_num
  from utils import load_csv

  raw_data = load_csv(sys.argv[1])

  k_l = set()
  for k in raw_data:
    k_l.add(get_stencil_num(k))
  k_l = list(k_l)

  n_l = set()
  for k in raw_data:
    n_l.add(k['Global NX'])
  n_l = list(n_l)


  for k in k_l:
    for N in n_l:
      gen_res(raw_data, int(k), int(N))
コード例 #36
0
ファイル: fe_extraction.py プロジェクト: johnarevalo/cnn-bcdr
def get_feats_from_csv_in_partitions():
    """
    Extract the original features that are distributed in the dataset. Features
    are splitted according with the config.yaml file.
    """
    conf = utils.get_config()
    rows = [row for row in utils.load_csv() if utils.check_filter(row, conf['filters'])]
    train_rows, valid_rows, test_rows = utils.split_dataset(
        rows, conf['valid_percent'], conf['test_percent'], rng=conf['rng_seed'])
    X_train, y_train, X_valid, y_valid, X_test, y_test = [], [], [], [], [], []
    prefixes = ['t_', 'i_', 's_']  # Feature names' prefixes
    datasets = [(X_train, y_train, train_rows),
                (X_test, y_test, test_rows), (X_valid, y_valid, valid_rows)]
    out = []
    for X, y, rows in datasets:
        for row in rows:
            X.append(
                [float(v) for k, v in row.iteritems() if len(filter(k.startswith, prefixes)) > 0])
            y.append(int(row['classification'] == 'Malign'))
        out.extend((np.asarray(X), np.asarray(y)))
    return out
コード例 #37
0
def main():
  import sys
  from utils import select_fields, load_csv
  raw_data = load_csv(sys.argv[1])

  stencil='7_pt_const'
  rows = [
           {'Thread group size':'0' , 'Stencil Kernel coefficients':'constant', 'Stencil Kernel semi-bandwidth':'1', 'OpenMP Threads':'6' },
           {'Thread group size':'1' , 'Stencil Kernel coefficients':'constant', 'Stencil Kernel semi-bandwidth':'1', 'OpenMP Threads':'10'},
           {'Thread group size':'2' , 'Stencil Kernel coefficients':'constant', 'Stencil Kernel semi-bandwidth':'1', 'OpenMP Threads':'10'},
           {'Thread group size':'5' , 'Stencil Kernel coefficients':'constant', 'Stencil Kernel semi-bandwidth':'1', 'OpenMP Threads':'10'},
           {'Thread group size':'10', 'Stencil Kernel coefficients':'constant', 'Stencil Kernel semi-bandwidth':'1', 'OpenMP Threads':'10'}
         ]
  create_table(raw_data, rows, stencil)


  stencil='7_pt_var'
  rows = [
           {'Thread group size':'0' , 'Stencil Kernel coefficients':'variable no-symmetry', 'Stencil Kernel semi-bandwidth':'1', 'OpenMP Threads':'6'},
           {'Thread group size':'1' , 'Stencil Kernel coefficients':'variable no-symmetry', 'Stencil Kernel semi-bandwidth':'1', 'OpenMP Threads':'8'},
           {'Thread group size':'2' , 'Stencil Kernel coefficients':'variable no-symmetry', 'Stencil Kernel semi-bandwidth':'1', 'OpenMP Threads':'10'},
           {'Thread group size':'5' , 'Stencil Kernel coefficients':'variable no-symmetry', 'Stencil Kernel semi-bandwidth':'1', 'OpenMP Threads':'10'},
           {'Thread group size':'10', 'Stencil Kernel coefficients':'variable no-symmetry', 'Stencil Kernel semi-bandwidth':'1', 'OpenMP Threads':'10'}
         ]
  create_table(raw_data, rows, stencil)


  stencil='25_pt_var'
  rows = [
           {'Thread group size':'0' , 'Stencil Kernel coefficients':'variable axis-symmetric', 'Stencil Kernel semi-bandwidth':'4', 'OpenMP Threads':'8' },
           {'Thread group size':'1' , 'Stencil Kernel coefficients':'variable axis-symmetric', 'Stencil Kernel semi-bandwidth':'4', 'OpenMP Threads':'7'},
           {'Thread group size':'2' , 'Stencil Kernel coefficients':'variable axis-symmetric', 'Stencil Kernel semi-bandwidth':'4', 'OpenMP Threads':'8'},
           {'Thread group size':'5' , 'Stencil Kernel coefficients':'variable axis-symmetric', 'Stencil Kernel semi-bandwidth':'4', 'OpenMP Threads':'10'},
           {'Thread group size':'10', 'Stencil Kernel coefficients':'variable axis-symmetric', 'Stencil Kernel semi-bandwidth':'4', 'OpenMP Threads':'10'}
         ]
  create_table(raw_data, rows, stencil)
コード例 #38
0
ファイル: eval.py プロジェクト: johnarevalo/cnn-bcdr
        X[i] = features[np.nonzero(int(row['segmentation_id']) == segm_ids)][0]
        y[i] = utils.is_positive(row)
    return X, y


rng = [2014, 12, 5]
rng = make_np_rng(None, rng, which_method='uniform')
scale_feats = True
n_runs = 20
C_range = 10.0 ** np.arange(-8, 8)
train_scores = np.zeros((n_runs, len(C_range)))
valid_scores = np.zeros((n_runs, len(C_range)))
fit_threshold = True
conf_file = sys.argv[1] if len(sys.argv) > 1 else None
conf = utils.get_config(conf_file)
features = np.empty([len(utils.load_csv()), 0])
#f_list = ['hcfeats', 'imnet', 'cnn']
f_list = ['cnn']

if 'imnet' in f_list:
    rows = utils.load_csv()
    feats, y = fe_extraction.get_feats_from_imagenet(rows)
    features = np.hstack((features, feats))
    segm_ids = np.asarray([int(row['segmentation_id']) for row in rows])
if 'hcfeats' in f_list:
    rows = utils.load_csv(conf['csv_features_file'])
    feats, y = fe_extraction.get_feats_from_csv(
        rows, prefixes=['s_', 't_', 'i_'])
    feats = np.asarray(feats)
    features = np.hstack((features, feats))
    segm_ids = np.asarray([int(row['segmentation_id']) for row in rows])