Esempio n. 1
0
def concat_english_spanish_vocab():
    english_vocab = read_data("../input/english_word_vocabs.txt")
    spanish_vocab = read_data("../input/words.txt")

    vocabs = spanish_vocab + english_vocab

    save_data(path='../input/processing/multi_task_learn/all_vocab.txt',
              data=vocabs)
Esempio n. 2
0
def split_train_valid(split_rate, is_use_real=False):
    train_file = config.spanish_train_path
    train_data_file = '../input/processing/train_data.txt'
    valid_data_file = '../input/processing/valid_data.txt'

    train = read_data(train_file)
    # random.shuffle(train)
    if is_use_real:
        valid_size = 1400

        train_data = train[:-valid_size]
        valid_data = train[-valid_size:]

        random.shuffle(train_data)

        save_data(train_data_file, train_data)
        save_data(valid_data_file, valid_data)

    else:
        valid_size = int(split_rate * len(train))
        train_data = train[:-valid_size]
        valid_data = train[-valid_size:]

        # random.shuffle(train_data)

        save_data(train_data_file, train_data)
        save_data(valid_data_file, valid_data)
Esempio n. 3
0
def split_with_overlap(ratio, ecfp_tr, ic50_tr, root_dir="", overlap=1000):
    print("Split with %d overlap." % overlap)
    split_path = root_dir + "data_2_split/"

    training_sample_num = ecfp_tr.shape[0]
    one_part = int(training_sample_num / (ratio + 1))
    samples_user_1 = ratio * one_part
    samples_user_2 = one_part
    print("Number of training samples user-1: %d" % (samples_user_1))
    print("Number of training samples user-2: %d" % (samples_user_2))

    shuffled_idx = np.array(range(training_sample_num))
    np.random.shuffle(shuffled_idx)
    user_1 = shuffled_idx[:samples_user_1]
    user_2 = shuffled_idx[-samples_user_2:]

    user_1_train_size = int(0.8 * samples_user_1)
    user_2_train_size = int(0.8 * samples_user_2)

    user_1_train = user_1[:user_1_train_size]
    user_1_test = user_1[user_1_train_size:]
    user_2_train = user_2[:user_2_train_size]
    user_2_test = user_2[user_2_train_size:]

    # T_total - number of targets
    T_total = ic50_tr.shape[1]
    # number of disjunct targets
    num_disjunct = (T_total - overlap) // 2

    print("%d disjunct and %d overlapping labels (total: %d)" %
          (num_disjunct, overlap, T_total))

    shuffled_labels = np.array(range(T_total))
    np.random.shuffle(shuffled_labels)
    common_labels = shuffled_labels[:overlap]
    if num_disjunct == 0:
        user_1_labels, user_2_labels = common_labels, common_labels
    else:
        user_1_labels = np.concatenate(
            (common_labels, shuffled_labels[overlap:overlap + num_disjunct]),
            axis=None)
        user_2_labels = np.concatenate(
            (common_labels, shuffled_labels[-num_disjunct:]), axis=None)

    u1_ic50_tr = ic50_tr.tocsc()[:, user_1_labels].tocsr()
    u2_ic50_tr = ic50_tr.tocsc()[:, user_2_labels].tocsr()

    ## save user-1 data
    du.save_data(split_path + "0_train/", ecfp_tr[user_1_train],
                 u1_ic50_tr[user_1_train])
    du.save_data(split_path + "0_test/", ecfp_tr[user_1_test],
                 u1_ic50_tr[user_1_test])

    ## save user-2 data
    du.save_data(split_path + "1_train/", ecfp_tr[user_2_train],
                 u2_ic50_tr[user_2_train])
    du.save_data(split_path + "1_test/", ecfp_tr[user_2_test],
                 u2_ic50_tr[user_2_test])
Esempio n. 4
0
def de_duplicate():
    filename = '../input/processing/spanish_train.txt'
    data = read_data(filename)
    dumplicate = []
    new_data = []
    help_set = set()
    for line in data:
        new_line = line[1:]
        if new_line in help_set:
            dumplicate.append(line)
        else:
            new_data.append(line)
            help_set.add(new_line)

    save_data(
        '../input/processing/dumplicate_spanish_data.txt',
        dumplicate,
    )
    save_data('../input/processing/spanish_train_dedup.txt', new_data)
Esempio n. 5
0
def tenfold():
    """split train data for 10 fold"""
    base_path = '../input/processing/'

    train_file = base_path + 'train.txt'

    save_path = base_path + '10fold'
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    # split data for ten fold
    train = read_data(train_file)
    size = len(train)
    one_part_size = int(0.1 * size)
    random.shuffle(train)

    for i in range(10):
        save_file_path = "{}/train_{}.txt".format(save_path, i)
        if i < 9:
            save_data(save_file_path,
                      train[i * one_part_size:(i + 1) * one_part_size])
        else:
            save_data(save_file_path, train[i * one_part_size:])
Esempio n. 6
0
def five_fold():
    """split train data for 10 fold"""
    base_path = '../input/processing/'

    train_file = config.spanish_train_path

    save_path = base_path + '5fold'
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    # split data for ten fold
    train = read_data(train_file)

    size = len(train)
    one_part_size = int(0.2 * size)
    # random.shuffle(train)

    for i in range(5):
        save_file_path = "{}/train_{}.txt".format(save_path, i)
        if i < 4:
            save_data(save_file_path,
                      train[i * one_part_size:(i + 1) * one_part_size])
        else:
            save_data(save_file_path, train[i * one_part_size:])
Esempio n. 7
0
def processing_data_1_step():
    """对数据集进行切分,先分成英文部分,西班牙文部分,不shuffle"""
    process_base_path = '../input/processing/'
    base_path = '../input/'

    if not os.path.exists(process_base_path):
        os.makedirs(process_base_path)

    train_en = base_path + 'cikm_english_train_20180516.txt'
    train_sp = base_path + 'cikm_spanish_train_20180516.txt'
    unlabel_data = base_path + 'cikm_unlabel_spanish_train_20180516.txt'

    english_file = process_base_path + 'english.txt'
    spanish_file = process_base_path + 'spanish.txt'
    unlabel_file = process_base_path + 'unlabel_spanish.txt'
    test_file = process_base_path + 'test_b_no_process.txt'

    # 将english_train中的英文和西班牙文划分开
    en_train = read_data(train_en)
    ens = []
    sps = []
    for line in en_train:
        line = line.strip()
        line_arr = line.split('\t')
        ens.append('{}\t{}\t{}\n'.format(line_arr[4], line_arr[0],
                                         line_arr[2]))
        sps.append('{}\t{}\t{}\n'.format(line_arr[4], line_arr[1],
                                         line_arr[3]))

    sp_train = read_data(train_sp)

    for line in sp_train:
        # line = punctiation(line)
        line = line.strip()
        line_arr = re.split('\t', line)
        sps.append('{}\t{}\t{}\n'.format(line_arr[4], line_arr[0],
                                         line_arr[2]))
        ens.append('{}\t{}\t{}\n'.format(line_arr[4], line_arr[1],
                                         line_arr[3]))

    # 讲分开的english和spanish文件保存下来
    save_data(english_file, data=ens)
    save_data(spanish_file, data=sps)

    print(u'对测试数据进行预处理,所有的label均设置为0')
    test_path = base_path + 'cikm_test_b_20180730.txt'
    test = read_data(test_path)
    sps = []
    for line in test:
        line = line.strip()
        line_arr = re.split('\t', line)
        sps.append('{}\t{}\t{}\n'.format(0, line_arr[0], line_arr[1]))
    save_data(test_file, sps)

    print('Done')
Esempio n. 8
0
# encoding:utf-8
from utils.log_utils import init_logger
from utils.data_utils import save_data, json2text, get_data, write_text
from config.basic_config import configs
import codecs
import numpy as np

if __name__ == "__main__":
    logger = init_logger(log_name='preprocess', log_dir=configs['log_dir'])
    # 将数据保存起来
    save_data(configs['all_data_path'], configs['train_data_path'],
              configs['test_data_path'], configs['val_data_path'])
    with codecs.open(configs['all_data_path']) as f:
        content = f.readlines()
    length_list = []
    for line in content:
        line = line.strip()
        if len(line) == 0:
            continue
        length_list.append(len(line))
    # print(length_list)
    max_length = int(np.percentile(length_list, 95))
    logger.info(f"max length: {max_length}")
    json2text(configs['test_data_path'], configs['ptest_x_path'], key='text')
    train_data = get_data(configs['train_data_path'])
    val_data = get_data(configs['val_data_path'])
    train_collections = []
    val_collections = []
    for item in train_data:
        tags = ['O' for _ in range(len(item['text']))]
        label = item['label']
Esempio n. 9
0
    with open(file_path) as f:
        df = pd.read_csv(f)

    return df


if __name__ == '__main__':
    feature_path = "../data/features"
    label_path = "../data/labels"

    # files_dir = r"E:\机器学习\数据集\音频\whale_data\Audio\train"
    # files = os.listdir(files_dir)
    # files.sort(key=lambda x: int(re.match(r"train(\d+).wav", x).group(1)))
    # features = []
    # for file in tqdm(files):
    #     path = os.path.join(files_dir, file)
    #     feature = wavfile_to_examples(path)
    #     feature = np.reshape(
    #         feature, (vggish_params.NUM_FRAMES, vggish_params.NUM_BANDS))
    #     features.append(feature)
    #
    # data_utils.save_data(np.array(features), feature_path)

    df = read_csv(r'E:\机器学习\数据集\音频\whale_data\labels.csv')

    labels = []
    for label in df.label:
        labels.append(label)

    data_utils.save_data(np.array(labels), label_path)
Esempio n. 10
0
def save_stats(path, stats_df):
    utils.ensure_path_exist(path)
    data_utils.save_data(stats_df, file_path=path)
Esempio n. 11
0
def processing_data_2_step():
    process_base_path = '../input/processing/'
    english_file = process_base_path + 'english.txt'
    spanish_file = process_base_path + 'spanish.txt'
    test_file = process_base_path + 'test_b_no_process.txt'

    english_core_nlp = StanfordCoreNLP(core_nlp_path, lang='en')

    with StanfordCoreNLP(core_nlp_path, lang='es') as client:

        english_processing_file = process_base_path + 'english_train.txt'
        spanish_processing_file = process_base_path + 'spanish_train.txt'
        test_processing_file = process_base_path + 'test_b.txt'
        #
        englishs = read_data(english_file)
        spanishs = read_data(spanish_file)

        english_processing = []
        spanish_processing = []

        # for english

        for line in tqdm(englishs):
            lines = line.strip().split('\t')
            assert len(lines) == 3
            lines[1] = text_processing_english(lines[1], english_core_nlp)
            lines[2] = text_processing_english(lines[2], english_core_nlp)

            english_processing.append("{}\t{}\t{}\n".format(
                lines[0], lines[1], lines[2]))

        save_data(english_processing_file, english_processing)

        # for spanish
        for line in tqdm(spanishs):
            line = line.strip().split('\t')
            assert len(line) == 3, print(line)
            line[1] = text_processing_spanish(line[1], client)
            line[2] = text_processing_spanish(line[2], client)

            spanish_processing.append("{}\t{}\t{}\n".format(
                line[0], line[1], line[2]))

        save_data(spanish_processing_file, spanish_processing)
        #
        # for test data
        test = read_data(test_file)
        test_processing = []
        for line in tqdm(test):
            # ipdb.set_trace()
            line = line.strip().split('\t')
            assert len(line) == 3
            line[2] = text_processing_spanish(line[2], client)
            line[1] = text_processing_spanish(line[1], client)

            test_processing.append("{}\t{}\t{}\n".format(
                line[0], line[1], line[2]))

        save_data(test_processing_file, test_processing)

    english_core_nlp.close()

    print('Done')
        random_states = np.zeros([n_random] + list(states.shape[1:]),
                                 dtype=np.float32)
        random_actions = np.zeros(n_random, dtype=np.int8) + 9

        random.shuffle(images)

        for idx in range(len(images[:n_random])):
            try:
                image = Image.open(images[idx])
            except:
                print('Failed on %s' % images[idx])
                os.remove(images[idx])
                continue
            image = image.convert('RGB')

            processor.process_observation(np.array(image))
            processor.process_observation(np.array(image))
            random_states[idx] = processor.process_observation(np.array(image))

        states = np.concatenate((states, random_states))
        robot_actions = np.concatenate((robot_actions, random_actions))
        human_actions = np.concatenate((human_actions, random_actions))

        permutation = np.random.permutation(states.shape[0])
        states = states[permutation]
        robot_actions = robot_actions[permutation]
        human_actions = human_actions[permutation]

        save_data(mixed_data_file, states, robot_actions, human_actions)
        print('Saved mixed states to %s\n' % mixed_data_file)
Esempio n. 13
0
 def to_file(self, path):
     utils.ensure_path_exist(path)
     data_utils.save_data(self, file_path=path)
Esempio n. 14
0
        new_labeled_idices = np.squeeze(new_labeled_idices)
        self._data_l = np.concatenate(
            [self._data_l, self._data_u[new_labeled_idices]])
        np.random.shuffle(self._data_l)  # shuffle new labeled data.
        self._data_u = np.delete(self._data_u, new_labeled_idices, axis=0)

        if self._embeddings is not None:
            # Update embedded data cache.
            self._embedded_data_l = np.concatenate([
                self._embedded_data, self._embedded_data_u[new_labeled_idices]
            ])
            np.random.shuffle(self._data_l)  # shuffle new labeled data.
            self._embedded_data_u = np.delete(self._embedded_data_u,
                                              new_labeled_idices,
                                              axis=0)

    def to_file(self, path):
        utils.ensure_path_exist(path)
        data_utils.save_data(self, file_path=path)


if __name__ == '__main__':
    _features = data_utils.load_data('../data/features')
    _labels = data_utils.load_data('../data/labels')

    pairs = [(f, l) for f, l in zip(_features, _labels)]

    data_utils.save_data(
        np.array(pairs, dtype=[('feature', np.ndarray), ('label', np.int)]),
        '../data/pairs')
Esempio n. 15
0
    for i in range(len(features)):
        features[i, :, :, 1] = librosa.feature.delta(features[i, :, :, 0])

    return np.array(features), np.array(labels, dtype=np.int)


if __name__ == '__main__':
    _features, _labels = extract_features(
        parent_dir=r'E:\机器学习\数据集\音频\speech_commands_v0.01',
        dir_label_dic={
            'zero': 0,
            'one': 1,
            'two': 2,
            'three': 3,
            'four': 4,
            'five': 5,
            'six': 6,
            'seven': 7,
            'eight': 8,
            'nine': 9
        },
        file_ext='*.wav',
        bands=60,
        frames=41)

    _pairs = [(f, l) for f, l in zip(_features, _labels)]
    data_utils.save_data(obj=np.array(_pairs,
                                      dtype=[('feature', np.ndarray),
                                             ('label', np.int)]),
                         file_path='../data/speech_command/pairs')
Esempio n. 16
0
            label = file_name.split('-')[1]
            for (start, end) in windows(sound_clip, window_size):
                if len(sound_clip[start:end]) == window_size:
                    signal = sound_clip[start:end]
                    melspec = librosa.feature.melspectrogram(signal, n_mels=bands)
                    logspec = librosa.amplitude_to_db(melspec)
                    logspec = logspec.T.flatten()[:, np.newaxis].T
                    log_specgrams.append(logspec)
                    labels.append(label)

    log_specgrams = np.asarray(log_specgrams).reshape(len(log_specgrams), bands, frames, 1)
    features = np.concatenate((log_specgrams, np.zeros(np.shape(log_specgrams))), axis=3)
    for i in range(len(features)):
        features[i, :, :, 1] = librosa.feature.delta(features[i, :, :, 0])

    return np.array(features), np.array(labels, dtype=np.int)


if __name__ == '__main__':
    _features, _labels = extract_features(
        parent_dir=r'E:\机器学习\数据集\音频\UrbanSound8K\audio',
        sub_dirs=['fold%d' % i for i in range(1, 11)],
        file_ext='*.wav',
        bands=60,
        frames=41)

    _pairs = [(f, l) for f, l in zip(_features, _labels)]
    data_utils.save_data(
        obj=np.array(_pairs, dtype=[('feature', np.ndarray), ('label', np.int)]),
        file_path='../data/urbansound8k/pairs')