Exemple #1
0
def associate(real_world: bool, audio: bool):
    project_dir = os.getcwd()
    cfg = utils.get_config(project_dir)
    bio_type = 'audio' if audio else 'video'
    data_path = os.path.join(
        project_dir, 'data',
        cfg['base_conf']['dataset'][utils.get_dataset(real_world, bio_type)])
    bio_data_path = os.path.join(data_path, cfg['base_conf']['biometric_data'])
    wifi_data_path = os.path.join(data_path, cfg['base_conf']['wifi_data'])
    wifi_thres = cfg['parameters']['wifi_threshold']
    meeting_thres = cfg['parameters']['meeting_threshold']
    cdist_metric = cfg['parameters']['cdist_metric']
    victims_thres = cfg['parameters']['estimated_victims']
    omega = cfg['parameters']['omega']

    # meeting_npy_paths, the npy path that contains feature vectors, after running feature_extraction
    # {'meeting_15': 'CrossLeak/data/audio_50vs20_100/bio_data/meeting_15/vecmeeting_15.npy'}}
    meeting_npy_paths = utils.get_meeting_and_path(bio_data_path, r'.+\.npy$')
    # {'meeting_51': 'CrossLeak/data/audio_50vs20_100/bio_data/meeting_51/segsmeeting_51.pk'}
    bio_path = utils.get_meeting_and_path(bio_data_path, r'.+segs.+\.pk$')
    assert len(meeting_npy_paths.keys()) == len(bio_path.keys())

    # get participants' names, given the corresponding WiFi sniffing files (wifi_thres is the cutoff rss threshold)
    meeting_people_name = utils.get_meeting_people_name(
        wifi_data_path, real_world, r'.+\.pk$', wifi_thres)

    # load and map poi name and mac address, respectively
    poi_name_mac = {}
    meeting_poi_name = collections.defaultdict(list)
    if real_world:
        for mac, poi in cfg['mac_name']:
            if poi not in poi_name_mac:
                poi_name_mac[poi] = mac
            else:
                logging.warning(
                    'duplicate mac address of {} with {} and {}'.format(
                        poi, poi_name_mac[poi], mac))
    else:
        poi = pickle.load(open(os.path.join(bio_data_path, 'POIs.pk'), 'rb'))
        non_poi = pickle.load(
            open(os.path.join(bio_data_path, 'nonPOIs.pk'), 'rb'))
        for peo in poi:
            poi_name_mac[peo] = peo
        for meeting, peos in meeting_people_name.items():
            for peo in peos:
                if peo in poi:
                    meeting_poi_name[meeting].append(peo)
                elif peo not in non_poi:
                    logging.warning('{} not in poi nor non_poi'.format(peo))

    # remove filter people
    excluded_peo = cfg['filter_people'][bio_type]
    for meeting, people_names in meeting_poi_name.items():
        meeting_poi_name[meeting] = [
            poi for poi in people_names if poi not in excluded_peo
        ]

    # flatten to get unique poi name
    poi_people = sorted({
        name
        for people_names in meeting_poi_name.values() for name in people_names
    })
    logging.info('poi people: {}'.format(poi_people))

    meeting_name = sorted(meeting_poi_name.keys())
    meeting_num = len(meeting_name)
    if meeting_thres == -1:
        meeting_thres = meeting_num

    # get meeting index by meeting name
    meeting_index = dict(zip(meeting_name, list(range(meeting_num))))
    poi_num = len(poi_people)

    context_infor = np.zeros([meeting_num, poi_num]).astype(np.float64)
    for meeting, people_names in meeting_poi_name.items():
        for poi in people_names:
            context_infor[meeting_index[meeting], poi_people.index(poi)] = 1

    bio_info = np.empty([0, cfg['pre_process'][bio_type]['dimension']])
    bio_paths = []
    for meeting in bio_path.keys():
        with open(bio_path[meeting], 'rb') as f:
            relative_bio_paths = pickle.load(f)
            absolute_bio_paths = [
                os.path.join(bio_data_path, p) for p in relative_bio_paths
            ]
            bio_paths.extend(absolute_bio_paths)
            # iou vec is the face / voice features via feature extractor
            iou_vec = np.load(meeting_npy_paths[meeting])
            try:
                bio_info = np.vstack((bio_info, iou_vec))
            except:
                logging.error('error in numpy vstack: {}'.format(meeting))

    # construct mac attendance vector
    real_mac_attendance = defaultdict(
        lambda: [0] * min(meeting_num, meeting_thres))
    if not real_world:
        # simply use people name to represent MAC address
        meeting_people_mac = utils.get_meeting_people_name(
            wifi_data_path, real_world, r'.+\.pk$', wifi_thres)

    for meeting, macs in meeting_people_mac.items():
        for mac in macs:
            real_mac_attendance[mac][meeting_index[meeting]] = 1

    # use mac_index to get the index of mac address in mac_attendance
    mac_index = {}
    poi_mac_attendance = []
    cnt = 0
    for poi in poi_people:
        if poi in poi_name_mac and poi_name_mac[poi] in real_mac_attendance:
            # divide real_mac_attendance into two sequences instead of random: first with poi and following non_poi.
            poi_mac_attendance.append(real_mac_attendance[poi_name_mac[poi]])
            mac_index[cnt] = poi_name_mac[poi]
            cnt += 1
            real_mac_attendance.pop(poi_name_mac[poi])
        else:
            # check if all poi has the name-mac mapping and poi attend at least one meeting
            logging.error(
                'poi {} do not have mac information or have not attended at least one meeting'
                .format(poi))
    assert len(real_mac_attendance) == 0

    # add the non poi part of mac_attendance
    non_poi_mac_attendance = []
    if real_world:
        for mac, attendance in real_mac_attendance.items():
            mac_index[cnt] = mac
            cnt += 1
            non_poi_mac_attendance.append(attendance)
    else:
        non_poi = pickle.load(
            open(os.path.join(bio_data_path, 'nonPOIs.pk'), 'rb'))
        non_poi_mac_attendance = [[
            0 for _ in range(min(meeting_num, meeting_thres))
        ] for _ in range(len(non_poi))]
        for oos_idx in range(len(non_poi)):
            random_rssi = np.random.normal(-60, 80,
                                           min(meeting_num, meeting_thres))
            for meeting in range(min(meeting_num, meeting_thres)):
                if random_rssi[meeting] >= wifi_thres:
                    non_poi_mac_attendance[oos_idx][meeting] = 1

    mac_attendance = np.concatenate(
        (poi_mac_attendance, non_poi_mac_attendance))

    if real_world:
        # remove always-on mac address, detected over 90% of all meetings, which may be router or something.
        always_on_index = np.where(
            mac_attendance.sum(axis=1) > 0.9 * mac_attendance.shape[1])[0]
        mac_attendance = np.delete(mac_attendance, always_on_index, axis=0)
        logging.info(
            'always on mac address no: {} of all meeting no: {}'.format(
                len(always_on_index), mac_attendance.shape[1]))
    logging.info('mac attendance len: {} with poi no: {}'.format(
        len(mac_attendance), len(poi_mac_attendance)))

    # concatenate features and attendance vector (ctx information)
    assert len(bio_paths) == len(bio_info)
    logging.info('bio_paths and bio_info len: {}'.format(len(bio_paths)))

    # get bio_features in valid meeting
    bio_features = []
    for bio_feature, path in zip(bio_info, bio_paths):
        parent = utils.get_parent_folder_name(path, 3)
        c = parent.split('_')
        if meeting_index['%s_%s' % (c[0], c[1])] < meeting_thres:
            bio_features.append(bio_feature)

    # event vector of MAC addresses
    wifi_people_in_meetings = np.zeros(
        [poi_num, min(meeting_num, meeting_thres)])
    for i in range(poi_num):
        for name in meeting_name:
            if meeting_index[name] < meeting_thres:
                wifi_people_in_meetings[i,
                                        meeting_index[name]] = context_infor[
                                            meeting_index[name], i]
    # for i in range(poi_num):
    #     for name in meeting_name:
    #         if poi_people[i] in meeting_poi_name[name]:
    #             if meeting_index[name] < meeting_thres:
    #                 wifi_people_in_meetings[i, meeting_index[name]] = 1

    scan(bio_features, bio_paths, bio_info, real_world, cdist_metric,
         victims_thres, omega, meeting_num, meeting_thres, meeting_index,
         wifi_people_in_meetings, poi_people, project_dir, bio_data_path,
         audio)
def feature_extraction(real_world: bool):
    project_dir = os.path.dirname(os.getcwd())
    cfg = utils.get_config(project_dir)
    data_path = os.path.join(project_dir, cfg['base_conf']['data_path'])
    wifi_thres = cfg['pre_process']['wifi_threshold']

    if cfg['specs']['set_gpu']:
        os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
        os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
        os.environ["CUDA_VISIBLE_DEVICES"] = str(cfg['specs']['gpu_num'])

    meeting_poi_name = utils.get_meeting_poi_name(data_path, real_world,
                                                  wifi_thres)

    # remove meeting that no one attended
    for meeting, people_names in meeting_poi_name.items():
        if len(people_names) == 0:
            shutil.rmtree(os.path.join(data_path, meeting))

    # multiple runs
    # remove classifier and npy file generated last time
    for meeting in os.listdir(data_path):
        classifier_path = os.path.join(project_dir, data_path, meeting,
                                       'classifier')
        if os.path.exists(classifier_path):
            shutil.rmtree(classifier_path)
    meeting_npy_paths = utils.get_meeting_and_path(data_path, r'.+\.npy')
    for npy_path in meeting_npy_paths:
        if os.path.exists(meeting_npy_paths[npy_path]):
            os.remove(meeting_npy_paths[npy_path])

    meeting_paths = [
        item for item in os.listdir(data_path)
        if os.path.isdir(os.path.join(data_path, item))
    ]
    meeting_paths = list(
        map(lambda x: os.path.join(data_path, x), meeting_paths))
    meeting_people_num = utils.get_meeting_poi_num(data_path, real_world,
                                                   wifi_thres)

    with tf.Graph().as_default():
        config = tf.compat.v1.ConfigProto()
        config.gpu_options.visible_device_list = str(cfg['specs']['gpu_num'])
        gpu_options = tf.compat.v1.GPUOptions(
            per_process_gpu_memory_fraction=0.8)
        with tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(
                gpu_options=gpu_options)) as sess:
            facenet.load_model(
                os.path.join(project_dir, 'models',
                             cfg['pre_process']['model_name']))
            images_placeholder = tf.compat.v1.get_default_graph(
            ).get_tensor_by_name("input:0")
            embeddings = tf.compat.v1.get_default_graph().get_tensor_by_name(
                "embeddings:0")
            phase_train_placeholder = tf.compat.v1.get_default_graph(
            ).get_tensor_by_name("phase_train:0")

            for meeting_path in meeting_paths:
                mtcnn_path = os.path.join(meeting_path,
                                          cfg['base_conf']['mtcnn_path'])
                pic_paths = os.listdir(mtcnn_path)
                pic_paths.sort(key=lambda x: utils.get_iou_num(x, real_world))
                meeting = utils.get_parent_folder_name(meeting_path, 1)
                image_list = []
                result = np.empty([0, cfg['pre_process']['dimension']])
                piece_num = cfg['pre_process']['piece_num']
                if len(pic_paths) == 0:
                    shutil.rmtree(meeting_path)
                    continue
                for pic in pic_paths:
                    imm = Image.open(os.path.join(mtcnn_path, pic))
                    try:
                        imm.verify()
                    except Exception:
                        logging.error('invalid image: {}'.format(
                            os.path.join(mtcnn_path, pic)))
                    im = cv2.imread(os.path.join(mtcnn_path, pic))
                    im = cv2.resize(im, (160, 160))
                    prewhitened = facenet.prewhiten(im)
                    image_list.append(prewhitened)
                    if len(image_list) == piece_num:
                        images = np.stack(image_list)
                        feed_dict = {
                            images_placeholder: images,
                            phase_train_placeholder: False
                        }
                        emb = sess.run(embeddings, feed_dict=feed_dict)
                        result = np.vstack((result, emb))
                        image_list.clear()

                if len(image_list) != 0:
                    try:
                        images = np.stack(image_list)
                    except:
                        logging.error('may not resize the image')
                    feed_dict = {
                        images_placeholder: images,
                        phase_train_placeholder: False
                    }
                    emb = sess.run(embeddings, feed_dict=feed_dict)
                    result = np.vstack((result, emb))

                features = result.tolist()

                pre_iou_num = utils.get_iou_num(pic_paths[0], real_world)
                same_iou_vec = np.empty([0, cfg['pre_process']['dimension']])
                iou_vec = []
                iou_pic_path = []
                single_iou_path = []
                for name, vec in zip(pic_paths, features):
                    iou_num = utils.get_iou_num(name, real_world)
                    if iou_num == pre_iou_num:
                        same_iou_vec = np.vstack((same_iou_vec, vec))
                        single_iou_path.append(
                            os.path.join(meeting_path, 'mtcnn', name))
                    else:
                        iou_vec.append(np.mean(same_iou_vec, axis=0))
                        iou_pic_path.append(single_iou_path)
                        single_iou_path = []
                        same_iou_vec = np.empty(
                            [0, cfg['pre_process']['dimension']])
                        same_iou_vec = np.vstack((same_iou_vec, vec))
                        single_iou_path.append(
                            os.path.join(meeting_path, 'mtcnn', name))
                        pre_iou_num = iou_num
                if len(single_iou_path) != 0:
                    iou_vec.append(np.mean(same_iou_vec, axis=0))
                    iou_pic_path.append(single_iou_path)

                with open(os.path.join(meeting_path, 'pics%s.pk' % meeting),
                          'wb') as f:
                    pickle.dump(iou_pic_path, f)
                np.save(os.path.join(meeting_path, 'vec%s.npy' % meeting),
                        iou_vec)

                classifier_path = os.path.join(meeting_path, 'classifier')
                if os.path.exists(classifier_path):
                    shutil.rmtree(classifier_path)
                os.mkdir(classifier_path)
                for i in range(meeting_people_num[str(meeting)]):
                    os.makedirs(os.path.join(classifier_path, str(i)))

                if len(iou_vec) == 0:
                    continue
                if len(iou_vec) < 2:
                    continue
                if len(iou_vec) > meeting_people_num[meeting]:
                    cluster_number = meeting_people_num[meeting]
                else:
                    cluster_number = len(iou_vec)
                kmeans = AgglomerativeClustering(
                    n_clusters=cluster_number, linkage='average').fit(iou_vec)

                index = 0
                for d in kmeans.labels_:
                    name = utils.get_parent_folder_name(
                        iou_pic_path[index][0], 1)
                    shutil.copyfile(
                        iou_pic_path[index][0],
                        os.path.join(meeting_path, 'classifier', str(d), name))
                    index += 1
Exemple #3
0
def scan(bio_features, bio_paths, bio_info, real_world, cdist_metric,
         victims_thres, omega, meeting_num, meeting_thres, meeting_index,
         wifi_people_in_meetings, poi_people, project_dir, bio_data_path,
         audio):
    logging.info(
        'begin SCAN with parameters: omega-{}, thres_item-{}, meeting_thres-{}'
        .format(omega, victims_thres, meeting_thres))

    linkage_mat = linkage(bio_features, method='average')
    root_node, nodelist = to_tree(linkage_mat, rd=True)

    # decide the starting position of useful IDs to avoid tiny clusters
    if real_world:
        frac_start = 0.995
    else:
        frac_start = 0.99
    start_node_idx = int(len(nodelist) * frac_start)

    # assign bio to useful nodes (clusters), avoiding meaningless association
    stat = {}
    cluster_feature = {}
    useful_nodelist, useful_id = [], 0
    for node in nodelist:
        if node.id < start_node_idx:
            continue
        node.useful_id = useful_id
        node.bios = set()
        stat[useful_id] = set()
        cluster_feature[useful_id] = []
        for leaf in node.pre_order():
            bio_path = bio_paths[leaf]
            cluster_feature[useful_id].append(bio_info[leaf])
            parent = utils.get_parent_folder_name(bio_path, 3)
            tem = parent.split('_')
            # node bios: {'04-03-11-00-00_04-03-14-00-00', '03-29-14-00-00_03-29-17-00-00'}
            node.bios.add('%s_%s' % (tem[0], tem[1]))
            stat[useful_id].add('%s_%s' % (tem[0], tem[1]))
        useful_nodelist.append(node)
        useful_id += 1

    # dict to save
    res_stat = {}
    for num in stat:
        res_stat[num] = {}
        for n in stat[num]:
            res_stat[num][n] = []

    bio_people_in_meetings = np.zeros(
        [len(useful_nodelist),
         min(meeting_num, meeting_thres)])
    for node in useful_nodelist:
        for bio in node.bios:
            if meeting_index[bio] < meeting_thres:
                bio_people_in_meetings[node.useful_id, meeting_index[bio]] = 1
        for bio_idx in node.pre_order():
            bio_path = bio_paths[bio_idx]
            res_stat[node.useful_id][bio].append(bio_path)

    # get the association cost matrix
    logging.info('bio_people_in_meetings shape: {}'.format(
        bio_people_in_meetings.shape))
    logging.info('wifi_people_in_meetings shape: {}'.format(
        wifi_people_in_meetings.shape))

    # get the association cost matrix
    dist_mat = cdist(bio_people_in_meetings,
                     wifi_people_in_meetings,
                     metric=cdist_metric)
    logging.info('dist_mat shape: {} with min: {} and max: {}'.format(
        dist_mat.shape, dist_mat.min(), dist_mat.max()))

    # PuLP Part
    rows = [str(i) for i in range(dist_mat.shape[0])]  # stands for bio_feature
    cols = [str(i) for i in range(dist_mat.shape[1])]  # stands for poi_people
    all_paths_to_leaves = utils.parse(to_tree(linkage_mat), [], [])
    # init
    prob = pulp.LpProblem("scan_opt", pulp.LpMinimize)
    assignment = pulp.LpVariable.dicts('assignment', (rows, cols),
                                       cat="Binary")
    # obj function
    prob += pulp.lpSum([
        assignment[r][c] *
        (dist_mat[int(r)][int(c)] + omega * useful_nodelist[int(r)].dist)
        for r in rows for c in cols
    ])

    # enforce constraints for useful nodes (clusters) and IDs respectively
    for r in rows:
        prob += pulp.lpSum([assignment[r][c] for c in cols]) <= 1
    for c in cols:
        prob += pulp.lpSum([assignment[r][c] for r in rows]) <= 1

    logging.info('using victims_thres: {}'.format(victims_thres))
    prob += pulp.lpSum([assignment[r][c] for r in rows
                        for c in cols]) == victims_thres

    # enforce the hierarchy relation constraints between nodes
    for leaf_path in all_paths_to_leaves:
        prob += pulp.lpSum([
            assignment[str(int(r) - start_node_idx)][c] for r in leaf_path
            for c in cols if int(r) >= start_node_idx
        ]) <= 1

    # pulp solve
    prob.solve()
    logging.info(pulp.LpStatus[prob.status])

    res_peo = []
    res_pic = []
    for r in rows:
        for c in cols:
            if assignment[r][c].varValue > 0:
                logging.debug('chose r: {}, c: {}, value: {}'.format(
                    r, c, assignment[r][c].varValue))
                res_peo.append(int(c))
                res_pic.append(int(r))

    # assign results
    nb_labelled_images = 0
    save_matrix = {}
    labeled = set()
    final_res = collections.OrderedDict()
    for r in rows:
        for c in cols:
            if assignment[r][c].varValue > 0:
                if int(c) < len(poi_people):
                    name = poi_people[int(c)]
                else:
                    name = 'non_poi_' + c
                final_res[name] = res_stat[int(r)]
                save_matrix[name] = np.mean(np.array(cluster_feature[int(r)]),
                                            axis=0)
                nb_labelled_images += len(useful_nodelist[int(r)].pre_order())
                labeled.update(set(useful_nodelist[int(r)].pre_order()))

    # load true label
    true_label = utils.load_true_label(audio, real_world, bio_data_path)

    peo_in_cluster = defaultdict(dict)
    bio_in_cluster = np.zeros([len(final_res), meeting_num])
    bio_name = []
    if audio and real_world:
        for peo, item in final_res.items():
            for full_pic in functools.reduce(operator.concat, item.values()):
                if full_pic.split('/')[-3] in meeting_index:
                    if peo not in bio_name:
                        bio_name.append(peo)
                    bio_in_cluster[bio_name.index(peo)][meeting_index[
                        full_pic.split('/')[-3]]] = 1
                if full_pic.split('/')[-3] in true_label:
                    index1 = bisect.bisect_left(
                        true_label[full_pic.split('/')[-3]]['timestamp'],
                        full_pic.split('/')[-1].split('_')[0])
                    index2 = bisect.bisect_left(
                        true_label[full_pic.split('/')[-3]]['timestamp'],
                        full_pic.split('/')[-1].split('_')[1][:-4])
                    if index1 == index2:
                        if index1 < len(true_label[full_pic.split('/')[-3]]
                                        ['speaker']):
                            peo_in_cluster[peo][true_label[full_pic.split('/')[-3]]['speaker'][index1]] = \
                                peo_in_cluster[peo].get(true_label[full_pic.split('/')[-3]]['speaker'][index1], 0) + 1
    else:
        for peo, item in final_res.items():
            for meeting, bios in item.items():
                for full_bio_path in bios:
                    if full_bio_path.split('/')[-3] in meeting_index:
                        if peo not in bio_name:
                            bio_name.append(peo)
                        bio_in_cluster[bio_name.index(peo)][meeting_index[
                            full_bio_path.split('/')[-3]]] = 1
                    id = full_bio_path.split('/')[-1].split('_')[0]
                    peo_in_cluster[peo][id] = peo_in_cluster[peo].get(id,
                                                                      0) + 1
                    # if full_bio_path.split('/')[-1] in true_label:
                    #     peo_in_cluster[peo][true_label[full_bio_path.split('/')[-1]]] = peo_in_cluster[peo].get(
                    #         true_label[full_bio_path.split('/')[-1]], 0) + 1
    # else:
    #     for peo, item in final_res.items():
    #         for full_pic in functools.reduce(operator.concat, functools.reduce(operator.concat, item.values())):
    #             print('full pic: {}'.format(full_pic))
    #             if full_pic.split('/')[-3] in meeting_index:
    #                 if peo not in bio_name:
    #                     bio_name.append(peo)
    #                 bio_in_cluster[bio_name.index(peo)][meeting_index[full_pic.split('/')[-3]]] = 1
    #             if full_pic.split('/')[-1] in true_label:
    #                 peo_in_cluster[peo][true_label[full_pic.split('/')[-1]]] = peo_in_cluster[peo].get(
    #                     true_label[full_pic.split('/')[-1]], 0) + 1
    print('peo_in_cluster: {}'.format(peo_in_cluster))

    # evaluation
    predict_peo = defaultdict(dict)
    precisions = []
    cnt2 = 0
    all = 0
    print('peo_in_cluster = ')
    with open(os.path.join(project_dir, 'peo_in_cluster.csv'), 'w') as f:
        print_color = {1: 'green', 0: 'red'}
        for peo, item in peo_in_cluster.items():
            for p, no in item.items():
                all += no
                if p == peo:
                    cnt2 += no
            major_element = max(item.items(), key=operator.itemgetter(1))[0]
            print(' \'' + colored(peo, print_color[peo == major_element]) +
                  '\':')
            print('  {},'.format(item))
            if peo == major_element:
                precisions.append(item[major_element] / sum(item.values()))
            for part_peo, cnt in item.items():
                predict_peo[part_peo][peo] = cnt
                f.write('{}, {}, {}\n'.format(peo, part_peo, cnt))
    print('precisions: {}'.format(precisions))
    correct_cnt = 0
    correct_peo = []
    with open(os.path.join(project_dir, 'predict_peo.csv'), 'w') as f:
        for peo, item in predict_peo.items():
            major_element = max(item.items(), key=operator.itemgetter(1))[0]
            if major_element == peo:
                correct_peo.append(peo)
                correct_cnt += 1
            for part_peo, cnt in item.items():
                f.write('{}, {}, {}\n'.format(peo, part_peo, cnt))
    correct_peo.sort()
    print('precision = {}'.format(sum(precisions) / len(precisions)))
    print(colored('purity: {}'.format(cnt2 / all), 'yellow'))
    print('k = {}'.format(victims_thres))
    print('correct = {}'.format(correct_cnt))
    print('correct_peo = {}'.format(correct_peo))
    print('unpredicted_peo = {}'.format(set(poi_people) - set(correct_peo)))
    print(
        colored(
            'choose {} correct: {} with all: {}'.format(
                victims_thres, correct_cnt, len(cols)), 'magenta'))

    return final_res, save_matrix, labeled
Exemple #4
0
    def prepare_data_index(self):
        init_dict = {'validation': [], 'testing': [], 'training': []}
        if not (self.train):
            init_dict = {'testing': []}
        elif self.mode == 'train':
            init_dict = {'validation': [], 'training': []}

        self.data_index = init_dict
        unknown_index = init_dict
        all_words = {}
        for wav_file in self.audio_files:
            if self.train:
                word = get_parent_folder_name(wav_file).lower()
            else:
                word = self.words_list[1]  #Unknown label in case of inference
            # used to remove previous augmentation folder
            aug_dir = os.path.join(os.path.dirname(wav_file), 'augmentation')
            if os.path.isdir(aug_dir):
                shutil.rmtree(aug_dir)
            if word == self.background_noise:
                continue
            all_words[word] = True
            if len(self.validation_list) > 0 and len(self.testing_list) > 0:
                wav_index = get_file_index(wav_file)
                if wav_index in self.validation_list:
                    set_index = 'validation'
                elif wav_index in self.testing_list:
                    set_index = 'testing'
                else:
                    set_index = 'training'
            else:
                set_index = which_set(
                    wav_file, self.validation_percentage,
                    self.testing_percentage,
                    self.model_settings['max_num_wavs_per_class'])
            if not (self.train):
                set_index = 'testing'  # in case of inference set index will be always testing
            elif self.mode == 'train' and set_index == 'testing':
                # using test set in the training set for production system
                set_index = 'training'
            if word in self.words_list[2:]:
                self.data_index[set_index].append({
                    'label': word,
                    'file': wav_file
                })
            else:
                unknown_index[set_index].append({
                    'label': self.words_list[1],
                    'file': wav_file
                })
        if self.train:
            for index, wanted_word in enumerate(self.words_list[2:]):
                if wanted_word not in all_words:
                    raise Exception('Expected to find ' + wanted_word +
                                    ' in labels but only found ' +
                                    ', '.join(all_words.keys()))
            silence_wav_path = self.data_index['training'][0]['file']
            for set_index in init_dict:
                set_size = len(self.data_index[set_index])
                silence_size = int(
                    math.ceil(set_size * self.silence_percentage / 100))
                for _ in range(silence_size):
                    self.data_index[set_index].append({
                        'label':
                        self.words_list[0],  #silence label
                        'file':
                        silence_wav_path
                    })
                # Pick some unknowns to add to each partition of the data set.
                random.shuffle(
                    unknown_index[set_index]
                )  # TODO might need to get examples from all words and all speakers
                # unknown will be sampled within each mini batch
                unknown_size = int(
                    math.ceil(set_size * self.unknown_percentage / 100))
                self.data_index[set_index].extend(
                    unknown_index[set_index][:unknown_size])

        if self.train:
            self.augment_data('training')
            for set_index in init_dict:
                random.shuffle(self.data_index[set_index])
        for word in all_words:
            if word in self.words_list[2:]:
                self.word_to_index[word] = self.words_list.index(word)
            else:
                self.word_to_index[word] = self.words_list.index(
                    self.words_list[1])
        self.word_to_index[self.words_list[1]] = 1  #unknown label
        self.word_to_index[self.words_list[0]] = 0  #silence label