Example #1
0
def spin_crawl_threads(state, classifiers, MAX_BIT_SIZE, MAX_DL_THREADS, image_path):
    print("Running threads...")
    manager = Manager()

    location_q = manager.Queue(maxsize=16)
    image_q = manager.Queue(maxsize=64)
    state_lock = manager.Lock()

    generate_location = Process(target=generate_location_thread,
                                args=(location_q, MAX_BIT_SIZE),
                                name="generate_location")
    classification = Process(target=classification_thread,
                             args=(image_q, classifiers, image_path,
                                   state, state_lock), name="classification")
    download_image_t = Process(target=download_image_thread,
                               args=(location_q, image_q, MAX_DL_THREADS),
                               name="download_image")

    download_image_t.start()
    classification.start()
    generate_location.start()

    def kill_threads():
        for thread in active_children():
            thread.terminate()

    atexit.register(kill_threads)

    download_image_t.join()
    classification.join()
    generate_location.join()
Example #2
0
    def create_csv(self):
        if __name__ == '__main__':
            t1 = time()
            file1 = open(self.out_csv1, "w")
            file1.write("id" + ',' + "level" + '\n')
            file2 = open(self.out_csv2, "w")
            file2.write("id" + ',' + "object_name" + '\n')
            file1.close()
            file2.close()

            i = range(len(self.list_of_zips))
            p = Pool()
            m = Manager()
            l = m.Lock()
            func = partial(self.parse_Zip, l)
            p.map(func, i)
            p.close()
            p.join()
        print('Create .csv files time = ' + str(time() - t1) + 's')
Example #3
0
def run_post_process():
    es = ES(FLAGS.configfile_name)
    manager=Manager()
    lock=manager.Lock()
    shared_dict=manager.dict({'time':0,"id":""})
    process_num=int(cpu_count()-2)

    generator_list=[]
    for i in range(process_num):
        generator_list.append(_generator(lock,shared_dict,es))

    #%%
    p=[]
    for i in range(process_num):
        p.append(Process(target=_process_unknown_record,args=(generator_list[i],)))
        p[i].start()

    for q in p:
        q.join()
def folderbase_cut_silence(input_folder, cut_interval):

    output_no_silence = os.path.join(input_folder, "remove_silence")
    # if not os.path.exists(output_folder):
    # 	os.mkdir(output_folder)
    if not os.path.exists(output_no_silence):
        os.mkdir(output_no_silence)
    wav_files = []
    for root, dirs, files in os.walk(input_folder):
        for filename in files:
            wav_files.append(filename)

    def process_files(lock, file):
        try:
            #exclude log.txt file
            if re.search(".+\.wav", file):
                wave_file = os.path.join(input_folder, file)
                wo_num = cut_wav_without_silence(wave_file, output_no_silence,
                                                 cut_interval)
                with cut_silence_file_num.get_lock():
                    cut_silence_file_num.value += 1
                with cut_silence_out_file_num.get_lock():
                    cut_silence_out_file_num.value += wo_num
                os.remove(wave_file)

        except Exception as e:
            logging.info(e)
            with cut_silence_fail_file.get_lock():
                cut_silence_fail_file.value += 1

    pool = Pool(process_num)
    m = Manager()
    lock = m.Lock()
    locks = [lock] * len(wav_files)
    pool.map(process_files, locks, wav_files)
    loginfo = '''Total number of audio files processed is {}, generated {} files and {} files failed
		'''.format(cut_silence_file_num.value, cut_silence_out_file_num.value,
             cut_silence_fail_file.value)
    logging.info(loginfo)
Example #5
0
def folderbase_convert_to_wave(webmfolder, wavefolder):
    def process_convert(lock, filename):
        my_logger.debug("filename is {}".format(filename))
        with total_num.get_lock():
            total_num.value += 1
        try:
            success = convert_to_wav(filename, wavefolder)
            with success_num.get_lock():
                success_num.value += success
            os.remove(filename)
        except Exception as e:
            line = "\t".join([str(datetime.datetime.now()), filename, str(e)])
            my_logger.info(line)
            fail_folder = "data/convert_failed"
            if not os.path.exists(fail_folder):
                os.mkdir(fail_folder)
            filebase = os.path.basename(filename)
            failed_file = os.path.join(fail_folder, filebase)
            os.rename(filename, failed_file)
            with fail_num.get_lock():
                fail_num.value += 1
        return 1

    filenames = []
    for file in mp3gen(webmfolder):
        if re.search("wav", file): continue
        filenames.append(file)
    pool = Pool(process_num)
    m = Manager()
    lock = m.Lock()
    locks = [lock] * len(filenames)
    pool.map(process_convert, locks, filenames)

    my_logger.info(
        "{}/{} files successfully converted to wave and {} files failed".
        format(success_num.value, total_num.value, fail_num.value))
def folderbase_cut_interval(input_folder, output_folder, cut_period):
    wav_files = []
    if not os.path.exists(output_folder):
        os.mkdir(output_folder)
    for root, dirs, files in os.walk(input_folder):
        for filename in files:
            wav_files.append(os.path.join(root, filename))


#	for file in wav_files:

    def process_files(lock, file):
        try:
            if re.search(".+\.wav", file):
                with file_num.get_lock():
                    file_num.value += 1
                filebasename = os.path.basename(file)
                filebasename, _ = os.path.splitext(filebasename)
                #get audio properties
                audio_prop = {}
                with wave.open(file, mode='rb') as newAudio:
                    audio_prop["nchannels"] = newAudio.getnchannels()
                    audio_prop["nframes"] = newAudio.getnframes()
                    audio_prop["sampwidth"] = newAudio.getsampwidth()
                    audio_prop["framerate"] = newAudio.getframerate()
                    audio_prop["comptype"] = newAudio.getcomptype()
                    audio_prop["compname"] = newAudio.getcompname()
                audio_duration = audio_prop["nframes"] / audio_prop["framerate"]

                precut_duration = cut_period
                cut_start = 0
                cut_return = 0
                cut_num = 0
                index = 0
                while cut_start < audio_duration:
                    cut_end = cut_start + precut_duration
                    cut_audio, cutaudio_prop = cut_wave(file,
                                                        cut_start,
                                                        cut_end,
                                                        start_bias=0,
                                                        end_bias=0)
                    newfile = os.path.join(
                        output_folder,
                        filebasename + "_" + str(index) + ".wav")
                    index += 1
                    with wave.open(newfile, "wb") as newAudio:
                        newAudio.setparams((cutaudio_prop["nchannels"],
                                            cutaudio_prop["sampwidth"],
                                            cutaudio_prop["framerate"],
                                            cutaudio_prop["nframes"],
                                            cutaudio_prop["comptype"],
                                            cutaudio_prop["compname"]))
                        newAudio.writeframes(cut_audio)
                    cut_start = cut_start + precut_duration
                    with out_file_num.get_lock():
                        out_file_num.value += 1
                os.remove(file)
        except Exception as e:
            logging.info(e)
            with fail_file.get_lock():
                fail_file.value += 1

    pool = Pool(process_num)
    m = Manager()
    lock = m.Lock()
    locks = [lock] * len(wav_files)
    pool.map(process_files, locks, wav_files)
    loginfo = '''Total number of audio files processed is {}, generated {} files and {} files failed
	'''.format(file_num.value, out_file_num.value, fail_file.value)
    logging.info(loginfo)
Example #7
0
    def perm_test(self, nperm, npr=1):
        """
        Performs permutation testing on residual matrix SVD.

        The rows of the residual matrix are first permuted.  Then  get_tks is called to calculate explained variance ratios and these tks are compared to the values from the actual residual matrix.  A running total is kept for the number of times the explained variance from the permuted matrix exceeds that from the original matrix. And significance is estimated by dividing these totals by the number of permutations.  This permutation testing is multiprocessed to decrease calculation times.
        
        Parameters
        ----------
        nperm : int
            Number of permutations to be tested.
        npr : int
            Number of processors to be used.

        Attributes
        ----------
        sigs : array
            Estimated significances for each batch effect.

        """
        def single_it(rseed):
            """
            Single iteration of permutation testing.
            Permutes residual matrix, calculates new tks for permuted matrix and compares to original tks.
            Parameters
            ----------
            rseed : int
                Random seed.
            Returns
            -------
            out : arr
                Counts of number of times permuted explained variance ratio exceeded explained variance ratio from actual residual matrix.
            """

            rstate = np.random.RandomState(rseed * 100)
            rstar = np.copy(self.res)
            out = np.zeros(len(self.tks))
            for i in range(rstar.shape[0]):
                rstate.shuffle(rstar[i, :])
            resstar = self.get_res(rstar)
            tkstar = self.get_tks(resstar)
            for m in range(len(self.tks)):
                if tkstar[m] > self.tks[m]:
                    out[m] += 1
            return out

        if int(npr) > 1:
            mgr = Manager()
            output = mgr.list()
            l = mgr.Lock()
            with Pool(int(npr)) as pool:
                pbar = tqdm(total=int(nperm),
                            desc='permuting',
                            position=0,
                            smoothing=0)
                imap_it = pool.imap_unordered(single_it, range(int(nperm)))
                for x in imap_it:
                    pbar.update(1)
                    with l:
                        output.append(x)
            pbar.close()
            pool.close()
            pool.join()
            self.sigs = np.sum(np.asarray(output), axis=0) / float(nperm)
            time.sleep(40)
        else:
            output = []
            with tqdm(total=int(nperm),
                      desc='permuting',
                      position=0,
                      smoothing=0) as pbar:
                for x in range(int(nperm)):
                    output.append(single_it(x))
                    pbar.update(1)
            self.sigs = np.sum(np.asarray(output), axis=0) / float(nperm)
    def generate_data(self, data_dir, tmp_dir, task_id=-1):
        test = 20
        train_paths = self.training_filepaths(data_dir,
                                              self.num_shards,
                                              shuffled=False)
        dev_paths = self.dev_filepaths(data_dir,
                                       self.num_dev_shards,
                                       shuffled=False)
        test_paths = self.test_filepaths(data_dir,
                                         self.num_test_shards,
                                         shuffled=True)
        try_num = 0
        if test:
            try_num = test
        manager = Manager()
        lock = manager.Lock()

        #shared_dict=manager.dict({'current_id':id_init,"current_last_updated":0,"record_num":0,"source_index":0})
        def process_files(train_paths, datasets, num_run, shared_dict):

            total_file_num = len(train_paths)
            num_per_partition = int(math.floor(total_file_num / num_run))
            train_paths_list = []
            for i in range(num_run):
                if i == num_run - 1:
                    train_paths_list.append(train_paths[i *
                                                        num_per_partition:])
                else:
                    train_paths_list.append(
                        train_paths[i * num_per_partition:(i + 1) *
                                    num_per_partition])
            generator_list = []
            for i in range(num_run):
                generator_list.append(
                    self.generator(data_dir,
                                   tmp_dir,
                                   datasets,
                                   lock,
                                   shared_dict,
                                   how_many=try_num))

            p = []
            for i in range(num_run):
                p.append(
                    Process(target=generator_utils.generate_files,
                            args=(generator_list[i], train_paths_list[i],
                                  try_num)))
                p[i].start()
            my_logger.error("Time: {} All processes started".format(
                str(datetime.datetime.now())))
            for q in p:
                q.join()
            my_logger.error("Time: {} All processes ended".format(
                str(datetime.datetime.now())))

        shared_dict = manager.dict({
            'current_id': id_init,
            "current_last_updated": 0,
            "record_num": 0,
            "source_index": 0
        })
        num_run = min(self.process_num, self.num_shards)
        process_files(train_paths, self.train_sources, num_run, shared_dict)
        if len(self.eval_sources) == 0:
            generator_utils.shuffle_dataset(train_paths)

        else:
            shared_dict["current_id"] = id_init
            shared_dict["current_last_updated"] = 0
            shared_dict["record_num"] = 0
            shared_dict["source_index"] = 0
            num_run = min(self.process_num, self.num_dev_shards)
            my_logger.error("Time: {} process dev dataset".format(
                str(datetime.datetime.now())))
            process_files(dev_paths, self.eval_sources, num_run, shared_dict)
            my_logger.error("Time: {} shuffle dataset".format(
                str(datetime.datetime.now())))
            generator_utils.shuffle_dataset(train_paths + dev_paths)
        shared_dict["current_id"] = id_init
        shared_dict["current_last_updated"] = 0
        shared_dict["record_num"] = 0
        shared_dict["source_index"] = 0
        num_run = min(self.process_num, self.num_test_shards)
        process_files(test_paths, self.test_sources, num_run, shared_dict)
Example #9
0
def folderbase_cut(input_folder, output_folder, cut_interval):
    #input_folder should have a structure with wav folder containing wav files
    #and vtt folder containing possible subtitle files
    #output_folder will contain output_subtitle and output_without_subtitle folders after processing

    wav_dir = os.path.join(input_folder, "wav")
    output_wo_sub = output_folder

    if not os.path.exists(output_wo_sub):
        os.mkdir(output_wo_sub)
    wav_files = []
    for root, dirs, files in os.walk(wav_dir):
        for filename in files:
            wav_files.append(filename)

    def process_cut(lock, file):
        try:
            #exclude log.txt file
            if re.search(".+\.wav", file):
                filebasename, _ = os.path.splitext(file)
                my_logger.info("process worker {} Processing file {}".format(
                    multiprocessing.current_process(), filebasename))
                wave_file = os.path.join(wav_dir, file)

                wo_num = cut_wav_without_subtitle(lock, wave_file,
                                                  output_wo_sub, cut_interval)
                my_logger.info(
                    "process worker {} file {} has no subtitle".format(
                        multiprocessing.current_process(), filebasename))
                with file_num.get_lock():
                    file_num.value += 1
                with out_file_num.get_lock():
                    out_file_num.value += wo_num
                os.remove(wave_file)

        except Exception as e:
            error_folder = "data/cut_failed"
            if not os.path.exists(error_folder):
                os.mkdir(error_folder)
            fail_file_path = os.path.join(error_folder, file)
            os.rename(wave_file, fail_file_path)
            line = [
                str(datetime.datetime.now()), filebasename, "error",
                str(e)
            ]
            line = "\t".join(line)
            my_logger.info(line)

            logging.info(e)
            with fail_file.get_lock():
                fail_file.value += 1
            return 1

    pool = Pool(process_num)
    # pool=Pool(1)
    m = Manager()
    lock = m.Lock()

    locks = [lock] * len(wav_files)
    pool.map(process_cut, locks, wav_files)

    loginfo = '''Total number of audio files processed is {}, {} files failed
		Total number of audio files generated is {}'''.format(
        file_num.value, fail_file.value, out_file_num.value)
    my_logger.info(loginfo)
Example #10
0
    p = Pool()
    p.map(func, range(count_zips))
    p.close()
    p.join()
    print('Create .zip files time = ' + str(time() - t1) + 's')

    # Second task: grep id, level, options from .zip to .csv files
    t1 = time()

    # create .csv files
    out_csv1 = os.path.join(path, 'csv1.csv')
    out_csv2 = os.path.join(path, 'csv2.csv')
    file1 = open(out_csv1, "w")
    file1.write("id" + ',' + "level" + '\n')
    file2 = open(out_csv2, "w")
    file2.write("id" + ',' + "object_name" + '\n')
    file1.close()
    file2.close()

    list_of_zips = get_list_of_zips(
        path)  # get list of zips in working directory
    i = range(len(list_of_zips))
    p = Pool()
    m = Manager()  #Manager is needed to distribute Lock to all processes
    lock = m.Lock()
    func = partial(parse_Zip, lock, list_of_zips, path, out_csv1, out_csv2)
    p.map(func, i)
    p.close()
    p.join()
    print('Create .csv files time = ' + str(time() - t1) + 's')