def prep_r2_with_barcode(fq1, fq2, out_file): safe_makedir(os.path.dirname(out_file)) if file_exists(out_file): print ("%s and %s have already been barcode-prepped, skipping." % (fq1, fq2)) return out_file with open_fastq(fq1) as r1_file, open_fastq(fq2) as r2_file: with file_transaction(out_file) as tx_out_file: out_handle = open(tx_out_file, "w") read_count = 0 buf = list() r1_r2 = itertools.izip(r1_file, r2_file) for header1, header2 in r1_r2: seq1, seq2 = r1_r2.next() plus1, plus2 = r1_r2.next() qual1, qual2 = r1_r2.next() read_name1, read_name2 = header1.split()[0][1:], header2.split()[0][1:] assert read_name1 == read_name2, "FASTQ files may be out of order." seq2, qual2 = seq2.rstrip(), qual2.rstrip() barcode, seq, qual = mask(seq1[0:6], qual1[0:6], min_qual=10) + \ mask(seq1[6:], qual1[6:]), seq2, qual2 barcoded_name = ":".join([read_name2, barcode]) print(format_fastq([barcoded_name, seq, qual]), file=out_handle) out_handle.close() return out_file
def preprocess_audio(self): """ Copy the Merged Arabic Corpus of Isolated Words into their associated directory. The whole audio data will be in 'data' directory, the enrolled data only will be in 'enroll', and the test data will be in 'test'. """ #remove the data directory if exists if os.path.exists(self.data_dir): shutil.rmtree(self.data_dir) #iterate over speakers speakers = sorted(os.listdir(self.conf['inpath'])) for sp in tqdm(speakers, desc="Converting Audio"): speaker_path = os.path.join(self.conf['inpath'], sp) wav_filenames = os.listdir(speaker_path) for wav in wav_filenames: inwav = os.path.join(speaker_path, wav) outwav = os.path.join(self.data_dir, wav) convert_wav(inwav, outwav, no_channels=self.conf['no_channels'], sampling_rate=self.conf['sampling_rate'], bit_precision=self.conf['bit_precision']) #remove the enroll directory if exists if os.path.exists(self.enroll_dir): shutil.rmtree(self.enroll_dir) #remove the test directory if exists if os.path.exists(self.test_dir): shutil.rmtree(self.test_dir) #create audio/enroll directory safe_makedir(self.enroll_dir) #create audio/test directory safe_makedir(self.test_dir) #parse num of sessions from configuration enroll_sessions = self.conf['enroll_sessions'] test_sessions = self.conf['test_sessions'] assert enroll_sessions+test_sessions <= 10,\ "The summation of all sessions must be less than or equal 10!!" #iterate over all preprocessed waves wav_filenames = os.listdir(self.data_dir) for wav in tqdm(wav_filenames, desc="Copying enroll/test waves"): _, sess, _, _ = wav.split(".") inwav = os.path.join(self.data_dir, wav) if int(sess) <= enroll_sessions: outwav = os.path.join(self.enroll_dir, wav) shutil.copyfile(inwav, outwav) elif int(sess) <= enroll_sessions + test_sessions: outwav = os.path.join(self.test_dir, wav) shutil.copyfile(inwav, outwav)
def create_rmd(summary_fn): root_path, fn = os.path.split(os.path.abspath(summary_fn)) basedir = os.path.join(root_path, "report") safe_makedir(basedir) out_file = os.path.join(root_path, fn.replace(".csv", "_re.csv")) with open(summary_fn) as in_handle: with open(out_file, 'w') as out_handle: for line in in_handle: cols = line.strip().split(",") fix_line = ",".join([os.path.relpath(c, root_path) if os.path.exists(c) else c for c in cols]) print >>out_handle, fix_line report_file = modify_report(root_path, out_file) return out_file, report_file
def _flatten_plus_safe(rollback_files): """Flatten names of files and create temporary file names. """ tx_files, orig_files = [], [] for fnames in rollback_files: if isinstance(fnames, basestring): fnames = [fnames] for fname in fnames: basedir = utils.safe_makedir(os.path.join(os.path.dirname(fname), "tx")) tmpdir = utils.safe_makedir(tempfile.mkdtemp(dir=basedir)) tx_file = os.path.join(tmpdir, os.path.basename(fname)) tx_files.append(tx_file) orig_files.append(fname) return tx_files, orig_files
def preprocess_audio(self): #remove the data directory if exists if os.path.exists(self.data_dir): shutil.rmtree(self.data_dir) #iterate over speakers speakers = sorted(os.listdir(self.conf['inpath'])) for sp in tqdm(speakers, desc="Converting Audio"): speaker_path = os.path.join(self.conf['inpath'], sp) wav_filenames = os.listdir(speaker_path) for wav in wav_filenames: inwav = os.path.join(speaker_path, wav) outwav = os.path.join(self.data_dir, wav) convert_wav(inwav, outwav, no_channels = self.conf['no_channels'], sampling_rate = self.conf['sampling_rate'], bit_precision = self.conf['bit_precision']) #remove the enroll directory if exists if os.path.exists(self.enroll_dir): shutil.rmtree(self.enroll_dir) #remove the test directory if exists if os.path.exists(self.test_dir): shutil.rmtree(self.test_dir) #create audio/enroll directory safe_makedir(self.enroll_dir) #create audio/test directory safe_makedir(self.test_dir) #parse num of sessions from configuration enroll_sessions = self.conf['enroll_sessions'] test_sessions = self.conf['test_sessions'] assert enroll_sessions+test_sessions <= 10,\ #iterate over all preprocessed waves wav_filenames = os.listdir(self.data_dir) for wav in tqdm(wav_filenames, desc="Copying enroll/test waves"): _, sess, _, _ = wav.split(".") inwav = os.path.join(self.data_dir, wav) if int(sess) <= enroll_sessions: outwav = os.path.join(self.enroll_dir, wav) shutil.copyfile(inwav, outwav) elif int(sess) <= enroll_sessions+test_sessions: outwav = os.path.join(self.test_dir, wav) shutil.copyfile(inwav, outwav)
def create_rmd(summary_fn): root_path, fn = os.path.split(os.path.abspath(summary_fn)) basedir = os.path.join(root_path, "report") safe_makedir(basedir) out_file = os.path.join(root_path, fn.replace(".csv", "_re.csv")) with open(summary_fn) as in_handle: with open(out_file, 'w') as out_handle: for line in in_handle: cols = line.strip().split(",") fix_line = ",".join([ os.path.relpath(c, root_path) if os.path.exists(c) else c for c in cols ]) print >> out_handle, fix_line report_file = modify_report(root_path, out_file) return out_file, report_file
def tx_tmpdir(data=None, base_dir=None, remove=True): """Context manager to create and remove a transactional temporary directory. Handles creating a transactional directory for running commands in. Will use either the current directory or a configured temporary directory. Creates an intermediary location and time specific directory for global temporary directories to prevent collisions. data can be the full world information object being process or a configuration dictionary. """ if data and "config" in data: config_tmpdir = tz.get_in(("config", "resources", "tmp", "dir"), data) elif data: config_tmpdir = tz.get_in(("resources", "tmp", "dir"), data) else: config_tmpdir = None if config_tmpdir: config_tmpdir = utils.safe_makedir(os.path.expandvars(config_tmpdir)) config_tmpdir = os.path.normpath(os.path.join(os.getcwd(), config_tmpdir)) tmp_dir_base = os.path.join(config_tmpdir, "bcbiotx", str(uuid.uuid4())) unique_attempts = 0 while os.path.exists(tmp_dir_base): if unique_attempts > 5: break tmp_dir_base = os.path.join(config_tmpdir, "bcbiotx", str(uuid.uuid4())) time.sleep(1) unique_attempts += 1 elif base_dir is not None: tmp_dir_base = os.path.join(base_dir, "tx") else: tmp_dir_base = os.path.join(os.getcwd(), "tx") utils.safe_makedir(tmp_dir_base) tmp_dir = tempfile.mkdtemp(dir=tmp_dir_base) utils.safe_makedir(tmp_dir) try: yield tmp_dir finally: if remove: for dname in [tmp_dir, tmp_dir_base if config_tmpdir else None]: if dname and os.path.exists(dname): try: shutil.rmtree(dname, ignore_errors=True) except: pass
def launch_training_job(model_dir, data_dir, job_name, params): """Launch training of the model with a set of hyperparameters in parent_dir/job_name Args: model_dir: (string) directory containing config, weights and log data_dir: (string) directory containing the dataset job_name: (string) name of the experiment to search hyperparameters params: (dict) containing hyperparameters """ # Create a new folder in parent_dir with unique_name "job_name" model_dir = os.path.join(model_dir, job_name) utils.safe_makedir(model_dir) # Write parameters in json file json_path = os.path.join(model_dir, 'params.json') params.save(json_path) # Launch training with this config cmd = "{python} train.py --model_dir={model_dir} --data_dir {data_dir}".format( python=PYTHON, model_dir=model_dir, data_dir=data_dir) print(cmd) check_call(cmd, shell=True)
def _create_base_ipython_dirs(): """Create default user directories to prevent potential race conditions downstream. """ utils.safe_makedir(get_ipython_dir()) ProfileDir.create_profile_dir_by_name(get_ipython_dir()) utils.safe_makedir(os.path.join(get_ipython_dir(), "db")) utils.safe_makedir(os.path.join(locate_profile(), "db"))
def file_transaction(*data_and_files): """Wrap file generation in a transaction, moving to output if finishes. The initial argument can be the world descriptive `data` dictionary, or a `config` dictionary. This is used to identify global settings for temporary directories to create transactional files in. """ exts = {".vcf": ".idx", ".bam": ".bai", ".vcf.gz": ".tbi", ".bed.gz": ".tbi"} with _flatten_plus_safe(data_and_files) as (safe_names, orig_names): _remove_files(safe_names) # remove any half-finished transactions try: if len(safe_names) == 1: yield safe_names[0] else: yield tuple(safe_names) except: # failure -- delete any temporary files _remove_files(safe_names) _remove_tmpdirs(safe_names) raise else: # worked -- move the temporary files to permanent location for safe, orig in zip(safe_names, orig_names): if os.path.exists(safe): utils.safe_makedir(os.path.dirname(orig)) # If we are rolling back a directory and it already exists # this will avoid making a nested set of directories if os.path.isdir(orig) and os.path.isdir(safe): shutil.rmtree(orig) _move_file_with_sizecheck(safe, orig) # Move additional, associated files in the same manner for check_ext, check_idx in exts.iteritems(): if safe.endswith(check_ext): safe_idx = safe + check_idx if os.path.exists(safe_idx): _move_file_with_sizecheck(safe_idx, orig + check_idx) _remove_tmpdirs(safe_names)
] def get_tsv_path(base_dir, recitation_id): return "%s/RecitationData%d.tsv" % (base_dir, recitation_id) if __name__ == "__main__": args = parse_arguments() conn = create_db() import_data(conn, args.input_path) group_data(conn) page_ids = get_unique_pages(conn) out_dir = safe_makedir(args.output_path) # tsv previous_tsv = None if args.update_previous_recitation: previous_tsv = get_tsv_path(args.input_path, args.recitation_id) prepare_results(conn, previous_tsv, page_ids) insert_encoded_data(conn, encode_data(conn, 800, args.reference_width)) tsv_filename = get_tsv_path(out_dir, args.recitation_id) export_data_tsv(conn, args.recitation_id, tsv_filename) if previous_tsv and filecmp.cmp(previous_tsv, tsv_filename): print("No change was detected since the previous recitation, exiting") os.remove(tsv_filename) else: # sql
# ***** Align lines starts and ends ***************** index = 0 l1 = None drawMe = ImageDraw.Draw(image, "RGBA") for line in lines: lines[index] = ((0, line[0][1]), (background.size[0], line[1][1])) if l1 is not None and line[0][1] > (l1[1][1] + 1): lines[index] = ((lines[index][0][0], l1[1][1] + 1), (lines[index][1][0], lines[index][1][1])) l1 = lines[index] drawMe.rectangle(lines[index], fill=(r(), r(), r(), 100)) index += 1 del drawMe output_file = "%s%s.png" % (output_path, page_str.zfill(3)) image.save(output_file, "PNG") all_pages_lines[page] = lines return all_pages_lines if __name__ == "__main__": args = parse_arguments() output_path = safe_makedir(args.output_path + '/lines/') print("Splitting pages to lines into " + output_path + "...") lines = main_find_lines(input_path=args.input_path, output_path=output_path) save_lines(args.output_path, lines)