Ejemplo n.º 1
0
def prep_r2_with_barcode(fq1, fq2, out_file):

    safe_makedir(os.path.dirname(out_file))
    if file_exists(out_file):
        print ("%s and %s have already been barcode-prepped, skipping."
               % (fq1, fq2))
        return out_file

    with open_fastq(fq1) as r1_file, open_fastq(fq2) as r2_file:
        with file_transaction(out_file) as tx_out_file:
            out_handle = open(tx_out_file, "w")
            read_count = 0
            buf = list()
            r1_r2 = itertools.izip(r1_file, r2_file)
            for header1, header2 in r1_r2:
                seq1, seq2 = r1_r2.next()
                plus1, plus2 = r1_r2.next()
                qual1, qual2 = r1_r2.next()

                read_name1, read_name2 = header1.split()[0][1:], header2.split()[0][1:]
                assert read_name1 == read_name2, "FASTQ files may be out of order."
                seq2, qual2 = seq2.rstrip(), qual2.rstrip()
                barcode, seq, qual = mask(seq1[0:6], qual1[0:6], min_qual=10) + \
                                     mask(seq1[6:], qual1[6:]), seq2, qual2
                barcoded_name = ":".join([read_name2, barcode])

                print(format_fastq([barcoded_name, seq, qual]), file=out_handle)
            out_handle.close()
    return out_file
Ejemplo n.º 2
0
    def preprocess_audio(self):
        """
        Copy the Merged Arabic Corpus of Isolated Words into their
        associated directory. The whole audio data will be in 'data'
        directory, the enrolled data only will be in 'enroll', and the
        test data will be in 'test'.
        """
        #remove the data directory if exists
        if os.path.exists(self.data_dir):
            shutil.rmtree(self.data_dir)
        #iterate over speakers
        speakers = sorted(os.listdir(self.conf['inpath']))
        for sp in tqdm(speakers, desc="Converting Audio"):
            speaker_path = os.path.join(self.conf['inpath'], sp)
            wav_filenames = os.listdir(speaker_path)
            for wav in wav_filenames:
                inwav = os.path.join(speaker_path, wav)
                outwav = os.path.join(self.data_dir, wav)

                convert_wav(inwav,
                            outwav,
                            no_channels=self.conf['no_channels'],
                            sampling_rate=self.conf['sampling_rate'],
                            bit_precision=self.conf['bit_precision'])

        #remove the enroll directory if exists
        if os.path.exists(self.enroll_dir):
            shutil.rmtree(self.enroll_dir)
        #remove the test directory if exists
        if os.path.exists(self.test_dir):
            shutil.rmtree(self.test_dir)

        #create audio/enroll directory
        safe_makedir(self.enroll_dir)
        #create audio/test directory
        safe_makedir(self.test_dir)

        #parse num of sessions from configuration
        enroll_sessions = self.conf['enroll_sessions']
        test_sessions = self.conf['test_sessions']
        assert enroll_sessions+test_sessions <= 10,\
            "The summation of all sessions must be less than or equal 10!!"
        #iterate over all preprocessed waves
        wav_filenames = os.listdir(self.data_dir)
        for wav in tqdm(wav_filenames, desc="Copying enroll/test waves"):
            _, sess, _, _ = wav.split(".")
            inwav = os.path.join(self.data_dir, wav)
            if int(sess) <= enroll_sessions:
                outwav = os.path.join(self.enroll_dir, wav)
                shutil.copyfile(inwav, outwav)
            elif int(sess) <= enroll_sessions + test_sessions:
                outwav = os.path.join(self.test_dir, wav)
                shutil.copyfile(inwav, outwav)
Ejemplo n.º 3
0
def create_rmd(summary_fn):
    root_path, fn = os.path.split(os.path.abspath(summary_fn))
    basedir = os.path.join(root_path, "report")
    safe_makedir(basedir)
    out_file = os.path.join(root_path, fn.replace(".csv", "_re.csv"))
    with open(summary_fn) as in_handle:
        with open(out_file, 'w') as out_handle:
            for line in in_handle:
                cols = line.strip().split(",")
                fix_line = ",".join([os.path.relpath(c, root_path) if os.path.exists(c) else c for c in cols])
                print >>out_handle, fix_line
    report_file = modify_report(root_path, out_file)

    return out_file, report_file
Ejemplo n.º 4
0
def _flatten_plus_safe(rollback_files):
    """Flatten names of files and create temporary file names.
    """
    tx_files, orig_files = [], []
    for fnames in rollback_files:
        if isinstance(fnames, basestring):
            fnames = [fnames]
        for fname in fnames:
            basedir = utils.safe_makedir(os.path.join(os.path.dirname(fname), "tx"))
            tmpdir = utils.safe_makedir(tempfile.mkdtemp(dir=basedir))
            tx_file = os.path.join(tmpdir, os.path.basename(fname))
            tx_files.append(tx_file)
            orig_files.append(fname)
    return tx_files, orig_files
Ejemplo n.º 5
0
    def preprocess_audio(self):

        #remove the data directory if exists
        if os.path.exists(self.data_dir):
            shutil.rmtree(self.data_dir)
        #iterate over speakers
        speakers = sorted(os.listdir(self.conf['inpath']))
        for sp in tqdm(speakers, desc="Converting Audio"):
            speaker_path = os.path.join(self.conf['inpath'], sp)
            wav_filenames = os.listdir(speaker_path)
            for wav in wav_filenames:
                inwav = os.path.join(speaker_path, wav)
                outwav = os.path.join(self.data_dir, wav)
                convert_wav(inwav,
                            outwav,
                            no_channels = self.conf['no_channels'],
                            sampling_rate = self.conf['sampling_rate'],
                            bit_precision = self.conf['bit_precision'])
        
        #remove the enroll directory if exists
        if os.path.exists(self.enroll_dir):
            shutil.rmtree(self.enroll_dir)
        #remove the test directory if exists
        if os.path.exists(self.test_dir):
            shutil.rmtree(self.test_dir)
        
        #create audio/enroll directory
        safe_makedir(self.enroll_dir)
        #create audio/test directory
        safe_makedir(self.test_dir)

        #parse num of sessions from configuration
        enroll_sessions = self.conf['enroll_sessions']
        test_sessions = self.conf['test_sessions']
        assert enroll_sessions+test_sessions <= 10,\


        #iterate over all preprocessed waves
        wav_filenames = os.listdir(self.data_dir)
        for wav in tqdm(wav_filenames, desc="Copying enroll/test waves"):
            _, sess, _, _ = wav.split(".")
            inwav = os.path.join(self.data_dir, wav)
            if int(sess) <= enroll_sessions:
                outwav = os.path.join(self.enroll_dir, wav)
                shutil.copyfile(inwav, outwav)
            elif int(sess) <= enroll_sessions+test_sessions:
                outwav = os.path.join(self.test_dir, wav)
                shutil.copyfile(inwav, outwav)
Ejemplo n.º 6
0
def create_rmd(summary_fn):
    root_path, fn = os.path.split(os.path.abspath(summary_fn))
    basedir = os.path.join(root_path, "report")
    safe_makedir(basedir)
    out_file = os.path.join(root_path, fn.replace(".csv", "_re.csv"))
    with open(summary_fn) as in_handle:
        with open(out_file, 'w') as out_handle:
            for line in in_handle:
                cols = line.strip().split(",")
                fix_line = ",".join([
                    os.path.relpath(c, root_path) if os.path.exists(c) else c
                    for c in cols
                ])
                print >> out_handle, fix_line
    report_file = modify_report(root_path, out_file)

    return out_file, report_file
Ejemplo n.º 7
0
def tx_tmpdir(data=None, base_dir=None, remove=True):
    """Context manager to create and remove a transactional temporary directory.

    Handles creating a transactional directory for running commands in. Will
    use either the current directory or a configured temporary directory.

    Creates an intermediary location and time specific directory for global
    temporary directories to prevent collisions.

    data can be the full world information object being process or a
    configuration dictionary.
    """
    if data and "config" in data:
        config_tmpdir = tz.get_in(("config", "resources", "tmp", "dir"), data)
    elif data:
        config_tmpdir = tz.get_in(("resources", "tmp", "dir"), data)
    else:
        config_tmpdir = None
    if config_tmpdir:
        config_tmpdir = utils.safe_makedir(os.path.expandvars(config_tmpdir))
        config_tmpdir = os.path.normpath(os.path.join(os.getcwd(), config_tmpdir))
        tmp_dir_base = os.path.join(config_tmpdir, "bcbiotx", str(uuid.uuid4()))
        unique_attempts = 0
        while os.path.exists(tmp_dir_base):
            if unique_attempts > 5:
                break
            tmp_dir_base = os.path.join(config_tmpdir, "bcbiotx", str(uuid.uuid4()))
            time.sleep(1)
            unique_attempts += 1
    elif base_dir is not None:
        tmp_dir_base = os.path.join(base_dir, "tx")
    else:
        tmp_dir_base = os.path.join(os.getcwd(), "tx")
    utils.safe_makedir(tmp_dir_base)
    tmp_dir = tempfile.mkdtemp(dir=tmp_dir_base)
    utils.safe_makedir(tmp_dir)
    try:
        yield tmp_dir
    finally:
        if remove:
            for dname in [tmp_dir, tmp_dir_base if config_tmpdir else None]:
                if dname and os.path.exists(dname):
                    try:
                        shutil.rmtree(dname, ignore_errors=True)
                    except:
                        pass
Ejemplo n.º 8
0
def launch_training_job(model_dir, data_dir, job_name, params):
    """Launch training of the model with a set of hyperparameters in parent_dir/job_name
    Args:
        model_dir: (string) directory containing config, weights and log
        data_dir: (string) directory containing the dataset
        job_name: (string) name of the experiment to search hyperparameters
        params: (dict) containing hyperparameters
    """
    # Create a new folder in parent_dir with unique_name "job_name"
    model_dir = os.path.join(model_dir, job_name)
    utils.safe_makedir(model_dir)

    # Write parameters in json file
    json_path = os.path.join(model_dir, 'params.json')
    params.save(json_path)

    # Launch training with this config
    cmd = "{python} train.py --model_dir={model_dir} --data_dir {data_dir}".format(
        python=PYTHON, model_dir=model_dir, data_dir=data_dir)
    print(cmd)
    check_call(cmd, shell=True)
Ejemplo n.º 9
0
def _create_base_ipython_dirs():
    """Create default user directories to prevent potential race conditions downstream.
    """
    utils.safe_makedir(get_ipython_dir())
    ProfileDir.create_profile_dir_by_name(get_ipython_dir())
    utils.safe_makedir(os.path.join(get_ipython_dir(), "db"))
    utils.safe_makedir(os.path.join(locate_profile(), "db"))
Ejemplo n.º 10
0
def file_transaction(*data_and_files):
    """Wrap file generation in a transaction, moving to output if finishes.

    The initial argument can be the world descriptive `data` dictionary, or
    a `config` dictionary. This is used to identify global settings for
    temporary directories to create transactional files in.
    """
    exts = {".vcf": ".idx", ".bam": ".bai", ".vcf.gz": ".tbi", ".bed.gz": ".tbi"}
    with _flatten_plus_safe(data_and_files) as (safe_names, orig_names):
        _remove_files(safe_names)  # remove any half-finished transactions
        try:
            if len(safe_names) == 1:
                yield safe_names[0]
            else:
                yield tuple(safe_names)
        except:  # failure -- delete any temporary files
            _remove_files(safe_names)
            _remove_tmpdirs(safe_names)
            raise
        else:  # worked -- move the temporary files to permanent location
            for safe, orig in zip(safe_names, orig_names):
                if os.path.exists(safe):
                    utils.safe_makedir(os.path.dirname(orig))
                    # If we are rolling back a directory and it already exists
                    # this will avoid making a nested set of directories
                    if os.path.isdir(orig) and os.path.isdir(safe):
                        shutil.rmtree(orig)

                    _move_file_with_sizecheck(safe, orig)
                    # Move additional, associated files in the same manner
                    for check_ext, check_idx in exts.iteritems():
                        if safe.endswith(check_ext):
                            safe_idx = safe + check_idx
                            if os.path.exists(safe_idx):
                                _move_file_with_sizecheck(safe_idx, orig + check_idx)
            _remove_tmpdirs(safe_names)
Ejemplo n.º 11
0
    ]


def get_tsv_path(base_dir, recitation_id):
    return "%s/RecitationData%d.tsv" % (base_dir, recitation_id)


if __name__ == "__main__":
    args = parse_arguments()

    conn = create_db()

    import_data(conn, args.input_path)
    group_data(conn)
    page_ids = get_unique_pages(conn)
    out_dir = safe_makedir(args.output_path)

    # tsv
    previous_tsv = None
    if args.update_previous_recitation:
        previous_tsv = get_tsv_path(args.input_path, args.recitation_id)
    prepare_results(conn, previous_tsv, page_ids)
    insert_encoded_data(conn, encode_data(conn, 800, args.reference_width))
    tsv_filename = get_tsv_path(out_dir, args.recitation_id)
    export_data_tsv(conn, args.recitation_id, tsv_filename)

    if previous_tsv and filecmp.cmp(previous_tsv, tsv_filename):
        print("No change was detected since the previous recitation, exiting")
        os.remove(tsv_filename)
    else:
        # sql
Ejemplo n.º 12
0
        # ***** Align lines starts and ends *****************
        index = 0
        l1 = None
        drawMe = ImageDraw.Draw(image, "RGBA")
        for line in lines:
            lines[index] = ((0, line[0][1]), (background.size[0], line[1][1]))
            if l1 is not None and line[0][1] > (l1[1][1] + 1):
                lines[index] = ((lines[index][0][0], l1[1][1] + 1),
                                (lines[index][1][0], lines[index][1][1]))
            l1 = lines[index]
            drawMe.rectangle(lines[index], fill=(r(), r(), r(), 100))
            index += 1
        del drawMe
        output_file = "%s%s.png" % (output_path, page_str.zfill(3))
        image.save(output_file, "PNG")
        all_pages_lines[page] = lines
    return all_pages_lines


if __name__ == "__main__":
    args = parse_arguments()

    output_path = safe_makedir(args.output_path + '/lines/')

    print("Splitting pages to lines into " + output_path + "...")
    lines = main_find_lines(input_path=args.input_path,
                            output_path=output_path)

    save_lines(args.output_path, lines)