Ejemplo n.º 1
0
def _make_output_folders():
    """
    create the output folders
    """
    failure = True
    try_idx = 1
    while failure:
        try:
            mkdir('generated_images' + str(try_idx))
            failure = False
            for i in range(len(TARGETS_EXPECTATION_PERCENTAGE)):
                cond_str = '_expectancy_' + str(
                    TARGETS_EXPECTATION_PERCENTAGE[i]) + "_clutter_" + str(
                        DISTANCE_BETWEEN_IMAGES[i])
                mkdir(
                    osp_join('generated_images' + str(try_idx),
                             cond_str + '_Color'))
                mkdir(
                    osp_join('generated_images' + str(try_idx),
                             cond_str + '_Color_no_clutter'))
                mkdir(
                    osp_join('generated_images' + str(try_idx),
                             cond_str + '_BW'))
        except FileExistsError:
            try_idx += 1
    return try_idx
Ejemplo n.º 2
0
def _save_images(final_image, final_image_bw, final_image_no_clutter,
                 targets_im, targets_im_bw, image_num, try_idx):
    cond_index = image_num // NUM_OF_IMAGES_TO_CREATE_PER_CONDITION
    cond_str = '_expectancy_' + str(
        TARGETS_EXPECTATION_PERCENTAGE[cond_index]) + "_clutter_" + str(
            DISTANCE_BETWEEN_IMAGES[cond_index])
    color_dir_path = osp_join("generated_images" + str(try_idx),
                              cond_str + "_Color")
    color_no_clutter_dir_path = osp_join("generated_images" + str(try_idx),
                                         cond_str + "_Color_no_clutter")
    bw_dir_path = osp_join("generated_images" + str(try_idx), cond_str + "_BW")
    final_image.convert('RGB').save(
        osp_join(curdir, color_dir_path,
                 "image_" + str(image_num + 1) + ".png"), "PNG")
    final_image_no_clutter.convert('RGB').save(
        osp_join(curdir, color_no_clutter_dir_path,
                 "image_" + str(image_num + 1) + ".png"), "PNG")
    final_image_bw.convert('LA').save(
        osp_join(curdir, bw_dir_path,
                 "image_bw_" + str(image_num + 1) + ".png"))
    targets_im_bw.convert('LA').save(
        osp_join(curdir, bw_dir_path,
                 "targets_bw_" + str(image_num + 1) + ".png"))
    targets_im.save(
        osp_join(curdir, color_dir_path,
                 "targets_" + str(image_num + 1) + ".png"))
    targets_im.save(
        osp_join(curdir, color_no_clutter_dir_path,
                 "targets_" + str(image_num + 1) + ".png"))
Ejemplo n.º 3
0
 def search(self, db, query, outputDir):
     """Run hmmsearch"""
     # make the output dir and files
     if self.mode != 'domtblout' and self.mode != 'tblout':
         raise HMMMERModeError("Mode %s not compatible with search" % self.mode)
     makeSurePathExists(outputDir)
     txt_file = osp_join(outputDir, self.txtOut)
     hmm_file = osp_join(outputDir, self.hmmOut)
     
     # run hmmer!
     system('hmmsearch --%s %s %s %s > %s' % (self.mode, txt_file, db, query, hmm_file))
def configuration(parent_package='', top_path=None):
    from numpy.distutils.misc_util import Configuration
    from numpy.distutils.system_info import get_info
    from os.path import join as osp_join

    config = Configuration('siesta', parent_package, top_path)

    all_info = get_info('ALL')
    sources = [
        'free_unit.f90',
        'write_hsx.f90',
        'read_hsx.f90',
        'read_dm.f90',
        'write_dm.f90',
        'read_hs.f90',
        'read_tshs.f90',
        'read_tsde.f90',
        'write_tshs.f90',
        'read_grid.f90',
        'write_grid.f90',
        'write_gf.f90',
    ]
    config.add_extension('_siesta',
                         sources=[osp_join('src', s) for s in sources],
                         extra_info=all_info)
    config.add_data_dir('tests')
    config.make_config_py()
    return config
Ejemplo n.º 5
0
Archivo: setup.py Proyecto: freude/sisl
def configuration(parent_package='', top_path=None):
    from numpy.distutils.misc_util import Configuration
    from numpy.distutils.system_info import get_info
    import os
    from os.path import join as osp_join

    config = Configuration('siesta', parent_package, top_path)

    all_info = get_info('ALL')
    sources = [
        'free_unit.f90',
        'siesta_sc_off.f90',
        'write_hsx.f90',
        'read_hsx.f90',
        'read_dm.f90',
        'write_dm.f90',
        'read_hs.f90',
        'read_tshs.f90',
        'read_tsde.f90',
        'write_tshs.f90',
        'read_grid.f90',
        'write_grid.f90',
        'write_gf.f90',
    ]
    # Only install the extension if not on READTHEDOCS
    if os.environ.get('READTHEDOCS', 'false').lower() != 'true':
        config.add_extension('_siesta',
                             sources=[osp_join('src', s) for s in sources],
                             extra_info=all_info)
    config.add_data_dir('tests')
    config.make_config_py()
    return config
Ejemplo n.º 6
0
def configuration(parent_package='', top_path=None):
    from numpy.distutils.misc_util import Configuration
    from numpy.distutils.system_info import get_info
    from os.path import join as osp_join

    config = Configuration('siesta', parent_package, top_path)

    all_info = get_info('ALL')
    sources = [
        'free_unit.f90',
        'write_hsx.f90',
        'read_hsx_header.f90',
        'read_hsx.f90',
        'read_hs_header.f90',
        'read_hs.f90',
        'read_tshs_sizes.f90',
        'read_tshs_geom.f90',
        'read_tshs_cell.f90',
        'read_tshs_es.f90',
        'read_tshs_version.f90',
    ]
    config.add_extension('_siesta',
                         sources=[osp_join('src', s) for s in sources],
                         extra_info=all_info)
    config.make_config_py()  # installs __config__.py
    return config
Ejemplo n.º 7
0
def configuration(parent_package='', top_path=None):
    from numpy.distutils.misc_util import Configuration
    from numpy.distutils.system_info import get_info
    import os
    from os.path import join as osp_join

    config = Configuration('siesta', parent_package, top_path)

    all_info = get_info('ALL')
    sources = [
        'io_m.f90',
        'siesta_sc_off.f90'
    ]
    for f in ['hsx', 'dm', 'tshs', 'grid', 'gf', 'tsde']:
        sources.extend([f + '_read.f90', f + '_write.f90'])
    for f in ['hs', 'wfsx']:
        sources.append(f + '_read.f90')

    # Only install the extension if not on READTHEDOCS
    if os.environ.get('READTHEDOCS', 'false').lower() != 'true':
        config.add_extension('_siesta',
                             sources = [osp_join('_src', s) for s in sources],
                             extra_info = all_info)
    config.add_data_dir('tests')
    config.make_config_py()
    return config
Ejemplo n.º 8
0
def _save_images_with_roi(final_images_with_roi,
                          final_images_no_clutter_with_roi,
                          final_images_bw_with_roi, image_num, try_idx):
    cond_index = image_num // NUM_OF_IMAGES_TO_CREATE_PER_CONDITION
    cond_str = '_expectancy_' + str(
        TARGETS_EXPECTATION_PERCENTAGE[cond_index]) + "_clutter_" + str(
            DISTANCE_BETWEEN_IMAGES[cond_index])
    color_dir_path = osp_join("generated_images" + str(try_idx),
                              cond_str + "_Color")
    color_no_clutter_dir_path = osp_join("generated_images" + str(try_idx),
                                         cond_str + "_Color_no_clutter")
    bw_dir_path = osp_join("generated_images" + str(try_idx), cond_str + "_BW")
    for i, image in enumerate(final_images_with_roi):
        image.convert('RGB').save(
            osp_join(
                curdir, color_dir_path, "image_" + str(image_num + 1) +
                "_with_roi_" + str(i + 1) + ".png"), "PNG")
    for i, image in enumerate(final_images_no_clutter_with_roi):
        image.convert('RGB').save(
            osp_join(
                curdir, color_no_clutter_dir_path,
                "image_" + str(image_num + 1) + "_no_clutter_with_roi_" +
                str(i + 1) + ".png"), "PNG")
    for i, image in enumerate(final_images_bw_with_roi):
        image.convert('RGB').save(
            osp_join(
                curdir, bw_dir_path, "image_bw_" + str(image_num + 1) +
                "_with_roi_" + str(i + 1) + ".png"), "PNG")
Ejemplo n.º 9
0
def _create_locations_xl(locations_list, images, try_idx, image_num,
                         categories_dict):
    doc = xl.Workbook(
        osp_join(curdir, "generated_images" + str(try_idx),
                 'target_locations' + str(image_num + 1) + '.xlsx'))
    sheet = doc.add_worksheet('target locations')
    sheet.write('A1', "Name")
    sheet.write('B1', "Target")
    sheet.write('C1', "Category")
    sheet.write('D1', "X")
    sheet.write('E1', "Y")
    for i, location in enumerate(locations_list):
        category = images[i].info[RELEVANT_CATEGORY_INFO_NAME2] + '_' + images[i].info[RELEVANT_CATEGORY_INFO_NAME1] \
            if RELEVANT_CATEGORY_INFO_NAME1 in images[i].info else images[i].info[RELEVANT_CATEGORY_INFO_NAME2]
        sheet.write('A' + str(i + 2), images[i].info['filename'])
        sheet.write('B' + str(i + 2), 1 if images[i].info['target'] else 0)
        sheet.write('C' + str(i + 2), category)
        sheet.write('D' + str(i + 2), location[0])
        sheet.write('E' + str(i + 2), location[1])
    doc.close()
Ejemplo n.º 10
0
def _load_images(images_path):
    """
    loads images from chosen directory, separates them into targets and distractors
    :param images_path: path to image directory
    :return: list of: resized distractors list, target images list, original size targets lists
    """
    distractor_images_dict = dict()
    not_resized_targets = []
    targets = []
    categories_index = 1
    categories_dict = dict()
    image_names_list = [
        str(x) for x in listdir(images_path) if x[-3:] == "png"
    ]
    for image_name in image_names_list:
        im = Image.open(osp_join(images_path, image_name)).convert('RGBA')
        _add_info_to_image(im, image_name)
        key = im.info[RELEVANT_CATEGORY_INFO_NAME2] + '_' + im.info[
            RELEVANT_CATEGORY_INFO_NAME1] \
            if RELEVANT_CATEGORY_INFO_NAME1 in im.info else im.info[
            RELEVANT_CATEGORY_INFO_NAME2]
        if key not in categories_dict:
            categories_dict[key] = categories_index
            categories_index += 1
        im.info["cat_num"] = categories_dict[key]
        if image_name in TARGET_CATEGORIES:
            targets.append(im)
            not_resized_targets.append(im)
        else:
            if key in distractor_images_dict:
                distractor_images_dict[key].append(im)
            else:
                distractor_images_dict[key] = [im]
    # print("equalizing images...")
    # distractor_images_dict, targets = _equalize_images(distractor_images_dict, targets)
    print("resizing images...")
    _resize_images(distractor_images_dict, targets)
    print("Categoriy mapping: ", categories_dict)
    return distractor_images_dict, targets, not_resized_targets, categories_dict
Ejemplo n.º 11
0
def configuration(parent_package="", top_path=None):
    from numpy.distutils.misc_util import Configuration
    from numpy.distutils.system_info import get_info
    from os.path import join as osp_join

    config = Configuration("siesta", parent_package, top_path)

    all_info = get_info("ALL")
    sources = [
        "free_unit.f90",
        "write_hsx.f90",
        "read_hsx_header.f90",
        "read_hsx.f90",
        "read_hs_header.f90",
        "read_hs.f90",
        "read_tshs_sizes.f90",
        "read_tshs_geom.f90",
        "read_tshs_cell.f90",
        "read_tshs_es.f90",
        "read_tshs_version.f90",
    ]
    config.add_extension("_siesta", sources=[osp_join("src", s) for s in sources], extra_info=all_info)
    config.make_config_py()  # installs __config__.py
    return config
Ejemplo n.º 12
0
    def __init__(self, c, title='Z-editor'):
        # pylint: disable=too-many-locals
        global TAB2SPACES
        super().__init__()
        QWidget().__init__()

        self.c = c
        self.p = c.p
        self.v = c.p.v
        self.host_id = c.p.gnx
        w = c.frame.body.wrapper
        self.host_editor = w.widget
        self.switching = False
        self.closing = False

        self.reloadSettings()

        # The rendering pane can be either a QWebView or a QTextEdit
        # depending on the features desired
        if not QWebView:  # Until Qt6 has a QWebEngineView, force QTextEdit
            self.render_pane_type = NAV_VIEW
        if self.render_pane_type == NAV_VIEW:
            self.render_widget = QTextEdit
        else:
            self.render_widget = QWebView
            self.render_pane_type = BROWSER_VIEW

        self.editor = QTextEdit()
        browser = self.browser = self.render_widget()

        wrapper = qt_text.QTextEditWrapper(self.editor, name='zwin', c=c)
        c.k.completeAllBindingsForWidget(wrapper)

        #@+<<set stylesheet paths>>
        #@+node:tom.20210604170628.1: *4* <<set stylesheet paths>>
        self.editor_csspath = ''
        self.rst_csspath = ''

        home = g.app.loadManager.computeHomeDir()
        cssdir = osp_join(home, '.leo', 'css')
        #dict_ = g.app.loadManager.globalSettingsDict

        #is_dark = dict_.get_setting('color-theme-is-dark')
        is_dark = is_body_dark(self.c)
        if is_dark:
            self.editor_csspath = osp_join(cssdir, EDITOR_STYLESHEET_DARK_FILE)
            self.rst_csspath = osp_join(cssdir,
                                        RST_CUSTOM_STYLESHEET_DARK_FILE)
        else:
            self.editor_csspath = osp_join(cssdir,
                                           EDITOR_STYLESHEET_LIGHT_FILE)
            self.rst_csspath = osp_join(cssdir,
                                        RST_CUSTOM_STYLESHEET_LIGHT_FILE)

        if g.isWindows:
            self.editor_csspath = self.editor_csspath.replace('/', '\\')
            self.rst_csspath = self.rst_csspath.replace('/', '\\')
        else:
            self.editor_csspath = self.editor_csspath.replace('\\', '/')
            self.rst_csspath = self.rst_csspath.replace('\\', '/')

        #@-<<set stylesheet paths>>
        #@+<<set stylesheets>>
        #@+node:tom.20210615101103.1: *4* <<set stylesheets>>
        # Check if editor stylesheet file exists. If so,
        # we cache its contents.
        if exists(self.editor_csspath):
            with open(self.editor_csspath, encoding=ENCODING) as f:
                self.editor_style = f.read()
        else:
            self.editor_style = EDITOR_STYLESHEET_DARK if is_dark \
                                else EDITOR_STYLESHEET_LIGHT

        # If a stylesheet exists for RsT, we cache its contents.
        self.rst_stylesheet = None
        if exists(self.rst_csspath):
            with open(self.rst_csspath, encoding=ENCODING) as f:
                self.rst_stylesheet = f.read()
        else:
            self.rst_stylesheet = RST_STYLESHEET_DARK if is_dark \
                                  else RST_STYLESHEET_LIGHT
        #@-<<set stylesheets>>
        #@+<<set up editor>>
        #@+node:tom.20210602172856.1: *4* <<set up editor>>
        self.doc = self.editor.document()
        self.editor.setWordWrapMode(WrapMode.WrapAtWordBoundaryOrAnywhere)  # pylint: disable=no-member

        # Adjust editor stylesheet color to match body fg, bg
        fg, bg = get_body_colors(self.c)
        css = change_css_prop(self.editor_style, 'color', fg)
        css = change_css_prop(css, 'background', bg)
        self.editor_style = css
        self.editor.setStyleSheet(css)

        colorizer = leoColorizer.make_colorizer(c, self.editor)
        colorizer.highlighter.setDocument(self.doc)

        # Try to get tab width from the host's body
        # Used when writing edits back to host
        # "tabwidth" directive ought to be in first six lines
        lines = self.p.v.b.split('\n', 6)
        for line in lines:
            if line.startswith('@tabwidth') and line.find(' ') > 0:
                tabfield = line.split()[1]
                TAB2SPACES = abs(int(tabfield))
                break

        # Make tabs line up with 4 spaces (at least approximately)
        self.editor.setTabStopDistance(TABWIDTH)

        if self.render_pane_type == NAV_VIEW:
            # Different stylesheet mechanism if we are a QTextEdit
            stylesheet = RST_STYLESHEET_DARK if is_dark else RST_STYLESHEET_LIGHT
            browser.setReadOnly(True)
            browser_doc = browser.document()
            browser_doc.setDefaultStyleSheet(stylesheet)
        #@-<<set up editor>>
        #@+<<set up render button>>
        #@+node:tom.20210602173354.1: *4* <<set up render button>>
        self.render_button = QPushButton("Rendered <--> Plain")
        self.render_button.clicked.connect(self.switch_and_render)

        b_style = RENDER_BTN_STYLESHEET_DARK if is_dark \
            else RENDER_BTN_STYLESHEET_LIGHT
        self.render_button.setStyleSheet(b_style)
        #@-<<set up render button>>

        #@+<<build central widget>>
        #@+node:tom.20210528235126.1: *4* <<build central widget>>
        self.stacked_widget = QStackedWidget()
        self.stacked_widget.insertWidget(EDITOR, self.editor)
        self.stacked_widget.insertWidget(BROWSER, self.browser)

        layout = QVBoxLayout()
        layout.addWidget(self.render_button)
        layout.addWidget(self.stacked_widget)
        layout.setContentsMargins(0, 0, 0, 0)

        self.central_widget = central_widget = QWidget()
        central_widget.setLayout(layout)
        self.setCentralWidget(central_widget)

        #@-<<build central widget>>
        #@+<<set geometry>>
        #@+node:tom.20210528235451.1: *4* <<set geometry>>
        Y_ = Y + (len(instances) % 10) * DELTA_Y
        self.setGeometry(QtCore.QRect(X, Y_, W, H))
        #@-<<set geometry>>
        #@+<<set window title>>
        #@+node:tom.20210531235412.1: *4* <<set window title>>
        # Show parent's title-->our title, our gnx
        ph = ''
        parents_ = list(c.p.parents())
        if parents_:
            ph = parents_[0].h + '-->'
        self.setWindowTitle(f'{ph}{c.p.h}   {c.p.gnx}')
        #@-<<set window title>>

        self.render_kind = EDITOR

        self.handlers = [('idle', self.update)]
        self._register_handlers()

        self.current_text = c.p.b
        self.editor.setPlainText(self.current_text)

        # Load docutils without rendering anything real
        # Avoids initial delay when switching to RsT the first time.
        if got_docutils:
            dummy = publish_string('dummy',
                                   writer_name='html').decode(ENCODING)
            self.browser.setHtml(dummy)
            central_widget.keyPressEvent = self.keyPressEvent

        self.show()
Ejemplo n.º 13
0
    def extractReads(self, bamFiles, 
                     prefix, 
                     targets, 
                     combineBams=False, 
                     pairsOnly=False, 
                     combine=False, 
                     shuffle=False, 
                     largeFiles=False, 
                     headersOnly = False,
                     dontTrustSamFlags=True,
                     folder='', 
                     verbose=True):
        """Extract reads from BAM files
        
        targets is a hash of type:
        { reference_name : bid }
        
        If bid == -1 then there is only one bin. This option is used by the command line
        caller so you can extract using a normal list 
        """
        # work out which files have been opened before so we don't nuke stuff!
        opened_files = {}

        if prefix != '':
            prefix = "_"+prefix

        # get a collection of storage points
        read_storage = {}
        bam_count = 0
        num_bams = len(bamFiles)
        for bf in bamFiles:
            seen_reads = {}
            read_storage[bam_count] = {}
            for bid in targets.values():
                read_storage[bam_count][bid] = [[],[],[]] # paired1, paired2, singles
            try:
                bam_file = pysam.Samfile(bf, 'rb')
                if verbose:
                    print "    Parsing BAM '%s' (%d of %d)" % (getBamDescriptor(bf), (bam_count+1), num_bams)
                
                # use these to work out if we need quality scores or not
                w_qual = 0
                wo_qual = 0
                
                # now get the reads associated with each target
                for reference, length in zip(bam_file.references, bam_file.lengths):
                    rl = ReadLoader()
                    bam_file.fetch(reference, 0, length, callback = rl )
                    for alignedRead in rl.alignedReads:
                        (query, end) = self.splitReadheader(alignedRead, dontTrustSamFlags=dontTrustSamFlags)
                        # we have basically thrown out all pairing information here
                        # so we make some assumptions:
                        #
                        # The first read we see in a pair is the first read
                        # We trust the reversed flag
                        # We don't mess with the relative orientation that follows
                        #
                        if query in seen_reads:
                            # get the raw reads
                            if headersOnly:
                                # no need to store all the information as we'll just be nuking it anyway
                                read1 = alignedRead.qname
                                read2 = seen_reads[query][0].qname
                            else:
                                if seen_reads[query][0].is_reverse:
                                    seen_read = (seen_reads[query][0].qname, self.revComp(seen_reads[query][0].seq), seen_reads[query][0].qual)
                                else:
                                    seen_read = (seen_reads[query][0].qname, seen_reads[query][0].seq, seen_reads[query][0].qual)
                                if alignedRead.is_reverse:
                                    align_read = (alignedRead.qname, self.revComp(alignedRead.seq), alignedRead.qual)
                                else:
                                    align_read = (alignedRead.qname, alignedRead.seq, alignedRead.qual)
    
                                # work out which is the first in the pair and which is the second
                                if end == 1:
                                    read1 = align_read
                                    read2 = seen_read
                                else:
                                    read1 = seen_read
                                    read2 = align_read

                                # check for quality info
                                if read1[2] is None:
                                    wo_qual += 1
                                else:
                                    w_qual += 1
                            
                            # put them in the storage
                            try:
                                bid1 = targets[bam_file.getrname(alignedRead.tid)]
                                read_storage[bam_count][bid1][0].append(read1)
                                read_storage[bam_count][bid1][1].append(read2)
                            except KeyError:
                                pass
                            try:
                                bid2 = targets[bam_file.getrname(seen_reads[query][0].tid)]
                                if bid2 != bid1:
                                    read_storage[bam_count][bid2][0].append(read1)
                                    read_storage[bam_count][bid2][1].append(read2)
                            except KeyError:
                                pass
                            
                            # delete this guy so we can determine who the pairs are at the end
                            del seen_reads[query]
                        else:
                            seen_reads[query] = (alignedRead, end)
                
                for ar in seen_reads:
                    alignedRead = seen_reads[ar][0]
                    if headersOnly:
                        read = alignedRead.qname
                    else:
                        if alignedRead.is_reverse:
                            read = (alignedRead.qname, self.revComp(alignedRead.seq), alignedRead.qual)
                        else:
                            read = (alignedRead.qname, alignedRead.seq, alignedRead.qual)
                    try:
                        bid = targets[bam_file.getrname(alignedRead.tid)]
                        read_storage[bam_count][bid][2].append(read)
                    except KeyError:
                        pass

                bam_file.close()
                
                # now we can write to file
                # work out file extension
                has_qual = w_qual > wo_qual
                if headersOnly:
                    extension = 'headers'
                    shuffle = True # force everything into one file
                    pairsOnly = True
                else:
                    if has_qual:
                        extension = "fq"
                    else:
                        extension = "fa"
                
                # work out compression
                if largeFiles:
                    fopen = open
                else:
                    fopen = gzip.open
                    extension += '.gz'

                # get a basename
                if combineBams:
                    # always use the same file
                    base_name = "extracted"
                else:
                    # need a different base name for each bam
                    base_name = getBamDescriptor(bf)

                if folder == '':
                    # use the bam file's folder
                    base_name = getBamStem(bf)
                else:
                    makeSurePathExists(folder)
                    base_name = osp_join(folder, base_name)

                unique_bids = {}
                for bid in targets.values():
                    try:
                        unique_bids[bid] += 1
                    except KeyError:
                         unique_bids[bid] = 1

                pretty_counter = 0    
                print "    ",                    
                for bid in unique_bids:
                    print ".",
                    sys.stdout.flush()
                    pretty_counter += 1
                    if pretty_counter > 79:
                        pretty_counter = 0
                        print "\n    ",
                    if bid == -1:
                        # single bin, no need to be fancy
                        if shuffle:
                            # if we are going to shuffle reads then we don't need
                            # to open a second file handle
                            read1_fh = self.openFileCorrectly("%s%s.%s" % (base_name, prefix, extension), fopen, opened_files)
                            read2_fh = read1_fh
                        else:
                            read1_fh = self.openFileCorrectly("%s%s_1.%s" % (base_name, prefix, extension), fopen, opened_files)
                            read2_fh = self.openFileCorrectly("%s%s_2.%s" % (base_name, prefix, extension), fopen, opened_files)
                    else:
                        if shuffle:
                            read1_fh = self.openFileCorrectly("%s%s_BIN_%d.%s" % (base_name, prefix, bid, extension), fopen, opened_files)
                            read2_fh = read1_fh
                        else:
                            read1_fh = self.openFileCorrectly("%s%s_BIN_%d_1.%s" % (base_name, prefix, bid, extension), fopen, opened_files)
                            read2_fh = self.openFileCorrectly("%s%s_BIN_%d_2.%s" % (base_name, prefix, bid, extension), fopen, opened_files)
                            
                    close_unpaired_file = False
                    if not pairsOnly:
                        # we need to write the unpaired guys somewhere
                        if combine and shuffle:
                            # we can just add them to the end of read1_fh
                            unpaired_fh = read1_fh
                        else:
                            # either the user wanted things separate or they chose
                            # not to shuffle, either way a new fh is needed
                            close_unpaired_file = True
                            if bid == -1:
                                unpaired_fh = self.openFileCorrectly("%s%s_unpaired.%s" % (base_name, prefix, extension), fopen, opened_files)
                            else:
                                unpaired_fh = self.openFileCorrectly("%s%s_BIN_%d_unpaired.%s" % (base_name, prefix, bid, extension), fopen, opened_files)
                    
                    # now we print
                    if headersOnly:
                        headers = {} # pipe into a hash first so we don't get dupes
                        for i in range(len(read_storage[bam_count][bid][0])):
                            headers[read_storage[bam_count][bid][0][i]] = True
                            headers[read_storage[bam_count][bid][1][i]] = True
                        for i in range(len(read_storage[bam_count][bid][2])):
                            headers[read_storage[bam_count][bid][2][i]] = True
                        for header in headers:
                            read1_fh.write(header+"\n")
                    else:
                        for i in range(len(read_storage[bam_count][bid][0])):
                            read1_fh.write(self.formatRead(read_storage[bam_count][bid][0][i], has_qual))
                            read2_fh.write(self.formatRead(read_storage[bam_count][bid][1][i], has_qual))
                        if not pairsOnly:
                            for i in range(len(read_storage[bam_count][bid][2])):
                                unpaired_fh.write(self.formatRead(read_storage[bam_count][bid][2][i], has_qual))
                
                    read1_fh.close()
                    if not shuffle:
                        read2_fh.close()
                    if close_unpaired_file:
                        unpaired_fh.close()
                print "\n",
                
            except:
                print "Unable to open BAM file",bf,"-- did you supply a SAM file instead?"
                raise
            bam_count += 1
Ejemplo n.º 14
0
    def extractReads(self,
                     bamFiles,
                     prefix,
                     targets,
                     combineBams=False,
                     pairsOnly=False,
                     combine=False,
                     shuffle=False,
                     largeFiles=False,
                     headersOnly=False,
                     dontTrustSamFlags=True,
                     folder='',
                     verbose=True):
        """Extract reads from BAM files
        
        targets is a hash of type:
        { reference_name : bid }
        
        If bid == -1 then there is only one bin. This option is used by the command line
        caller so you can extract using a normal list 
        """
        # work out which files have been opened before so we don't nuke stuff!
        opened_files = {}

        if prefix != '':
            prefix = "_" + prefix

        # get a collection of storage points
        read_storage = {}
        bam_count = 0
        num_bams = len(bamFiles)
        for bf in bamFiles:
            seen_reads = {}
            read_storage[bam_count] = {}
            for bid in targets.values():
                read_storage[bam_count][bid] = [[], [], []
                                                ]  # paired1, paired2, singles
            try:
                bam_file = pysam.Samfile(bf, 'rb')
                if verbose:
                    print "    Parsing BAM '%s' (%d of %d)" % (
                        getBamDescriptor(bf), (bam_count + 1), num_bams)

                # use these to work out if we need quality scores or not
                w_qual = 0
                wo_qual = 0

                # now get the reads associated with each target
                for reference, length in zip(bam_file.references,
                                             bam_file.lengths):
                    rl = ReadLoader()
                    bam_file.fetch(reference, 0, length, callback=rl)
                    for alignedRead in rl.alignedReads:
                        (query, end) = self.splitReadheader(
                            alignedRead, dontTrustSamFlags=dontTrustSamFlags)
                        # we have basically thrown out all pairing information here
                        # so we make some assumptions:
                        #
                        # The first read we see in a pair is the first read
                        # We trust the reversed flag
                        # We don't mess with the relative orientation that follows
                        #
                        if query in seen_reads:
                            # get the raw reads
                            if headersOnly:
                                # no need to store all the information as we'll just be nuking it anyway
                                read1 = alignedRead.qname
                                read2 = seen_reads[query][0].qname
                            else:
                                if seen_reads[query][0].is_reverse:
                                    seen_read = (seen_reads[query][0].qname,
                                                 self.revComp(
                                                     seen_reads[query][0].seq),
                                                 seen_reads[query][0].qual)
                                else:
                                    seen_read = (seen_reads[query][0].qname,
                                                 seen_reads[query][0].seq,
                                                 seen_reads[query][0].qual)
                                if alignedRead.is_reverse:
                                    align_read = (alignedRead.qname,
                                                  self.revComp(
                                                      alignedRead.seq),
                                                  alignedRead.qual)
                                else:
                                    align_read = (alignedRead.qname,
                                                  alignedRead.seq,
                                                  alignedRead.qual)

                                # work out which is the first in the pair and which is the second
                                if end == 1:
                                    read1 = align_read
                                    read2 = seen_read
                                else:
                                    read1 = seen_read
                                    read2 = align_read

                                # check for quality info
                                if read1[2] is None:
                                    wo_qual += 1
                                else:
                                    w_qual += 1

                            # put them in the storage
                            try:
                                bid1 = targets[bam_file.getrname(
                                    alignedRead.tid)]
                                read_storage[bam_count][bid1][0].append(read1)
                                read_storage[bam_count][bid1][1].append(read2)
                            except KeyError:
                                pass
                            try:
                                bid2 = targets[bam_file.getrname(
                                    seen_reads[query][0].tid)]
                                if bid2 != bid1:
                                    read_storage[bam_count][bid2][0].append(
                                        read1)
                                    read_storage[bam_count][bid2][1].append(
                                        read2)
                            except KeyError:
                                pass

                            # delete this guy so we can determine who the pairs are at the end
                            del seen_reads[query]
                        else:
                            seen_reads[query] = (alignedRead, end)

                for ar in seen_reads:
                    alignedRead = seen_reads[ar][0]
                    if headersOnly:
                        read = alignedRead.qname
                    else:
                        if alignedRead.is_reverse:
                            read = (alignedRead.qname,
                                    self.revComp(alignedRead.seq),
                                    alignedRead.qual)
                        else:
                            read = (alignedRead.qname, alignedRead.seq,
                                    alignedRead.qual)
                    try:
                        bid = targets[bam_file.getrname(alignedRead.tid)]
                        read_storage[bam_count][bid][2].append(read)
                    except KeyError:
                        pass

                bam_file.close()

                # now we can write to file
                # work out file extension
                has_qual = w_qual > wo_qual
                if headersOnly:
                    extension = 'headers'
                    shuffle = True  # force everything into one file
                    pairsOnly = True
                else:
                    if has_qual:
                        extension = "fq"
                    else:
                        extension = "fa"

                # work out compression
                if largeFiles:
                    fopen = open
                else:
                    fopen = gzip.open
                    extension += '.gz'

                # get a basename
                if combineBams:
                    # always use the same file
                    base_name = "extracted"
                else:
                    # need a different base name for each bam
                    base_name = getBamDescriptor(bf)

                if folder == '':
                    # use the bam file's folder
                    base_name = getBamStem(bf)
                else:
                    makeSurePathExists(folder)
                    base_name = osp_join(folder, base_name)

                unique_bids = {}
                for bid in targets.values():
                    try:
                        unique_bids[bid] += 1
                    except KeyError:
                        unique_bids[bid] = 1

                pretty_counter = 0
                print "    ",
                for bid in unique_bids:
                    print ".",
                    sys.stdout.flush()
                    pretty_counter += 1
                    if pretty_counter > 79:
                        pretty_counter = 0
                        print "\n    ",
                    if bid == -1:
                        # single bin, no need to be fancy
                        if shuffle:
                            # if we are going to shuffle reads then we don't need
                            # to open a second file handle
                            read1_fh = self.openFileCorrectly(
                                "%s%s.%s" % (base_name, prefix, extension),
                                fopen, opened_files)
                            read2_fh = read1_fh
                        else:
                            read1_fh = self.openFileCorrectly(
                                "%s%s_1.%s" % (base_name, prefix, extension),
                                fopen, opened_files)
                            read2_fh = self.openFileCorrectly(
                                "%s%s_2.%s" % (base_name, prefix, extension),
                                fopen, opened_files)
                    else:
                        if shuffle:
                            read1_fh = self.openFileCorrectly(
                                "%s%s_BIN_%d.%s" %
                                (base_name, prefix, bid, extension), fopen,
                                opened_files)
                            read2_fh = read1_fh
                        else:
                            read1_fh = self.openFileCorrectly(
                                "%s%s_BIN_%d_1.%s" %
                                (base_name, prefix, bid, extension), fopen,
                                opened_files)
                            read2_fh = self.openFileCorrectly(
                                "%s%s_BIN_%d_2.%s" %
                                (base_name, prefix, bid, extension), fopen,
                                opened_files)

                    close_unpaired_file = False
                    if not pairsOnly:
                        # we need to write the unpaired guys somewhere
                        if combine and shuffle:
                            # we can just add them to the end of read1_fh
                            unpaired_fh = read1_fh
                        else:
                            # either the user wanted things separate or they chose
                            # not to shuffle, either way a new fh is needed
                            close_unpaired_file = True
                            if bid == -1:
                                unpaired_fh = self.openFileCorrectly(
                                    "%s%s_unpaired.%s" %
                                    (base_name, prefix, extension), fopen,
                                    opened_files)
                            else:
                                unpaired_fh = self.openFileCorrectly(
                                    "%s%s_BIN_%d_unpaired.%s" %
                                    (base_name, prefix, bid, extension), fopen,
                                    opened_files)

                    # now we print
                    if headersOnly:
                        headers = {
                        }  # pipe into a hash first so we don't get dupes
                        for i in range(len(read_storage[bam_count][bid][0])):
                            headers[read_storage[bam_count][bid][0][i]] = True
                            headers[read_storage[bam_count][bid][1][i]] = True
                        for i in range(len(read_storage[bam_count][bid][2])):
                            headers[read_storage[bam_count][bid][2][i]] = True
                        for header in headers:
                            read1_fh.write(header + "\n")
                    else:
                        for i in range(len(read_storage[bam_count][bid][0])):
                            read1_fh.write(
                                self.formatRead(
                                    read_storage[bam_count][bid][0][i],
                                    has_qual))
                            read2_fh.write(
                                self.formatRead(
                                    read_storage[bam_count][bid][1][i],
                                    has_qual))
                        if not pairsOnly:
                            for i in range(len(
                                    read_storage[bam_count][bid][2])):
                                unpaired_fh.write(
                                    self.formatRead(
                                        read_storage[bam_count][bid][2][i],
                                        has_qual))

                    read1_fh.close()
                    if not shuffle:
                        read2_fh.close()
                    if close_unpaired_file:
                        unpaired_fh.close()
                print "\n",

            except:
                print "Unable to open BAM file", bf, "-- did you supply a SAM file instead?"
                raise
            bam_count += 1