def _make_output_folders(): """ create the output folders """ failure = True try_idx = 1 while failure: try: mkdir('generated_images' + str(try_idx)) failure = False for i in range(len(TARGETS_EXPECTATION_PERCENTAGE)): cond_str = '_expectancy_' + str( TARGETS_EXPECTATION_PERCENTAGE[i]) + "_clutter_" + str( DISTANCE_BETWEEN_IMAGES[i]) mkdir( osp_join('generated_images' + str(try_idx), cond_str + '_Color')) mkdir( osp_join('generated_images' + str(try_idx), cond_str + '_Color_no_clutter')) mkdir( osp_join('generated_images' + str(try_idx), cond_str + '_BW')) except FileExistsError: try_idx += 1 return try_idx
def _save_images(final_image, final_image_bw, final_image_no_clutter, targets_im, targets_im_bw, image_num, try_idx): cond_index = image_num // NUM_OF_IMAGES_TO_CREATE_PER_CONDITION cond_str = '_expectancy_' + str( TARGETS_EXPECTATION_PERCENTAGE[cond_index]) + "_clutter_" + str( DISTANCE_BETWEEN_IMAGES[cond_index]) color_dir_path = osp_join("generated_images" + str(try_idx), cond_str + "_Color") color_no_clutter_dir_path = osp_join("generated_images" + str(try_idx), cond_str + "_Color_no_clutter") bw_dir_path = osp_join("generated_images" + str(try_idx), cond_str + "_BW") final_image.convert('RGB').save( osp_join(curdir, color_dir_path, "image_" + str(image_num + 1) + ".png"), "PNG") final_image_no_clutter.convert('RGB').save( osp_join(curdir, color_no_clutter_dir_path, "image_" + str(image_num + 1) + ".png"), "PNG") final_image_bw.convert('LA').save( osp_join(curdir, bw_dir_path, "image_bw_" + str(image_num + 1) + ".png")) targets_im_bw.convert('LA').save( osp_join(curdir, bw_dir_path, "targets_bw_" + str(image_num + 1) + ".png")) targets_im.save( osp_join(curdir, color_dir_path, "targets_" + str(image_num + 1) + ".png")) targets_im.save( osp_join(curdir, color_no_clutter_dir_path, "targets_" + str(image_num + 1) + ".png"))
def search(self, db, query, outputDir): """Run hmmsearch""" # make the output dir and files if self.mode != 'domtblout' and self.mode != 'tblout': raise HMMMERModeError("Mode %s not compatible with search" % self.mode) makeSurePathExists(outputDir) txt_file = osp_join(outputDir, self.txtOut) hmm_file = osp_join(outputDir, self.hmmOut) # run hmmer! system('hmmsearch --%s %s %s %s > %s' % (self.mode, txt_file, db, query, hmm_file))
def configuration(parent_package='', top_path=None): from numpy.distutils.misc_util import Configuration from numpy.distutils.system_info import get_info from os.path import join as osp_join config = Configuration('siesta', parent_package, top_path) all_info = get_info('ALL') sources = [ 'free_unit.f90', 'write_hsx.f90', 'read_hsx.f90', 'read_dm.f90', 'write_dm.f90', 'read_hs.f90', 'read_tshs.f90', 'read_tsde.f90', 'write_tshs.f90', 'read_grid.f90', 'write_grid.f90', 'write_gf.f90', ] config.add_extension('_siesta', sources=[osp_join('src', s) for s in sources], extra_info=all_info) config.add_data_dir('tests') config.make_config_py() return config
def configuration(parent_package='', top_path=None): from numpy.distutils.misc_util import Configuration from numpy.distutils.system_info import get_info import os from os.path import join as osp_join config = Configuration('siesta', parent_package, top_path) all_info = get_info('ALL') sources = [ 'free_unit.f90', 'siesta_sc_off.f90', 'write_hsx.f90', 'read_hsx.f90', 'read_dm.f90', 'write_dm.f90', 'read_hs.f90', 'read_tshs.f90', 'read_tsde.f90', 'write_tshs.f90', 'read_grid.f90', 'write_grid.f90', 'write_gf.f90', ] # Only install the extension if not on READTHEDOCS if os.environ.get('READTHEDOCS', 'false').lower() != 'true': config.add_extension('_siesta', sources=[osp_join('src', s) for s in sources], extra_info=all_info) config.add_data_dir('tests') config.make_config_py() return config
def configuration(parent_package='', top_path=None): from numpy.distutils.misc_util import Configuration from numpy.distutils.system_info import get_info from os.path import join as osp_join config = Configuration('siesta', parent_package, top_path) all_info = get_info('ALL') sources = [ 'free_unit.f90', 'write_hsx.f90', 'read_hsx_header.f90', 'read_hsx.f90', 'read_hs_header.f90', 'read_hs.f90', 'read_tshs_sizes.f90', 'read_tshs_geom.f90', 'read_tshs_cell.f90', 'read_tshs_es.f90', 'read_tshs_version.f90', ] config.add_extension('_siesta', sources=[osp_join('src', s) for s in sources], extra_info=all_info) config.make_config_py() # installs __config__.py return config
def configuration(parent_package='', top_path=None): from numpy.distutils.misc_util import Configuration from numpy.distutils.system_info import get_info import os from os.path import join as osp_join config = Configuration('siesta', parent_package, top_path) all_info = get_info('ALL') sources = [ 'io_m.f90', 'siesta_sc_off.f90' ] for f in ['hsx', 'dm', 'tshs', 'grid', 'gf', 'tsde']: sources.extend([f + '_read.f90', f + '_write.f90']) for f in ['hs', 'wfsx']: sources.append(f + '_read.f90') # Only install the extension if not on READTHEDOCS if os.environ.get('READTHEDOCS', 'false').lower() != 'true': config.add_extension('_siesta', sources = [osp_join('_src', s) for s in sources], extra_info = all_info) config.add_data_dir('tests') config.make_config_py() return config
def _save_images_with_roi(final_images_with_roi, final_images_no_clutter_with_roi, final_images_bw_with_roi, image_num, try_idx): cond_index = image_num // NUM_OF_IMAGES_TO_CREATE_PER_CONDITION cond_str = '_expectancy_' + str( TARGETS_EXPECTATION_PERCENTAGE[cond_index]) + "_clutter_" + str( DISTANCE_BETWEEN_IMAGES[cond_index]) color_dir_path = osp_join("generated_images" + str(try_idx), cond_str + "_Color") color_no_clutter_dir_path = osp_join("generated_images" + str(try_idx), cond_str + "_Color_no_clutter") bw_dir_path = osp_join("generated_images" + str(try_idx), cond_str + "_BW") for i, image in enumerate(final_images_with_roi): image.convert('RGB').save( osp_join( curdir, color_dir_path, "image_" + str(image_num + 1) + "_with_roi_" + str(i + 1) + ".png"), "PNG") for i, image in enumerate(final_images_no_clutter_with_roi): image.convert('RGB').save( osp_join( curdir, color_no_clutter_dir_path, "image_" + str(image_num + 1) + "_no_clutter_with_roi_" + str(i + 1) + ".png"), "PNG") for i, image in enumerate(final_images_bw_with_roi): image.convert('RGB').save( osp_join( curdir, bw_dir_path, "image_bw_" + str(image_num + 1) + "_with_roi_" + str(i + 1) + ".png"), "PNG")
def _create_locations_xl(locations_list, images, try_idx, image_num, categories_dict): doc = xl.Workbook( osp_join(curdir, "generated_images" + str(try_idx), 'target_locations' + str(image_num + 1) + '.xlsx')) sheet = doc.add_worksheet('target locations') sheet.write('A1', "Name") sheet.write('B1', "Target") sheet.write('C1', "Category") sheet.write('D1', "X") sheet.write('E1', "Y") for i, location in enumerate(locations_list): category = images[i].info[RELEVANT_CATEGORY_INFO_NAME2] + '_' + images[i].info[RELEVANT_CATEGORY_INFO_NAME1] \ if RELEVANT_CATEGORY_INFO_NAME1 in images[i].info else images[i].info[RELEVANT_CATEGORY_INFO_NAME2] sheet.write('A' + str(i + 2), images[i].info['filename']) sheet.write('B' + str(i + 2), 1 if images[i].info['target'] else 0) sheet.write('C' + str(i + 2), category) sheet.write('D' + str(i + 2), location[0]) sheet.write('E' + str(i + 2), location[1]) doc.close()
def _load_images(images_path): """ loads images from chosen directory, separates them into targets and distractors :param images_path: path to image directory :return: list of: resized distractors list, target images list, original size targets lists """ distractor_images_dict = dict() not_resized_targets = [] targets = [] categories_index = 1 categories_dict = dict() image_names_list = [ str(x) for x in listdir(images_path) if x[-3:] == "png" ] for image_name in image_names_list: im = Image.open(osp_join(images_path, image_name)).convert('RGBA') _add_info_to_image(im, image_name) key = im.info[RELEVANT_CATEGORY_INFO_NAME2] + '_' + im.info[ RELEVANT_CATEGORY_INFO_NAME1] \ if RELEVANT_CATEGORY_INFO_NAME1 in im.info else im.info[ RELEVANT_CATEGORY_INFO_NAME2] if key not in categories_dict: categories_dict[key] = categories_index categories_index += 1 im.info["cat_num"] = categories_dict[key] if image_name in TARGET_CATEGORIES: targets.append(im) not_resized_targets.append(im) else: if key in distractor_images_dict: distractor_images_dict[key].append(im) else: distractor_images_dict[key] = [im] # print("equalizing images...") # distractor_images_dict, targets = _equalize_images(distractor_images_dict, targets) print("resizing images...") _resize_images(distractor_images_dict, targets) print("Categoriy mapping: ", categories_dict) return distractor_images_dict, targets, not_resized_targets, categories_dict
def configuration(parent_package="", top_path=None): from numpy.distutils.misc_util import Configuration from numpy.distutils.system_info import get_info from os.path import join as osp_join config = Configuration("siesta", parent_package, top_path) all_info = get_info("ALL") sources = [ "free_unit.f90", "write_hsx.f90", "read_hsx_header.f90", "read_hsx.f90", "read_hs_header.f90", "read_hs.f90", "read_tshs_sizes.f90", "read_tshs_geom.f90", "read_tshs_cell.f90", "read_tshs_es.f90", "read_tshs_version.f90", ] config.add_extension("_siesta", sources=[osp_join("src", s) for s in sources], extra_info=all_info) config.make_config_py() # installs __config__.py return config
def __init__(self, c, title='Z-editor'): # pylint: disable=too-many-locals global TAB2SPACES super().__init__() QWidget().__init__() self.c = c self.p = c.p self.v = c.p.v self.host_id = c.p.gnx w = c.frame.body.wrapper self.host_editor = w.widget self.switching = False self.closing = False self.reloadSettings() # The rendering pane can be either a QWebView or a QTextEdit # depending on the features desired if not QWebView: # Until Qt6 has a QWebEngineView, force QTextEdit self.render_pane_type = NAV_VIEW if self.render_pane_type == NAV_VIEW: self.render_widget = QTextEdit else: self.render_widget = QWebView self.render_pane_type = BROWSER_VIEW self.editor = QTextEdit() browser = self.browser = self.render_widget() wrapper = qt_text.QTextEditWrapper(self.editor, name='zwin', c=c) c.k.completeAllBindingsForWidget(wrapper) #@+<<set stylesheet paths>> #@+node:tom.20210604170628.1: *4* <<set stylesheet paths>> self.editor_csspath = '' self.rst_csspath = '' home = g.app.loadManager.computeHomeDir() cssdir = osp_join(home, '.leo', 'css') #dict_ = g.app.loadManager.globalSettingsDict #is_dark = dict_.get_setting('color-theme-is-dark') is_dark = is_body_dark(self.c) if is_dark: self.editor_csspath = osp_join(cssdir, EDITOR_STYLESHEET_DARK_FILE) self.rst_csspath = osp_join(cssdir, RST_CUSTOM_STYLESHEET_DARK_FILE) else: self.editor_csspath = osp_join(cssdir, EDITOR_STYLESHEET_LIGHT_FILE) self.rst_csspath = osp_join(cssdir, RST_CUSTOM_STYLESHEET_LIGHT_FILE) if g.isWindows: self.editor_csspath = self.editor_csspath.replace('/', '\\') self.rst_csspath = self.rst_csspath.replace('/', '\\') else: self.editor_csspath = self.editor_csspath.replace('\\', '/') self.rst_csspath = self.rst_csspath.replace('\\', '/') #@-<<set stylesheet paths>> #@+<<set stylesheets>> #@+node:tom.20210615101103.1: *4* <<set stylesheets>> # Check if editor stylesheet file exists. If so, # we cache its contents. if exists(self.editor_csspath): with open(self.editor_csspath, encoding=ENCODING) as f: self.editor_style = f.read() else: self.editor_style = EDITOR_STYLESHEET_DARK if is_dark \ else EDITOR_STYLESHEET_LIGHT # If a stylesheet exists for RsT, we cache its contents. self.rst_stylesheet = None if exists(self.rst_csspath): with open(self.rst_csspath, encoding=ENCODING) as f: self.rst_stylesheet = f.read() else: self.rst_stylesheet = RST_STYLESHEET_DARK if is_dark \ else RST_STYLESHEET_LIGHT #@-<<set stylesheets>> #@+<<set up editor>> #@+node:tom.20210602172856.1: *4* <<set up editor>> self.doc = self.editor.document() self.editor.setWordWrapMode(WrapMode.WrapAtWordBoundaryOrAnywhere) # pylint: disable=no-member # Adjust editor stylesheet color to match body fg, bg fg, bg = get_body_colors(self.c) css = change_css_prop(self.editor_style, 'color', fg) css = change_css_prop(css, 'background', bg) self.editor_style = css self.editor.setStyleSheet(css) colorizer = leoColorizer.make_colorizer(c, self.editor) colorizer.highlighter.setDocument(self.doc) # Try to get tab width from the host's body # Used when writing edits back to host # "tabwidth" directive ought to be in first six lines lines = self.p.v.b.split('\n', 6) for line in lines: if line.startswith('@tabwidth') and line.find(' ') > 0: tabfield = line.split()[1] TAB2SPACES = abs(int(tabfield)) break # Make tabs line up with 4 spaces (at least approximately) self.editor.setTabStopDistance(TABWIDTH) if self.render_pane_type == NAV_VIEW: # Different stylesheet mechanism if we are a QTextEdit stylesheet = RST_STYLESHEET_DARK if is_dark else RST_STYLESHEET_LIGHT browser.setReadOnly(True) browser_doc = browser.document() browser_doc.setDefaultStyleSheet(stylesheet) #@-<<set up editor>> #@+<<set up render button>> #@+node:tom.20210602173354.1: *4* <<set up render button>> self.render_button = QPushButton("Rendered <--> Plain") self.render_button.clicked.connect(self.switch_and_render) b_style = RENDER_BTN_STYLESHEET_DARK if is_dark \ else RENDER_BTN_STYLESHEET_LIGHT self.render_button.setStyleSheet(b_style) #@-<<set up render button>> #@+<<build central widget>> #@+node:tom.20210528235126.1: *4* <<build central widget>> self.stacked_widget = QStackedWidget() self.stacked_widget.insertWidget(EDITOR, self.editor) self.stacked_widget.insertWidget(BROWSER, self.browser) layout = QVBoxLayout() layout.addWidget(self.render_button) layout.addWidget(self.stacked_widget) layout.setContentsMargins(0, 0, 0, 0) self.central_widget = central_widget = QWidget() central_widget.setLayout(layout) self.setCentralWidget(central_widget) #@-<<build central widget>> #@+<<set geometry>> #@+node:tom.20210528235451.1: *4* <<set geometry>> Y_ = Y + (len(instances) % 10) * DELTA_Y self.setGeometry(QtCore.QRect(X, Y_, W, H)) #@-<<set geometry>> #@+<<set window title>> #@+node:tom.20210531235412.1: *4* <<set window title>> # Show parent's title-->our title, our gnx ph = '' parents_ = list(c.p.parents()) if parents_: ph = parents_[0].h + '-->' self.setWindowTitle(f'{ph}{c.p.h} {c.p.gnx}') #@-<<set window title>> self.render_kind = EDITOR self.handlers = [('idle', self.update)] self._register_handlers() self.current_text = c.p.b self.editor.setPlainText(self.current_text) # Load docutils without rendering anything real # Avoids initial delay when switching to RsT the first time. if got_docutils: dummy = publish_string('dummy', writer_name='html').decode(ENCODING) self.browser.setHtml(dummy) central_widget.keyPressEvent = self.keyPressEvent self.show()
def extractReads(self, bamFiles, prefix, targets, combineBams=False, pairsOnly=False, combine=False, shuffle=False, largeFiles=False, headersOnly = False, dontTrustSamFlags=True, folder='', verbose=True): """Extract reads from BAM files targets is a hash of type: { reference_name : bid } If bid == -1 then there is only one bin. This option is used by the command line caller so you can extract using a normal list """ # work out which files have been opened before so we don't nuke stuff! opened_files = {} if prefix != '': prefix = "_"+prefix # get a collection of storage points read_storage = {} bam_count = 0 num_bams = len(bamFiles) for bf in bamFiles: seen_reads = {} read_storage[bam_count] = {} for bid in targets.values(): read_storage[bam_count][bid] = [[],[],[]] # paired1, paired2, singles try: bam_file = pysam.Samfile(bf, 'rb') if verbose: print " Parsing BAM '%s' (%d of %d)" % (getBamDescriptor(bf), (bam_count+1), num_bams) # use these to work out if we need quality scores or not w_qual = 0 wo_qual = 0 # now get the reads associated with each target for reference, length in zip(bam_file.references, bam_file.lengths): rl = ReadLoader() bam_file.fetch(reference, 0, length, callback = rl ) for alignedRead in rl.alignedReads: (query, end) = self.splitReadheader(alignedRead, dontTrustSamFlags=dontTrustSamFlags) # we have basically thrown out all pairing information here # so we make some assumptions: # # The first read we see in a pair is the first read # We trust the reversed flag # We don't mess with the relative orientation that follows # if query in seen_reads: # get the raw reads if headersOnly: # no need to store all the information as we'll just be nuking it anyway read1 = alignedRead.qname read2 = seen_reads[query][0].qname else: if seen_reads[query][0].is_reverse: seen_read = (seen_reads[query][0].qname, self.revComp(seen_reads[query][0].seq), seen_reads[query][0].qual) else: seen_read = (seen_reads[query][0].qname, seen_reads[query][0].seq, seen_reads[query][0].qual) if alignedRead.is_reverse: align_read = (alignedRead.qname, self.revComp(alignedRead.seq), alignedRead.qual) else: align_read = (alignedRead.qname, alignedRead.seq, alignedRead.qual) # work out which is the first in the pair and which is the second if end == 1: read1 = align_read read2 = seen_read else: read1 = seen_read read2 = align_read # check for quality info if read1[2] is None: wo_qual += 1 else: w_qual += 1 # put them in the storage try: bid1 = targets[bam_file.getrname(alignedRead.tid)] read_storage[bam_count][bid1][0].append(read1) read_storage[bam_count][bid1][1].append(read2) except KeyError: pass try: bid2 = targets[bam_file.getrname(seen_reads[query][0].tid)] if bid2 != bid1: read_storage[bam_count][bid2][0].append(read1) read_storage[bam_count][bid2][1].append(read2) except KeyError: pass # delete this guy so we can determine who the pairs are at the end del seen_reads[query] else: seen_reads[query] = (alignedRead, end) for ar in seen_reads: alignedRead = seen_reads[ar][0] if headersOnly: read = alignedRead.qname else: if alignedRead.is_reverse: read = (alignedRead.qname, self.revComp(alignedRead.seq), alignedRead.qual) else: read = (alignedRead.qname, alignedRead.seq, alignedRead.qual) try: bid = targets[bam_file.getrname(alignedRead.tid)] read_storage[bam_count][bid][2].append(read) except KeyError: pass bam_file.close() # now we can write to file # work out file extension has_qual = w_qual > wo_qual if headersOnly: extension = 'headers' shuffle = True # force everything into one file pairsOnly = True else: if has_qual: extension = "fq" else: extension = "fa" # work out compression if largeFiles: fopen = open else: fopen = gzip.open extension += '.gz' # get a basename if combineBams: # always use the same file base_name = "extracted" else: # need a different base name for each bam base_name = getBamDescriptor(bf) if folder == '': # use the bam file's folder base_name = getBamStem(bf) else: makeSurePathExists(folder) base_name = osp_join(folder, base_name) unique_bids = {} for bid in targets.values(): try: unique_bids[bid] += 1 except KeyError: unique_bids[bid] = 1 pretty_counter = 0 print " ", for bid in unique_bids: print ".", sys.stdout.flush() pretty_counter += 1 if pretty_counter > 79: pretty_counter = 0 print "\n ", if bid == -1: # single bin, no need to be fancy if shuffle: # if we are going to shuffle reads then we don't need # to open a second file handle read1_fh = self.openFileCorrectly("%s%s.%s" % (base_name, prefix, extension), fopen, opened_files) read2_fh = read1_fh else: read1_fh = self.openFileCorrectly("%s%s_1.%s" % (base_name, prefix, extension), fopen, opened_files) read2_fh = self.openFileCorrectly("%s%s_2.%s" % (base_name, prefix, extension), fopen, opened_files) else: if shuffle: read1_fh = self.openFileCorrectly("%s%s_BIN_%d.%s" % (base_name, prefix, bid, extension), fopen, opened_files) read2_fh = read1_fh else: read1_fh = self.openFileCorrectly("%s%s_BIN_%d_1.%s" % (base_name, prefix, bid, extension), fopen, opened_files) read2_fh = self.openFileCorrectly("%s%s_BIN_%d_2.%s" % (base_name, prefix, bid, extension), fopen, opened_files) close_unpaired_file = False if not pairsOnly: # we need to write the unpaired guys somewhere if combine and shuffle: # we can just add them to the end of read1_fh unpaired_fh = read1_fh else: # either the user wanted things separate or they chose # not to shuffle, either way a new fh is needed close_unpaired_file = True if bid == -1: unpaired_fh = self.openFileCorrectly("%s%s_unpaired.%s" % (base_name, prefix, extension), fopen, opened_files) else: unpaired_fh = self.openFileCorrectly("%s%s_BIN_%d_unpaired.%s" % (base_name, prefix, bid, extension), fopen, opened_files) # now we print if headersOnly: headers = {} # pipe into a hash first so we don't get dupes for i in range(len(read_storage[bam_count][bid][0])): headers[read_storage[bam_count][bid][0][i]] = True headers[read_storage[bam_count][bid][1][i]] = True for i in range(len(read_storage[bam_count][bid][2])): headers[read_storage[bam_count][bid][2][i]] = True for header in headers: read1_fh.write(header+"\n") else: for i in range(len(read_storage[bam_count][bid][0])): read1_fh.write(self.formatRead(read_storage[bam_count][bid][0][i], has_qual)) read2_fh.write(self.formatRead(read_storage[bam_count][bid][1][i], has_qual)) if not pairsOnly: for i in range(len(read_storage[bam_count][bid][2])): unpaired_fh.write(self.formatRead(read_storage[bam_count][bid][2][i], has_qual)) read1_fh.close() if not shuffle: read2_fh.close() if close_unpaired_file: unpaired_fh.close() print "\n", except: print "Unable to open BAM file",bf,"-- did you supply a SAM file instead?" raise bam_count += 1
def extractReads(self, bamFiles, prefix, targets, combineBams=False, pairsOnly=False, combine=False, shuffle=False, largeFiles=False, headersOnly=False, dontTrustSamFlags=True, folder='', verbose=True): """Extract reads from BAM files targets is a hash of type: { reference_name : bid } If bid == -1 then there is only one bin. This option is used by the command line caller so you can extract using a normal list """ # work out which files have been opened before so we don't nuke stuff! opened_files = {} if prefix != '': prefix = "_" + prefix # get a collection of storage points read_storage = {} bam_count = 0 num_bams = len(bamFiles) for bf in bamFiles: seen_reads = {} read_storage[bam_count] = {} for bid in targets.values(): read_storage[bam_count][bid] = [[], [], [] ] # paired1, paired2, singles try: bam_file = pysam.Samfile(bf, 'rb') if verbose: print " Parsing BAM '%s' (%d of %d)" % ( getBamDescriptor(bf), (bam_count + 1), num_bams) # use these to work out if we need quality scores or not w_qual = 0 wo_qual = 0 # now get the reads associated with each target for reference, length in zip(bam_file.references, bam_file.lengths): rl = ReadLoader() bam_file.fetch(reference, 0, length, callback=rl) for alignedRead in rl.alignedReads: (query, end) = self.splitReadheader( alignedRead, dontTrustSamFlags=dontTrustSamFlags) # we have basically thrown out all pairing information here # so we make some assumptions: # # The first read we see in a pair is the first read # We trust the reversed flag # We don't mess with the relative orientation that follows # if query in seen_reads: # get the raw reads if headersOnly: # no need to store all the information as we'll just be nuking it anyway read1 = alignedRead.qname read2 = seen_reads[query][0].qname else: if seen_reads[query][0].is_reverse: seen_read = (seen_reads[query][0].qname, self.revComp( seen_reads[query][0].seq), seen_reads[query][0].qual) else: seen_read = (seen_reads[query][0].qname, seen_reads[query][0].seq, seen_reads[query][0].qual) if alignedRead.is_reverse: align_read = (alignedRead.qname, self.revComp( alignedRead.seq), alignedRead.qual) else: align_read = (alignedRead.qname, alignedRead.seq, alignedRead.qual) # work out which is the first in the pair and which is the second if end == 1: read1 = align_read read2 = seen_read else: read1 = seen_read read2 = align_read # check for quality info if read1[2] is None: wo_qual += 1 else: w_qual += 1 # put them in the storage try: bid1 = targets[bam_file.getrname( alignedRead.tid)] read_storage[bam_count][bid1][0].append(read1) read_storage[bam_count][bid1][1].append(read2) except KeyError: pass try: bid2 = targets[bam_file.getrname( seen_reads[query][0].tid)] if bid2 != bid1: read_storage[bam_count][bid2][0].append( read1) read_storage[bam_count][bid2][1].append( read2) except KeyError: pass # delete this guy so we can determine who the pairs are at the end del seen_reads[query] else: seen_reads[query] = (alignedRead, end) for ar in seen_reads: alignedRead = seen_reads[ar][0] if headersOnly: read = alignedRead.qname else: if alignedRead.is_reverse: read = (alignedRead.qname, self.revComp(alignedRead.seq), alignedRead.qual) else: read = (alignedRead.qname, alignedRead.seq, alignedRead.qual) try: bid = targets[bam_file.getrname(alignedRead.tid)] read_storage[bam_count][bid][2].append(read) except KeyError: pass bam_file.close() # now we can write to file # work out file extension has_qual = w_qual > wo_qual if headersOnly: extension = 'headers' shuffle = True # force everything into one file pairsOnly = True else: if has_qual: extension = "fq" else: extension = "fa" # work out compression if largeFiles: fopen = open else: fopen = gzip.open extension += '.gz' # get a basename if combineBams: # always use the same file base_name = "extracted" else: # need a different base name for each bam base_name = getBamDescriptor(bf) if folder == '': # use the bam file's folder base_name = getBamStem(bf) else: makeSurePathExists(folder) base_name = osp_join(folder, base_name) unique_bids = {} for bid in targets.values(): try: unique_bids[bid] += 1 except KeyError: unique_bids[bid] = 1 pretty_counter = 0 print " ", for bid in unique_bids: print ".", sys.stdout.flush() pretty_counter += 1 if pretty_counter > 79: pretty_counter = 0 print "\n ", if bid == -1: # single bin, no need to be fancy if shuffle: # if we are going to shuffle reads then we don't need # to open a second file handle read1_fh = self.openFileCorrectly( "%s%s.%s" % (base_name, prefix, extension), fopen, opened_files) read2_fh = read1_fh else: read1_fh = self.openFileCorrectly( "%s%s_1.%s" % (base_name, prefix, extension), fopen, opened_files) read2_fh = self.openFileCorrectly( "%s%s_2.%s" % (base_name, prefix, extension), fopen, opened_files) else: if shuffle: read1_fh = self.openFileCorrectly( "%s%s_BIN_%d.%s" % (base_name, prefix, bid, extension), fopen, opened_files) read2_fh = read1_fh else: read1_fh = self.openFileCorrectly( "%s%s_BIN_%d_1.%s" % (base_name, prefix, bid, extension), fopen, opened_files) read2_fh = self.openFileCorrectly( "%s%s_BIN_%d_2.%s" % (base_name, prefix, bid, extension), fopen, opened_files) close_unpaired_file = False if not pairsOnly: # we need to write the unpaired guys somewhere if combine and shuffle: # we can just add them to the end of read1_fh unpaired_fh = read1_fh else: # either the user wanted things separate or they chose # not to shuffle, either way a new fh is needed close_unpaired_file = True if bid == -1: unpaired_fh = self.openFileCorrectly( "%s%s_unpaired.%s" % (base_name, prefix, extension), fopen, opened_files) else: unpaired_fh = self.openFileCorrectly( "%s%s_BIN_%d_unpaired.%s" % (base_name, prefix, bid, extension), fopen, opened_files) # now we print if headersOnly: headers = { } # pipe into a hash first so we don't get dupes for i in range(len(read_storage[bam_count][bid][0])): headers[read_storage[bam_count][bid][0][i]] = True headers[read_storage[bam_count][bid][1][i]] = True for i in range(len(read_storage[bam_count][bid][2])): headers[read_storage[bam_count][bid][2][i]] = True for header in headers: read1_fh.write(header + "\n") else: for i in range(len(read_storage[bam_count][bid][0])): read1_fh.write( self.formatRead( read_storage[bam_count][bid][0][i], has_qual)) read2_fh.write( self.formatRead( read_storage[bam_count][bid][1][i], has_qual)) if not pairsOnly: for i in range(len( read_storage[bam_count][bid][2])): unpaired_fh.write( self.formatRead( read_storage[bam_count][bid][2][i], has_qual)) read1_fh.close() if not shuffle: read2_fh.close() if close_unpaired_file: unpaired_fh.close() print "\n", except: print "Unable to open BAM file", bf, "-- did you supply a SAM file instead?" raise bam_count += 1