def calculate_cs219(threads, a3m_db_base, cs_db_base): hhlib_environment = os.environ['HHLIB'] if(hhlib_environment): #TODO: check check_call(" ".join(["cstranslate", "-A ", os.path.join(hhlib_environment, "data/cs219.lib"), "-D", os.path.join(hhlib_environment, "data/context_data.lib"), "-x 0.3 -c 4 --ffindex", "-i", a3m_db_base, "-o", cs_db_base, "-I a3m -b"]), env=dict(os.environ, OMP_NUM_THREADS=threads), shell=True) else: sys.err("ERROR: HHLIB environment variable not set! See manual!\n") exit(1)
def MatchLabels(column_labels, row_labels): if len(column_labels) != len(row_labels): sys.err("ERROR 1st matrix column count " + str(len(column_labels)) + " not equal 2nd Matrix number row count " + str(len(row_labels)) + "\n") else: cnt = 0 for k in range(0, len(column_labels)): if column_labels[k] != row_labels[k] and cnt < 20: cnt += 1 #sys.err("ERROR At column & row position "+str(k)+" Matrix 1 column value "+str(column_labels)+" not equal 2nd Matrix row value "+str(row_labels)+"\n" ) if cnt > 0: sys.exit(-11)
def make_data_for_clustering(self, option, filter=False, sub_topics=False): documents = [] targets = [] if not sub_topics: topic_nums = range(1, 46) # get all files in all topics, partitioned by suptopic for i in topic_nums: if i not in [15, 17]: # get all files in topic i for f in self.get_topic(topic=i): if option == 'doc_template': # this returns a list, so converting to string documents.append(' '.join( self.get_text(f, element_type='doc_template'))) targets.append(str(i)) elif option == 'text': documents.append(' '.join( self.get_text(f, element_type='text', filter=filter))) targets.append(str(i)) else: print('unknown option') sys.err(-1) else: topic_nums = range(1, 46) # get all files in all topics, partitioned by suptopic for i in topic_nums: if i not in [15, 17]: # get all files in topic i for f in self.get_topic(topic=i): if option == 'doc_template': #this returns a list, so converting to string top = self.get_topic_num(f) documents.append(' '.join( self.get_text(f, element_type='doc_template'))) targets.append(top[0] + "_" + top[1]) elif option == 'text': top = self.get_topic_num(f) documents.append(' '.join( self.get_text(f, element_type='text', filter=filter))) targets.append(top[0] + "_" + top[1]) else: print('unknown option') sys.err(-1) return (documents, targets)
def processTif(infile, outpre, polyC, rows, lines, whiteLevel, debug): """ Read a tif file containing uint16 data of "I" image data and apply the correction given by the polynomial(s) in polyC. White level "whiteLevel" must be provided. Have to convert to float format for correction and retur to uint16 afterwards. A correction table in the reconstruction process would be more efficient. Results output written to outfile in same tif format. """ try: #from PIL import Image import tifffile as tf except ImportError: #print("Import PIL failed") print("Import tifffile failed") sys.err(1) #im = Image.open(infile) imArr = tf.imread(infile) if len(imArr[0, :]) != lines or len(imArr[:, 0]) != rows: print("* Error: tiff image size is ", len(imArr[0, :]), " by ", len(imArr[:, 0])) print("Must set lines and rows the same") sys.exit(1) #imArr = np.array(im) if whiteLevel == 0: whiteLevel = imArr.max() if debug: print("Using whiteLevel=", whiteLevel) # imLn = np.log(float(whiteLevel) / imArr) for l in range(lines): ln = 0 if len(polyC[:, 0]) == 1: ln = 0 elif len(polyC[:, 0]) == lines: ln = l imLn[l, :] = correct(imLn[l, :], polyC[ln, ::-1]) imCor = whiteLevel / np.exp(imLn) imCor16 = np.array(imCor, dtype='uint16') if debug: print("imCor16: ", imCor16[0:2, 0:2]) #imOut = Image.fromarray(imCor16) #print("infile=",infile) #," outpre=",outpre) outfile = outpre + infile #imOut.save(outfile) tf.imsave(outfile, imCor16)
def calculate_cs219(threads, a3m_db_base, cs_db_base): hhlib_environment = os.environ['HHLIB'] if (hhlib_environment): #TODO: check check_call(" ".join([ "cstranslate", "-A ", os.path.join(hhlib_environment, "data/cs219.lib"), "-D", os.path.join(hhlib_environment, "data/context_data.lib"), "-x 0.3 -c 4 --ffindex", "-i", a3m_db_base, "-o", cs_db_base, "-I a3m -b" ]), env=dict(os.environ, OMP_NUM_THREADS=threads), shell=True) else: sys.err("ERROR: HHLIB environment variable not set! See manual!\n") exit(1)
def __init__(self, installDir, toolDir): self.installDir, self.toolDir = installDir, toolDir link = input("Git repository of the tool (full link): ") name = input("Tool name: ") ver = input("Python version: ") exe = input("Name of the file to launch (w/o extension): ") cmds = input("Custom command (leave blank if unsure): ") issudo = input( "Does the package needs root permissions? [y/N]: ").lower() #Add question if script has a different name #g.e: main.py insttead of <projectname>.py temp = 0 if not cmds: if issudo != "y": cmds = "python{0} {1}{2}/{3}.py".format( ver, self.toolDir, name, exe) else: cmds = "sudo python{0} {1}{2}/{3}.py".format( ver, self.toolDir, name, exe) try: shell("git clone %s %s%s" % (link, self.toolDir, name)) temp = 1 except: temp = -1 ErrorHandler(err(), False) if temp: dictmgr.addWords(self.installDir, [name]) dictmgr.addCustomWords(self.installDir, name) dictmgr.updateConfig(self.installDir, name, cmds) print( "[*] - You may need to restart onifw in order to use the custom tool." )
def getHomVarInfo(conn, affected, parents) : myquery = "" if len(parents) == 2 : print "Performing Two Parent Query" myquery = TWO_PARENT_SHARED_HOM_QUERY myquery = myquery.replace("$$A$$", str(affected)) myquery = myquery.replace("$$P1$$", str(parents[0])) myquery = myquery.replace("$$P2$$", str(parents[1])) #print myquery elif len(parents) == 1 : print "Performing Single Parent Query" myquery = SINGLE_PARENT_SHARED_HOM_QUERY myquery = myquery.replace("$$A$$", str(affected)) myquery = myquery.replace("$$P1$$", str(parents[0])) #elif len(parents) == 0 : # print "Performing Affected Het Query" # myquery = AFFECTED_HET_QUERY # myquery = myquery.replace("$$A$$", str(affected)) else : sys.err("What kind of family is this?" ) sys.exit(1) # Time the Query start = time() variantdata = [] index =0 for row in conn.query( myquery ): #print index,":",row variantdata.append( "%s:%s" % (row[1], row[2]) ) index += 1 # Print Run Time elapsed = (time() - start) print "Elapsed:",elapsed return variantdata
def addCustomWords(installDir, name): """Add words to the custom dictionnary Arguments: - name : name of the custom tool - installDir : Directory of current install """ print("[*] - Adding custom words...") try: with open("{}data/ctools.txt".format(installDir), "a") as f: f.write(name + "\n") f.close() print("[*] - Done.") except: ErrorHandler(err(), False)
def addWords(installDir, wordList): """Add words to dictionnary file Arguments: - wordList : Array of strings containing words to add """ print("[*] - Adding dictionnary words...") try: with open("{}data/dict.txt".format(installDir), "a") as f: for i in range(len(wordList)): f.write(wordList[i] + "\n") f.close() print("[*] - Done.") except: ErrorHandler(err(), False)
def __init__(self, tool_name, lang="", need_args=False, change_dir=False, exe_name="", pre_cmd="", post_cmd="", sudo=False): self.tool_name = tool_name self.lang = lang self.installDir = toolDir + self.tool_name self.prefix = self.lang self.arg = "" if not isdir(self.installDir): print( color.IMPORTANT + "[!] - Tool not installed or not located in the tool/ directory" + color.END) return for i in extensions: if i in self.lang: self.extension = extensions[i] if need_args: print(color.LOGGING + "[?] - Please specify a target" + color.END) self.arg = str( input(color.HEADER + "onifw[{}]: ".format(self.tool_name) + color.END)) if len(pre_cmd) > 1: self.extension += " " + pre_cmd if len(post_cmd) > 1: self.arg += " " + post_cmd if len(exe_name) > 1: self.exe_name = exe_name else: self.exe_name = self.tool_name if sudo: self.prefix = "sudo " + self.prefix self.cmd = "{0} {1}/{2}{3}".format(self.prefix, self.installDir, self.exe_name, self.extension) if change_dir: self.cmd = "cd {0} && {1}".format(self.installDir, self.cmd) if not len(self.arg) < 1: self.cmd += " " + self.arg try: system(self.cmd) except: ErrorHandler(err(), False)
def updateConfig(installDir, name, command): """Add launch command to the onirc file Arguments: - installDir : directory of current install - name : name of the tool - command : command used to launch the tool """ print("[*] - Updating configuration...") try: with open("{}onirc".format(installDir), "a") as f: f.write("{0} = {1}\n".format(name, command)) f.close() print("[*] - Done.") except: print(color.LOGGING + "[!] - Unexpected error: ") ErrorHandler(err(), False)
def __init__(self, installDir, toolDir): self.installDir, self.toolDir = installDir, toolDir link = input("Git repository of the tool (full link): ") name = input("Tool name: ") nb_cmd = int(input("How many commands to build the tool?: ")) try: shell("git clone %s %s %s" % (link, self.toolDir, name)) for i in range(nb_cmd): print("[*] - Current directory: %s" % shell("pwd")) cmd = input("Custom command: ") shell(cmd) cmds = input("Launch command: ") dictmgr.addWords(self.installDir, name) dictmgr.addCustomWords(self.installDir, name) dictmgr.updateConfig(self.installDir, name, cmds) except: ErrorHandler(err(), False)
def __init__(self, installDir, toolDir): lang_dict = { "perl": "perl", "ruby": "ruby", "go": "go", "java-jar": "jar", "java": "java", } self.installDir, self.toolDir = installDir, toolDir print(color.OKBLUE + "Available languages:") for i in lang_dict.keys(): print(i) print(color.END) lang = input("Select lang: ") link = input("Git repository of the tool (full link): ") name = input("Tool name: ") name_exe = input("Name of the main file (w/ entension): ") nb_cmd = int(input("How many commands to build the tool?: ")) try: shell("git clone %s %s%s" % (link, self.toolDir, name)) for i in range(nb_cmd): print("[*] - Current directory: %s" % shell("pwd")) cmd = input("Custom command: ") shell(cmd) if lang == "java": cmds = "{0} = cd {1}{2} && {3}{4}".format( name, toolDir, name, lang_dict[lang], name_exe) else: cmds = "{0} = {1} {2}{3}{4}".format(name, lang_dict[lang], toolDir, name, name_exe) with open("{}onirc".format(self.installDir), "a") as f: f.write("{0} = {1}\n".format(name, cmds)) f.close() with open("{}data/dict.txt".format(self.installDir), "a") as f: f.write(name + '\n') f.close() with open("{}data/ctools.txt".format(self.installDir), "a") as f: f.write(name + '\n') f.close() except: ErrorHandler(err(), False)
def __init__(self, installDir): self.installDir = installDir if not self.check_branch(): try: with open("{}data/version.txt".format(installDir)) as f: local_version = version.parse( f.readlines()[0].rstrip("\n\r")) f.close() latest_version = check_output( "curl -s https://raw.githubusercontent.com/w0bos/onifw/master/src/data/version.txt", shell=True).decode("utf-8").strip('\r\n') late = version.parse(latest_version) if late > local_version: ans = input( color.NOTICE + "[*] - A new version is available\nDo you wish to install the new update? [y/N] :" + color.END) if ans.lower() in ["yes", "y"]: # Won't wipe old install shell("cd {} && git pull".format(installDir)) else: print("[*] - Update aborted") elif late == local_version: print( color.OKGREEN + "[*] - You're already running the latest version of onifw" + color.END) elif late < local_version: print(color.BOLD + color.IMPORTANT + "[+] - You are running an alpha version of onifw" + color.END) else: print(color.WARNING + "[!] - Unknown error" + color.END) shell("rm -rf {}/temp".format(installDir)) except: ErrorHandler(err(), False, True)
def restoreDict(installDir): """Restore default dictionnary file """ print("[*] - Restoring dictionnary to default...") try: f = open("{}data/dict.txt".format(installDir)) out = [] default = False for line in f: temp = str(line).rstrip('\n\r') if temp == "update": default = True out.append(temp) if default == False: out.append(line) f.close() f = open("{}data/dict.txt".format(installDir), 'w') out.append("\n") f.writelines(out) f.close() except: ErrorHandler(err(), False)
def write_circos_table(data, name=None, rowheader=None, colheader=None, prefix="label", corner=None, delimiter='\t'): ''' wite a matrix of data in tab-delimated format file input: data: a 2 dimensioal array of data name: includes path and the name of file to save rowheader columnheader output: a file tabdelimated file ''' f = open(name, 'w') # write order header f.write("Data") f.write(delimiter) f.write("Data") f.write(delimiter) for i in range(len(data[0])): f.write(str(i + 1)) if i < len(data[0]) - 1: f.write(delimiter) f.write('\n') # column numbers as header f.write("Data") f.write(delimiter) if len(colheader) == 0: f.write("Data") f.write(delimiter) for i in range(len(data[0])): f.write(str(i)) if i < len(data[0]) - 1: f.write(delimiter) f.write('\n') elif len(colheader) == len(data[0]): f.write("Data") f.write(delimiter) for i in range(len(data[0][:])): f.write(colheader[i]) if i < len(data[0]) - 1: f.write(delimiter) f.write('\n') else: sys.err("The lable list in not matched with the data size") sys.exit() for i in range(len(data)): if len(rowheader) == 0: f.write(str(i + len(data[0]))) f.write(prefix + str(i)) f.write(delimiter) elif len(colheader) == len(data[0]): f.write(str(i + len(data[0]) + 1)) f.write(delimiter) f.write(rowheader[i]) f.write(delimiter) else: sys.err("The lable list in not matched with the data size") sys.exit() for j in range(len(data[i])): f.write(str(data[i][j])) if j < len(data[i]) - 1: f.write(delimiter) f.write('\n') f.close()
def process(self, refDB): """ process reads against a reference fasta file """ try: bc = self.orig_bc_lines[0].split(":")[0] mapped_pairs_count = 0 remapped_pairs_count = 0 mapped_singles_count = 0 remapped_singles_count = 0 secondary_alignment = 0 count = 0 PE1 = {} PE2 = {} for line in self.orig_bc_lines: # 0x1 template having multiple segments in sequencing # 0x2 each segment properly aligned according to the aligner # 0x4 segment unmapped # 0x8 next segment in the template unmapped # 0x10 SEQ being reverse complemented # 0x20 SEQ of the next segment in the template being reversed # 0x40 the first segment in the template # 0x80 the last segment in the template # 0x100 secondary alignment # 0x200 not passing quality controls # 0x400 PCR or optical duplicate lbc = line.split(":")[0] if lbc != bc: sys.err( "Something went wrong, more than one barcode in process barcodes" ) count += 1 line2 = line.strip().split() flag = int(line2[1]) # Secondary alignment if ( flag & 0x100 ): # not sure what to do with secondary alignment yet, for now ignore secondary_alignment += 1 continue mq = int(line2[4]) # if mq < self.minMQ: # # add to those that need to be remapped # self.remap_lines.append(line) # Handle SE: # mapped SE reads have 0x1 set to 0, and 0x4 (third bit) set to 1 if not ( flag & 0x1 ): # SE READ, shouldn't see singles, maybe handle them later # if not (flag & 0x4 and mq >= self.minMQ): # MAPPED # self.ok_bc_lines.append(line) # # TODO: NEED to determine read cloud for read # mapped_singles_count += 1 # else: # UNMAPPED or Poor minMQ, remap the read # if (flag & 0x10): # reverse complement # line2[9] = reverseComplement(line2[9]) # line2[10] = reverse(line2[10]) # self.addRead(['\n'.join(['@' + line2[0] + ' 1:N:O', line2[9], '+', line2[10]])]) # remapped_singles_count += 1 continue # Handle PE: # logic: 0x1 = multiple segments in sequencing, 0x4 = segment unmapped, 0x8 = next segment unmapped if (flag & 0x1): # PE READ if (not (flag & 0x4) and not (flag & 0x8)): # both pairs mapped if (flag & 0x40 ): # is this PE1 (first segment in template) # PE1 read, check that PE2 is in dict ID = line2[0] if ID in PE2: if mq >= self.minMQ and int( PE2[ID].strip().split()[4] ) >= self.minMQ: # check MQ of both reads self.ok_bc_lines.append(line) self.ok_bc_lines.append(PE2[ID]) del PE2[ID] # TODO: NEED to determine read cloud for read mapped_pairs_count += 1 else: if (flag & 0x10): # reverse complement line2[9] = reverseComplement(line2[9]) line2[10] = reverse(line2[10]) r1 = '\n'.join([ '@' + line2[0] + ' 1:N:O', line2[9], '+', line2[10] ]) # sequence + qual rl2 = PE2[ID].strip().split() if (int(rl2[1]) & 0x10): # reverse complement rl2[9] = reverseComplement(rl2[9]) rl2[10] = reverse(rl2[10]) r2 = '\n'.join([ '@' + rl2[0] + ' 2:N:O', rl2[9], '+', rl2[10] ]) # sequence + qual self.addRead('\n'.join([r1, r2])) del PE2[ID] remapped_pairs_count += 1 else: PE1[ID] = line elif (flag & 0x80): # is this PE2 (last segment in template) # PE2 read, check that PE1 is in dict and write out ID = line2[0] if ID in PE1: if mq >= self.minMQ and int( PE1[ID].strip().split()[4] ) >= self.minMQ: # check MQ of both reads self.ok_bc_lines.append(line) self.ok_bc_lines.append(PE1[ID]) del PE1[ID] # TODO: NEED to determine read cloud for read mapped_pairs_count += 1 else: if (flag & 0x10): # reverse complement line2[9] = reverseComplement(line2[9]) line2[10] = reverse(line2[10]) r2 = '\n'.join([ '@' + line2[0] + ' 2:N:O', line2[9], '+', line2[10] ]) # sequence + qual rl1 = PE1[ID].strip().split() if (int(rl1[1]) & 0x10): # reverse complement rl1[9] = reverseComplement(rl1[9]) rl1[10] = reverse(rl1[10]) r1 = '\n'.join([ '@' + rl1[0] + ' 1:N:O', rl1[9], '+', rl1[10] ]) # sequence + qual self.addRead('\n'.join([r1, r2])) del PE1[ID] remapped_pairs_count += 1 else: PE2[ID] = line else: # an 'unmapped' pair, at least 1 unmapped if (flag & 0x40 ): # is this PE1 (first segment in template) # PE1 read, check that PE2 is in dict and write out ID = line2[0] if ID in PE2: if (flag & 0x10): # reverse complement line2[9] = reverseComplement(line2[9]) line2[10] = reverse(line2[10]) r1 = '\n'.join([ '@' + line2[0] + ' 1:N:O', line2[9], '+', line2[10] ]) # sequence + qual rl2 = PE2[ID].strip().split() if (int(rl2[1]) & 0x10): # reverse complement rl2[9] = reverseComplement(rl2[9]) rl2[10] = reverse(rl2[10]) r2 = '\n'.join([ '@' + rl2[0] + ' 2:N:O', rl2[9], '+', rl2[10] ]) # sequence + qual self.addRead('\n'.join([r1, r2])) del PE2[ID] remapped_pairs_count += 1 else: PE1[ID] = line elif (flag & 0x80): # is this PE2 (last segment in template) # PE2 read, check that PE1 is in dict and write out ID = line2[0] if ID in PE1: if (flag & 0x10): # reverse complement line2[9] = reverseComplement(line2[9]) line2[10] = reverse(line2[10]) r1 = '\n'.join([ '@' + line2[0] + ' 1:N:O', line2[9], '+', line2[10] ]) # sequence + qual rl2 = PE2[ID].strip().split() if (int(rl2[1]) & 0x10): # reverse complement rl2[9] = reverseComplement(rl2[9]) rl2[10] = reverse(rl2[10]) r2 = '\n'.join([ '@' + rl2[0] + ' 2:N:O', rl2[9], '+', rl2[10] ]) # sequence + qual self.addRead('\n'.join([r1, r2])) del PE2[ID] remapped_pairs_count += 1 else: PE2[ID] = line except (KeyboardInterrupt, SystemExit): sys.stderr.write("MAPPING\tERROR\t%s unexpectedly terminated\n" % (__name__)) return 1 except: sys.stderr.write("".join( traceback.format_exception(*sys.exc_info()))) return 1
def extract_folds(depth_file, min_fold_size=50, tiny_depth=0.001, save_file=False): """ Use depth to extract folds from a triangular surface mesh. Steps :: 1. Compute histogram of depth measures. 2. Define a depth threshold and find the deepest vertices. 3. Segment deep vertices as an initial set of folds. 4. Remove small folds. 5. Find and fill holes in the folds. 6. Renumber folds. Step 2 :: To extract an initial set of deep vertices from the surface mesh, we anticipate that there will be a rapidly decreasing distribution of low depth values (on the outer surface) with a long tail of higher depth values (in the folds), so we smooth the histogram's bin values, convolve to compute slopes, and find the depth value for the first bin with slope = 0. This is our threshold. Step 5 :: The folds could have holes in areas shallower than the depth threshold. Calling fill_holes() could accidentally include very shallow areas (in an annulus-shaped fold, for example), so we call fill_holes() with the argument exclude_range set close to zero to retain these areas. Parameters ---------- depth_file : string surface mesh file in VTK format with faces and depth scalar values min_fold_size : integer minimum fold size (number of vertices) tiny_depth : float largest non-zero depth value that will stop a hole from being filled save_file : Boolean save output VTK file? Returns ------- folds : list of integers fold numbers for all vertices (-1 for non-fold vertices) n_folds : int number of folds depth_threshold : float threshold defining the minimum depth for vertices to be in a fold bins : list of integers histogram bins: each is the number of vertices within a range of depth values bin_edges : list of floats histogram bin edge values defining the bin ranges of depth values folds_file : string (if save_file) name of output VTK file with fold IDs (-1 for non-fold vertices) Examples -------- >>> import os >>> import numpy as np >>> import pylab >>> from scipy.ndimage.filters import gaussian_filter1d >>> from mindboggle.utils.io_vtk import read_scalars >>> from mindboggle.utils.mesh import find_neighbors_from_file >>> from mindboggle.utils.plots import plot_vtk >>> from mindboggle.features.folds import extract_folds >>> path = os.environ['MINDBOGGLE_DATA'] >>> depth_file = os.path.join(path, 'arno', 'shapes', 'lh.pial.travel_depth.vtk') >>> neighbor_lists = find_neighbors_from_file(depth_file) >>> min_fold_size = 50 >>> tiny_depth = 0.001 >>> save_file = True >>> # >>> folds, n_folds, thr, bins, bin_edges, folds_file = extract_folds(depth_file, >>> min_fold_size, tiny_depth, save_file) >>> # >>> # View folds: >>> plot_vtk('folds.vtk') >>> # Plot histogram and depth threshold: >>> depths, name = read_scalars(depth_file) >>> nbins = np.round(len(depths) / 100.0) >>> a,b,c = pylab.hist(depths, bins=nbins) >>> pylab.plot(thr*np.ones((100,1)), np.linspace(0, max(bins), 100), 'r.') >>> pylab.show() >>> # Plot smoothed histogram: >>> bins_smooth = gaussian_filter1d(bins.tolist(), 5) >>> pylab.plot(range(len(bins)), bins, '.', range(len(bins)), bins_smooth,'-') >>> pylab.show() """ import os import sys import numpy as np from time import time from scipy.ndimage.filters import gaussian_filter1d from mindboggle.utils.io_vtk import rewrite_scalars, read_vtk from mindboggle.utils.mesh import find_neighbors from mindboggle.utils.morph import fill_holes from mindboggle.utils.segment import segment do_fill_holes = True print("Extract folds in surface mesh") t0 = time() #------------------------------------------------------------------------- # Load depth values for all vertices #------------------------------------------------------------------------- faces, lines, indices, points, npoints, depths, name, input_vtk = read_vtk(depth_file, return_first=True, return_array=True) #------------------------------------------------------------------------- # Find neighbors for each vertex #------------------------------------------------------------------------- neighbor_lists = find_neighbors(faces, npoints) #------------------------------------------------------------------------- # Compute histogram of depth measures #------------------------------------------------------------------------- min_vertices = 10000 if npoints > min_vertices: nbins = np.round(npoints / 100.0) else: sys.err(" Expecting at least {0} vertices to create depth histogram". format(min_vertices)) bins, bin_edges = np.histogram(depths, bins=nbins) #------------------------------------------------------------------------- # Anticipating that there will be a rapidly decreasing distribution # of low depth values (on the outer surface) with a long tail of higher # depth values (in the folds), smooth the bin values (Gaussian), convolve # to compute slopes, and find the depth for the first bin with slope = 0. #------------------------------------------------------------------------- bins_smooth = gaussian_filter1d(bins.tolist(), 5) window = [-1, 0, 1] bin_slopes = np.convolve(bins_smooth, window, mode='same') / (len(window) - 1) ibins0 = np.where(bin_slopes == 0)[0] if ibins0.shape: depth_threshold = bin_edges[ibins0[0]] else: depth_threshold = np.median(depths) #------------------------------------------------------------------------- # Find the deepest vertices #------------------------------------------------------------------------- indices_deep = [i for i,x in enumerate(depths) if x >= depth_threshold] if indices_deep: #--------------------------------------------------------------------- # Segment deep vertices as an initial set of folds #--------------------------------------------------------------------- print(" Segment vertices deeper than {0:.2f} as folds".format(depth_threshold)) t1 = time() folds = segment(indices_deep, neighbor_lists) # Slightly slower alternative -- fill boundaries: #regions = -1 * np.ones(len(points)) #regions[indices_deep] = 1 #folds = segment_by_filling_borders(regions, neighbor_lists) print(' ...Segmented folds ({0:.2f} seconds)'.format(time() - t1)) #--------------------------------------------------------------------- # Remove small folds #--------------------------------------------------------------------- if min_fold_size > 1: print(' Remove folds smaller than {0}'.format(min_fold_size)) unique_folds = [x for x in np.unique(folds) if x > -1] for nfold in unique_folds: indices_fold = [i for i,x in enumerate(folds) if x == nfold] if len(indices_fold) < min_fold_size: folds[indices_fold] = -1 #--------------------------------------------------------------------- # Find and fill holes in the folds # Note: Surfaces surrounded by folds can be mistaken for holes, # so exclude_range includes outer surface values close to zero. #--------------------------------------------------------------------- if do_fill_holes: print(" Find and fill holes in the folds") folds = fill_holes(folds, neighbor_lists, values=depths, exclude_range=[0, tiny_depth]) #--------------------------------------------------------------------- # Renumber folds so they are sequential #--------------------------------------------------------------------- renumber_folds = -1 * np.ones(len(folds)) fold_numbers = [int(x) for x in np.unique(folds) if x > -1] for i_fold, n_fold in enumerate(fold_numbers): fold = [i for i,x in enumerate(folds) if x == n_fold] renumber_folds[fold] = i_fold folds = renumber_folds n_folds = i_fold + 1 # Print statement print(' ...Extracted {0} folds ({1:.2f} seconds)'. format(n_folds, time() - t0)) else: print(' No deep vertices') #------------------------------------------------------------------------- # Return folds, number of folds, file name #------------------------------------------------------------------------- if save_file: folds_file = os.path.join(os.getcwd(), 'folds.vtk') rewrite_scalars(depth_file, folds_file, folds, 'folds', folds) if not os.path.exists(folds_file): raise(IOError(folds_file + " not found")) else: folds_file = None return folds.tolist(), n_folds, depth_threshold, bins, bin_edges, folds_file
def extract_folds(depth_file, min_vertices=10000, min_fold_size=50, do_fill_holes=False, min_hole_depth=0.001, save_file=False): """ Use depth to extract folds from a triangular surface mesh. Steps :: 1. Compute histogram of depth measures. 2. Define a depth threshold and find the deepest vertices. 3. Segment deep vertices as an initial set of folds. 4. Remove small folds. 5. Find and fill holes in the folds (optional). 6. Renumber folds. Step 2 :: To extract an initial set of deep vertices from the surface mesh, we anticipate that there will be a rapidly decreasing distribution of low depth values (on the outer surface) with a long tail of higher depth values (in the folds), so we smooth the histogram's bin values, convolve to compute slopes, and find the depth value for the first bin with slope = 0. This is our threshold. Step 5 :: The folds could have holes in areas shallower than the depth threshold. Calling fill_holes() could accidentally include very shallow areas (in an annulus-shaped fold, for example), so we include the argument exclude_range to check for any values from zero to min_hole_depth; holes are not filled if they contains values within this range. Parameters ---------- depth_file : string surface mesh file in VTK format with faces and depth scalar values min_fold_size : integer minimum fold size (number of vertices) do_fill_holes : Boolean fill holes in the folds? min_hole_depth : float largest non-zero depth value that will stop a hole from being filled save_file : Boolean save output VTK file? Returns ------- folds : list of integers fold numbers for all vertices (-1 for non-fold vertices) n_folds : int number of folds depth_threshold : float threshold defining the minimum depth for vertices to be in a fold bins : list of integers histogram bins: each is the number of vertices within a range of depth values bin_edges : list of floats histogram bin edge values defining the bin ranges of depth values folds_file : string (if save_file) name of output VTK file with fold IDs (-1 for non-fold vertices) Examples -------- >>> import os >>> import numpy as np >>> import pylab >>> from scipy.ndimage.filters import gaussian_filter1d >>> from mindboggle.mio.vtks import read_scalars >>> from mindboggle.guts.mesh import find_neighbors_from_file >>> from mindboggle.mio.plots import plot_surfaces >>> from mindboggle.features.folds import extract_folds >>> path = os.environ['MINDBOGGLE_DATA'] >>> depth_file = 'travel_depth.vtk' #os.path.join(path, 'arno', 'shapes', 'lh.pial.travel_depth.vtk') >>> neighbor_lists = find_neighbors_from_file(depth_file) >>> min_vertices = 10000 >>> min_fold_size = 50 >>> do_fill_holes = False #True >>> min_hole_depth = 0.001 >>> save_file = True >>> # >>> folds, n_folds, thr, bins, bin_edges, folds_file = extract_folds(depth_file, >>> min_vertices, min_fold_size, do_fill_holes, min_hole_depth, save_file) >>> # >>> # View folds: >>> plot_surfaces('folds.vtk') >>> # Plot histogram and depth threshold: >>> depths, name = read_scalars(depth_file) >>> nbins = np.round(len(depths) / 100.0) >>> a,b,c = pylab.hist(depths, bins=nbins) >>> pylab.plot(thr*np.ones((100,1)), np.linspace(0, max(bins), 100), 'r.') >>> pylab.show() >>> # Plot smoothed histogram: >>> bins_smooth = gaussian_filter1d(bins.tolist(), 5) >>> pylab.plot(range(len(bins)), bins, '.', range(len(bins)), bins_smooth,'-') >>> pylab.show() """ import os import sys import numpy as np from time import time from scipy.ndimage.filters import gaussian_filter1d from mindboggle.mio.vtks import rewrite_scalars, read_vtk from mindboggle.guts.mesh import find_neighbors from mindboggle.guts.morph import fill_holes from mindboggle.guts.segment import segment print("Extract folds in surface mesh") t0 = time() #------------------------------------------------------------------------- # Load depth values for all vertices #------------------------------------------------------------------------- points, indices, lines, faces, depths, scalar_names, npoints, \ input_vtk = read_vtk(depth_file, return_first=True, return_array=True) #------------------------------------------------------------------------- # Find neighbors for each vertex #------------------------------------------------------------------------- neighbor_lists = find_neighbors(faces, npoints) #------------------------------------------------------------------------- # Compute histogram of depth measures #------------------------------------------------------------------------- if npoints > min_vertices: nbins = np.round(npoints / 100.0) else: sys.err(" Expecting at least {0} vertices to create depth histogram". format(min_vertices)) bins, bin_edges = np.histogram(depths, bins=nbins) #------------------------------------------------------------------------- # Anticipating that there will be a rapidly decreasing distribution # of low depth values (on the outer surface) with a long tail of higher # depth values (in the folds), smooth the bin values (Gaussian), convolve # to compute slopes, and find the depth for the first bin with slope = 0. #------------------------------------------------------------------------- bins_smooth = gaussian_filter1d(bins.tolist(), 5) window = [-1, 0, 1] bin_slopes = np.convolve(bins_smooth, window, mode='same') / (len(window) - 1) ibins0 = np.where(bin_slopes == 0)[0] if ibins0.shape: depth_threshold = bin_edges[ibins0[0]] else: depth_threshold = np.median(depths) #------------------------------------------------------------------------- # Find the deepest vertices #------------------------------------------------------------------------- indices_deep = [i for i, x in enumerate(depths) if x >= depth_threshold] if indices_deep: #--------------------------------------------------------------------- # Segment deep vertices as an initial set of folds #--------------------------------------------------------------------- print(" Segment vertices deeper than {0:.2f} as folds".format( depth_threshold)) t1 = time() folds = segment(indices_deep, neighbor_lists) # Slightly slower alternative -- fill boundaries: #regions = -1 * np.ones(len(points)) #regions[indices_deep] = 1 #folds = segment_by_filling_borders(regions, neighbor_lists) print(' ...Segmented folds ({0:.2f} seconds)'.format(time() - t1)) #--------------------------------------------------------------------- # Remove small folds #--------------------------------------------------------------------- if min_fold_size > 1: print(' Remove folds smaller than {0}'.format(min_fold_size)) unique_folds = [x for x in np.unique(folds) if x != -1] for nfold in unique_folds: indices_fold = [i for i, x in enumerate(folds) if x == nfold] if len(indices_fold) < min_fold_size: folds[indices_fold] = -1 #--------------------------------------------------------------------- # Find and fill holes in the folds # Note: Surfaces surrounded by folds can be mistaken for holes, # so exclude_range includes outer surface values close to zero. #--------------------------------------------------------------------- if do_fill_holes: print(" Find and fill holes in the folds") folds = fill_holes(folds, neighbor_lists, values=depths, exclude_range=[0, min_hole_depth]) #--------------------------------------------------------------------- # Renumber folds so they are sequential #--------------------------------------------------------------------- renumber_folds = -1 * np.ones(len(folds)) fold_numbers = [int(x) for x in np.unique(folds) if x != -1] for i_fold, n_fold in enumerate(fold_numbers): fold = [i for i, x in enumerate(folds) if x == n_fold] renumber_folds[fold] = i_fold folds = renumber_folds n_folds = i_fold + 1 # Print statement print(' ...Extracted {0} folds ({1:.2f} seconds)'.format( n_folds, time() - t0)) else: print(' No deep vertices') folds = [int(x) for x in folds] #------------------------------------------------------------------------- # Return folds, number of folds, file name #------------------------------------------------------------------------- if save_file: folds_file = os.path.join(os.getcwd(), 'folds.vtk') rewrite_scalars(depth_file, folds_file, folds, 'folds', folds) if not os.path.exists(folds_file): raise (IOError(folds_file + " not found")) else: folds_file = None return folds, n_folds, depth_threshold, bins, bin_edges, folds_file
def create_trials(dict, data_h5, options): trial_id = data_h5["trialIds/trialIds"].value if options.verbose: print("\nCreating trials with ids: " + str(trial_id)) trial_t = data_h5["trialStartTimes/trialStartTimes"].value # trial stop isn't stored. assume that it's twice the duration of other # trials -- padding on the high side shouldn't matter ival = (trial_t[-1] - trial_t[0]) / (len(trial_t) - 1) trial_t = np.append(trial_t, trial_t[-1] + 2 * ival) if options.data_type == "ephys": good_trials = get_value_by_key(data_h5["/trialPropertiesHash"], "GoodTrials") ignore_ivals_start = [ time for (time, good_trial) in zip(trial_t, good_trials) if good_trial == 0 ] # trial stop isn't stored. assume that it's twice the duration of other # trials -- padding on the high side shouldn't matter ignore_ivals_stop = [ time for (time, good_trial) in zip(trial_t[1:], good_trials) if good_trial == 0 ] ignore_intervals = [ignore_ivals_start, ignore_ivals_stop] keyName3 = "PhotostimulationType" hash_group_pointer2 = data_h5["/trialPropertiesHash"] stimulus_types = np.array( get_value_by_key(hash_group_pointer2, keyName3)).tolist() count_1 = stimulus_types.count(1) count_2 = stimulus_types.count(2) elif options.data_type == "ophys": plane_map = create_plane_map(data_h5, options) epoch_roi_list, epoch_roi_planes = create_trial_roi_map( data_h5, plane_map, options) for i in range(len(trial_id)): tid = trial_id[i] trial = "trial_%d%d%d" % (int(tid / 100), int(tid / 10) % 10, tid % 10) dict["epochs." + trial + ".description"] = "Data that belong to " + trial if options.data_type == "ephys": start = trial_t[i] stop = trial_t[i + 1] dict["epochs." + trial + ".start_time"] = start dict["epochs." + trial + ".stop_time"] = stop tags = [] if good_trials[i] == 1: tags.append("Good trial") else: tags.append("Non-performing") for j in range(len(epoch_tags[trial])): tags.append(epoch_tags[trial][j]) try: dict["epochs." + trial + ".tags"] = tags except: sys.err(" Unable to create dataset 'tag' containing " + str(tags)) # keep with tradition and create a units field, even if it's empty if trial not in epoch_units: units = ["NA"] else: units = epoch_units[trial] try: dict["epochs." + trial + ".units_present"] = units except: print " Unable to create dataset 'units_present' containing ", units raw_path = "descrHash/value/%d" % (trial_id[i]) raw_file = parse_h5_obj(data_h5[raw_path])[0] if len(raw_file) == 1: raw_file = 'na' else: raw_file = str(raw_file) elif options.data_type == "ophys": start = trial_t[i] stop = trial_t[i + 1] dict["epochs." + trial + ".start_time"] = start dict["epochs." + trial + ".stop_time"] = stop if trial in epoch_roi_list.keys(): dict["epochs." + trial + ".ROIs"] = epoch_roi_list[trial] dict["epochs." + trial + ".ROI_planes"] = epoch_roi_planes[trial] tags = [] if trial in epoch_trial_types: for j in range(len(epoch_trial_types[trial])): tags.append(epoch_trial_types[trial][j]) dict["epochs." + trial + ".tags"] = tags return dict
def main(args): parser = argparse.ArgumentParser( description='Check if the barcodes are present in the sample') parser.add_argument( '-i', '--infile') # file containing the sample name and the barcode string parser.add_argument('-o', '--outfile', default="out.txt") # output filename parser.add_argument('-l', '--logfile', default="ITS-log-file.txt") # output filename parser.add_argument('-a', '--amplicon', default="") # 16S, nifH, ITS parser.add_argument('-f', '--fasta', action='store_true', default=False) # files in fasta format, default fastq parser.add_argument('-q', '--quality', default=30, type=int) # Phred quality threshold of seq. parser.add_argument( '-z', '--trimfastq', default='./Trimmed_fastq_files/') # prefix for trimmed files parser.add_argument('-c', '--cutadapt', action='store_true', default=False) # run cutadapt parser.add_argument('-d', '--delimiter', default='\t') # delimiter for file parser.add_argument( '-m', '--histogram', action='store_true', default=False) # draw histogram for each adapter using infile args = parser.parse_args() if len(sys.argv) == 1: parser.print_help() sys.exit('\natleast one argument required\n') infile = args.infile outpfile = args.outfile filetype = '.fasta' if args.fasta else '.fastq' logfile = args.logfile run_cutadapt = args.cutadapt delim = args.delimiter if args.histogram: draw_histogram_adpaters_trimed_seqs(infile) sys.exit(0) global FWDPRIMER, FADAPTER, REVPRIMER, RADAPTER, QUALITY, R_1, R_2, INTERMFASTQ, INTERMINFO, TRIMFASTQF, TRIMFASTQD, \ REV_COMPLEM_RAD_RP, REV_COMPLEM_FAD_FP, FAD_FP, RAD_RP, REV_COMPLEM_FWDPRIMER, REV_COMPLEM_REVPRIMER if args.amplicon == "16S": # primers and adapters for the 16S data FWDPRIMER = 'CCTACGGGNGGCWGCAG' FADAPTER = 'TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG' REVPRIMER = 'GACTACHVGGGTATCTAATCC' RADAPTER = 'GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG' elif args.amplicon == "nifH": # primers and adapters for the nifH data FWDPRIMER = 'TGCGAYCCSAARGCBGACTC' FADAPTER = 'TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG' REVPRIMER = 'ATSGCCATCATYTCRCCGGA' RADAPTER = 'GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG' elif args.amplicon == "ITS": # primers and adapters for the ITS data FWDPRIMER = 'CTTGGTCATTTAGAGGAAGTAA' FADAPTER = 'TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG' REVPRIMER = 'GCTGCGTTCTTCATCGATGC' RADAPTER = 'GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG' else: sys.err( "The code was modified so that you now have to explicitly mention the amplicon (16S, nifH, ITS). Previous code allowed 16S with a boolean (True) -a and without the argument it defaulted to ITS" ) REV_COMPLEM_FWDPRIMER = BioSeq.reverse_complement( FWDPRIMER) # search at end in read 2 REV_COMPLEM_REVPRIMER = BioSeq.reverse_complement( REVPRIMER) # search at end in read 1 FAD_FP = FADAPTER + FWDPRIMER # check at start in read 1 RAD_RP = RADAPTER + REVPRIMER # check at start in read 2 REV_COMPLEM_RAD_RP = BioSeq.reverse_complement( RAD_RP) # search at end in read 1 REV_COMPLEM_FAD_FP = BioSeq.reverse_complement( FAD_FP) # search at end in read 2 QUALITY = args.quality R_1 = '_L001_R1_001' R_2 = '_L001_R2_001' #INTERMFASTQ = args.intermedfastq #INTERMINFO = args.intermedinfo TRIMFASTQD = args.trimfastq # dir TRIMFASTQF = TRIMFASTQD + 'trim_' # dir + prefix for files for dir in [TRIMFASTQD]: if not os.path.exists(dir): os.makedirs(dir) lines = read_file(infile) #GRL4463_S49_L001_R1_001 or 1_S72_L001_R1_001 SAMPLE_BARCODE_DICT = dict() # this is not a global variable sampleid_col = fbarcode_col = rbarcode_col = 0 for l in lines: cont = l.strip().split(delim) if '#' in l: # identify column number of SampleId and barcodeseq. check if present to avoid ValueError if the target is not found if "#SampleID" in cont: sampleid_col = cont.index("#SampleID") if "FBarcodeSequence" in cont: fbarcode_col = cont.index("FBarcodeSequence") if "RBarcodeSequence" in cont: rbarcode_col = cont.index("RBarcodeSequence") continue if sampleid_col == 0 and fbarcode_col == 0 and rbarcode_col == 0: sys.exit( 'Could not detect the column labels SampleID, FBarcodeSequence, or RBarcodeSequence' ) sample = cont[sampleid_col] + '_S' barcode = cont[fbarcode_col] + '_' + cont[rbarcode_col] if sample in SAMPLE_BARCODE_DICT: sys.exit( '\nduplicate samples/barcodes detected, check sample sheet\n') else: SAMPLE_BARCODE_DICT[sample] = barcode # The nextera 16S metagenomic kit has the following layout: #P5 - Index2 - overhang_adapter - fwd_primer - DNA - rev_primer - overhang_adapter - Index1 - P7 barcode_stats(outpfile, SAMPLE_BARCODE_DICT, delim, filetype, run_cutadapt, logfile)
#!/usr/bin/python3 import sys peaks = dict() table = open(sys.argv[1]) table.readline() for i in table: f = i.rstrip().split("\t") chrx = f[0] + ":" + f[1] start = int(f[2]) end = int(f[3]) if start > end: sys.err("Bro, something wrong with the coordinates!" + i) exit(0) if chrx not in peaks: peaks[chrx] = list() peaks[chrx].append((start, end)) for chrx in peaks: for i in range(len(peaks[chrx]) - 1): for j in range(i+1, len(peaks[chrx])): distance = abs(peaks[chrx][j][0] - peaks[chrx][i][1]) print(distance)
nvalues = len(mvs[0]) print nvalues for nv in range(0,nvalues): if mvs[0][nv]+count == 0 or mvs[0][nv]+count == ntimes-1: #can't do anything for missing values at the very beginning or very end of the timeseries #print "can't deal with missing values sequentially in time, setting to zero" precipin[mvs[0][nv],mvs[1][nv],mvs[2][nv]] = 0.0 elif mvs[0][nv] < step+1: if (precipin[mvs[0][nv]-1,mvs[1][nv],mvs[2][nv]] > -1 and precipin[mvs[0][nv]+1,mvs[1][nv],mvs[2][nv]] > -1): precipin[mvs[0][nv],mvs[1][nv],mvs[2][nv]] = 0.5 * precipin[mvs[0][nv]-1,mvs[1][nv],mvs[2][nv]] + \ 0.5 * precipin[mvs[0][nv]+1,mvs[1][nv],mvs[2][nv]] else: precipin[mvs[0][nv],mvs[1][nv],mvs[2][nv]] = 0.0 else: #don't do anything for the last timestamp in each segment - it will be included in the next unless it's the #last one in which case we've already dealt with it! print "not doing anything for time " + str(count + mvs[0][nv]) count = count + step mvs2 = np.where(precipin[0:step+1,:,:] < -1) nvalues2 = len(mvs2[0]) if nvalues2 != 0: sys.err("error with filling missing data; missing data remains") precipdata.variables['pcp'][nt:upto,:,:] = precipin print 'finished filling in missing data'
import sys import json import yaml try: jsonfile = sys.argv[1] except IndexError: sys.err(f"Usage: {sys.argv[0]} JSONFILE") with open(jsonfile) as f: data = json.load(f) tickets = data["tickets"] output = {"labels": {}} tag_types = [ ["component", "c:"], ["priority", "p:"], ["type", "t:"], ["owner", "o:"], ["version", "v:"], ["resolution", "r:"], ] # Convert space to hyphen and drop punctuation fixer = str.maketrans(" ", "-", ",;:") for tag_type, prefix in tag_types: tag_list = set(d.get("attributes").get(tag_type) for d in tickets.values()) output["labels"][tag_type] = { _: f"{prefix}{_.translate(fixer)}" for _ in tag_list } print(f"{tag_type}s:", tag_list)