def runInstances(credClient, ami, key, iType, groups, availZone, bidPrice, minInstances, maxInstances, userData): def _runInstances(num): if bidPrice: return credClient.runSpotInstances(bidPrice=bidPrice, ami=ami, key=key, instanceType=iType, groups=groups, availabilityZone=availZone, numInstances=num, userData=userData) else: return credClient.runInstances(ami=ami, key=key, instanceType=iType, groups=groups, availabilityZone=availZone, numInstances=num, userData=userData) instances = [] @defer.inlineCallbacks def _startInstances(): startedInstances = yield _runInstances(maxInstances - len(instances)) instances.extend(startedInstances) if len(instances) < minInstances: raise InstanceStartError('Wanted %d instances got %d' % (maxInstances - len(instances), len(startedInstances))) try: yield defer_utils.tryUntil(RUN_INSTANCE_TRIES, _startInstances, onFailure=defer_utils.sleep(30)) except Exception, err: ## If we got an exception then terminate any instances ## that were started and reraise exception. ## The last thing we want is to leak instances ## ## This is not completely safe! We should probably ## raise an exception with the started instances in it ## and let the caller decide what to do with them log.err('Error starting instances') log.err(err) defer_utils.mapSerial(lambda iChunk : credClient.terminateInstances(iChunk), func.chunk(5, instances))
def terminateCluster(credClient, persistManager, clusterName, userName): cluster = yield persistManager.loadCluster(clusterName, userName) yield defer_utils.mapSerial( lambda instances: credClient.terminateInstances(instances), func.chunk(5, cluster.execNodes + cluster.dataNodes)) if cluster.master: yield credClient.terminateInstances([cluster.master]) defer.returnValue(cluster.setState(cluster.TERMINATED))
def terminateCluster(credClient, persistManager, clusterName, userName): cluster = yield persistManager.loadCluster(clusterName, userName) yield defer_utils.mapSerial( lambda instances: credClient.terminateInstances(instances), func.chunk(5, cluster.execNodes + cluster.dataNodes) ) if cluster.master: yield credClient.terminateInstances([cluster.master]) defer.returnValue(cluster.setState(cluster.TERMINATED))
def runInstances(credClient, ami, key, iType, groups, availZone, bidPrice, minInstances, maxInstances, userData): def _runInstances(num): if bidPrice: return credClient.runSpotInstances(bidPrice=bidPrice, ami=ami, key=key, instanceType=iType, groups=groups, availabilityZone=availZone, numInstances=num, userData=userData) else: return credClient.runInstances(ami=ami, key=key, instanceType=iType, groups=groups, availabilityZone=availZone, numInstances=num, userData=userData) instances = [] @defer.inlineCallbacks def _startInstances(): startedInstances = yield _runInstances(maxInstances - len(instances)) instances.extend(startedInstances) if len(instances) < minInstances: raise InstanceStartError( 'Wanted %d instances got %d' % (maxInstances - len(instances), len(startedInstances))) try: yield defer_utils.tryUntil(RUN_INSTANCE_TRIES, _startInstances, onFailure=defer_utils.sleep(30)) except Exception, err: ## If we got an exception then terminate any instances ## that were started and reraise exception. ## The last thing we want is to leak instances ## ## This is not completely safe! We should probably ## raise an exception with the started instances in it ## and let the caller decide what to do with them log.err('Error starting instances') log.err(err) defer_utils.mapSerial( lambda iChunk: credClient.terminateInstances(iChunk), func.chunk(5, instances))
def terminateInstancesByAttribute(persistManager, credClient, clusterName, userName, byAttribute, attributeValues): cluster = yield persistManager.loadCluster(clusterName, userName) instances = [ i for i in cluster.execNodes + cluster.dataNodes if i[byAttribute] in attributeValues ] yield defer_utils.mapSerial(credClient.terminateInstances, func.chunk(5, instances)) defer.returnValue( cluster.removeExecNodes(instances).removeDataNodes(instances))
def terminateInstancesByAttribute(persistManager, credClient, clusterName, userName, byAttribute, attributeValues): cluster = yield persistManager.loadCluster(clusterName, userName) instances = [i for i in cluster.execNodes + cluster.dataNodes if i[byAttribute] in attributeValues] yield defer_utils.mapSerial(credClient.terminateInstances, func.chunk(5, instances)) defer.returnValue(cluster.removeExecNodes(instances).removeDataNodes(instances))
def removeTerminatedCluster(persistManager, credClient, clusterName, userName): yield defer_utils.sleep(REMOVE_CLUSTER_TIMEOUT)() cluster = yield persistManager.loadCluster(clusterName, userName) if cluster.state == cluster.TERMINATED: # Another check to make sure the instances have # really been terminated instances = ([cluster.master] + cluster.execNodes + cluster.dataNodes) instances = yield credClient.updateInstances(instances) undeadInstances = [i for i in instances if i['state'] != 'terminated'] if undeadInstances: yield defer_utils.mapSerial( lambda instances: credClient.terminateInstances(instances), func.chunk(5, undeadInstances)) yield persistManager.removeCluster(clusterName, userName)
def removeTerminatedCluster(persistManager, credClient, clusterName, userName): yield defer_utils.sleep(REMOVE_CLUSTER_TIMEOUT)() cluster = yield persistManager.loadCluster(clusterName, userName) if cluster.state == cluster.TERMINATED: # Another check to make sure the instances have # really been terminated instances = [cluster.master] + cluster.execNodes + cluster.dataNodes instances = yield credClient.updateInstances(instances) undeadInstances = [i for i in instances if i["state"] != "terminated"] if undeadInstances: yield defer_utils.mapSerial( lambda instances: credClient.terminateInstances(instances), func.chunk(5, undeadInstances) ) yield persistManager.removeCluster(clusterName, userName)
def tree_loop(fasta_dict, combined, tree, parallel_workers, run_r, num_refs): def _temp_name(t, f): return t + '_' + f def _perform_workflow(data): tn, f = data outfile = open("%s.fasta" % tn, "w") outfile.write(">%s\n%s" % (tn,f)) outfile.close() logging.debugPrint(lambda : "Processing sequence: %s" % tn) blast_against_reference("%s.fasta" % tn, combined, _temp_name(tn, "blast_parsed.txt")) subprocess.check_call("sort -u -k 2,2 %s > %s" % (_temp_name(tn, "blast_parsed.txt"), _temp_name(tn, "blast_unique.parsed.txt")), shell=True) parsed_blast_to_seqs(_temp_name(tn, "blast_unique.parsed.txt"), _temp_name(tn, "seqs_in.fas")) check_and_align_seqs(_temp_name(tn, "seqs_in.fas"), num_refs, _temp_name(tn, "seqs_aligned.fas")) if os.path.isfile(_temp_name(tn, "seqs_aligned.fas")): """What if there are NO SNPs in a given region""" #try: subprocess.call(['mothur', '#filter.seqs(fasta=%s, soft=100, vertical=F)' % _temp_name(tn, "seqs_aligned.fas")], stdout=subprocess.PIPE) subprocess.check_call('sed "s/[^1]/0/g" %s | sed "s/0/2/g" | sed "s/1/0/g" | sed "s/2/1/g" > %s' % (_temp_name(tn, "seqs_aligned.filter"), _temp_name(tn, "mask.txt")), shell=True) split_read(_temp_name(tn, "mask.txt"),_temp_name(tn, "padded.txt")) sum_qual_reads(_temp_name(tn, "padded.txt"), _temp_name(tn,"polys.txt")) #except: # """This function was never created""" # write_poly_zeros(_temp_name(tn, "padded.txt"), _temp_name(tn,"polys.txt")) if "T" == run_r: name = get_seq_name("%s.fasta" % tn) subprocess.check_call("cat snps.r | R --slave --args %s %s.table %s.pdf 2> /dev/null" % (_temp_name(tn, "seqs_aligned.fas"), name, name), shell=True) os.system("mv %s.table ./R_output/%s.table.txt" % (name, name)) os.system("mv %s.pdf ./R_output/%s.plots.pdf" % (name, name)) else: pass subprocess.check_call("FastTree -nt -noboot %s > %s 2> /dev/null" % (_temp_name(tn, "seqs_aligned.fas"), _temp_name(tn, "tmp.tree")), shell=True) run_dendropy("%s" % (_temp_name(tn, "tmp.tree")), tree, "%s" % (_temp_name(tn, "tmp.RF"))) run_dendropy_euclidian("%s" % (_temp_name(tn, "tmp.tree")), tree, "%s" % (_temp_name(tn, "tmp.EU"))) get_contig_length("%s.fasta" % tn, _temp_name(tn, "length.txt")) thread_id = id(threading.current_thread()) thread_distance_file = str(thread_id) + '_distance.txt' parse_rf_file(_temp_name(tn, "tmp.RF"), thread_distance_file) thread_euclidian_file = str(thread_id) + "_euc_dist.txt" parse_rf_file(_temp_name(tn, "tmp.EU"), thread_euclidian_file) thread_name_file = str(thread_id) + '_name.txt' write_strip_name("%s.fasta" % tn, thread_name_file) polys_name_file = str(thread_id) + '_polys.txt' parse_poly_file(_temp_name(tn, "polys.txt"), polys_name_file) length_name_file = str(thread_id) + '_length.txt' parse_poly_file(_temp_name(tn, "length.txt"), length_name_file) try: subprocess.check_call("rm mothur*", shell=True, stderr=open(os.devnull, 'w')) except: pass subprocess.check_call(["rm", _temp_name(tn, "blast_parsed.txt"), "%s.fasta" % tn, _temp_name(tn, "blast_unique.parsed.txt"), _temp_name(tn, "seqs_in.fas"), _temp_name(tn, "seqs_aligned.fas"), _temp_name(tn, "tmp.tree"), _temp_name(tn, "tmp.RF"), _temp_name(tn, "tmp.EU"), _temp_name(tn, "mask.txt"), _temp_name(tn, "padded.txt"), _temp_name(tn, "polys.txt"), _temp_name(tn, "seqs_aligned.filter"), _temp_name(tn, "length.txt"), _temp_name(tn, "seqs_aligned.filter.fasta")]) return (thread_distance_file, thread_name_file, polys_name_file, length_name_file, thread_euclidian_file) else: subprocess.check_call(["rm", _temp_name(tn, "blast_parsed.txt"), "%s.fasta" % tn, _temp_name(tn, "blast_unique.parsed.txt"), _temp_name(tn, "seqs_in.fas")]) files_and_temp_names = [(str(idx), f) for idx, f in fasta_dict.iteritems()] results = set(p_func.pmap(_perform_workflow, files_and_temp_names, num_workers=parallel_workers)) #I do this to make sure and remove any old files that are setting around subprocess.call("rm distance.txt name.txt polys.txt length.txt", shell=True, stderr=open(os.devnull, 'w')) for files in func.chunk(5, results): distances = [] names = [] polys = [] lengths = [] euc_dist = [] for value in files: if value: distances.append(value[0]) names.append(value[1]) polys.append(value[2]) lengths.append(value[3]) euc_dist.append(value[4]) if distances: subprocess.check_call("cat %s >> distance.txt" % " ".join(distances), shell=True) subprocess.check_call("cat %s >> name.txt" % " ".join(names), shell=True) subprocess.check_call("cat %s >> polys.txt" % " ".join(polys), shell=True) subprocess.check_call("cat %s >> length.txt" % " ".join(lengths), shell=True) subprocess.check_call("cat %s >> euc_dist.txt" % " ".join(euc_dist), shell=True) subprocess.check_call("rm %s" % " ".join(distances), shell=True) subprocess.check_call("rm %s" % " ".join(names), shell=True) subprocess.check_call("rm %s" % " ".join(polys), shell=True) subprocess.check_call("rm %s" % " ".join(lengths), shell=True) paste_files("name.txt","distance.txt","euc_dist.txt","polys.txt","length.txt","all_distances.txt")
def tagData(tagsDir, tagName, tagBaseDir, files, recursive, expand, compress, append, overwrite, metadata=None, filterF=None): """ Tag a list of files with the name. The files can contain direcotires, and if recursive is set the contends of the directories will become part of the tag rather than just the name tagBaseDir is the name of the directory that is not part of the actual tag heirarchy expand will cause any archives listed to be expanded and the contents of the archive to be added compress will compress the files that have been put in the tag. compress should be the path to the directory the compressed file should be put. append will add to a tagName if it already exists, only unique names will be kept though filterF - if you want to filter any of the files as they are added to the file list provide a filter function that will be called on each individual file name. The file will be added if filter returns True This returns the tag that was created """ if metadata is None: metadata = {} if not os.path.exists(tagsDir): runSystemEx('mkdir -p ' + tagsDir) outName = os.path.join(tagsDir, tagName) if os.path.exists(outName) and not append and not overwrite: raise Exception('Tag already exists') ## # Keep a set of all old entries in the file, when we walk the generator we'll # we'll check to see if the file already exists in here if append and os.path.exists(outName): oldFiles = set([l.strip() for l in open(outName)]) else: oldFiles = set() files = [ f for f in generateFileList(files, recursive, expand) if f not in oldFiles and (not filterF or filterF and filterF(f)) ] if overwrite: ## # If we are just overwritign the file, no need to old the list of oldFiles # Technically it shouldn't matter but if the old file list is really large # the lookup could be expensive outFile = open(outName, 'w') oldFiles = set() else: outFile = open(outName, 'a') outFile.write('\n'.join(files)) outFile.write('\n') outFile.close() # # If we are compressing the files then, load the tag back up # so we have all of the files there if compress: outTar = str(os.path.join(compress, tagName + '.tar')) outGzip = outTar + '.gz' if os.path.exists(outGzip): os.remove(outGzip) runSystemEx('mkdir -p ' + compress) files = loadTagFile(outName)('files') baseDirFiles, nonBaseDirFiles = partitionFiles(files, tagBaseDir) if baseDirFiles: for fs in func.chunk(20, baseDirFiles): cmd = [ 'tar', '-C', tagBaseDir, '-rf', outTar, ] cmd.extend([removeBase('/', f) for f in fs]) runSystemEx(' '.join(cmd), log=True) if nonBaseDirFiles: for fs in func.chunk(20, nonBaseDirFiles): cmd = [ 'tar', '-C', '/', '-rf', outTar, ] cmd.extend([removeBase('/', f) for f in fs]) runSystemEx(' '.join(cmd), log=True) # # It's possible we have no values here, if so, the tar was not created # and should be ignored if os.path.exists(outTar): runSystemEx('gzip ' + outTar, log=True) metadata = func.updateDict(metadata, { 'compressed': True, 'compressed_file': outGzip }) # # If tagBaseDir is set it means we have some metadata to write if tagBaseDir: metadata['tag_base_dir'] = tagBaseDir if append and os.path.exists(outName + '.metadata'): tmd = json.loads(open(outName + '.metadata').read()) metadata = func.updateDict(tmd, metadata) outFile = open(outName + '.metadata', 'w') outFile.write(json.dumps(metadata, indent=1) + '\n') outFile.close() return loadTagFile(outName)
def tagData(tagsDir, tagName, tagBaseDir, files, recursive, expand, compress, append, overwrite, metadata=None, filterF=None): """ Tag a list of files with the name. The files can contain direcotires, and if recursive is set the contends of the directories will become part of the tag rather than just the name tagBaseDir is the name of the directory that is not part of the actual tag heirarchy expand will cause any archives listed to be expanded and the contents of the archive to be added compress will compress the files that have been put in the tag. compress should be the path to the directory the compressed file should be put. append will add to a tagName if it already exists, only unique names will be kept though filterF - if you want to filter any of the files as they are added to the file list provide a filter function that will be called on each individual file name. The file will be added if filter returns True This returns the tag that was created """ if metadata is None: metadata = {} if not os.path.exists(tagsDir): runSystemEx('mkdir -p ' + tagsDir) outName = os.path.join(tagsDir, tagName) if os.path.exists(outName) and not append and not overwrite: raise Exception('Tag already exists') ## # Keep a set of all old entries in the file, when we walk the generator we'll # we'll check to see if the file already exists in here if append and os.path.exists(outName): oldFiles = set([l.strip() for l in open(outName)]) else: oldFiles = set() files = [f for f in generateFileList(files, recursive, expand) if f not in oldFiles and (not filterF or filterF and filterF(f))] if overwrite: ## # If we are just overwritign the file, no need to old the list of oldFiles # Technically it shouldn't matter but if the old file list is really large # the lookup could be expensive outFile = open(outName, 'w') oldFiles = set() else: outFile = open(outName, 'a') outFile.write('\n'.join(files)) outFile.write('\n') outFile.close() # # If we are compressing the files then, load the tag back up # so we have all of the files there if compress: outTar = str(os.path.join(compress, tagName + '.tar')) outGzip = outTar + '.gz' if os.path.exists(outGzip): os.remove(outGzip) runSystemEx('mkdir -p ' + compress) files = loadTagFile(outName)('files') baseDirFiles, nonBaseDirFiles = partitionFiles(files, tagBaseDir) if baseDirFiles: for fs in func.chunk(20, baseDirFiles): cmd = ['tar', '-C', tagBaseDir, '-rf', outTar, ] cmd.extend([removeBase('/', f) for f in fs]) runSystemEx(' '.join(cmd), log=True) if nonBaseDirFiles: for fs in func.chunk(20, nonBaseDirFiles): cmd = ['tar', '-C', '/', '-rf', outTar, ] cmd.extend([removeBase('/', f) for f in fs]) runSystemEx(' '.join(cmd), log=True) # # It's possible we have no values here, if so, the tar was not created # and should be ignored if os.path.exists(outTar): runSystemEx('gzip ' + outTar, log=True) metadata = func.updateDict(metadata, {'compressed': True, 'compressed_file': outGzip}) # # If tagBaseDir is set it means we have some metadata to write if tagBaseDir: metadata['tag_base_dir'] = tagBaseDir if append and os.path.exists(outName + '.metadata'): tmd = json.loads(open(outName + '.metadata').read()) metadata = func.updateDict(tmd, metadata) outFile = open(outName + '.metadata', 'w') outFile.write(json.dumps(metadata, indent=1) + '\n') outFile.close() return loadTagFile(outName)