Exemple #1
0
def runInstances(credClient,
                 ami,
                 key,
                 iType,
                 groups,
                 availZone,
                 bidPrice,
                 minInstances,
                 maxInstances,
                 userData):
    def _runInstances(num):
        if bidPrice:
            return credClient.runSpotInstances(bidPrice=bidPrice,
                                               ami=ami,
                                               key=key,
                                               instanceType=iType,
                                               groups=groups,
                                               availabilityZone=availZone,
                                               numInstances=num,
                                               userData=userData)
        else:
            return credClient.runInstances(ami=ami,
                                           key=key,
                                           instanceType=iType,
                                           groups=groups,
                                           availabilityZone=availZone,
                                           numInstances=num,
                                           userData=userData)

    instances = []
    @defer.inlineCallbacks
    def _startInstances():
        startedInstances = yield _runInstances(maxInstances - len(instances))
        instances.extend(startedInstances)
        if len(instances) < minInstances:
            raise InstanceStartError('Wanted %d instances got %d' %
                                     (maxInstances - len(instances),
                                      len(startedInstances)))


    try:
        yield defer_utils.tryUntil(RUN_INSTANCE_TRIES,
                                   _startInstances,
                                   onFailure=defer_utils.sleep(30))
    except Exception, err:
        ## If we got an exception then terminate any instances
        ## that were started and reraise exception.
        ## The last thing we want is to leak instances
        ##
        ## This is not completely safe!  We should probably
        ## raise an exception with the started instances in it
        ## and let the caller decide what to do with them
        log.err('Error starting instances')
        log.err(err)
        defer_utils.mapSerial(lambda iChunk :
                                  credClient.terminateInstances(iChunk),
                              func.chunk(5, instances))
Exemple #2
0
def terminateCluster(credClient, persistManager, clusterName, userName):
    cluster = yield persistManager.loadCluster(clusterName, userName)

    yield defer_utils.mapSerial(
        lambda instances: credClient.terminateInstances(instances),
        func.chunk(5, cluster.execNodes + cluster.dataNodes))

    if cluster.master:
        yield credClient.terminateInstances([cluster.master])

    defer.returnValue(cluster.setState(cluster.TERMINATED))
Exemple #3
0
def terminateCluster(credClient, persistManager, clusterName, userName):
    cluster = yield persistManager.loadCluster(clusterName, userName)

    yield defer_utils.mapSerial(
        lambda instances: credClient.terminateInstances(instances), func.chunk(5, cluster.execNodes + cluster.dataNodes)
    )

    if cluster.master:
        yield credClient.terminateInstances([cluster.master])

    defer.returnValue(cluster.setState(cluster.TERMINATED))
Exemple #4
0
def runInstances(credClient, ami, key, iType, groups, availZone, bidPrice,
                 minInstances, maxInstances, userData):
    def _runInstances(num):
        if bidPrice:
            return credClient.runSpotInstances(bidPrice=bidPrice,
                                               ami=ami,
                                               key=key,
                                               instanceType=iType,
                                               groups=groups,
                                               availabilityZone=availZone,
                                               numInstances=num,
                                               userData=userData)
        else:
            return credClient.runInstances(ami=ami,
                                           key=key,
                                           instanceType=iType,
                                           groups=groups,
                                           availabilityZone=availZone,
                                           numInstances=num,
                                           userData=userData)

    instances = []

    @defer.inlineCallbacks
    def _startInstances():
        startedInstances = yield _runInstances(maxInstances - len(instances))
        instances.extend(startedInstances)
        if len(instances) < minInstances:
            raise InstanceStartError(
                'Wanted %d instances got %d' %
                (maxInstances - len(instances), len(startedInstances)))

    try:
        yield defer_utils.tryUntil(RUN_INSTANCE_TRIES,
                                   _startInstances,
                                   onFailure=defer_utils.sleep(30))
    except Exception, err:
        ## If we got an exception then terminate any instances
        ## that were started and reraise exception.
        ## The last thing we want is to leak instances
        ##
        ## This is not completely safe!  We should probably
        ## raise an exception with the started instances in it
        ## and let the caller decide what to do with them
        log.err('Error starting instances')
        log.err(err)
        defer_utils.mapSerial(
            lambda iChunk: credClient.terminateInstances(iChunk),
            func.chunk(5, instances))
def terminateInstancesByAttribute(persistManager, credClient, clusterName,
                                  userName, byAttribute, attributeValues):

    cluster = yield persistManager.loadCluster(clusterName, userName)

    instances = [
        i for i in cluster.execNodes + cluster.dataNodes
        if i[byAttribute] in attributeValues
    ]

    yield defer_utils.mapSerial(credClient.terminateInstances,
                                func.chunk(5, instances))

    defer.returnValue(
        cluster.removeExecNodes(instances).removeDataNodes(instances))
def terminateInstancesByAttribute(persistManager,
                                  credClient,
                                  clusterName,
                                  userName,
                                  byAttribute,
                                  attributeValues):
    
    cluster = yield persistManager.loadCluster(clusterName, userName)

    instances = [i
                 for i in cluster.execNodes + cluster.dataNodes
                 if i[byAttribute] in attributeValues]
    
    yield defer_utils.mapSerial(credClient.terminateInstances,
                                func.chunk(5, instances))

    defer.returnValue(cluster.removeExecNodes(instances).removeDataNodes(instances))
Exemple #7
0
def removeTerminatedCluster(persistManager, credClient, clusterName, userName):
    yield defer_utils.sleep(REMOVE_CLUSTER_TIMEOUT)()
    cluster = yield persistManager.loadCluster(clusterName, userName)

    if cluster.state == cluster.TERMINATED:
        # Another check to make sure the instances have
        # really been terminated
        instances = ([cluster.master] + cluster.execNodes + cluster.dataNodes)

        instances = yield credClient.updateInstances(instances)

        undeadInstances = [i for i in instances if i['state'] != 'terminated']

        if undeadInstances:
            yield defer_utils.mapSerial(
                lambda instances: credClient.terminateInstances(instances),
                func.chunk(5, undeadInstances))

        yield persistManager.removeCluster(clusterName, userName)
Exemple #8
0
def removeTerminatedCluster(persistManager, credClient, clusterName, userName):
    yield defer_utils.sleep(REMOVE_CLUSTER_TIMEOUT)()
    cluster = yield persistManager.loadCluster(clusterName, userName)

    if cluster.state == cluster.TERMINATED:
        # Another check to make sure the instances have
        # really been terminated
        instances = [cluster.master] + cluster.execNodes + cluster.dataNodes

        instances = yield credClient.updateInstances(instances)

        undeadInstances = [i for i in instances if i["state"] != "terminated"]

        if undeadInstances:
            yield defer_utils.mapSerial(
                lambda instances: credClient.terminateInstances(instances), func.chunk(5, undeadInstances)
            )

        yield persistManager.removeCluster(clusterName, userName)
Exemple #9
0
def tree_loop(fasta_dict, combined, tree, parallel_workers, run_r, num_refs):
    def _temp_name(t, f):
        return t + '_' + f

    def _perform_workflow(data):
        tn, f = data
        outfile = open("%s.fasta" % tn, "w")
        outfile.write(">%s\n%s" % (tn,f))
        outfile.close()
        logging.debugPrint(lambda : "Processing sequence: %s" % tn)
        blast_against_reference("%s.fasta" % tn, combined, _temp_name(tn, "blast_parsed.txt"))
        subprocess.check_call("sort -u -k 2,2 %s > %s" % (_temp_name(tn, "blast_parsed.txt"),
                                                          _temp_name(tn, "blast_unique.parsed.txt")),
                              shell=True)
        parsed_blast_to_seqs(_temp_name(tn, "blast_unique.parsed.txt"), _temp_name(tn, "seqs_in.fas"))
        check_and_align_seqs(_temp_name(tn, "seqs_in.fas"), num_refs, _temp_name(tn, "seqs_aligned.fas"))
        if os.path.isfile(_temp_name(tn, "seqs_aligned.fas")):
            """What if there are NO SNPs in a given region"""
            #try:
            subprocess.call(['mothur',
                                   '#filter.seqs(fasta=%s, soft=100, vertical=F)' % _temp_name(tn, "seqs_aligned.fas")], stdout=subprocess.PIPE)
            subprocess.check_call('sed "s/[^1]/0/g" %s | sed "s/0/2/g" | sed "s/1/0/g" | sed "s/2/1/g" > %s' % (_temp_name(tn, "seqs_aligned.filter"),
                                                                                                                _temp_name(tn, "mask.txt")), shell=True)
            split_read(_temp_name(tn, "mask.txt"),_temp_name(tn, "padded.txt"))
            sum_qual_reads(_temp_name(tn, "padded.txt"), _temp_name(tn,"polys.txt"))
            #except:
            #    """This function was never created"""
            #    write_poly_zeros(_temp_name(tn, "padded.txt"), _temp_name(tn,"polys.txt"))
            if "T" == run_r:
                name = get_seq_name("%s.fasta" % tn)
                subprocess.check_call("cat snps.r | R --slave --args %s %s.table %s.pdf 2> /dev/null" % (_temp_name(tn, "seqs_aligned.fas"), name, name),
        					      shell=True)
                os.system("mv %s.table ./R_output/%s.table.txt" % (name, name))
                os.system("mv %s.pdf ./R_output/%s.plots.pdf" % (name, name))
            else:
                pass
            subprocess.check_call("FastTree -nt -noboot %s > %s 2> /dev/null" % (_temp_name(tn, "seqs_aligned.fas"),
                                                                                 _temp_name(tn, "tmp.tree")),
                                  shell=True)
            run_dendropy("%s" % (_temp_name(tn, "tmp.tree")), tree, "%s" % (_temp_name(tn, "tmp.RF")))
            run_dendropy_euclidian("%s" % (_temp_name(tn, "tmp.tree")), tree, "%s" % (_temp_name(tn, "tmp.EU")))
            get_contig_length("%s.fasta" % tn, _temp_name(tn, "length.txt"))
            thread_id = id(threading.current_thread())
            thread_distance_file = str(thread_id) + '_distance.txt'
            parse_rf_file(_temp_name(tn, "tmp.RF"), thread_distance_file)
            thread_euclidian_file = str(thread_id) + "_euc_dist.txt"
            parse_rf_file(_temp_name(tn, "tmp.EU"), thread_euclidian_file)
            thread_name_file = str(thread_id) + '_name.txt'
            write_strip_name("%s.fasta" % tn, thread_name_file)
            polys_name_file = str(thread_id) + '_polys.txt'
            parse_poly_file(_temp_name(tn, "polys.txt"), polys_name_file)
            length_name_file = str(thread_id) + '_length.txt'
            parse_poly_file(_temp_name(tn, "length.txt"), length_name_file)
            try:
                subprocess.check_call("rm mothur*", shell=True, stderr=open(os.devnull, 'w'))
            except:
                pass
            subprocess.check_call(["rm",
                                   _temp_name(tn, "blast_parsed.txt"),
                                   "%s.fasta" % tn,
                                   _temp_name(tn, "blast_unique.parsed.txt"),
                                   _temp_name(tn, "seqs_in.fas"),
                                   _temp_name(tn, "seqs_aligned.fas"),
                                   _temp_name(tn, "tmp.tree"),
                                   _temp_name(tn, "tmp.RF"),
                                   _temp_name(tn, "tmp.EU"),
                                   _temp_name(tn, "mask.txt"),
                                   _temp_name(tn, "padded.txt"),
                                   _temp_name(tn, "polys.txt"),
                                   _temp_name(tn, "seqs_aligned.filter"),
                                   _temp_name(tn, "length.txt"),
                                   _temp_name(tn, "seqs_aligned.filter.fasta")])
            return (thread_distance_file, thread_name_file, polys_name_file, length_name_file,
                    thread_euclidian_file)
        else:
            subprocess.check_call(["rm",
                                   _temp_name(tn, "blast_parsed.txt"),
                                   "%s.fasta" % tn,
                                   _temp_name(tn, "blast_unique.parsed.txt"),
                                   _temp_name(tn, "seqs_in.fas")])

    files_and_temp_names = [(str(idx), f)
                             for idx, f in fasta_dict.iteritems()]
    results = set(p_func.pmap(_perform_workflow,
                              files_and_temp_names,
                              num_workers=parallel_workers))

    #I do this to make sure and remove any old files that are setting around
    subprocess.call("rm distance.txt name.txt polys.txt length.txt", shell=True, stderr=open(os.devnull, 'w'))

    for files in func.chunk(5, results):
        distances = []
        names = []
        polys = []
        lengths = []
        euc_dist = []
        for value in files:
            if value:
                distances.append(value[0])
                names.append(value[1])
                polys.append(value[2])
                lengths.append(value[3])
                euc_dist.append(value[4])
        if distances:
            subprocess.check_call("cat %s >> distance.txt" % " ".join(distances), shell=True)
            subprocess.check_call("cat %s >> name.txt" % " ".join(names), shell=True)
            subprocess.check_call("cat %s >> polys.txt" % " ".join(polys), shell=True)
            subprocess.check_call("cat %s >> length.txt" % " ".join(lengths), shell=True)
            subprocess.check_call("cat %s >> euc_dist.txt" % " ".join(euc_dist), shell=True)
            subprocess.check_call("rm %s" % " ".join(distances), shell=True)
            subprocess.check_call("rm %s" % " ".join(names), shell=True)
            subprocess.check_call("rm %s" % " ".join(polys), shell=True)
            subprocess.check_call("rm %s" % " ".join(lengths), shell=True)
    paste_files("name.txt","distance.txt","euc_dist.txt","polys.txt","length.txt","all_distances.txt")
Exemple #10
0
def tagData(tagsDir,
            tagName,
            tagBaseDir,
            files,
            recursive,
            expand,
            compress,
            append,
            overwrite,
            metadata=None,
            filterF=None):
    """
    Tag a list of files with the name.  The files can contain direcotires, and if recursive
    is set the contends of the directories will become part of the tag rather than just the name

    tagBaseDir is the name of the directory that is not part of the actual tag heirarchy
    
    expand will cause any archives listed to be expanded and the contents of the archive to be added

    compress will compress the files that have been put in the tag.  compress should be the path to the
    directory the compressed file should be put.

    append will add to a tagName if it already exists, only unique names will be kept though

    filterF - if you want to filter any of the files as they are added to the file list provide a filter
    function that will be called on each individual file name.  The file will be added if filter returns True

    This returns the tag that was created
    """

    if metadata is None:
        metadata = {}

    if not os.path.exists(tagsDir):
        runSystemEx('mkdir -p ' + tagsDir)

    outName = os.path.join(tagsDir, tagName)
    if os.path.exists(outName) and not append and not overwrite:
        raise Exception('Tag already exists')

    ##
    # Keep a set of all old entries in the file, when we walk the generator we'll
    # we'll check to see if the file already exists in here
    if append and os.path.exists(outName):
        oldFiles = set([l.strip() for l in open(outName)])
    else:
        oldFiles = set()

    files = [
        f for f in generateFileList(files, recursive, expand)
        if f not in oldFiles and (not filterF or filterF and filterF(f))
    ]

    if overwrite:
        ##
        # If we are just overwritign the file, no need to old the list of oldFiles
        # Technically it shouldn't matter but if the old file list is really large
        # the lookup could be expensive
        outFile = open(outName, 'w')
        oldFiles = set()
    else:
        outFile = open(outName, 'a')

    outFile.write('\n'.join(files))
    outFile.write('\n')
    outFile.close()

    #
    # If we are compressing the files then, load the tag back up
    # so we have all of the files there
    if compress:
        outTar = str(os.path.join(compress, tagName + '.tar'))
        outGzip = outTar + '.gz'
        if os.path.exists(outGzip):
            os.remove(outGzip)
        runSystemEx('mkdir -p ' + compress)
        files = loadTagFile(outName)('files')
        baseDirFiles, nonBaseDirFiles = partitionFiles(files, tagBaseDir)
        if baseDirFiles:
            for fs in func.chunk(20, baseDirFiles):
                cmd = [
                    'tar',
                    '-C',
                    tagBaseDir,
                    '-rf',
                    outTar,
                ]
                cmd.extend([removeBase('/', f) for f in fs])
                runSystemEx(' '.join(cmd), log=True)

        if nonBaseDirFiles:
            for fs in func.chunk(20, nonBaseDirFiles):
                cmd = [
                    'tar',
                    '-C',
                    '/',
                    '-rf',
                    outTar,
                ]
                cmd.extend([removeBase('/', f) for f in fs])
                runSystemEx(' '.join(cmd), log=True)

        #
        # It's possible we have no values here, if so, the tar was not created
        # and should be ignored
        if os.path.exists(outTar):
            runSystemEx('gzip ' + outTar, log=True)
            metadata = func.updateDict(metadata, {
                'compressed': True,
                'compressed_file': outGzip
            })

    #
    # If tagBaseDir is set it means we have some metadata to write
    if tagBaseDir:
        metadata['tag_base_dir'] = tagBaseDir

    if append and os.path.exists(outName + '.metadata'):
        tmd = json.loads(open(outName + '.metadata').read())
        metadata = func.updateDict(tmd, metadata)

    outFile = open(outName + '.metadata', 'w')
    outFile.write(json.dumps(metadata, indent=1) + '\n')
    outFile.close()

    return loadTagFile(outName)
Exemple #11
0
def tagData(tagsDir, tagName, tagBaseDir, files, recursive, expand, compress, append, overwrite, metadata=None, filterF=None):
    """
    Tag a list of files with the name.  The files can contain direcotires, and if recursive
    is set the contends of the directories will become part of the tag rather than just the name

    tagBaseDir is the name of the directory that is not part of the actual tag heirarchy
    
    expand will cause any archives listed to be expanded and the contents of the archive to be added

    compress will compress the files that have been put in the tag.  compress should be the path to the
    directory the compressed file should be put.

    append will add to a tagName if it already exists, only unique names will be kept though

    filterF - if you want to filter any of the files as they are added to the file list provide a filter
    function that will be called on each individual file name.  The file will be added if filter returns True

    This returns the tag that was created
    """

    if metadata is None:
        metadata = {}
        
    if not os.path.exists(tagsDir):
        runSystemEx('mkdir -p ' + tagsDir)
    
    outName = os.path.join(tagsDir, tagName)
    if os.path.exists(outName) and not append and not overwrite:
        raise Exception('Tag already exists')


    ##
    # Keep a set of all old entries in the file, when we walk the generator we'll
    # we'll check to see if the file already exists in here
    if append and os.path.exists(outName):
        oldFiles = set([l.strip() for l in open(outName)])
    else:
        oldFiles = set()


    files = [f
             for f in generateFileList(files, recursive, expand)
             if f not in oldFiles and (not filterF or filterF and filterF(f))]
        

    if overwrite:
        ##
        # If we are just overwritign the file, no need to old the list of oldFiles
        # Technically it shouldn't matter but if the old file list is really large
        # the lookup could be expensive
        outFile = open(outName, 'w')
        oldFiles = set()
    else:
        outFile = open(outName, 'a')

    
    outFile.write('\n'.join(files))
    outFile.write('\n')
    outFile.close()

    #
    # If we are compressing the files then, load the tag back up
    # so we have all of the files there
    if compress:
        outTar = str(os.path.join(compress, tagName + '.tar'))
        outGzip = outTar + '.gz'
        if os.path.exists(outGzip):
            os.remove(outGzip)
        runSystemEx('mkdir -p ' + compress)
        files = loadTagFile(outName)('files')
        baseDirFiles, nonBaseDirFiles = partitionFiles(files, tagBaseDir)
        if baseDirFiles:
            for fs in func.chunk(20, baseDirFiles):
                cmd = ['tar',
                       '-C', tagBaseDir,
                       '-rf', outTar,
                       ]
                cmd.extend([removeBase('/', f) for f in fs])
                runSystemEx(' '.join(cmd), log=True)

        if nonBaseDirFiles:
            for fs in func.chunk(20, nonBaseDirFiles):
                cmd = ['tar',
                       '-C', '/',
                       '-rf', outTar,
                       ]
                cmd.extend([removeBase('/', f) for f in fs])
                runSystemEx(' '.join(cmd), log=True)

        #
        # It's possible we have no values here, if so, the tar was not created
        # and should be ignored
        if os.path.exists(outTar):
            runSystemEx('gzip ' + outTar, log=True)
            metadata = func.updateDict(metadata, {'compressed': True,
                                                  'compressed_file': outGzip})

    #
    # If tagBaseDir is set it means we have some metadata to write
    if tagBaseDir:
        metadata['tag_base_dir'] = tagBaseDir

    if append and os.path.exists(outName + '.metadata'):
        tmd = json.loads(open(outName + '.metadata').read())
        metadata = func.updateDict(tmd, metadata)

    outFile = open(outName + '.metadata', 'w')
    outFile.write(json.dumps(metadata, indent=1) + '\n')
    outFile.close()

    return loadTagFile(outName)