Example #1
0
    def __init__(self, cmd, cmd_def=None, runner=None, runner_conf=None,
                 stdout=None, stderr=None, stdin=None, splits=None):
        '''It inits the a Popen instance, it creates and runs the subjobs.

        Like the subprocess.Popen it accepts stdin, stdout, stderr, but in this
        case all of them should be files, PIPE will not work.

        In the cmd_def list we have to tell this Popen how to locate the
        input and output files in the cmd and how to split and join them. Look
        for the cmd_format in the streams.py file.

        keyword arguments:
        cmd -- a list with the cmd to parallelize
        cmd_def -- the cmd definition list (default [])
        runner -- which runner to use  (default subprocess.Popen)
        runner_conf -- extra parameters for the runner (default {})
        stdout -- a fhand to store the stdout (default None)
        stderr -- a fhand to store the stderr (default None)
        stdin -- a fhand with the stdin (default None)
        splits -- number of subjobs to generate
        '''
        #we want the same interface as subprocess.popen
        #pylint: disable-msg=R0913
        self._retcode = None
        self._outputs_collected = False
        #some defaults
        #if the runner is not given, we use subprocess.Popen
        if runner is None:
            runner = StdPopen
        #is the cmd_def set in the command?
        cmd, cmd_cmd_def = get_cmd_def_from_cmd(cmd)

        if cmd_cmd_def:
            cmd_def = cmd_cmd_def
        elif cmd_def:
            cmd_def = cmd_def
        else:
            cmd_def = []

        if not cmd_def and stdin is not None:
            raise ValueError('No cmd_def given but stdin present')

        #if the number of splits is not given we calculate them
        if splits is None:
            splits = self.default_splits(runner)

        #we need a work dir to create the temporary split files
        self._work_dir = NamedTemporaryDir()
        copy_file_mode('.', self._work_dir.name)

        #the main job
        self._job = {'cmd': cmd, 'work_dir': self._work_dir}
        #we create the new subjobs
        self._jobs = self._split_jobs(cmd, cmd_def, splits, self._work_dir,
                                      stdout=stdout, stderr=stderr, stdin=stdin)

        #launch every subjobs
        self._launch_jobs(self._jobs, runner=runner, runner_conf=runner_conf)
Example #2
0
    def _split_streams(streams, splits, work_dir):
        '''Given a list of streams it splits every stream in the given number of
        splits'''
        #which are the input and output streams?
        input_stream_indexes = []
        output_stream_indexes = []
        for index, stream in enumerate(streams):
            if stream['io'] == 'in':
                input_stream_indexes.append(index)
            elif stream['io'] == 'out':
                output_stream_indexes.append(index)

        #we create one work dir for every split
        work_dirs = []
        for index in range(splits):
            dir_ = NamedTemporaryDir(dir=work_dir)
            work_dirs.append(dir_)
            copy_file_mode('.', dir_.name)

        #we have to do first the input files because the number of splits could
        #be changed by them
        #we split the input stream files into several splits
        #we have to sort the input_stream_indexes, first we should take the ones
        #that have an input file to be split
        def do_we_have_to_split(stream_index):
            'If the stream has to split a file it will return True'
            split = None
            stream = streams[stream_index]
            #maybe they shouldn't be split
            if 'special' in stream and 'no_split' in stream['special']:
                split = False
            #maybe there is no file to split
            if (('fhand' in stream and stream['fhand'] is None) or
                ('fname' in stream and stream['fname'] is None) or
                ('fname' not in stream and 'fhand' not in stream)):
                split = False
            elif (('fhand' in stream and stream['fhand'] is not None) or
                  ('fname' in stream and stream['fname'] is not None)):
                split = True
            return split
        def to_be_split_first(stream1, stream2):
            'It sorts the streams, the ones to be split go first'
            split1 = do_we_have_to_split(stream1)
            split2 = do_we_have_to_split(stream2)
            return int(split1) - int(split2)
        input_stream_indexes = sorted(input_stream_indexes, to_be_split_first)

        first = True
        split_files = {}
        for index in input_stream_indexes:
            stream = streams[index]
            #splitter
            splitter = None
            if 'special' in stream and 'no_split' in stream['special']:
                splitter = create_non_splitter_splitter(copy_files=True)
            elif 'splitter' not in stream:
                msg = 'An splitter should be provided for every input stream'
                msg += 'missing for: ' + str(stream)
                raise ValueError(msg)
            else:
                splitter = stream['splitter']
            #if the splitter is a function we assume that it will know how to
            #split the given file, otherwise should be a registered type of
            #splitter or a regular expression
            if '__call__' not in dir(splitter):
                splitter = get_splitter(splitter)
            #we split the input files in the splits, every file will be in one
            #of the given work_dirs
            #the stream can have fname or fhands
            if 'fhand' in stream:
                file_ = stream['fhand']
            elif 'fname' in stream:
                file_ = stream['fname']
            else:
                file_ = None
            if file_ is None:
                #the stream migth have no file associated
                files = [None] * len(work_dirs)
            else:
                files = splitter(file_, work_dirs)
            #the files len can be different than splits, in that case we modify
            #the splits or we raise an error
            if len(files) != splits:
                if first:
                    splits = len(files)
                    #we discard the empty temporary dirs
                    work_dirs = work_dirs[0:splits]
                else:
                    msg = 'Not all input files were divided in the same number'
                    msg += ' of splits'
                    raise RuntimeError(msg)
            first = False
            split_files[index] = files   #a list of files for every in stream

        #we split the ouptut stream files into several splits
        output_splitter = create_non_splitter_splitter(copy_files=False)
        for index in output_stream_indexes:
            stream = streams[index]
            #for th output we just create the new names, but we don't split
            #any file
            if 'fhand' in stream:
                fname = stream['fhand']
            else:
                fname = stream['fname']
            files = output_splitter(fname, work_dirs)
            split_files[index] = files   #a list of files for every in stream

        new_streamss = []
        #we need one new stream for every split
        for split_index in range(splits):
            #the streams for one job
            new_streams = []
            for stream_index, stream in enumerate(streams):
                #we duplicate the original stream
                new_stream = stream.copy()
                #we set the new files
                if 'fhand' in stream:
                    new_stream['fhand'] = split_files[stream_index][split_index]
                else:
                    new_stream['fname'] = split_files[stream_index][split_index]
                new_streams.append(new_stream)
            new_streamss.append(new_streams)
        return new_streamss, work_dirs
Example #3
0
    def splitter(file_, work_dirs):
        '''It splits the given file into several splits.

        Every split will be located in one of the work_dirs, although it is not
        guaranteed to create as many splits as work dirs. If in the file there
        are less items than work_dirs some work_dirs will be left empty.
        It returns a list with the fpaths or fhands for the splitted files.
        file_ can be an fhand or an fname.
        '''
        #the file_ can be an fname or an fhand. which one is it?
        file_is_str = None
        if isinstance(file_, str):
            fname = file_
            file_is_str = True
        else:
            fname = file_.name
            file_is_str = False

        # do we have header?
        if header_extractor is not None:
            header_fhand = NamedTemporaryFile()
            fhand = open(fname)
            header_extractor(fhand, header_fhand)
            fhand.close()
        else:
            header_fhand = None

        # do we have footer?
        if footer_extractor is not None:
            footer_fhand = NamedTemporaryFile()
            fhand = open(fname)
            footer_extractor(fhand, header_fhand)
            fhand.close()
        else:
            footer_fhand = None

        # File preprocess
        if preprocesor is not None:
            suffix = os.path.splitext(fname)[-1]
            preprocessed_fhand = NamedTemporaryFile(suffix=suffix)
            fhand = open(fname)
            preprocesor(fhand, preprocessed_fhand)
            fhand.close()
            fname = preprocessed_fhand.name


        #how many splits do we want?
        nsplits = len(work_dirs)
        #how many items are in the file? We assume that all files have the same
        #number of items

        fhand = open(fname, 'r')
        nitems = item_counter(fhand, expression)

        #how many splits a we going to create? and how many items will be in
        #every split
        #if there are more items than splits we create as many splits as items
        if nsplits > nitems:
            nsplits = nitems
        (nsplits1, nitems1), (nsplits2, nitems2) = _calculate_divisions(nitems,
                                                                       nsplits)
        #we have to create nsplits1 files with nitems1 in it and nsplits2 files
        #with nitems2 items in it
        new_files  = []
        fhand = open(fname, 'r')
        items = item_splitter(fhand, expression)
        splits_made = 0
        for nsplits, nitems in ((nsplits1, nitems1), (nsplits2, nitems2)):
            #we have to create nsplits files with nitems in it
            #we don't need the split_index for anything
            #pylint: disable-msg=W0612
            for split_index in range(nsplits):
                suffix = os.path.splitext(fname)[-1]
                work_dir = work_dirs[splits_made]
                ofh = NamedTemporaryFile(dir=work_dir.name, delete=False,
                                         suffix=suffix)
                copy_file_mode(fhand.name, ofh.name)

                # header
                if header_fhand is not None:
                    header_fhand.seek(0)
                    ofh.write(header_fhand.read())

                for item_index in range(nitems):
                    ofh.write(items.next())
                ofh.flush()

                # footer
                if footer_fhand is not None:
                    footer_fhand.seek(0)
                    ofh.write(footer_fhand.read())

                #postprocess
                if postprocesor is not None:
                    newofh = NamedTemporaryFile(dir=work_dir.name, delete=False,
                                                suffix=suffix)
                    postprocesor(ofh, newofh)
                    ofh_path = ofh.name
                    ofh.close()
                    os.remove(ofh_path)
                    ofh = newofh

                #we have to close the files otherwise we can run out of files
                #in the os filesystem
                if file_is_str:
                    new_files.append(ofh.name)
                else:
                    new_files.append(ofh)
                ofh.close()
                splits_made += 1

        return new_files