Beispiel #1
0
def fragmentInputBySize(infile,
                        tmpdir,
                        chunk,
                        fileType,
                        fragmentBase,
                        splitOnSize=True,
                        **kwargs):
    """
    Break up input into files of size chunk in tmpdir.
    Return number of fragments.
    """
    logging.debug("Fragmenting input: %r" % ({
        'infile': infile,
        'tmpDir': tmpdir,
        'chunk': chunk,
        'base': fragmentBase,
        'kwargs': kwargs
    }))
    inhandle = openInputFile(infile)
    num = fragmentInputStreamBySize(inhandle,
                                    tmpdir,
                                    chunk,
                                    fileType,
                                    fragmentBase,
                                    splitOnSize=splitOnSize,
                                    **kwargs)
    if infile is not None:
        inhandle.close()
    return num
Beispiel #2
0
def fragmentInputBySize(infile, tmpdir, chunk, fileType,
                        fragmentBase, splitOnSize=True, **kwargs):
    """
    Break up input into files of size chunk in tmpdir.
    Return number of fragments.
    """
    logging.debug(
        "Fragmenting input: %r" % ({
            'infile': infile,
            'tmpDir': tmpdir,
            'chunk': chunk,
            'base': fragmentBase,
            'kwargs': kwargs}))
    inhandle = openInputFile(infile)
    num = fragmentInputStreamBySize(
        inhandle,
        tmpdir,
        chunk,
        fileType,
        fragmentBase,
        splitOnSize=splitOnSize,
        **kwargs)
    if infile is not None:
        inhandle.close()
    return num
Beispiel #3
0
def getSizePerChunk(infile, splits, fileType, splitOnSize=False):
    """
    Get total size of all records and return target size for each chunk to end up with number of chunks specified by 'splits'
    """
    if infile is None:
        raise Exception("We cannot determine chunk size from STDIN!")

    if splitOnSize:
        # get a custom function that returns the size of this type of record
        recordSizer=fileType.sizer
    else:
        # just return 1 for each record
        recordSizer=recordCounter

    # loop through records
    inhandle = openInputFile(infile)
    totalSize = 0
    for record in fileType.recordStreamer(inhandle):
        totalSize+=recordSizer(record)
    inhandle.close()

    return calculateChunkSize(totalSize,splits)
 def __init__(self, fileName, *args):
     LineCounter.__init__(self, openInputFile(fileName, *args))
     self.fileName=fileName
Beispiel #5
0
 def __init__(self, fileName, *args):
     LineCounter.__init__(self, openInputFile(fileName, *args))
     try:
         self.fileName = fileName.name
     except AttributeError:
         self.fileName = fileName
Beispiel #6
0
 def __init__(self, fileName, *args):
     LineCounter.__init__(self, openInputFile(fileName, *args))
     try:
         self.fileName = fileName.name
     except AttributeError:
         self.fileName = fileName