Esempio n. 1
0
def rrmerge_line_mmap(inputFiles, outputFile, bufferSize, writePosition):
    """
    Merges the files inside a list and writes them into outputFile
    using the line approach for reading and the mapped approach
    for writing.
    """
    files_to_read = []
    totalSize = 0

    for inputFile in inputFiles:
        rFile = open(inputFile, 'r+b')

        rFileSize = os.fstat(rFile.fileno()).st_size
        totalSize += rFileSize

        files_to_read.append(FileObject(rFile, 0, False, None))
    
    wFile = open(outputFile, 'w+b')
    wFile.write(totalSize * b'\0')

    mapping, actualFilePosition, actualBufferSize = getNewMapRegion(writePosition, bufferSize, totalSize, wFile, 1)
    
    while not all([x.isClosed for x in files_to_read]):
        for file in files_to_read:
            if not file.isClosed:
                bline, file.readPos = readln_line(file.fileObject, file.readPos)
                if not bline:
                    file.fileObject.close()
                    file.isClosed = True
                else:
                    mapping, writePosition, actualFilePosition = writeln_mmap(mapping, writePosition, actualFilePosition, actualBufferSize, totalSize, wFile, bline)
    mapping.close
    wFile.close()
Esempio n. 2
0
def generateSortedFiles(f, k, M, bufferSize):
    """
    Given a file f, a column index k, a memory size M and a buffer 
    size bufferSize, generates a priority queue of files of a size
    similar to M with the content of each of the files sorted.
    """
    rFile = open(f, 'r+b')
    fileName = os.path.basename(rFile.name).split(".")[0]
    sortedFiles = []
    linesToWrite = []

    current_position = 0
    mFileSize = 0
    currentFileIndex = 0

    # File name format for generated files:
    # [ORIGINAL_FILE_NAME]_0.csv
    # (e.g. info_type_0.csv)
    outputFile = "/output/" + fileName + "_" + str(currentFileIndex) + ".csv"

    while True:
        line, current_position = readln_line(rFile, current_position)
        if not line:
            # Write in a file the remaining content and add it to the queue
            writeSortedFile(linesToWrite, k, outputFile, bufferSize)
            heapq.heappush(sortedFiles,
                           (currentFileIndex, outputFile, mFileSize))
            break
        # Add lines to an array
        linesToWrite.append(line)
        mFileSize += len(line)
        if mFileSize > M:  # Max file size reached
            # Write lines to file and add it to the queue
            writeSortedFile(linesToWrite, k, outputFile, bufferSize)
            heapq.heappush(sortedFiles,
                           (currentFileIndex, outputFile, mFileSize))
            # Reset variables for next file to be written
            mFileSize = 0
            linesToWrite = []
            currentFileIndex += 1
            outputFile = "/output/" + fileName + "_" + str(
                currentFileIndex) + ".csv"

    rFile.close()

    return sortedFiles
Esempio n. 3
0
def rrmerge_Line_Char(file_list, outputFile):
    """
    Merges the files inside a list and writes them into outputFile
    using the line by line approach for reading and the char by char
     approach for writing.
    """
    files_to_read = initializeFileObjects(file_list)
    file_to_write = open(outputFile, 'w+b')
    while not all([x.isClosed for x in files_to_read]):
        for file in files_to_read:
            if not file.isClosed:
                line, file.readPos = readln_line(file.fileObject, file.readPos)
                if not line:
                    file.isClosed = True
                    file.fileObject.close()
                else:
                    writeln_char(file_to_write, line)
Esempio n. 4
0
def length_line(fileName):
    """
    Given a file, reads it sequentially line by line.
    Returns the number of bytes read.
    """
    file = open(fileName, 'r+b')
    sum = 0
    current_position = 0

    while True:
        line, current_position = readln_line(file, current_position)
        if not line:
            break
        sum += len(line)

    file.close()
    return sum
Esempio n. 5
0
def rrmerge_line_buffer(fileListArray, outputFilePath, bufferSize):
    """
    Merges the files inside a list and writes them into outputFile
    using the line approach for reading and the buffered
    approach for writing.
    """
    fileObjectArray = []
    outputFile = open(outputFilePath, 'w+b')
    for file in fileListArray:
        fileObjectArray.append(FileObject(open(file, 'r+b'), 0, False))
    while not all([x.isClosed for x in fileObjectArray]):
        for file in fileObjectArray:
            if not file.isClosed:
                line, file.readPos = readln_line(file.fileObject, file.readPos)
                if not line:
                    file.isClosed = True
                else:
                    writeln_buffer(outputFile, line, bufferSize)
Esempio n. 6
0
def randjump_readln(f, j):
    """
    Given a file and an integer j, performs j jumps inside the file
    and reads (line approach) j lines. Returns the amount of bytes read.
    """
    sum = 0
    count = 0

    fileSize = os.stat(f).st_size - 1
    file = open(f, "r+b")

    while count < j:
        randPosition = random.randint(0, fileSize)
        line, _ = readln_line(file, randPosition)
        sum += len(line)
        count += 1

    file.close()
    return sum
Esempio n. 7
0
def loadBuffer(buffer, f, M, d, fileSize):
    """
    Takes a dictionary (buffer), an int (filePosition), a str (f),
    an int (M) and an int (d). Then opens the file f and fills buffer
    with lines from f (starting from filePosition (element in the dictionary)) 
    until the size of buffer (in bytes) is greater than M / d.
    """
    file = open(str(Path.cwd()) + str(f), 'r+b')
    # file = open(f, 'r+b')
    maxBufferSize = M / d
    bufferSize = 0

    while bufferSize < maxBufferSize:
        nextLine, buffer['filePosition'] = readln_line(file,
                                                       buffer['filePosition'])
        if nextLine == b'':
            break
        buffer['buffer'].append(nextLine)
        bufferSize += len(nextLine)
        fileSize += len(nextLine)
    file.close()

    return buffer, fileSize