예제 #1
0
 def divideIntoChunks(self,filenameGenerator):
     # input : a file with one input/line
     # output : chunks with one word/lines (chunks == give size)
     stackOfValues = []
     counter = 0;
     for InputfileName in self.files:
         filePointer = open(InputfileName, "r")
         for line in filePointer:                                
             stackOfValues.append(line)                
             if(sys.getsizeof(stackOfValues)>536870912): # more than 64Mo
                 FileHelper.writeListInFile(filenameGenerator(counter),stackOfValues)
                 counter = counter + 1
         filePointer.close()
     if len(stackOfValues) > 0: # Just in case ...
         FileHelper.writeListInFile(filenameGenerator(counter),stackOfValues)
         counter = counter + 1;
     self.nChunks = counter
     return
예제 #2
0
파일: Grouper.py 프로젝트: neosky2142/PyMR
    def writeDictio(self,dicFromKeyToListOfValues):
        # At first, we check if all keys are ascociate to a node file.
        # If yes, we copy the old node file
        # Id not, we create an empty node file
        # Then, we write all values in a chunk.
        # At the end, we append the chunkname into the correct node file
		
	# Finally, all nodes ascociated with keys which are not in the actual chunk (but already seen) are copied.

        for key, listOfValues in dicFromKeyToListOfValues.iteritems():
            # We write the list of value into a chunk (for a given key)
            self.nDifferentChunks = self.nDifferentChunks+1;
            chunkFilename = self.chunkNameGenerator(self.nDifferentChunks)
            FileHelper.writeListInFile(chunkFilename,listOfValues)

            # We check if the NodeFile has been created before.
            #   If yes : we copy it into a new file
            #   If not : we create a new empty file
            if self.oldDictFromKeyToNodeFile.has_key(key): # the node file exist
                nodeFileIdx = self.oldDictFromKeyToNodeFile[key];
                self.dictFromKeyToNodeFile[key] = nodeFileIdx;
                oldNodeFileName = self.oldNodeFileNameGenerator(nodeFileIdx);
                nodeFileName = self.nodeFileNameGenerator(nodeFileIdx);
                FileHelper.copyFile(oldNodeFileName,nodeFileName)
            else:
                self.nDifferentKeys = self.nDifferentKeys+1;
                nodeFileIdx = self.nDifferentKeys
                self.dictFromKeyToNodeFile[key] = nodeFileIdx;                
                nodeFileName = self.nodeFileNameGenerator(nodeFileIdx);
                open(nodeFileName, 'w+').close(); # create empty file

            # We append the new chunk name (without the directory) into the node file
            with open(nodeFileName, 'a') as nodePointer:                
                nodePointer.write(chunkFilename + "\n")
                
        for key, listOfValues in self.oldDictFromKeyToNodeFile.iteritems():
            if not dicFromKeyToListOfValues.has_key(key):
                nodeFileIdx = self.oldDictFromKeyToNodeFile[key];
                self.dictFromKeyToNodeFile[key] = nodeFileIdx;
                oldNodeFileName = self.oldNodeFileNameGenerator(nodeFileIdx);
                nodeFileName = self.nodeFileNameGenerator(nodeFileIdx);
                FileHelper.copyFile(oldNodeFileName,nodeFileName)

        return;