Beispiel #1
0
class BalanedData:
    def __init__(self, filterSize, hashCount, clickedUsers):
        self.allData = []
        self.clickedCounter = len(clickedUsers)
        self.noClickedCounter = 0
        self.collectedDataUsersFilter = BloomFilter(filterSize, hashCount)
        self.__addUsers(clickedUsers)

    def __addUsers(self, clickedUsersIds):
        for userId in clickedUsersIds:
            self.__addUser(userId)

    def __addUser(self, userId):
        self.collectedDataUsersFilter.add(userId)

    def addUserRow(self, userId, row):

        isCollected = self.collectedDataUsersFilter.contains(userId)

        if isCollected:
            self.allData.append(row)
        elif self.clickedCounter > self.noClickedCounter:
            self.__addUser(userId)
            self.noClickedCounter += 1
            self.allData.append(row)
def sampleData(file1, file2, column):

    filter = BloomFilter(13419082, 23)

    firstUsersIds1 = userIds(file1, column)

    for user in firstUsersIds1:
        filter.add(str(user))

    firstUsersIds2 = userIds(file2, 'fc20')

    same = 0
    diff = 0
    for user in firstUsersIds2:
        if filter.contains(str(user)):
            same += 1
        else:
            diff += 1

    return same, diff