Esempio n. 1
0
def reduce_number_of_files(fileType):
    # get the number of files
    numberOfFiles = len([
        name for name in os.listdir(reducedDataDir)
        if re.match(fileType + '-[0-9]+' + '.csv', name)
    ])
    # loop through the files
    newFileIndex = 1
    for workIndex in range(1, numberOfFiles + 1,
                           2):  # from 1 to numberOfFiles - 1 by 2
        infilePath = get_existing_path(reducedDataDir, fileType, workIndex)
        df = pd.read_csv(infilePath, header=None)
        if workIndex < numberOfFiles:
            infilePath2 = get_existing_path(reducedDataDir, fileType,
                                            workIndex + 1)
            df2 = pd.read_csv(infilePath2, header=None)
            # merge
            df = df.append(df2)
            # remove file 2
            os.remove(infilePath2)
            print('delete ' + infilePath2)
        # remove file 1
        os.remove(infilePath)
        print('delete ' + infilePath)
        # write to file
        df.to_csv(get_path(reducedDataDir, fileType, newFileIndex),
                  index=False,
                  header=None)
        print('save file ' + get_path(reducedDataDir, fileType, newFileIndex))
        newFileIndex = newFileIndex + 1
Esempio n. 2
0
def sort_files(fileType, index_for_second_sort):
    numberOfFiles = len([
        name for name in os.listdir(reducedDataDir)
        if re.match(fileType + '-[0-9]+' + '.csv', name)
    ])
    numberOfFiles = min(numberOfFiles, 81)
    if numberOfFiles == 1:
        raise ("Sort single file separately")
    # start the loop where at the end of an iteration all the following files are sorted (the "backward loop")
    for indexOfTheSortedFileAtTheEndOfThisLoop in range(
            numberOfFiles, 0, -1):  # this goes from numberOfFiles to 1
        # start the forward loop, where we merge sort and split
        smallIdsFilePath = get_existing_path(reducedDataDir, fileType, 1)
        # put the file with small ids into a data frame
        dfSmall = pd.read_csv(smallIdsFilePath, header=None)
        for fileNumber in range(
                1, indexOfTheSortedFileAtTheEndOfThisLoop
        ):  # this goes from 1 to indexOfTheSortedFileAtTheEndOfThisLoop -1
            print(str(fileNumber) + ' at ' + str(datetime.now()))
            bigIdsFilePath = get_existing_path(reducedDataDir, fileType,
                                               fileNumber + 1)
            # put the file with large ids into a data frame
            dfBig = pd.read_csv(bigIdsFilePath, header=None)
            #  merge
            df = dfSmall.append(dfBig)
            # sort
            if index_for_second_sort is None:
                df.sort_values(by=0, inplace=True)
            else:
                df.sort_values(by=[0, index_for_second_sort], inplace=True)
            # split (being careful that on id is not in two files)
            mid = int(df.shape[0] / 2)
            while df[0].values[mid - 1] == df[0].values[mid]:
                mid = mid + 1
            dfs = np.split(df, [mid], axis=0)
            # write the small ids
            dfs[0].to_csv(smallIdsFilePath, index=False, header=None)
            # also write the large ids on the last iteration
            if fileNumber + 1 == indexOfTheSortedFileAtTheEndOfThisLoop:
                dfs[1].to_csv(bigIdsFilePath, index=False, header=None)
                print(bigIdsFilePath + ' is now sorted at ' +
                      str(datetime.now()))
            else:  # prepare the next iteration: the bid ids become the small ids
                dfSmall = dfs[1]
                smallIdsFilePath = bigIdsFilePath
    # now write the first id of each file into a file
    res = []
    for fileNumber in range(1, numberOfFiles + 1):
        filePath = get_existing_path(reducedDataDir, fileType, fileNumber)
        with open(filePath, 'r') as infile:
            firstRow = next(csv.reader(infile))
            res = res + [firstRow[0]]
    df = pd.DataFrame(res)
    outFilePath = os.path.join(reducedDataDir, fileType + '_first_ids.csv')
    df.to_csv(outFilePath, index=False, header=None)
    return
Esempio n. 3
0
def resplit_user_log_files():
    firstIds = []
    numberOfFiles = len([
        name for name in os.listdir(reducedDataDir)
        if re.match('user_logs-[0-9]+' + '.csv', name)
    ])
    for fileNumber in range(numberOfFiles, 0,
                            -1):  # this goes from numberOfFiles to 1
        dfBig = pd.read_csv(get_existing_path(reducedDataDir, 'user_logs',
                                              fileNumber),
                            header=None)
        mid = int(dfBig.shape[0] / 2)
        while dfBig[0].values[mid - 1] == dfBig[0].values[mid]:
            mid = mid + 1
        dfs = np.split(dfBig, [mid], axis=0)
        dfs[0].to_csv(get_path(reducedDataDir, 'user_logs',
                               2 * fileNumber - 1),
                      index=False,
                      header=None)
        dfs[1].to_csv(get_path(reducedDataDir, 'user_logs', 2 * fileNumber),
                      index=False,
                      header=None)
        firstIds = firstIds + [dfs[0][0].values[0], dfs[1][0].values[0]]
    df = pd.DataFrame(firstIds)
    df.sort_values(by=0, inplace=True)
    outFilePath = os.path.join(reducedDataDir, 'user_logs_first_ids.csv')
    df.to_csv(outFilePath, index=False, header=None)
    return
Esempio n. 4
0
def reduce_number_of_files_and_ids(fileType):
    # get the mapping
    knownSortedUserIdsPath = os.path.join(splitDataDir,
                                          'sorted_member_ids.csv')
    knownSortedUserIds = pd.read_csv(knownSortedUserIdsPath,
                                     header=None,
                                     dtype=str)
    unknownSortedUserIdsPath = os.path.join(splitDataDir,
                                            'unknown_user_ids_sorted.csv')
    unknownSortedUserIds = pd.read_csv(unknownSortedUserIdsPath,
                                       header=None,
                                       dtype=str)

    # get the number of files
    numberOfFiles = len([
        name for name in os.listdir(splitDataDir)
        if re.match(fileType + '-[0-9]+' + '.csv', name)
    ])
    # loop through the files
    newFileIndex = 1
    for workIndex in range(1, numberOfFiles + 1,
                           2):  # from 1 to numberOfFiles - 1 by 2
        infilePath = get_existing_path(splitDataDir, fileType, workIndex)
        df = reduce_ids(infilePath, knownSortedUserIds[0],
                        unknownSortedUserIds[0])
        if workIndex < numberOfFiles:
            infilePath2 = get_existing_path(splitDataDir, fileType,
                                            workIndex + 1)
            df2 = reduce_ids(infilePath2, knownSortedUserIds[0],
                             unknownSortedUserIds[0])
            # merge
            df = df.append(df2)
            # remove file 2
            if splitDataDir == reducedDataDir:
                os.remove(infilePath2)
        # remove file 1
        if splitDataDir == reducedDataDir:
            os.remove(infilePath)
        # write to file
        df.sort_values(by=0, inplace=True)
        df.to_csv(get_path(reducedDataDir, fileType, newFileIndex),
                  index=False,
                  header=None)
        newFileIndex = newFileIndex + 1
Esempio n. 5
0
def get_unknown_user_ids_sorted(fileType, knownSortedUserIds,
                                current_unknow_user_ids):
    numberOfFiles = len([
        name for name in os.listdir(reducedDataDir)
        if re.match(fileType + '-[0-9]+' + '.csv', name)
    ])
    for fileIndex in range(1, numberOfFiles + 1):  # from 1 to numberOfFiles)
        infilePath = get_existing_path(reducedDataDir, fileType, fileIndex)
        df = pd.read_csv(infilePath, header=None, dtype={0: object})
        new_unknow_user_ids = [x for x in df[0] if x not in knownSortedUserIds]
        current_unknow_user_ids = current_unknow_user_ids + sorted(
            set(new_unknow_user_ids) - set(current_unknow_user_ids))
    return current_unknow_user_ids