def reduce_number_of_files(fileType): # get the number of files numberOfFiles = len([ name for name in os.listdir(reducedDataDir) if re.match(fileType + '-[0-9]+' + '.csv', name) ]) # loop through the files newFileIndex = 1 for workIndex in range(1, numberOfFiles + 1, 2): # from 1 to numberOfFiles - 1 by 2 infilePath = get_existing_path(reducedDataDir, fileType, workIndex) df = pd.read_csv(infilePath, header=None) if workIndex < numberOfFiles: infilePath2 = get_existing_path(reducedDataDir, fileType, workIndex + 1) df2 = pd.read_csv(infilePath2, header=None) # merge df = df.append(df2) # remove file 2 os.remove(infilePath2) print('delete ' + infilePath2) # remove file 1 os.remove(infilePath) print('delete ' + infilePath) # write to file df.to_csv(get_path(reducedDataDir, fileType, newFileIndex), index=False, header=None) print('save file ' + get_path(reducedDataDir, fileType, newFileIndex)) newFileIndex = newFileIndex + 1
def sort_files(fileType, index_for_second_sort): numberOfFiles = len([ name for name in os.listdir(reducedDataDir) if re.match(fileType + '-[0-9]+' + '.csv', name) ]) numberOfFiles = min(numberOfFiles, 81) if numberOfFiles == 1: raise ("Sort single file separately") # start the loop where at the end of an iteration all the following files are sorted (the "backward loop") for indexOfTheSortedFileAtTheEndOfThisLoop in range( numberOfFiles, 0, -1): # this goes from numberOfFiles to 1 # start the forward loop, where we merge sort and split smallIdsFilePath = get_existing_path(reducedDataDir, fileType, 1) # put the file with small ids into a data frame dfSmall = pd.read_csv(smallIdsFilePath, header=None) for fileNumber in range( 1, indexOfTheSortedFileAtTheEndOfThisLoop ): # this goes from 1 to indexOfTheSortedFileAtTheEndOfThisLoop -1 print(str(fileNumber) + ' at ' + str(datetime.now())) bigIdsFilePath = get_existing_path(reducedDataDir, fileType, fileNumber + 1) # put the file with large ids into a data frame dfBig = pd.read_csv(bigIdsFilePath, header=None) # merge df = dfSmall.append(dfBig) # sort if index_for_second_sort is None: df.sort_values(by=0, inplace=True) else: df.sort_values(by=[0, index_for_second_sort], inplace=True) # split (being careful that on id is not in two files) mid = int(df.shape[0] / 2) while df[0].values[mid - 1] == df[0].values[mid]: mid = mid + 1 dfs = np.split(df, [mid], axis=0) # write the small ids dfs[0].to_csv(smallIdsFilePath, index=False, header=None) # also write the large ids on the last iteration if fileNumber + 1 == indexOfTheSortedFileAtTheEndOfThisLoop: dfs[1].to_csv(bigIdsFilePath, index=False, header=None) print(bigIdsFilePath + ' is now sorted at ' + str(datetime.now())) else: # prepare the next iteration: the bid ids become the small ids dfSmall = dfs[1] smallIdsFilePath = bigIdsFilePath # now write the first id of each file into a file res = [] for fileNumber in range(1, numberOfFiles + 1): filePath = get_existing_path(reducedDataDir, fileType, fileNumber) with open(filePath, 'r') as infile: firstRow = next(csv.reader(infile)) res = res + [firstRow[0]] df = pd.DataFrame(res) outFilePath = os.path.join(reducedDataDir, fileType + '_first_ids.csv') df.to_csv(outFilePath, index=False, header=None) return
def resplit_user_log_files(): firstIds = [] numberOfFiles = len([ name for name in os.listdir(reducedDataDir) if re.match('user_logs-[0-9]+' + '.csv', name) ]) for fileNumber in range(numberOfFiles, 0, -1): # this goes from numberOfFiles to 1 dfBig = pd.read_csv(get_existing_path(reducedDataDir, 'user_logs', fileNumber), header=None) mid = int(dfBig.shape[0] / 2) while dfBig[0].values[mid - 1] == dfBig[0].values[mid]: mid = mid + 1 dfs = np.split(dfBig, [mid], axis=0) dfs[0].to_csv(get_path(reducedDataDir, 'user_logs', 2 * fileNumber - 1), index=False, header=None) dfs[1].to_csv(get_path(reducedDataDir, 'user_logs', 2 * fileNumber), index=False, header=None) firstIds = firstIds + [dfs[0][0].values[0], dfs[1][0].values[0]] df = pd.DataFrame(firstIds) df.sort_values(by=0, inplace=True) outFilePath = os.path.join(reducedDataDir, 'user_logs_first_ids.csv') df.to_csv(outFilePath, index=False, header=None) return
def reduce_number_of_files_and_ids(fileType): # get the mapping knownSortedUserIdsPath = os.path.join(splitDataDir, 'sorted_member_ids.csv') knownSortedUserIds = pd.read_csv(knownSortedUserIdsPath, header=None, dtype=str) unknownSortedUserIdsPath = os.path.join(splitDataDir, 'unknown_user_ids_sorted.csv') unknownSortedUserIds = pd.read_csv(unknownSortedUserIdsPath, header=None, dtype=str) # get the number of files numberOfFiles = len([ name for name in os.listdir(splitDataDir) if re.match(fileType + '-[0-9]+' + '.csv', name) ]) # loop through the files newFileIndex = 1 for workIndex in range(1, numberOfFiles + 1, 2): # from 1 to numberOfFiles - 1 by 2 infilePath = get_existing_path(splitDataDir, fileType, workIndex) df = reduce_ids(infilePath, knownSortedUserIds[0], unknownSortedUserIds[0]) if workIndex < numberOfFiles: infilePath2 = get_existing_path(splitDataDir, fileType, workIndex + 1) df2 = reduce_ids(infilePath2, knownSortedUserIds[0], unknownSortedUserIds[0]) # merge df = df.append(df2) # remove file 2 if splitDataDir == reducedDataDir: os.remove(infilePath2) # remove file 1 if splitDataDir == reducedDataDir: os.remove(infilePath) # write to file df.sort_values(by=0, inplace=True) df.to_csv(get_path(reducedDataDir, fileType, newFileIndex), index=False, header=None) newFileIndex = newFileIndex + 1
def get_unknown_user_ids_sorted(fileType, knownSortedUserIds, current_unknow_user_ids): numberOfFiles = len([ name for name in os.listdir(reducedDataDir) if re.match(fileType + '-[0-9]+' + '.csv', name) ]) for fileIndex in range(1, numberOfFiles + 1): # from 1 to numberOfFiles) infilePath = get_existing_path(reducedDataDir, fileType, fileIndex) df = pd.read_csv(infilePath, header=None, dtype={0: object}) new_unknow_user_ids = [x for x in df[0] if x not in knownSortedUserIds] current_unknow_user_ids = current_unknow_user_ids + sorted( set(new_unknow_user_ids) - set(current_unknow_user_ids)) return current_unknow_user_ids