def getGraphIterator(dir, nb_user=None, nb_purchases_per_it=None): """ * Load the Bemol data from the directory dir * The dataset will only contain the $nb_user$ first users. None means all users. * $nb_purchases_per_it$ is used to split the purchases list in several iterations. See kperDatedPurchasesGraphListIterator's documentation * Do not read all the data, only the $10^n$ version of data with $n$ big enough. """ BemolData.assert_nb_user(nb_user) if nb_user == None: nb_user = BemolData.nb_max_user() # read data f_data_name = BemolData.get_file_name(dir, nb_user) purchasesList = [] dict_user = MyDictionary() try: with gzip.open(f_data_name, 'rb') as f_data: for line in f_data: # m = re.match("(\d+)\s(\d+)\s(\d+)\s(\d+)", line) # if dict_user.index(int(m.group(1))) < nb_user: # purchasesList.append([int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(4))]) vals = list(map(int, line.split())) if dict_user.index(vals[0]) < nb_user: purchasesList.append(vals) logging.info("File read " + f_data_name) # graph iterator graphIterator = DatedPurchasesGraphListIterator(purchasesList, nb_purchases_per_it) return graphIterator except IOError as error: raise RGIOError(error, RGIOError.indent() + 'consider running BemolData.generate_data_file(...)')
def generate_data_file(dir, nb_user=None): logging.debug("nb_user: "******"creating file " + str(f_data_name)) shutil.copy(BemolData.get_file_name(dir, None), f_data_name) # other files to generate nb_user_to_generate = [] current_nb_user = BemolData.get_nb_user_to_read(nb_user) logging.debug("current_nb_user before while: " + str(current_nb_user)) # !!!!! security failure TOCTTOU while (not os.path.exists(BemolData.get_file_name(dir, current_nb_user))): logging.debug("current_nb_user in while: " + str(current_nb_user)) nb_user_to_generate.append(current_nb_user) current_nb_user = BemolData.get_nb_user_to_read(current_nb_user+1) nb_user_to_generate.reverse() # generate other files for current_nb_user in nb_user_to_generate: # read data f_existing_data_name = BemolData.get_file_name(dir, current_nb_user+1) f_to_create_data_name = BemolData.get_file_name(dir, current_nb_user) logging.info("creating file " + f_to_create_data_name) dict_user = MyDictionary() try: f_existing_data = gzip.open(f_existing_data_name, 'rb') f_to_create_data = gzip.open(f_to_create_data_name, 'wb') i = 0 i_max = BemolData.get_nb_line(f_existing_data_name) for line in f_existing_data: Util.printIteration(i, 1000, i_max); i += 1 m = re.match("(\d+)\s(\d+)\s(\d+)\s(\d+)", line) if dict_user.index(int(m.group(1))) < current_nb_user: f_to_create_data.write(line) except IOError as error: if error.filename == f_existing_data: raise RGIOError(error, RGIOError.indent() + 'it disappeared in the meanwhile') else: raise error