Esempio n. 1
0
    def getGraphIterator(dir, nb_user=None, nb_purchases_per_it=None):
        """
        * Load the Bemol data from the directory dir
        * The dataset will only contain the $nb_user$ first users. None means all users.
        * $nb_purchases_per_it$ is used to split the purchases list in several
        iterations. See kperDatedPurchasesGraphListIterator's documentation
        * Do not read all the data, only the $10^n$ version of data with $n$ big
        enough.
        """
        BemolData.assert_nb_user(nb_user)
        if nb_user == None:
            nb_user = BemolData.nb_max_user()
            
        # read data
        f_data_name = BemolData.get_file_name(dir, nb_user)
        purchasesList = []
        dict_user = MyDictionary()
        try:
            with gzip.open(f_data_name, 'rb') as f_data:
    
                for line in f_data:
#                    m = re.match("(\d+)\s(\d+)\s(\d+)\s(\d+)", line)
#                    if dict_user.index(int(m.group(1))) < nb_user:
#                        purchasesList.append([int(m.group(1)), int(m.group(2)), int(m.group(3)), int(m.group(4))])
                    vals = list(map(int, line.split()))
                    if dict_user.index(vals[0]) < nb_user:
                        purchasesList.append(vals)
                logging.info("File read " + f_data_name)
    
                # graph iterator
                graphIterator = DatedPurchasesGraphListIterator(purchasesList, nb_purchases_per_it)
    
                return graphIterator
        except IOError as error:
            raise RGIOError(error, RGIOError.indent() + 'consider running BemolData.generate_data_file(...)')
Esempio n. 2
0
    def generate_data_file(dir, nb_user=None):
        logging.debug("nb_user: "******"creating file " + str(f_data_name))
            shutil.copy(BemolData.get_file_name(dir, None), f_data_name)

        # other files to generate
        nb_user_to_generate = []
        current_nb_user = BemolData.get_nb_user_to_read(nb_user)
        logging.debug("current_nb_user before while: " + str(current_nb_user))
        # !!!!! security failure TOCTTOU
        while (not os.path.exists(BemolData.get_file_name(dir, current_nb_user))):
            logging.debug("current_nb_user in while: " + str(current_nb_user))
            nb_user_to_generate.append(current_nb_user)
            current_nb_user = BemolData.get_nb_user_to_read(current_nb_user+1)
        nb_user_to_generate.reverse()

    
        # generate other files
        for current_nb_user in nb_user_to_generate:
            # read data
            f_existing_data_name = BemolData.get_file_name(dir, current_nb_user+1)
            f_to_create_data_name = BemolData.get_file_name(dir, current_nb_user)
            logging.info("creating file " + f_to_create_data_name)
            dict_user = MyDictionary()
            try:
                f_existing_data = gzip.open(f_existing_data_name, 'rb')
                f_to_create_data = gzip.open(f_to_create_data_name, 'wb')

                i = 0
                i_max = BemolData.get_nb_line(f_existing_data_name)
                for line in f_existing_data:
                    Util.printIteration(i, 1000, i_max); i += 1
                    m = re.match("(\d+)\s(\d+)\s(\d+)\s(\d+)", line)
                    if dict_user.index(int(m.group(1))) < current_nb_user:
                        f_to_create_data.write(line)
            except IOError as error:
                if error.filename == f_existing_data:
                    raise RGIOError(error, RGIOError.indent() + 'it disappeared in the meanwhile')
                else:
                    raise error