def __getitem__(self, index):

        # debug
        #print(f"debug-> index is:{index}")

        # if true, update the caches, i.e. self.tensors
        if index % self.df_tokens_reader_original.chunksize == 0:
            sep_tok_id = 102
            if index == 0:
                self.count = 0
                self.df_tokens_reader_current = pd.read_csv(
                    self.df_tokens_reader_original.f,
                    chunksize=self.df_tokens_reader_original.chunksize,
                    index_col=0,
                    header=0,
                )

                for j in range(0, self.batches_to_skip):
                    chunk = self.df_tokens_reader_current.get_chunk()
            else:
                self.count += 1

            df_tokens_cache = self.df_tokens_reader_current.get_chunk()
            df_tokens_cache.columns = ['tokens']

            df_tokens_cache.set_index(df_tokens_cache.index -
                                      self.batches_to_skip *
                                      self.df_tokens_reader_original.chunksize,
                                      inplace=True)

            #print(df_tokens_cache)
            #print(df_features_cache)

            start = index
            end = start + self.df_tokens_reader_current.chunksize
            df_features_cache = self.df_features.iloc[start:end]
            df_label_cache = self.df_label.iloc[start:end]

            text_series = df_tokens_cache['tokens'].map(
                lambda x: x.split('\t'))
            #print(f"first text_series: {text_series}")
            max_len, is_capped = get_max_len_cap(text_series, self.cap)
            attention_masks = np.ones((len(text_series), max_len),
                                      dtype=np.int8)
            if is_capped:
                debug_first_branch = False
                debug_second_branch = False

                # remove the additional tokens if exceeds max_len,
                # else: pad
                for i in range(len(text_series)):
                    debug_first_branch = False
                    debug_second_branch = False

                    i_shifted = i + index
                    if len(text_series[i_shifted]) > max_len:
                        debug_first_branch = True
                        # remove the additional tokens
                        while len(text_series[i_shifted]) >= (max_len):
                            text_series[i_shifted].pop()
                        # append the SEP token
                        text_series[i_shifted].append(sep_tok_id)

                    elif len(text_series[i_shifted]) < max_len:  # padding
                        debug_second_branch = True
                        initial_len = len(text_series[i_shifted])
                        miss = max_len - initial_len
                        text_series[i_shifted] += [0] * miss
                        for j in range(initial_len, max_len):
                            attention_masks[i][j] = 0
                    # print(
                    #    f"iteration {i}, debug_first_branch {debug_first_branch} ,debug_second_branch {debug_second_branch}, len: {len(text_series[i_shifted])}")
                    # text_series[i_shifted] = np.array(text_series[i_shifted], dtype=np.int32)
                    # print(f"type of the array: {text_series[i_shifted].dtype}")

            else:  # if the series is not capped, normal padding

                # padding
                for i in range(len(text_series)):
                    i_shifted = i + index  # * self.df_tokens_reader.chunksize
                    initial_len = len(text_series[i_shifted])
                    miss = max_len - initial_len
                    text_series[i_shifted] += [0] * miss
                    for j in range(initial_len, max_len):
                        attention_masks[i][j] = 0
                    # text_series[i_shifted] = np.array(text_series[i_shifted], dtype=np.int32)

            # todo we need to optimize this
            list_arr = []
            for feat in df_features_cache.columns:
                list_arr.append(df_features_cache[feat].values)
            feature_mat = np.array(list_arr)
            del list_arr
            gc.collect()

            text_series = text_series.map(
                lambda l: [int(elem) for elem in l]).map(
                    lambda x: np.array(x, dtype=np.int32))

            # print(f"text_series : {text_series}")
            # print(f"text_series type: {type(text_series)}")
            # print(f"text_series to numpy: {text_series.to_numpy()}")

            text_np_mat = np.stack(text_series)
            #print(f"text_np_mat :\n {text_np_mat}")
            #print(f"text_np_mat shape :\n {text_np_mat.shape}")
            #print(f"text_np_mat type : {type(text_np_mat)}")
            #print(f"text_np_mat dtype : {text_np_mat.dtype}")
            #print(f"text_np_mat 0 type : {type(text_np_mat[0])}")

            #print(f"text_np_mat 0 : {text_np_mat[0]}")
            #print(f"text_np_mat 0 dtype : {text_np_mat[0].dtype}")
            text_tensor = torch.tensor(text_np_mat, dtype=torch.int64)
            attention_masks = torch.tensor(attention_masks, dtype=torch.int8)

            df_label_cache.columns = [0, 1]
            arr_0 = df_label_cache[0].astype(np.int8).values
            arr_1 = df_label_cache[1].astype(np.int8).values

            labels_mat = np.vstack([arr_0, arr_1])

            labels = torch.tensor(labels_mat.T, dtype=torch.int8)

            features = torch.tensor(feature_mat.T)
            self.tensors = [text_tensor, attention_masks, features, labels]

        return tuple(
            tensor[index -
                   self.count * self.df_tokens_reader_current.chunksize]
            for tensor in self.tensors)
    def __getitem__(self, index):

        # debug
        #print(f"debug-> index is:{index}")

        current_batch_size = self.df_tokens_reader_original.chunksize
        current_subsampled_batch_size = int(current_batch_size *
                                            self.batch_subsample)

        #print("index :", index)

        # if true, update the caches, i.e. self.tensors
        if index % current_subsampled_batch_size == 0:
            sep_tok_id = 102
            if index == 0:
                self.count = 0
                self.df_tokens_reader_current = pd.read_csv(
                    self.df_tokens_reader_original.f,
                    chunksize=self.df_tokens_reader_original.chunksize,
                    index_col=0,
                    header=0,
                )
            else:
                self.count += 1

            df_tokens_cache = self.df_tokens_reader_current.get_chunk()
            df_tokens_cache.columns = ['tokens']

            current_batch_size = len(df_tokens_cache)
            current_subsampled_batch_size = int(current_batch_size *
                                                self.batch_subsample)

            #print(current_batch_size, current_subsampled_batch_size)

            start = self.count * self.df_tokens_reader_current.chunksize
            end = start + current_batch_size

            #print(start, end)

            df_features_cache = self.df_features.iloc[start:end]
            df_label_cache = self.df_label.iloc[start:end]

            if self.batch_subsample is not None:
                mask = np.zeros(current_batch_size, dtype=int)
                mask[:current_subsampled_batch_size] = 1
                np.random.shuffle(mask)
                mask = mask.astype(bool)
                df_tokens_cache = df_tokens_cache[mask]
                df_features_cache = df_features_cache[mask]
                df_label_cache = df_label_cache[mask]
                #print(mask)
                #print(df_tokens_cache)
                #print(df_features_cache)
                #print(df_label_cache)
                new_index = pd.Series(
                    range(index, index + current_subsampled_batch_size))
                df_tokens_cache.set_index(new_index, inplace=True)
                df_features_cache.set_index(new_index, inplace=True)
                df_label_cache.set_index(new_index, inplace=True)
                #print(df_tokens_cache)
                #print(df_features_cache)
                #print(df_label_cache)
            else:
                print("\nInvalid subsample value\n")
                return

            text_series = df_tokens_cache['tokens'].map(
                lambda x: x.split('\t'))
            #print(f"first text_series: {text_series}")
            max_len, is_capped = get_max_len_cap(text_series, self.cap)
            attention_masks = np.ones((len(text_series), max_len),
                                      dtype=np.int8)
            if is_capped:
                debug_first_branch = False
                debug_second_branch = False

                # remove the additional tokens if exceeds max_len,
                # else: pad
                for i in range(len(text_series)):
                    debug_first_branch = False
                    debug_second_branch = False

                    i_shifted = i + index
                    if len(text_series[i_shifted]) > max_len:
                        debug_first_branch = True
                        # remove the additional tokens
                        while len(text_series[i_shifted]) >= (max_len):
                            text_series[i_shifted].pop()
                        # append the SEP token
                        text_series[i_shifted].append(sep_tok_id)

                    elif len(text_series[i_shifted]) < max_len:  # padding
                        debug_second_branch = True
                        initial_len = len(text_series[i_shifted])
                        miss = max_len - initial_len
                        text_series[i_shifted] += [0] * miss
                        for j in range(initial_len, max_len):
                            attention_masks[i][j] = 0
                    # print(
                    #    f"iteration {i}, debug_first_branch {debug_first_branch} ,debug_second_branch {debug_second_branch}, len: {len(text_series[i_shifted])}")
                    # text_series[i_shifted] = np.array(text_series[i_shifted], dtype=np.int32)
                    # print(f"type of the array: {text_series[i_shifted].dtype}")

            else:  # if the series is not capped, normal padding

                # padding
                for i in range(len(text_series)):
                    i_shifted = i + index  # * self.df_tokens_reader.chunksize
                    initial_len = len(text_series[i_shifted])
                    miss = max_len - initial_len
                    text_series[i_shifted] += [0] * miss
                    for j in range(initial_len, max_len):
                        attention_masks[i][j] = 0
                    # text_series[i_shifted] = np.array(text_series[i_shifted], dtype=np.int32)

            # todo we need to optimize this
            list_arr = []
            for feat in df_features_cache.columns:
                list_arr.append(df_features_cache[feat].values)
            feature_mat = np.array(list_arr)
            del list_arr
            gc.collect()

            text_series = text_series.map(
                lambda l: [int(elem) for elem in l]).map(
                    lambda x: np.array(x, dtype=np.int32))

            # print(f"text_series : {text_series}")
            # print(f"text_series type: {type(text_series)}")
            # print(f"text_series to numpy: {text_series.to_numpy()}")

            text_np_mat = np.stack(text_series)
            #print(f"text_np_mat :\n {text_np_mat}")
            #print(f"text_np_mat shape :\n {text_np_mat.shape}")
            #print(f"text_np_mat type : {type(text_np_mat)}")
            #print(f"text_np_mat dtype : {text_np_mat.dtype}")
            #print(f"text_np_mat 0 type : {type(text_np_mat[0])}")

            #print(f"text_np_mat 0 : {text_np_mat[0]}")
            #print(f"text_np_mat 0 dtype : {text_np_mat[0].dtype}")
            text_tensor = torch.tensor(text_np_mat, dtype=torch.int64)
            attention_masks = torch.tensor(attention_masks, dtype=torch.int8)
            #print(df_label_cache['tweet_feature_engagement_is_like'])
            labels = torch.tensor(df_label_cache[
                f'tweet_feature_engagement_is_{self.class_label}'].map(
                    lambda x: 1 if x else 0).values,
                                  dtype=torch.int8)
            features = torch.tensor(feature_mat.T)
            self.tensors = [text_tensor, attention_masks, features, labels]

        return tuple(tensor[index - self.count * current_subsampled_batch_size]
                     for tensor in self.tensors)