def __getitem__(self, idx): try: x = next( pd.read_csv(self.filename, skiprows=idx * self.chunksize + 1, chunksize=self.chunksize, header=None, dtype=str)).fillna(NO_CONTEXT_WORD).values # something is broken here so just give filler if len(x[0]) != self.num_cols: # idx = max(0, idx-1) return self.__getitem__(np.random.randint(0, self.len)) except: x = next( pd.read_csv(self.filename, skiprows=idx * self.chunksize + 1, chunksize=self.chunksize, header=None, sep=',\s+', quoting=csv.QUOTE_ALL, dtype=str)).fillna(NO_CONTEXT_WORD).values x = np.array(fix_quote_strings(x[0, 0])) x_tokens = preprocess_tokens(tokenize_fine_grained(x[0, 0]), self.max_dim) y_tokens = preprocess_tokens(tokenize_fine_grained(x[0, 1]), self.max_dim) # x_tokens = [word2idx.get(token, UNKNOWN_IDX) for token in x_tokens] # y_tokens = [word2idx.get(token, UNKNOWN_IDX) for token in y_tokens] return x_tokens, y_tokens
def words2tokens(self, x): x_tokens = preprocess_context( x, self.n_retrieved, self.max_dim) if self.retrieve_context else preprocess_tokens( tokenize_fine_grained(x[0, 0]), self.max_dim) y_tokens = preprocess_tokens(tokenize_fine_grained(x[0, 1]), self.max_dim) x_tokens = [word2idx.get(token, UNKNOWN_IDX) for token in x_tokens] y_tokens = [word2idx.get(token, UNKNOWN_IDX) for token in y_tokens] return x_tokens, y_tokens
def __getitem__(self, idx): try: x = self.read_pandas_line(idx) # something is broken here so just give filler if len(x[0]) != self.num_cols: idx = max(0, idx - 1) return self.__getitem__(self.len - 1 if idx == 0 else idx) except: x = self.read_pandas_line_quote(idx) x = np.array(fix_quote_strings_context(x[0, 0], self.n_retrieved)) query_x = [ word2idx.get(token, UNKNOWN_IDX) for token in preprocess_tokens( tokenize_fine_grained(x[0, 0]), self.max_dim) ] support_list_x = [] support_list_y = [] for i in range(self.n_retrieved): support_list_x.append([ word2idx.get(token, UNKNOWN_IDX) for token in preprocess_tokens( tokenize_fine_grained(x[0, i * 2 + 1]), self.max_dim) ]) support_list_y.append([ word2idx.get(token, UNKNOWN_IDX) for token in preprocess_tokens( tokenize_fine_grained(x[0, i * 2 + 2]), self.max_dim) ]) query_y = [ word2idx.get(token, UNKNOWN_IDX) for token in preprocess_tokens( tokenize_fine_grained(x[0, -1]), self.max_dim) ] support_x = torch.LongTensor( pd.DataFrame(support_x).values.astype('int64')) support_y = torch.LongTensor( pd.DataFrame(support_y).values.astype('int64')) query_x = torch.LongTensor( pd.DataFrame(query_x).values.astype('int64')).contiguous().view( 1, -1) query_y = torch.LongTensor( pd.DataFrame(query_y).values.astype('int64')).contiguous().view( 1, -1) return support_x, support_y, query_x, query_y