Ejemplo n.º 1
0
    def __getitem__(self, idx):
        try:
            x = next(
                pd.read_csv(self.filename,
                            skiprows=idx * self.chunksize + 1,
                            chunksize=self.chunksize,
                            header=None,
                            dtype=str)).fillna(NO_CONTEXT_WORD).values

            # something is broken here so just give filler
            if len(x[0]) != self.num_cols:
                # idx = max(0, idx-1)
                return self.__getitem__(np.random.randint(0, self.len))
        except:
            x = next(
                pd.read_csv(self.filename,
                            skiprows=idx * self.chunksize + 1,
                            chunksize=self.chunksize,
                            header=None,
                            sep=',\s+',
                            quoting=csv.QUOTE_ALL,
                            dtype=str)).fillna(NO_CONTEXT_WORD).values

            x = np.array(fix_quote_strings(x[0, 0]))

        x_tokens = preprocess_tokens(tokenize_fine_grained(x[0, 0]),
                                     self.max_dim)
        y_tokens = preprocess_tokens(tokenize_fine_grained(x[0, 1]),
                                     self.max_dim)

        # x_tokens = [word2idx.get(token, UNKNOWN_IDX) for token in x_tokens]
        # y_tokens = [word2idx.get(token, UNKNOWN_IDX) for token in y_tokens]

        return x_tokens, y_tokens
Ejemplo n.º 2
0
    def get_line(self, idx, not_idx, data_list):
        if idx == not_idx:
            return self.get_line(np.random.randint(0, self.len), not_idx)
        try:
            x = self.read_pandas_line(idx)

            if len(x[0]) != self.num_cols:
                return self.get_line(np.random.randint(0, self.len), not_idx)
        except:
            x = self.read_pandas_line_quote()

            x = np.array(
                fix_quote_strings(x[0, 0]) if self.retrieve_context else
                fix_quote_strings_context(x[0, 0], self.n_retrieved))

        x_tokens, y_tokens = self.words2tokens(x)

        data_list.append((x_tokens, y_tokens))
Ejemplo n.º 3
0
    def __getitem__(self, idx):
        query_list = []
        data_threads = []
        support_indices = np.random.randint(0, self.len, self.k_shot)
        for support_idx in support_indices:
            data_threads.append(
                Thread(target=self.get_line,
                       args=(support_idx, idx, query_list)))
            data_threads[-1].start()

        try:
            x = self.read_pandas_line(idx)

            # something is broken here so just give filler
            if len(x[0]) != self.num_cols:
                idx = max(0, idx - 1)
                return self.__getitem__(self.len - 1 if idx == 0 else idx)
        except:
            x = self.read_pandas_line_quote(idx)

            x = np.array(
                fix_quote_strings(x[0, 0]) if self.
                retrieve_context else fix_quote_strings_context(x[0, 0]))

        query_x, query_y = self.words2tokens(x)

        for dt in data_threads:
            dt.join()

        support_x, support_y = zip(*query_list)
        # support_x = torch.LongTensor(pd.DataFrame(support_x).values.astype('int64'))
        # support_y = torch.LongTensor(pd.DataFrame(support_x).values.astype('int64'))

        # query_x = torch.LongTensor(pd.DataFrame(query_x).values.astype('int64')).contiguous().view(1, -1)
        # query_y = torch.LongTensor(pd.DataFrame(query_y).values.astype('int64')).contiguous().view(1, -1)

        support_x = pd.DataFrame(support_x).values.astype('int64')
        support_y = pd.DataFrame(support_x).values.astype('int64')

        query_x = pd.DataFrame(query_x).values.astype('int64').reshape(1, -1)
        query_y = pd.DataFrame(query_y).values.astype('int64').reshape(1, -1)

        return support_x, support_y, query_x, query_y
    def get_line(self, idx, data_list, query=False):
        # if idx == not_idx:
        # return self.get_line(np.random.randint(0, self.len), not_idx)
        try:
            x = self.read_pandas_line(idx)

            if len(x[0]) != self.num_cols:
                if query:
                    idx = self.query_indices.pop(0)
                else:
                    idx = np.random.randint(0, self.split_idx)

                self.get_line(idx, data_list)
                return
        except:
            x = self.read_pandas_line_quote(idx)

            x = np.array(
                fix_quote_strings(x[0, 0]) if self.retrieve_context else
                fix_quote_strings_context(x[0, 0], self.n_retrieved))

        x_tokens, y_tokens = self.words2tokens(x)

        data_list.append((x_tokens, y_tokens))