def generate_compounded_merger(self, f, width, ascending, inline): type = self.type g = self maybe_cmp = lambda: ", cmp" if (type == "int64_t" or type == "uint64_t") else "" maybe_topbit = lambda: f"\n {g.vector_type()} topBit = _mm256_set1_epi64x(1LLU << 63);" if ( type == "uint64_t") else "" w1 = int(next_power_of_2(width) / 2) w2 = int(width - w1) suffix = "ascending" if ascending else "descending" rev_suffix = "descending" if ascending else "ascending" inl = "INLINE" if inline else "NOINLINE" s = f""" static {inl} void sort_{width:02d}v_merge_{suffix}({g.generate_param_def_list(width)}) {{ {g.vector_type()} tmp{maybe_cmp()};{maybe_topbit()}""" print(s, file=f) for r in range(w1 + 1, width + 1): x = r - w1 s = f""" tmp = d{x:02d}; {g.crappity_crap_crap(f"d{r:02d}", f"d{x:02d}")} d{x:02d} = {g.generate_min(f"d{r:02d}", f"d{x:02d}")}; {g.crappity_crap_crap(f"d{r:02d}", "tmp")} d{r:02d} = {g.generate_max(f"d{r:02d}", "tmp")};""" print(s, file=f) s = f""" sort_{w1:02d}v_merge_{suffix}({g.generate_param_list(1, w1)}); sort_{w2:02d}v_merge_{suffix}({g.generate_param_list(w1 + 1, w2)});""" print(s, file=f) print(" }", file=f)
def generate_compounded_merger(self, f, width, ascending, inline): type = self.type g = self w1 = int(next_power_of_2(width) / 2) w2 = int(width - w1) suffix = "ascending" if ascending else "descending" rev_suffix = "descending" if ascending else "ascending" inl = "INLINE" if inline else "NOINLINE" s = f""" static {inl} void sort_{width:02d}v_merge_{suffix}({g.generate_param_def_list(width)}) {{ {g.vector_type()} tmp;""" print(s, file=f) for r in range(w1 + 1, width + 1): x = r - w1 s = f""" tmp = d{x:02d}; d{x:02d} = {g.generate_min(f"d{r:02d}", f"d{x:02d}")}; d{r:02d} = {g.generate_max(f"d{r:02d}", "tmp")};""" print(s, file=f) s = f""" sort_{w1:02d}v_merge_{suffix}({g.generate_param_list(1, w1)}); sort_{w2:02d}v_merge_{suffix}({g.generate_param_list(w1 + 1, w2)});""" print(s, file=f) print(" }", file=f)
def largest_merge_variant_needed(self): return next_power_of_2(self.max_bitonic_sort_vectors()) / 2