def compare(self, others, scores, dtype=np.float16, plot=False): result0 = self.compute(scores, dtype=dtype) if not isiterable(others): others = [others] result_grid = [] for other in others: result1 = other.compute(scores, dtype=dtype) if plot: from matplotlib import pyplot as plt from palettable import colorbrewer colors = colorbrewer.get_map("Set1", "qualitative", 9).mpl_colors result_row = {} for score_name, scores0 in result0.iteritems(): scores1 = result1[score_name] auc_score = dist_auc(scores0, scores1) result_row[score_name] = auc_score if plot: scores0p = [x for x in scores0 if not np.isnan(x)] scores1p = [x for x in scores1 if not np.isnan(x)] hmin0, hmax0 = minmaxr(scores0p) hmin1, hmax1 = minmaxr(scores1p) bins = np.linspace(min(hmin0, hmin1), max(hmax0, hmax1), 50) plt.hist(scores0p, bins, alpha=0.5, label="0", color=colors[0], edgecolor="none") plt.hist(scores1p, bins, alpha=0.5, label="1", color=colors[1], edgecolor="none") plt.legend(loc="upper right") plt.title("%s: AUC=%.4f" % (score_name, auc_score)) plt.show() result_grid.append(result_row) return result_grid
def __getitem__(self, index): """ Get the item at a given index. If `index` is a slice, you will get back that slice of items. If it's the slice [:], exactly the same object is returned. (If you want an independent copy of an OrderedSet, use `OrderedSet.copy()`.) If `index` is an iterable, you'll get the OrderedSet of items corresponding to those indices. This is similar to NumPy's "fancy indexing". """ if index == SLICE_ALL: return self elif hasattr(index, '__index__') or isinstance(index, slice): result = self._mapping.keys()[index] if isinstance(result, list): return OrderedSet(result) else: return result elif isiterable(index): keys = self._mapping.keys() return OrderedSet([keys[i] for i in index]) else: raise TypeError("Don't know how to index an OrderedSet by %r" % index)
def get_shingles(self, input_text, prefix=None): """Return a vector of shingles from a source text :param input_text: Input sequence :type input_text: collections.Iterable :param prefix: an object to prepend to token sequence :type prefix: object :return: A set of shingles (tuples) :rtype: set, list """ normalizer = self._normalizer text = input_text \ if normalizer is None \ else normalizer.normalize(input_text) tokens = text if isiterable(text) else self._tokenize(text) span = self._span unique = self._unique kmin = self._kmin if not unique and kmin > 0: # cycle tokens until we can take kmin shingles token_count = len(tokens) prefix_token_count = 0 if prefix is None else 1 num_shingles = token_count - span + prefix_token_count + 1 append_num = kmin - num_shingles if append_num > 0: tokens = take(token_count + append_num, cycle(tokens)) final_it = tokens if prefix is None else chain([prefix], tokens) shingles = self._shinglify(final_it, span, skip=self._skip) result = set(shingles) if unique else list(shingles) return result
def compute(self, scores, show_progress=False, dtype=np.float16): result = defaultdict(partial(np.empty, (self.n, ), dtype=dtype)) if not isiterable(scores): scores = [scores] for idx, conf in self.iter_matrices(): if show_progress: pct_done = 100 * idx / float(self.n) if pct_done % 5 == 0: sys.stderr.write("%d%% done\n" % pct_done) for score in scores: score_arr = conf.get_score(score) if isiterable(score_arr): for j, val in enumerate(score_arr): result["%s-%d" % (score, j)][idx] = val else: result[score][idx] = score_arr return result
def compute(self, scores, show_progress=False, dtype=np.float16): result = defaultdict(partial(np.empty, (self.n,), dtype=dtype)) if not isiterable(scores): scores = [scores] for idx, conf in self.iter_matrices(): if show_progress: pct_done = 100 * idx / float(self.n) if pct_done % 5 == 0: sys.stderr.write("%d%% done\n" % pct_done) for score in scores: score_arr = conf.get_score(score) if isiterable(score_arr): for j, val in enumerate(score_arr): result["%s-%d" % (score, j)][idx] = val else: result[score][idx] = score_arr return result
def split_or_whole(dirname): if not isiterable(dirname): if not os.path.exists(dirname): raise ValueError(u"Specified path does not exist: {}".format(dirname)) pattern = os.path.join(dirname, 'part-*') fnames = glob.glob(pattern) else: fnames = dirname return fnames
def split_or_whole(dirname): if not isiterable(dirname): if not os.path.exists(dirname): raise ValueError( u"Specified path does not exist: {}".format(dirname)) pattern = os.path.join(dirname, 'part-*') fnames = glob.glob(pattern) else: fnames = dirname return fnames
def read_json_lines(finput, logger=logging, encoding='utf-8'): ctx = joint_context(finput) \ if isiterable(finput) \ else open_gz(finput, 'r') with ctx as fhandle: for idx, line in enumerate(fhandle, start=1): try: obj = json.loads(line, encoding=encoding) except ValueError as err: logger.error("Could not parse line %d: %s", idx, err) continue yield obj
def append_scores(cm, pairs, metrics): for metric in metrics: try: scores = cm.get_score(metric) except AttributeError: logging.warn("Method %s not defined", metric) continue else: if isiterable(scores): for idx, score in enumerate(scores): pairs.append(("%s-%d" % (metric, idx), score)) else: pairs.append((metric, scores))
def read_json_lines(finput, logger=logging, show_progress=None): ctx = joint_context(finput) \ if isiterable(finput) \ else open_gz(finput, 'r') with ctx as fhandle: for idx, line in enumerate(fhandle, start=1): if show_progress and idx % show_progress == 0 and idx > 1: logger.info("Processed %d lines", idx) try: obj = json.loads(line) except ValueError as err: logger.error("Could not parse line %d: %s", idx, err) continue yield obj
def pd_row_iter(datasets, chunksize=1000): """Produce an iterator over rows in Pandas dataframe while reading from files on disk @param datasets: a list of filenames or file handles @param chunksize: how many lines to read at once """ # ensure that silly values of chunksize don't get passed if not chunksize: chunksize = 1 if not isiterable(datasets): datasets = [datasets] for dataset in datasets: for chunk in read_tsv(dataset, iterator=True, chunksize=chunksize): for row in chunk.iterrows(): yield row
def buildParamGrid(gridSpec): ks, vs = zip(*gridSpec.items()) if gridSpec else ((), ()) all_params = [] # ensure all vs are iterable all_vals = [] for v in vs: if not isiterable(v): v = [v] all_vals.append(v) for vv in product(*all_vals): these_params = [] for k, v in izip(ks, vv): these_params.append((k, v)) all_params.append(these_params) return all_params
def compare(self, others, scores, dtype=np.float16, plot=False): result0 = self.compute(scores, dtype=dtype) if not isiterable(others): others = [others] result_grid = [] for other in others: result1 = other.compute(scores, dtype=dtype) if plot: from matplotlib import pyplot as plt from palettable import colorbrewer colors = colorbrewer.get_map('Set1', 'qualitative', 9).mpl_colors result_row = {} for score_name, scores0 in result0.iteritems(): scores1 = result1[score_name] auc_score = dist_auc(scores0, scores1) result_row[score_name] = auc_score if plot: scores0p = [x for x in scores0 if not np.isnan(x)] scores1p = [x for x in scores1 if not np.isnan(x)] hmin0, hmax0 = minmaxr(scores0p) hmin1, hmax1 = minmaxr(scores1p) bins = np.linspace(min(hmin0, hmin1), max(hmax0, hmax1), 50) plt.hist(scores0p, bins, alpha=0.5, label='0', color=colors[0], edgecolor="none") plt.hist(scores1p, bins, alpha=0.5, label='1', color=colors[1], edgecolor="none") plt.legend(loc='upper right') plt.title("%s: AUC=%.4f" % (score_name, auc_score)) plt.show() result_grid.append(result_row) return result_grid
def write_text_resource(foutput, text, encoding='utf-8'): """Write a text resource :param foutput: path or file handle :type foutput: str, file :param text: content to write :type text: str, unicode, iterable :param encoding: which encoding to use (default: UTF-8) :type encoding: str """ if isinstance(foutput, file): for chunk in codecs.iterencode(text, encoding=encoding): foutput.write(chunk) else: with codecs.open(foutput, 'w', encoding=encoding) as fhandle: if isiterable(text): for line in text: fhandle.write(u"%s\n" % line) else: fhandle.write(text)
def read_text_resource(finput, encoding='utf-8', ignore_prefix='#'): """Read a text resource ignoring comments beginning with pound sign :param finput: path or file handle :type finput: str, file :param encoding: which encoding to use (default: UTF-8) :type encoding: str :param ignore_prefix: lines matching this prefix will be skipped :type ignore_prefix: str, unicode :rtype: generator """ ctx = joint_context(codecs.iterdecode(finput, encoding=encoding)) \ if isiterable(finput) \ else codecs.open(finput, 'r', encoding=encoding) with ctx as fhandle: for line in fhandle: if ignore_prefix is not None: line = line.split(ignore_prefix)[0] line = line.strip() if line: yield line
def wrap_scalar(a): """If scalar, convert to tuple""" return a if isiterable(a) else (a,)
def wrap_scalar(a): """If scalar, convert to tuple""" return a if isiterable(a) else (a, )