Example #1
0
    def compare(self, others, scores, dtype=np.float16, plot=False):
        result0 = self.compute(scores, dtype=dtype)

        if not isiterable(others):
            others = [others]

        result_grid = []
        for other in others:
            result1 = other.compute(scores, dtype=dtype)

            if plot:
                from matplotlib import pyplot as plt
                from palettable import colorbrewer

                colors = colorbrewer.get_map("Set1", "qualitative", 9).mpl_colors

            result_row = {}
            for score_name, scores0 in result0.iteritems():
                scores1 = result1[score_name]
                auc_score = dist_auc(scores0, scores1)
                result_row[score_name] = auc_score
                if plot:
                    scores0p = [x for x in scores0 if not np.isnan(x)]
                    scores1p = [x for x in scores1 if not np.isnan(x)]
                    hmin0, hmax0 = minmaxr(scores0p)
                    hmin1, hmax1 = minmaxr(scores1p)
                    bins = np.linspace(min(hmin0, hmin1), max(hmax0, hmax1), 50)
                    plt.hist(scores0p, bins, alpha=0.5, label="0", color=colors[0], edgecolor="none")
                    plt.hist(scores1p, bins, alpha=0.5, label="1", color=colors[1], edgecolor="none")
                    plt.legend(loc="upper right")
                    plt.title("%s: AUC=%.4f" % (score_name, auc_score))
                    plt.show()
            result_grid.append(result_row)
        return result_grid
Example #2
0
    def __getitem__(self, index):
        """
        Get the item at a given index.

        If `index` is a slice, you will get back that slice of items. If it's
        the slice [:], exactly the same object is returned. (If you want an
        independent copy of an OrderedSet, use `OrderedSet.copy()`.)

        If `index` is an iterable, you'll get the OrderedSet of items
        corresponding to those indices. This is similar to NumPy's
        "fancy indexing".
        """
        if index == SLICE_ALL:
            return self
        elif hasattr(index, '__index__') or isinstance(index, slice):
            result = self._mapping.keys()[index]
            if isinstance(result, list):
                return OrderedSet(result)
            else:
                return result
        elif isiterable(index):
            keys = self._mapping.keys()
            return OrderedSet([keys[i] for i in index])
        else:
            raise TypeError("Don't know how to index an OrderedSet by %r" % index)
Example #3
0
    def get_shingles(self, input_text, prefix=None):
        """Return a vector of shingles from a source text

        :param input_text: Input sequence
        :type input_text: collections.Iterable
        :param prefix: an object to prepend to token sequence
        :type prefix: object
        :return: A set of shingles (tuples)
        :rtype: set, list
        """
        normalizer = self._normalizer
        text = input_text \
            if normalizer is None \
            else normalizer.normalize(input_text)
        tokens = text if isiterable(text) else self._tokenize(text)
        span = self._span
        unique = self._unique
        kmin = self._kmin
        if not unique and kmin > 0:
            # cycle tokens until we can take kmin shingles
            token_count = len(tokens)
            prefix_token_count = 0 if prefix is None else 1
            num_shingles = token_count - span + prefix_token_count + 1
            append_num = kmin - num_shingles
            if append_num > 0:
                tokens = take(token_count + append_num, cycle(tokens))
        final_it = tokens if prefix is None else chain([prefix], tokens)
        shingles = self._shinglify(final_it, span, skip=self._skip)
        result = set(shingles) if unique else list(shingles)
        return result
Example #4
0
    def __getitem__(self, index):
        """
        Get the item at a given index.

        If `index` is a slice, you will get back that slice of items. If it's
        the slice [:], exactly the same object is returned. (If you want an
        independent copy of an OrderedSet, use `OrderedSet.copy()`.)

        If `index` is an iterable, you'll get the OrderedSet of items
        corresponding to those indices. This is similar to NumPy's
        "fancy indexing".
        """
        if index == SLICE_ALL:
            return self
        elif hasattr(index, '__index__') or isinstance(index, slice):
            result = self._mapping.keys()[index]
            if isinstance(result, list):
                return OrderedSet(result)
            else:
                return result
        elif isiterable(index):
            keys = self._mapping.keys()
            return OrderedSet([keys[i] for i in index])
        else:
            raise TypeError("Don't know how to index an OrderedSet by %r" %
                            index)
Example #5
0
    def get_shingles(self, input_text, prefix=None):
        """Return a vector of shingles from a source text

        :param input_text: Input sequence
        :type input_text: collections.Iterable
        :param prefix: an object to prepend to token sequence
        :type prefix: object
        :return: A set of shingles (tuples)
        :rtype: set, list

        """
        normalizer = self._normalizer
        text = input_text \
            if normalizer is None \
            else normalizer.normalize(input_text)
        tokens = text if isiterable(text) else self._tokenize(text)
        span = self._span
        unique = self._unique
        kmin = self._kmin
        if not unique and kmin > 0:
            # cycle tokens until we can take kmin shingles
            token_count = len(tokens)
            prefix_token_count = 0 if prefix is None else 1
            num_shingles = token_count - span + prefix_token_count + 1
            append_num = kmin - num_shingles
            if append_num > 0:
                tokens = take(token_count + append_num, cycle(tokens))
        final_it = tokens if prefix is None else chain([prefix], tokens)
        shingles = self._shinglify(final_it, span, skip=self._skip)
        result = set(shingles) if unique else list(shingles)
        return result
 def compute(self, scores, show_progress=False, dtype=np.float16):
     result = defaultdict(partial(np.empty, (self.n, ), dtype=dtype))
     if not isiterable(scores):
         scores = [scores]
     for idx, conf in self.iter_matrices():
         if show_progress:
             pct_done = 100 * idx / float(self.n)
             if pct_done % 5 == 0:
                 sys.stderr.write("%d%% done\n" % pct_done)
         for score in scores:
             score_arr = conf.get_score(score)
             if isiterable(score_arr):
                 for j, val in enumerate(score_arr):
                     result["%s-%d" % (score, j)][idx] = val
             else:
                 result[score][idx] = score_arr
     return result
Example #7
0
 def compute(self, scores, show_progress=False, dtype=np.float16):
     result = defaultdict(partial(np.empty, (self.n,), dtype=dtype))
     if not isiterable(scores):
         scores = [scores]
     for idx, conf in self.iter_matrices():
         if show_progress:
             pct_done = 100 * idx / float(self.n)
             if pct_done % 5 == 0:
                 sys.stderr.write("%d%% done\n" % pct_done)
         for score in scores:
             score_arr = conf.get_score(score)
             if isiterable(score_arr):
                 for j, val in enumerate(score_arr):
                     result["%s-%d" % (score, j)][idx] = val
             else:
                 result[score][idx] = score_arr
     return result
Example #8
0
def split_or_whole(dirname):
    if not isiterable(dirname):
        if not os.path.exists(dirname):
            raise ValueError(u"Specified path does not exist: {}".format(dirname))
        pattern = os.path.join(dirname, 'part-*')
        fnames = glob.glob(pattern)
    else:
        fnames = dirname
    return fnames
Example #9
0
def split_or_whole(dirname):
    if not isiterable(dirname):
        if not os.path.exists(dirname):
            raise ValueError(
                u"Specified path does not exist: {}".format(dirname))
        pattern = os.path.join(dirname, 'part-*')
        fnames = glob.glob(pattern)
    else:
        fnames = dirname
    return fnames
Example #10
0
def read_json_lines(finput, logger=logging, encoding='utf-8'):
    ctx = joint_context(finput) \
        if isiterable(finput) \
        else open_gz(finput, 'r')
    with ctx as fhandle:
        for idx, line in enumerate(fhandle, start=1):
            try:
                obj = json.loads(line, encoding=encoding)
            except ValueError as err:
                logger.error("Could not parse line %d: %s", idx, err)
                continue
            yield obj
Example #11
0
def read_json_lines(finput, logger=logging, encoding='utf-8'):
    ctx = joint_context(finput) \
        if isiterable(finput) \
        else open_gz(finput, 'r')
    with ctx as fhandle:
        for idx, line in enumerate(fhandle, start=1):
            try:
                obj = json.loads(line, encoding=encoding)
            except ValueError as err:
                logger.error("Could not parse line %d: %s", idx, err)
                continue
            yield obj
Example #12
0
def append_scores(cm, pairs, metrics):
    for metric in metrics:
        try:
            scores = cm.get_score(metric)
        except AttributeError:
            logging.warn("Method %s not defined", metric)
            continue
        else:
            if isiterable(scores):
                for idx, score in enumerate(scores):
                    pairs.append(("%s-%d" % (metric, idx), score))
            else:
                pairs.append((metric, scores))
Example #13
0
def read_json_lines(finput, logger=logging, show_progress=None):
    ctx = joint_context(finput) \
        if isiterable(finput) \
        else open_gz(finput, 'r')
    with ctx as fhandle:
        for idx, line in enumerate(fhandle, start=1):
            if show_progress and idx % show_progress == 0 and idx > 1:
                logger.info("Processed %d lines", idx)
            try:
                obj = json.loads(line)
            except ValueError as err:
                logger.error("Could not parse line %d: %s", idx, err)
                continue
            yield obj
Example #14
0
def read_json_lines(finput, logger=logging, show_progress=None):
    ctx = joint_context(finput) \
        if isiterable(finput) \
        else open_gz(finput, 'r')
    with ctx as fhandle:
        for idx, line in enumerate(fhandle, start=1):
            if show_progress and idx % show_progress == 0 and idx > 1:
                logger.info("Processed %d lines", idx)
            try:
                obj = json.loads(line)
            except ValueError as err:
                logger.error("Could not parse line %d: %s", idx, err)
                continue
            yield obj
Example #15
0
def pd_row_iter(datasets, chunksize=1000):
    """Produce an iterator over rows in Pandas
    dataframe while reading from files on disk

    @param datasets: a list of filenames or file handles
    @param chunksize: how many lines to read at once
    """
    # ensure that silly values of chunksize don't get passed
    if not chunksize:
        chunksize = 1
    if not isiterable(datasets):
        datasets = [datasets]
    for dataset in datasets:
        for chunk in read_tsv(dataset, iterator=True, chunksize=chunksize):
            for row in chunk.iterrows():
                yield row
Example #16
0
def pd_row_iter(datasets, chunksize=1000):
    """Produce an iterator over rows in Pandas
    dataframe while reading from files on disk

    @param datasets: a list of filenames or file handles
    @param chunksize: how many lines to read at once
    """
    # ensure that silly values of chunksize don't get passed
    if not chunksize:
        chunksize = 1
    if not isiterable(datasets):
        datasets = [datasets]
    for dataset in datasets:
        for chunk in read_tsv(dataset, iterator=True, chunksize=chunksize):
            for row in chunk.iterrows():
                yield row
Example #17
0
def buildParamGrid(gridSpec):
    ks, vs = zip(*gridSpec.items()) if gridSpec else ((), ())
    all_params = []

    # ensure all vs are iterable
    all_vals = []
    for v in vs:
        if not isiterable(v):
            v = [v]
        all_vals.append(v)

    for vv in product(*all_vals):
        these_params = []
        for k, v in izip(ks, vv):
            these_params.append((k, v))
        all_params.append(these_params)
    return all_params
    def compare(self, others, scores, dtype=np.float16, plot=False):
        result0 = self.compute(scores, dtype=dtype)

        if not isiterable(others):
            others = [others]

        result_grid = []
        for other in others:
            result1 = other.compute(scores, dtype=dtype)

            if plot:
                from matplotlib import pyplot as plt
                from palettable import colorbrewer
                colors = colorbrewer.get_map('Set1', 'qualitative',
                                             9).mpl_colors

            result_row = {}
            for score_name, scores0 in result0.iteritems():
                scores1 = result1[score_name]
                auc_score = dist_auc(scores0, scores1)
                result_row[score_name] = auc_score
                if plot:
                    scores0p = [x for x in scores0 if not np.isnan(x)]
                    scores1p = [x for x in scores1 if not np.isnan(x)]
                    hmin0, hmax0 = minmaxr(scores0p)
                    hmin1, hmax1 = minmaxr(scores1p)
                    bins = np.linspace(min(hmin0, hmin1), max(hmax0, hmax1),
                                       50)
                    plt.hist(scores0p,
                             bins,
                             alpha=0.5,
                             label='0',
                             color=colors[0],
                             edgecolor="none")
                    plt.hist(scores1p,
                             bins,
                             alpha=0.5,
                             label='1',
                             color=colors[1],
                             edgecolor="none")
                    plt.legend(loc='upper right')
                    plt.title("%s: AUC=%.4f" % (score_name, auc_score))
                    plt.show()
            result_grid.append(result_row)
        return result_grid
Example #19
0
def write_text_resource(foutput, text, encoding='utf-8'):
    """Write a text resource
    :param foutput: path or file handle
    :type foutput: str, file
    :param text: content to write
    :type text: str, unicode, iterable
    :param encoding: which encoding to use (default: UTF-8)
    :type encoding: str
    """
    if isinstance(foutput, file):
        for chunk in codecs.iterencode(text, encoding=encoding):
            foutput.write(chunk)
    else:
        with codecs.open(foutput, 'w', encoding=encoding) as fhandle:
            if isiterable(text):
                for line in text:
                    fhandle.write(u"%s\n" % line)
            else:
                fhandle.write(text)
Example #20
0
def write_text_resource(foutput, text, encoding='utf-8'):
    """Write a text resource
    :param foutput: path or file handle
    :type foutput: str, file
    :param text: content to write
    :type text: str, unicode, iterable
    :param encoding: which encoding to use (default: UTF-8)
    :type encoding: str
    """
    if isinstance(foutput, file):
        for chunk in codecs.iterencode(text, encoding=encoding):
            foutput.write(chunk)
    else:
        with codecs.open(foutput, 'w', encoding=encoding) as fhandle:
            if isiterable(text):
                for line in text:
                    fhandle.write(u"%s\n" % line)
            else:
                fhandle.write(text)
Example #21
0
def read_text_resource(finput, encoding='utf-8', ignore_prefix='#'):
    """Read a text resource ignoring comments beginning with pound sign
    :param finput: path or file handle
    :type finput: str, file
    :param encoding: which encoding to use (default: UTF-8)
    :type encoding: str
    :param ignore_prefix: lines matching this prefix will be skipped
    :type ignore_prefix: str, unicode
    :rtype: generator
    """
    ctx = joint_context(codecs.iterdecode(finput, encoding=encoding)) \
        if isiterable(finput) \
        else codecs.open(finput, 'r', encoding=encoding)
    with ctx as fhandle:
        for line in fhandle:
            if ignore_prefix is not None:
                line = line.split(ignore_prefix)[0]
            line = line.strip()
            if line:
                yield line
Example #22
0
def read_text_resource(finput, encoding='utf-8', ignore_prefix='#'):
    """Read a text resource ignoring comments beginning with pound sign
    :param finput: path or file handle
    :type finput: str, file
    :param encoding: which encoding to use (default: UTF-8)
    :type encoding: str
    :param ignore_prefix: lines matching this prefix will be skipped
    :type ignore_prefix: str, unicode
    :rtype: generator
    """
    ctx = joint_context(codecs.iterdecode(finput, encoding=encoding)) \
        if isiterable(finput) \
        else codecs.open(finput, 'r', encoding=encoding)
    with ctx as fhandle:
        for line in fhandle:
            if ignore_prefix is not None:
                line = line.split(ignore_prefix)[0]
            line = line.strip()
            if line:
                yield line
Example #23
0
def wrap_scalar(a):
    """If scalar, convert to tuple"""
    return a if isiterable(a) else (a,)
Example #24
0
def wrap_scalar(a):
    """If scalar, convert to tuple"""
    return a if isiterable(a) else (a, )