コード例 #1
0
ファイル: nlp.py プロジェクト: Redwa/twip
def vocab_freq(docs, limit=1e6, verbose=1, tokenizer=generate_tokens):
    """Get the set of words used anywhere in a sequence of documents and count occurrences

    >>> gen = ('label: ' + chr(ord('A') + i % 3)*3 for i in range(11))
    >>> vocab_freq(gen, verbose=0)
    Counter({'label': 11, 'AAA': 4, 'BBB': 4, 'CCC': 3})
    """
    total = Counter()
    try:
        limit = min(limit, docs.count())
        docs = docs.iterator()
    except:
        pass
    for i, doc in enumerate(docs):
        try:
            doc = doc.values()
        except AttributeError:
            if not isinstance(doc, basestring):
                doc = ' '.join([stringify(v) for v in doc])
            else:
                doc = stringify(doc)
        if i >= limit:
            break
        c = Counter(tokenizer(doc, strip=True, nonwords=False))
        if verbose and (verbose < 1e-3 or not i % int(limit * verbose)):
            print('{}: {} ... {}'.format(i, c.keys()[:3], c.keys()[-3:] if len(c.keys()) > 6 else ''))
        total += c
    return total
コード例 #2
0
ファイル: nlp.py プロジェクト: Redwa/twip
def vocab_freq(docs, limit=1e6, verbose=1, tokenizer=generate_tokens):
    """Get the set of words used anywhere in a sequence of documents and count occurrences

    >>> gen = ('label: ' + chr(ord('A') + i % 3)*3 for i in range(11))
    >>> vocab_freq(gen, verbose=0)
    Counter({'label': 11, 'AAA': 4, 'BBB': 4, 'CCC': 3})
    """
    total = Counter()
    try:
        limit = min(limit, docs.count())
        docs = docs.iterator()
    except:
        pass
    for i, doc in enumerate(docs):
        try:
            doc = doc.values()
        except AttributeError:
            if not isinstance(doc, basestring):
                doc = ' '.join([stringify(v) for v in doc])
            else:
                doc = stringify(doc)
        if i >= limit:
            break
        c = Counter(tokenizer(doc, strip=True, nonwords=False))
        if verbose and (verbose < 1e-3 or not i % int(limit * verbose)):
            print('{}: {} ... {}'.format(
                i,
                c.keys()[:3],
                c.keys()[-3:] if len(c.keys()) > 6 else ''))
        total += c
    return total
コード例 #3
0
ファイル: nlp.py プロジェクト: Redwa/twip
def friendly(obj):
    """Make the representation of an object (mainly numbers) more human friendly

    >>> friendly(1e6)
    '1M'
    >>> friendly(-1e3)
    '-1k'
    >>> friendly(1.23456789e9)
    '1.23G'
    >>> friendly(0)
    '0'
    >>> friendly('whatever')
    'whatever'
    """
    powers = 'TGMk munp'
    if isinstance(obj, (float, int, long)):
        sign = 1 - 2 * int(obj < 0)
        obj = abs(obj)
        i = 0
        mid = 4
        while np.inf > safe_log(
                obj, 1000
        ) >= .9999999 and i <= mid and obj > 0 and np.isfinite(obj):
            obj = obj * .001
            i += 1
        while np.inf > safe_log(obj, 1000) < -.25 and i > mid - len(
                powers) and obj > 0 and np.isfinite(obj):
            obj = obj * 1000.
            i -= 1
        return '{:.3g}{}'.format(sign * obj, powers[mid - i] if i else '')
    return stringify(obj)
コード例 #4
0
ファイル: nlp.py プロジェクト: Redwa/twip
def make_filename(s,
                  allow_whitespace=False,
                  allow_underscore=False,
                  allow_hyphen=False,
                  limit=255,
                  lower=False):
    r"""Make sure the provided string is a valid filename, and optionally remove whitespace

    >>> make_filename('Not so great!')
    'Notsogreat'
    >>> make_filename('')
    'empty'
    >>> make_filename('EOF\x00 EOL\n')
    'EOFEOL'
    >>> make_filename('EOF\x00 EOL\n', allow_whitespace=True)
    'EOF EOL\n'
    """
    s = stringify(s)
    s = CRE_BAD_FILENAME.sub('', s)
    if not allow_whitespace:
        s = CRE_WHITESPACE.sub('', s)
    if lower:
        s = str.lower(s)
    if not allow_hyphen:
        s = s.replace('-', '')
    if not allow_underscore:
        s = s.replace('_', '')
    if limit is not None:
        s = s[:limit]
    return s or 'empty'[:limit]
コード例 #5
0
ファイル: nlp.py プロジェクト: Redwa/twip
def make_named_stemmer(stem=None, min_len=3):
    """Construct a callable object and a string sufficient to reconstruct it later (unpickling)

    >>> make_named_stemmer('str_lower')
    ('str_lower', <function str_lower at ...>)
    >>> make_named_stemmer('Lancaster')
    ('lancaster', <Stemmer object at ...>)
    """
    name, stem = stringify(stem), make_stemmer(stem=stem, min_len=min_len)
    if hasattr(stem, '__name__'):
        return stem.__name__, stem
    if name.strip().lower() in STEMMER_TYPES:
        return name.strip().lower(), stem
    if hasattr(stem, 'pattern'):
        return stem.pattern, stem
    return stringify(stem), stem
コード例 #6
0
ファイル: nlp.py プロジェクト: Redwa/twip
def friendly(obj):
    """Make the representation of an object (mainly numbers) more human friendly

    >>> friendly(1e6)
    '1M'
    >>> friendly(-1e3)
    '-1k'
    >>> friendly(1.23456789e9)
    '1.23G'
    >>> friendly(0)
    '0'
    >>> friendly('whatever')
    'whatever'
    """
    powers = 'TGMk munp'
    if isinstance(obj, (float, int, long)):
        sign = 1 - 2 * int(obj < 0)
        obj = abs(obj)
        i = 0
        mid = 4
        while np.inf > safe_log(obj, 1000) >= .9999999 and i <= mid and obj > 0 and np.isfinite(obj):
            obj = obj * .001
            i += 1
        while np.inf > safe_log(obj, 1000) < -.25 and i > mid - len(powers) and obj > 0 and np.isfinite(obj):
            obj = obj * 1000.
            i -= 1
        return '{:.3g}{}'.format(sign * obj, powers[mid - i] if i else '')
    return stringify(obj)
コード例 #7
0
ファイル: nlp.py プロジェクト: Redwa/twip
def make_filename(s, allow_whitespace=False, allow_underscore=False, allow_hyphen=False, limit=255, lower=False):
    r"""Make sure the provided string is a valid filename, and optionally remove whitespace

    >>> make_filename('Not so great!')
    'Notsogreat'
    >>> make_filename('')
    'empty'
    >>> make_filename('EOF\x00 EOL\n')
    'EOFEOL'
    >>> make_filename('EOF\x00 EOL\n', allow_whitespace=True)
    'EOF EOL\n'
    """
    s = stringify(s)
    s = CRE_BAD_FILENAME.sub('', s)
    if not allow_whitespace:
        s = CRE_WHITESPACE.sub('', s)
    if lower:
        s = str.lower(s)
    if not allow_hyphen:
        s = s.replace('-', '')
    if not allow_underscore:
        s = s.replace('_', '')
    if limit is not None:
        s = s[:limit]
    return s or 'empty'[:limit]
コード例 #8
0
ファイル: nlp.py プロジェクト: Redwa/twip
def make_named_stemmer(stem=None, min_len=3):
    """Construct a callable object and a string sufficient to reconstruct it later (unpickling)

    >>> make_named_stemmer('str_lower')
    ('str_lower', <function str_lower at ...>)
    >>> make_named_stemmer('Lancaster')
    ('lancaster', <Stemmer object at ...>)
    """
    name, stem = stringify(stem), make_stemmer(stem=stem, min_len=min_len)
    if hasattr(stem, '__name__'):
        return stem.__name__, stem
    if name.strip().lower() in STEMMER_TYPES:
        return name.strip().lower(), stem
    if hasattr(stem, 'pattern'):
        return stem.pattern, stem
    return stringify(stem), stem
コード例 #9
0
ファイル: nlp.py プロジェクト: Redwa/twip
def make_stemmer(stem=None, min_len=3):
    """Build a nltk.stem.StemmerI instance from regex, named stemmer ('Lancaster', 'Porter', None), or function

    Arguments:
      min (int): Dont stem anything short than this. e.g. for min=4 don't stem token "I'm" to "I"

    >>> make_stemmer()
    <Stemmer object at ...>
    >>> make_stemmer(str_lower)
    <function str_lower at ...>
    >>> make_stemmer('str_lower')
    <function str_lower at ...>
    >>> make_stemmer('Lancaster')
    <Stemmer object at ...>
    >>> make_stemmer('WordNet')
    <Stemmer object at ...>
    >>> make_stemmer('ing$|s$')
    <Stemmer object at ...>
    """
    if not stem or stem == 'passthrough':
        stem = Stemmer()
        # FIXME: this is unnecessary?! and will make the object less picklable?
        stem.stem = passthrough
        return stem
    if isinstance(stem, basestring):
        stem = globals().get(stem, None) or locals().get(stem, stem)
    # in case stem is a compiled stemmer regex, make it a string so it can be compiled by the nltk.RegexStemmer
    if hasattr(stem, 'pattern'):
        stem = Stemmer(nltk.stem.RegexpStemmer(stem, min=min_len))
    if isinstance(stem, basestring):
        # strip nonascii and whitespace, and only consider first letter, case-insentively
        name = stringify(stem).lower().strip() or 'porter'
        if name in STEMMER_TYPES:
            dataset = STEMMER_DATASETS.get(name, None)
            if dataset is not None:
                nltk_download(dataset)
            stem = STEMMER_TYPES.get(name, stem)
        else:
            stem = Stemmer(nltk.stem.RegexpStemmer(stem, min=min_len))
    if isinstance(stem, type):
        stem = stem()
    if (hasattr(stem, 'stem') and callable(stem.stem)) or (hasattr(
            stem, 'lemmatize') and callable(stem.lemmatize) or stem is True):
        return Stemmer(stem)
    elif callable(stem):
        return stem
    raise (ValueError(
        "Unable to make {} into a stemmer. ".format(stem) +
        "Try 'porter', 'lancaster', None, a regular expression, a callable function, "
        + "or an object with a stem method."))
コード例 #10
0
ファイル: nlp.py プロジェクト: Redwa/twip
def make_stemmer(stem=None, min_len=3):
    """Build a nltk.stem.StemmerI instance from regex, named stemmer ('Lancaster', 'Porter', None), or function

    Arguments:
      min (int): Dont stem anything short than this. e.g. for min=4 don't stem token "I'm" to "I"

    >>> make_stemmer()
    <Stemmer object at ...>
    >>> make_stemmer(str_lower)
    <function str_lower at ...>
    >>> make_stemmer('str_lower')
    <function str_lower at ...>
    >>> make_stemmer('Lancaster')
    <Stemmer object at ...>
    >>> make_stemmer('WordNet')
    <Stemmer object at ...>
    >>> make_stemmer('ing$|s$')
    <Stemmer object at ...>
    """
    if not stem or stem == 'passthrough':
        stem = Stemmer()
        # FIXME: this is unnecessary?! and will make the object less picklable?
        stem.stem = passthrough
        return stem
    if isinstance(stem, basestring):
        stem = globals().get(stem, None) or locals().get(stem, stem)
    # in case stem is a compiled stemmer regex, make it a string so it can be compiled by the nltk.RegexStemmer
    if hasattr(stem, 'pattern'):
        stem = Stemmer(nltk.stem.RegexpStemmer(stem, min=min_len))
    if isinstance(stem, basestring):
        # strip nonascii and whitespace, and only consider first letter, case-insentively
        name = stringify(stem).lower().strip() or 'porter'
        if name in STEMMER_TYPES:
            dataset = STEMMER_DATASETS.get(name, None)
            if dataset is not None:
                nltk_download(dataset)
            stem = STEMMER_TYPES.get(name, stem)
        else:
            stem = Stemmer(nltk.stem.RegexpStemmer(stem, min=min_len))
    if isinstance(stem, type):
        stem = stem()
    if (hasattr(stem, 'stem') and callable(stem.stem)) or (hasattr(stem, 'lemmatize') and callable(stem.lemmatize) or stem is True):
        return Stemmer(stem)
    elif callable(stem):
        return stem
    raise(ValueError("Unable to make {} into a stemmer. ".format(stem) +
                     "Try 'porter', 'lancaster', None, a regular expression, a callable function, " +
                     "or an object with a stem method."))
コード例 #11
0
ファイル: nlp.py プロジェクト: Redwa/twip
def nonnull_fields(obj, pretty=True):
    """Generate `.values()` dict from a table record, removing non-informative values

    Noninformative values include:
      date < 1970
      date > 2100
      False
      None
      0
      0.0
      '0'
      '0.0'
      '0.0000'
    """
    return PrettyDict(
        (k, v) for k, v in [(f.attname, getattr(obj, f.attname, None))
                            for f in obj._meta.fields]
        if (v and v not in NULL_VALUES and stringify(
            v).strip().lower()[:MAX_NULL_REPR_LEN] not in NULL_REPR_VALUES
            and not is_invalid_date(v)))
コード例 #12
0
ファイル: nlp.py プロジェクト: Redwa/twip
def nonnull_fields(obj, pretty=True):
    """Generate `.values()` dict from a table record, removing non-informative values

    Noninformative values include:
      date < 1970
      date > 2100
      False
      None
      0
      0.0
      '0'
      '0.0'
      '0.0000'
    """
    return PrettyDict((k, v) for k, v in [(f.attname, getattr(obj, f.attname, None))
                for f in obj._meta.fields] if (
                    v and
                    v not in NULL_VALUES and
                    stringify(v).strip().lower()[:MAX_NULL_REPR_LEN] not in NULL_REPR_VALUES and
                    not is_invalid_date(v)))
コード例 #13
0
ファイル: stats.py プロジェクト: totalgood/pug-nlp
def infer_pos_label(neg_label=None):
    """Try to guess a positive classification label from a negative label
    Basis for an NLP function to find the "opposite" of a string
    yes->no, true->false, etc)
    >>> [infer_pos_label(x) for x in ('1', '-1', '0', '2', 1, 0, 'F', 'False', False, True)]
    ['0', '1', '1', '3', 0, 1, 'T', 'True', True, False]
    """
    # A class label should be a None, bool, int, or str
    if neg_label is None:
        return True
    typ = type(neg_label)
    # If class label isn't a bool or None then make it an int or str
    try:
        neg_label = int(float(neg_label))
        if neg_label in (0, 1):
            return typ(int(not neg_label))
        if neg_label < 0:
            return typ(-neg_label)
        return typ(neg_label + 1)
    except:
        neg_label = stringify(neg_label).strip()
    for xform, label_dict in zip((lambda x: x, lambda x: x,       str.lower,        str.lower,),
                                 (POS_LABELS, POS_LABELS_INVERSE, POS_LABELS_LOWER, POS_LABELS_LOWER_INVERSE)):
        try:
            return typ(label_dict[xform(neg_label)])
        except KeyError:
            pass
    # neg_label = neg.lower()
    # for labels in (POS_LABELS_LOWER, POS_LABELS_LOWER_INVERSE):
    #     try:
    #         return typ(labels[neg_label])
    #     except KeyError:
    #         pass
    # if not neg_label:
    #     return True
    # neg = neg[0]
    # try:
    #     return POS_LABELS_LOWER_FIRST[neg]
    # except KeyError:
    #     pass
    return 'P'