def vocab_freq(docs, limit=1e6, verbose=1, tokenizer=generate_tokens): """Get the set of words used anywhere in a sequence of documents and count occurrences >>> gen = ('label: ' + chr(ord('A') + i % 3)*3 for i in range(11)) >>> vocab_freq(gen, verbose=0) Counter({'label': 11, 'AAA': 4, 'BBB': 4, 'CCC': 3}) """ total = Counter() try: limit = min(limit, docs.count()) docs = docs.iterator() except: pass for i, doc in enumerate(docs): try: doc = doc.values() except AttributeError: if not isinstance(doc, basestring): doc = ' '.join([stringify(v) for v in doc]) else: doc = stringify(doc) if i >= limit: break c = Counter(tokenizer(doc, strip=True, nonwords=False)) if verbose and (verbose < 1e-3 or not i % int(limit * verbose)): print('{}: {} ... {}'.format(i, c.keys()[:3], c.keys()[-3:] if len(c.keys()) > 6 else '')) total += c return total
def vocab_freq(docs, limit=1e6, verbose=1, tokenizer=generate_tokens): """Get the set of words used anywhere in a sequence of documents and count occurrences >>> gen = ('label: ' + chr(ord('A') + i % 3)*3 for i in range(11)) >>> vocab_freq(gen, verbose=0) Counter({'label': 11, 'AAA': 4, 'BBB': 4, 'CCC': 3}) """ total = Counter() try: limit = min(limit, docs.count()) docs = docs.iterator() except: pass for i, doc in enumerate(docs): try: doc = doc.values() except AttributeError: if not isinstance(doc, basestring): doc = ' '.join([stringify(v) for v in doc]) else: doc = stringify(doc) if i >= limit: break c = Counter(tokenizer(doc, strip=True, nonwords=False)) if verbose and (verbose < 1e-3 or not i % int(limit * verbose)): print('{}: {} ... {}'.format( i, c.keys()[:3], c.keys()[-3:] if len(c.keys()) > 6 else '')) total += c return total
def friendly(obj): """Make the representation of an object (mainly numbers) more human friendly >>> friendly(1e6) '1M' >>> friendly(-1e3) '-1k' >>> friendly(1.23456789e9) '1.23G' >>> friendly(0) '0' >>> friendly('whatever') 'whatever' """ powers = 'TGMk munp' if isinstance(obj, (float, int, long)): sign = 1 - 2 * int(obj < 0) obj = abs(obj) i = 0 mid = 4 while np.inf > safe_log( obj, 1000 ) >= .9999999 and i <= mid and obj > 0 and np.isfinite(obj): obj = obj * .001 i += 1 while np.inf > safe_log(obj, 1000) < -.25 and i > mid - len( powers) and obj > 0 and np.isfinite(obj): obj = obj * 1000. i -= 1 return '{:.3g}{}'.format(sign * obj, powers[mid - i] if i else '') return stringify(obj)
def make_filename(s, allow_whitespace=False, allow_underscore=False, allow_hyphen=False, limit=255, lower=False): r"""Make sure the provided string is a valid filename, and optionally remove whitespace >>> make_filename('Not so great!') 'Notsogreat' >>> make_filename('') 'empty' >>> make_filename('EOF\x00 EOL\n') 'EOFEOL' >>> make_filename('EOF\x00 EOL\n', allow_whitespace=True) 'EOF EOL\n' """ s = stringify(s) s = CRE_BAD_FILENAME.sub('', s) if not allow_whitespace: s = CRE_WHITESPACE.sub('', s) if lower: s = str.lower(s) if not allow_hyphen: s = s.replace('-', '') if not allow_underscore: s = s.replace('_', '') if limit is not None: s = s[:limit] return s or 'empty'[:limit]
def make_named_stemmer(stem=None, min_len=3): """Construct a callable object and a string sufficient to reconstruct it later (unpickling) >>> make_named_stemmer('str_lower') ('str_lower', <function str_lower at ...>) >>> make_named_stemmer('Lancaster') ('lancaster', <Stemmer object at ...>) """ name, stem = stringify(stem), make_stemmer(stem=stem, min_len=min_len) if hasattr(stem, '__name__'): return stem.__name__, stem if name.strip().lower() in STEMMER_TYPES: return name.strip().lower(), stem if hasattr(stem, 'pattern'): return stem.pattern, stem return stringify(stem), stem
def friendly(obj): """Make the representation of an object (mainly numbers) more human friendly >>> friendly(1e6) '1M' >>> friendly(-1e3) '-1k' >>> friendly(1.23456789e9) '1.23G' >>> friendly(0) '0' >>> friendly('whatever') 'whatever' """ powers = 'TGMk munp' if isinstance(obj, (float, int, long)): sign = 1 - 2 * int(obj < 0) obj = abs(obj) i = 0 mid = 4 while np.inf > safe_log(obj, 1000) >= .9999999 and i <= mid and obj > 0 and np.isfinite(obj): obj = obj * .001 i += 1 while np.inf > safe_log(obj, 1000) < -.25 and i > mid - len(powers) and obj > 0 and np.isfinite(obj): obj = obj * 1000. i -= 1 return '{:.3g}{}'.format(sign * obj, powers[mid - i] if i else '') return stringify(obj)
def make_stemmer(stem=None, min_len=3): """Build a nltk.stem.StemmerI instance from regex, named stemmer ('Lancaster', 'Porter', None), or function Arguments: min (int): Dont stem anything short than this. e.g. for min=4 don't stem token "I'm" to "I" >>> make_stemmer() <Stemmer object at ...> >>> make_stemmer(str_lower) <function str_lower at ...> >>> make_stemmer('str_lower') <function str_lower at ...> >>> make_stemmer('Lancaster') <Stemmer object at ...> >>> make_stemmer('WordNet') <Stemmer object at ...> >>> make_stemmer('ing$|s$') <Stemmer object at ...> """ if not stem or stem == 'passthrough': stem = Stemmer() # FIXME: this is unnecessary?! and will make the object less picklable? stem.stem = passthrough return stem if isinstance(stem, basestring): stem = globals().get(stem, None) or locals().get(stem, stem) # in case stem is a compiled stemmer regex, make it a string so it can be compiled by the nltk.RegexStemmer if hasattr(stem, 'pattern'): stem = Stemmer(nltk.stem.RegexpStemmer(stem, min=min_len)) if isinstance(stem, basestring): # strip nonascii and whitespace, and only consider first letter, case-insentively name = stringify(stem).lower().strip() or 'porter' if name in STEMMER_TYPES: dataset = STEMMER_DATASETS.get(name, None) if dataset is not None: nltk_download(dataset) stem = STEMMER_TYPES.get(name, stem) else: stem = Stemmer(nltk.stem.RegexpStemmer(stem, min=min_len)) if isinstance(stem, type): stem = stem() if (hasattr(stem, 'stem') and callable(stem.stem)) or (hasattr( stem, 'lemmatize') and callable(stem.lemmatize) or stem is True): return Stemmer(stem) elif callable(stem): return stem raise (ValueError( "Unable to make {} into a stemmer. ".format(stem) + "Try 'porter', 'lancaster', None, a regular expression, a callable function, " + "or an object with a stem method."))
def make_stemmer(stem=None, min_len=3): """Build a nltk.stem.StemmerI instance from regex, named stemmer ('Lancaster', 'Porter', None), or function Arguments: min (int): Dont stem anything short than this. e.g. for min=4 don't stem token "I'm" to "I" >>> make_stemmer() <Stemmer object at ...> >>> make_stemmer(str_lower) <function str_lower at ...> >>> make_stemmer('str_lower') <function str_lower at ...> >>> make_stemmer('Lancaster') <Stemmer object at ...> >>> make_stemmer('WordNet') <Stemmer object at ...> >>> make_stemmer('ing$|s$') <Stemmer object at ...> """ if not stem or stem == 'passthrough': stem = Stemmer() # FIXME: this is unnecessary?! and will make the object less picklable? stem.stem = passthrough return stem if isinstance(stem, basestring): stem = globals().get(stem, None) or locals().get(stem, stem) # in case stem is a compiled stemmer regex, make it a string so it can be compiled by the nltk.RegexStemmer if hasattr(stem, 'pattern'): stem = Stemmer(nltk.stem.RegexpStemmer(stem, min=min_len)) if isinstance(stem, basestring): # strip nonascii and whitespace, and only consider first letter, case-insentively name = stringify(stem).lower().strip() or 'porter' if name in STEMMER_TYPES: dataset = STEMMER_DATASETS.get(name, None) if dataset is not None: nltk_download(dataset) stem = STEMMER_TYPES.get(name, stem) else: stem = Stemmer(nltk.stem.RegexpStemmer(stem, min=min_len)) if isinstance(stem, type): stem = stem() if (hasattr(stem, 'stem') and callable(stem.stem)) or (hasattr(stem, 'lemmatize') and callable(stem.lemmatize) or stem is True): return Stemmer(stem) elif callable(stem): return stem raise(ValueError("Unable to make {} into a stemmer. ".format(stem) + "Try 'porter', 'lancaster', None, a regular expression, a callable function, " + "or an object with a stem method."))
def nonnull_fields(obj, pretty=True): """Generate `.values()` dict from a table record, removing non-informative values Noninformative values include: date < 1970 date > 2100 False None 0 0.0 '0' '0.0' '0.0000' """ return PrettyDict( (k, v) for k, v in [(f.attname, getattr(obj, f.attname, None)) for f in obj._meta.fields] if (v and v not in NULL_VALUES and stringify( v).strip().lower()[:MAX_NULL_REPR_LEN] not in NULL_REPR_VALUES and not is_invalid_date(v)))
def nonnull_fields(obj, pretty=True): """Generate `.values()` dict from a table record, removing non-informative values Noninformative values include: date < 1970 date > 2100 False None 0 0.0 '0' '0.0' '0.0000' """ return PrettyDict((k, v) for k, v in [(f.attname, getattr(obj, f.attname, None)) for f in obj._meta.fields] if ( v and v not in NULL_VALUES and stringify(v).strip().lower()[:MAX_NULL_REPR_LEN] not in NULL_REPR_VALUES and not is_invalid_date(v)))
def infer_pos_label(neg_label=None): """Try to guess a positive classification label from a negative label Basis for an NLP function to find the "opposite" of a string yes->no, true->false, etc) >>> [infer_pos_label(x) for x in ('1', '-1', '0', '2', 1, 0, 'F', 'False', False, True)] ['0', '1', '1', '3', 0, 1, 'T', 'True', True, False] """ # A class label should be a None, bool, int, or str if neg_label is None: return True typ = type(neg_label) # If class label isn't a bool or None then make it an int or str try: neg_label = int(float(neg_label)) if neg_label in (0, 1): return typ(int(not neg_label)) if neg_label < 0: return typ(-neg_label) return typ(neg_label + 1) except: neg_label = stringify(neg_label).strip() for xform, label_dict in zip((lambda x: x, lambda x: x, str.lower, str.lower,), (POS_LABELS, POS_LABELS_INVERSE, POS_LABELS_LOWER, POS_LABELS_LOWER_INVERSE)): try: return typ(label_dict[xform(neg_label)]) except KeyError: pass # neg_label = neg.lower() # for labels in (POS_LABELS_LOWER, POS_LABELS_LOWER_INVERSE): # try: # return typ(labels[neg_label]) # except KeyError: # pass # if not neg_label: # return True # neg = neg[0] # try: # return POS_LABELS_LOWER_FIRST[neg] # except KeyError: # pass return 'P'