def build_vocab(df, degree): vocab = set([]) attr = deg_to_attr[degree] for _, x in df.iterrows(): vocab.update(x[attr]) vocab = util.IxDict(list(vocab)) return vocab
def word_dict(): fp = 'tmp/fake_vocab.json' if os.path.exists(fp): return util.IxDict.load(fp) counts = collections.Counter() df = fake.load() for _, x in df.iterrows(): counts.update(text.tokenize(x.tweet)) wd = util.IxDict(counts.keys()) wd.save(fp) return wd
def __init__(self, k, n): """Create a new Vectorizer. Args: k: Integer. n: Integer, degree of n-grams. """ self.k = k self.top_k = get_top_k(k, n) self.word_dict = util.IxDict(self.top_k.keys()) self.doc_ent = top_k_doc_ent(k, n) self.scaling = special.softmax(1 / self.doc_ent)
def get_auth_dict(): path = 'data/auth/auth_ix_dict' if not os.path.exists(path): authors = set([]) _, truth = load_small() with tqdm(total=len(truth)) as pbar: for y in truth: authors.update(y['authors']) pbar.update() ix_dict = util.IxDict(authors) ix_dict.save(path) return ix_dict else: return util.IxDict.load(path)
def get_fandom_dict(): path = 'data/auth/fd_ix_dict' if not os.path.exists(path): fandoms = set([]) X, _ = load_small() with tqdm(total=len(X)) as pbar: for x in X: fandoms.update(x['fandoms']) pbar.update() ix_dict = util.IxDict(fandoms) ix_dict.save(path) return ix_dict else: return util.IxDict.load(path)
def top_k_counts(k, n): fp = f'tmp/top_{k}_{n}_counts.npy' if os.path.exists(fp): return np.load(fp) else: top_k = get_top_k(k, n) word_dict = util.IxDict(top_k.keys()) toks = auth.small_toks(n) counts = np.zeros((k, auth.n_docs)) with tqdm(total=auth.n_docs, desc=f'Top {k} Counts') as pbar: for doc_ix, doc_toks in enumerate(toks): doc_counts = collections.Counter(doc_toks) for word_ix, word in word_dict.items(): if word in doc_counts: counts[word_ix, doc_ix] = doc_counts[word] pbar.update() np.save(fp, counts) return counts
def calculate_entropy(df, degree=1): attr = deg_to_attr[degree] # get vocabulary V = set([]) for _, x in df.iterrows(): V.update(x[attr]) V = util.IxDict(list(V)) # calculate probabilities n_date_labels = len(df.date_label.unique()) P = np.zeros((n_date_labels, len(V))) with tqdm(total=n_date_labels) as pbar: for date_label in range(n_date_labels): dfd = df[df.date_label == date_label] counts = collections.Counter() for _, x in dfd.iterrows(): counts.update(x[attr]) n = sum(counts.values()) for token, count in counts.items(): token_ix = V[token] P[date_label, token_ix] = counts[token] / n pbar.update() # calculate entropies h = util.entropy(np.clip(P, a_min=1e-16, a_max=1.), 0) # create a DataFrame dfe = [] for ix, v in V.items(): dfe.append({ 'token': v, 'h': h[ix], }) dfe = pd.DataFrame(dfe) return dfe
"""Bill Noble's function word list.""" from pan20 import util from pan20.util import text cats = [ 'adverbs', 'articles', 'auxiliary_verbs', 'conjunctions', 'impersonal_pronouns', 'personal_pronouns', 'prepositions', 'quantifiers', ] cat_ix_dict = util.IxDict(cats) class NobleDict: def __init__(self): self.word_dict = {} self.cat_dict = {} for cat in cats: self.cat_dict[cat] = [] with open(f'data/noble/{cat}.txt') as f: for line in f.readlines(): word = line.strip() if word not in self.word_dict.keys(): self.word_dict[word] = [] self.word_dict[word].append(cat) self.cat_dict[cat].append(word)