Beispiel #1
0
def build_vocab(df, degree):
    vocab = set([])
    attr = deg_to_attr[degree]
    for _, x in df.iterrows():
        vocab.update(x[attr])
    vocab = util.IxDict(list(vocab))
    return vocab
Beispiel #2
0
def word_dict():
    fp = 'tmp/fake_vocab.json'
    if os.path.exists(fp):
        return util.IxDict.load(fp)
    counts = collections.Counter()
    df = fake.load()
    for _, x in df.iterrows():
        counts.update(text.tokenize(x.tweet))
    wd = util.IxDict(counts.keys())
    wd.save(fp)
    return wd
Beispiel #3
0
    def __init__(self, k, n):
        """Create a new Vectorizer.

        Args:
          k: Integer.
          n: Integer, degree of n-grams.
        """
        self.k = k
        self.top_k = get_top_k(k, n)
        self.word_dict = util.IxDict(self.top_k.keys())
        self.doc_ent = top_k_doc_ent(k, n)
        self.scaling = special.softmax(1 / self.doc_ent)
Beispiel #4
0
def get_auth_dict():
    path = 'data/auth/auth_ix_dict'
    if not os.path.exists(path):
        authors = set([])
        _, truth = load_small()
        with tqdm(total=len(truth)) as pbar:
            for y in truth:
                authors.update(y['authors'])
                pbar.update()
        ix_dict = util.IxDict(authors)
        ix_dict.save(path)
        return ix_dict
    else:
        return util.IxDict.load(path)
Beispiel #5
0
def get_fandom_dict():
    path = 'data/auth/fd_ix_dict'
    if not os.path.exists(path):
        fandoms = set([])
        X, _ = load_small()
        with tqdm(total=len(X)) as pbar:
            for x in X:
                fandoms.update(x['fandoms'])
                pbar.update()
        ix_dict = util.IxDict(fandoms)
        ix_dict.save(path)
        return ix_dict
    else:
        return util.IxDict.load(path)
Beispiel #6
0
def top_k_counts(k, n):
    fp = f'tmp/top_{k}_{n}_counts.npy'
    if os.path.exists(fp):
        return np.load(fp)
    else:
        top_k = get_top_k(k, n)
        word_dict = util.IxDict(top_k.keys())
        toks = auth.small_toks(n)
        counts = np.zeros((k, auth.n_docs))
        with tqdm(total=auth.n_docs, desc=f'Top {k} Counts') as pbar:
            for doc_ix, doc_toks in enumerate(toks):
                doc_counts = collections.Counter(doc_toks)
                for word_ix, word in word_dict.items():
                    if word in doc_counts:
                        counts[word_ix, doc_ix] = doc_counts[word]
                pbar.update()
        np.save(fp, counts)
        return counts
Beispiel #7
0
def calculate_entropy(df, degree=1):
    attr = deg_to_attr[degree]

    # get vocabulary
    V = set([])
    for _, x in df.iterrows():
        V.update(x[attr])
    V = util.IxDict(list(V))

    # calculate probabilities
    n_date_labels = len(df.date_label.unique())
    P = np.zeros((n_date_labels, len(V)))
    with tqdm(total=n_date_labels) as pbar:
        for date_label in range(n_date_labels):
            dfd = df[df.date_label == date_label]
            counts = collections.Counter()
            for _, x in dfd.iterrows():
                counts.update(x[attr])
            n = sum(counts.values())
            for token, count in counts.items():
                token_ix = V[token]
                P[date_label, token_ix] = counts[token] / n
            pbar.update()

    # calculate entropies
    h = util.entropy(np.clip(P, a_min=1e-16, a_max=1.), 0)

    # create a DataFrame
    dfe = []
    for ix, v in V.items():
        dfe.append({
            'token': v,
            'h': h[ix],
        })
    dfe = pd.DataFrame(dfe)

    return dfe
Beispiel #8
0
"""Bill Noble's function word list."""
from pan20 import util
from pan20.util import text

cats = [
    'adverbs',
    'articles',
    'auxiliary_verbs',
    'conjunctions',
    'impersonal_pronouns',
    'personal_pronouns',
    'prepositions',
    'quantifiers',
]
cat_ix_dict = util.IxDict(cats)


class NobleDict:
    def __init__(self):
        self.word_dict = {}
        self.cat_dict = {}

        for cat in cats:
            self.cat_dict[cat] = []
            with open(f'data/noble/{cat}.txt') as f:
                for line in f.readlines():
                    word = line.strip()
                    if word not in self.word_dict.keys():
                        self.word_dict[word] = []
                    self.word_dict[word].append(cat)
                    self.cat_dict[cat].append(word)