def vector_maker(datum): try: dclass, dname, dpath, df, u = datum doc = WCDocument(dpath) v = sp.sparse.lil_matrix((1, df.vocab_size)) norm = 0 for word, count in doc.wc_dict().items(): idx = df.word_index(word) if idx is not None: tfidf = (1 + log(count, 2)) * log(float(df.ndocs) / df[word]) if u is not None: tfidf -= u[0, idx] v[0, idx] = tfidf norm += tfidf ** 2 norm = sqrt(norm) if norm != 0: v /= norm return {'class': dclass, 'disaster': dname, 'file': dpath, 'vector': v} except KeyboardInterrupt, e: pass
def mean_vec_worker(datum): try: dclass, dname, dpath, df = datum doc = WCDocument(dpath) results = [] for word, count in doc.wc_dict().items(): idx = df.word_index(word) if idx is not None: tfidf = (1 + log(count, 2)) * log(float(df.ndocs) / df[word]) results.append((0, idx, tfidf)) return results except KeyboardInterrupt, e: pass
def file2vector(filename, df): doc = WCDocument(filename) vsize = df.vocab_size v = sp.sparse.lil_matrix((vsize, 1)) norm = 0 for word, count in doc.wc_dict().items(): idx = df.word_index(word) if idx is not None: v[idx] = 1 norm += 1 norm = sqrt(norm) if norm != 0: v /= norm return v.tocsr()
def file2vector(filename, df): doc = WCDocument(filename) vsize = df.vocab_size v = sp.sparse.lil_matrix((1, vsize)) norm = 0 for word, count in doc.wc_dict().items(): idx = df.word_index(word) if idx is not None: tfidf = (1 + log(count, 2)) * log(float(df.ndocs) / df[word]) v[0, idx] = tfidf norm += tfidf ** 2 norm = sqrt(norm) if norm != 0: v /= norm return v