Esempio n. 1
0
def vector_maker(datum):
    try:
        dclass, dname, dpath, df, u = datum
        
        doc = WCDocument(dpath)
        v = sp.sparse.lil_matrix((1, df.vocab_size))

        norm = 0
        for word, count in doc.wc_dict().items():
            idx = df.word_index(word)
            if idx is not None:
                tfidf = (1 + log(count, 2)) * log(float(df.ndocs) / df[word])
                if u is not None:
                    tfidf -= u[0, idx]
                v[0, idx] = tfidf
            norm += tfidf ** 2
        norm = sqrt(norm)
        if norm != 0:
            v /= norm
 
        return {'class': dclass, 
                'disaster': dname,
                'file': dpath,
                'vector': v}

    except KeyboardInterrupt, e:
        pass
Esempio n. 2
0
def mean_vec_worker(datum):
    try:
        dclass, dname, dpath, df = datum
        doc = WCDocument(dpath)

        results = [] 
        for word, count in doc.wc_dict().items():
            idx = df.word_index(word)
            if idx is not None:
                tfidf = (1 + log(count, 2)) * log(float(df.ndocs) / df[word])
                results.append((0, idx, tfidf))
        return results 
    except KeyboardInterrupt, e:
        pass
Esempio n. 3
0
def file2vector(filename, df):
    doc = WCDocument(filename)
    vsize = df.vocab_size
    v = sp.sparse.lil_matrix((vsize, 1))

    norm = 0
    for word, count in doc.wc_dict().items():
        idx = df.word_index(word)
        if idx is not None:
            v[idx] = 1
            norm += 1
    norm = sqrt(norm)
    if norm != 0:
        v /= norm
    return v.tocsr()
Esempio n. 4
0
def file2vector(filename, df):
    doc = WCDocument(filename)
    vsize = df.vocab_size
    v = sp.sparse.lil_matrix((1, vsize))

    norm = 0
    for word, count in doc.wc_dict().items():
        idx = df.word_index(word)
        if idx is not None:
            tfidf = (1 + log(count, 2)) * log(float(df.ndocs) / df[word])
            v[0, idx] = tfidf
            norm += tfidf ** 2
    norm = sqrt(norm)
    if norm != 0:
        v /= norm
    return v