Beispiel #1
0
def generate_lsh_graph(data_set, num_hashes=3, num_bits=5, verbose=False):
    hashers = MultiLSHasher(num_hashes, num_bits)
    if verbose: print 'Hashers initialized'

    data_counts = get_counts(data_set)
    num_docs = data_counts[0]
    num_features = data_counts[1]

    doc_features = {}
    word_counts = Counter()
    with open_data_file(data_set) as data:
        datareader = csv.reader(data, delimiter=' ')
        for row in datareader:
            doc = int(row[0])
            word = int(row[1])
            count = float(row[2])
            word_counts[word] += 1
            if doc not in doc_features:
                doc_features[doc] = []
            doc_features[doc].append((word, count))
    if verbose: print 'Loaded doc features'

    for doc, features in doc_features.items():
        if type(features[0]) is float:
            break
        feature_tfidf = []
        for w, c in features:
            tfidf = math.log(c + 1) * math.log(
                num_docs / float(word_counts[w]))
            feature_tfidf.append((w, tfidf))
        doc_features[doc] = feature_tfidf

    hashers.compute_stream(doc_features)
    signatures = hashers.compute_signatures()
    if verbose: print 'Computed signatures'

    doc_features = {}
    words_doc_count = Counter()
    with open_data_file(data_set) as data:
        datareader = csv.reader(data, delimiter=' ')
        for row in datareader:
            doc = int(row[0])
            count = float(row[2]) if '.' in row[2] else int(row[2])
            for hl, s in signatures.items():
                word = str(row[1]) + hl + s[doc]
                words_doc_count[word] += 1
                if doc not in doc_features:
                    doc_features[doc] = []
                doc_features[doc].append((word, count))
    if verbose: print 'Generated hashed doc features'

    filename = '%s-lsh-h%db%d' % (data_set, num_hashes, num_bits)
    with open_graph_file(filename) as graph:
        datawriter = csv.writer(graph, delimiter='\t')
        for doc, feature_counts in doc_features.items():
            for feature, count in feature_counts:
                tfidf = math.log(count + 1) * math.log(
                    num_docs / float(words_doc_count[feature]))
                datawriter.writerow([doc, feature, tfidf])
    if verbose: print 'Wrote graph file %s' % filename
Beispiel #2
0
def generate_lsh_graph(data_set, num_hashes=3, num_bits=5, verbose=False):
    hashers = MultiLSHasher(num_hashes, num_bits)
    if verbose: print 'Hashers initialized'

    data_counts = get_counts(data_set)
    num_docs = data_counts[0]
    num_features = data_counts[1]

    doc_features = {}
    word_counts = Counter()
    with open_data_file(data_set) as data:
        datareader = csv.reader(data, delimiter=' ')
        for row in datareader:
            doc = int(row[0])
            word = int(row[1])
            count = float(row[2])
            word_counts[word] += 1
            if doc not in doc_features:
                doc_features[doc] = []
            doc_features[doc].append((word, count))
    if verbose: print 'Loaded doc features'

    for doc, features in doc_features.items():
        if type(features[0]) is float:
            break
        feature_tfidf = []
        for w, c in features:
            tfidf = math.log(c+1) * math.log(num_docs/float(word_counts[w]))
            feature_tfidf.append((w,tfidf))
        doc_features[doc] = feature_tfidf

    hashers.compute_stream(doc_features)
    signatures = hashers.compute_signatures()
    if verbose: print 'Computed signatures'

    doc_features = {}
    words_doc_count = Counter()
    with open_data_file(data_set) as data:
        datareader = csv.reader(data, delimiter=' ')
        for row in datareader:
            doc = int(row[0])
            count = float(row[2]) if '.' in row[2] else int(row[2])
            for hl, s in signatures.items():
                word = str(row[1]) + hl + s[doc]
                words_doc_count[word] += 1
                if doc not in doc_features:
                    doc_features[doc] = []
                doc_features[doc].append((word, count))
    if verbose: print 'Generated hashed doc features'

    filename = '%s-lsh-h%db%d' % (data_set, num_hashes, num_bits)
    with open_graph_file(filename) as graph:
        datawriter = csv.writer(graph, delimiter='\t')
        for doc, feature_counts in doc_features.items():
            for feature, count in feature_counts:
                tfidf = math.log(count+1) * math.log(num_docs/float(
                    words_doc_count[feature]))
                datawriter.writerow([doc, feature, tfidf])
    if verbose: print 'Wrote graph file %s' % filename
Beispiel #3
0
def get_hash_docs(graph_file):
    assert 'lsh' in graph_file
    hash_docs = {}
    with util.open_graph_file(graph_file, 'rb') as graph:
        datareader = csv.reader(graph, delimiter='\t')
        for row in datareader:
            doc = int(row[0])
            for i in xrange(len(row[1])):
                if row[1][i] in string.ascii_lowercase:
                    hl = row[1][i]
                    h = row[1][(i + 1):]
                    if hl not in hash_docs:
                        hash_docs[hl] = {}
                    if not h in hash_docs[hl]:
                        hash_docs[hl][h] = set()
                    hash_docs[hl][h].add(doc)
                    break
    return hash_docs
Beispiel #4
0
def get_hash_docs(graph_file):
    assert 'lsh' in graph_file
    hash_docs = {}
    with util.open_graph_file(graph_file, 'rb') as graph:
        datareader = csv.reader(graph, delimiter='\t')
        for row in datareader:
            doc = int(row[0])
            for i in xrange(len(row[1])):
                if row[1][i] in string.ascii_lowercase:
                    hl = row[1][i]
                    h = row[1][(i + 1):]
                    if hl not in hash_docs:
                        hash_docs[hl] = {}
                    if not h in hash_docs[hl]:
                        hash_docs[hl][h] = set()
                    hash_docs[hl][h].add(doc)
                    break
    return hash_docs
Beispiel #5
0
def get_doc_hashes(graph_file):
    assert 'lsh' in graph_file
    doc_hashes = {}
    with util.open_graph_file(graph_file, 'rb') as graph:
        datareader = csv.reader(graph, delimiter='\t')
        for row in datareader:
            doc = int(row[0])
            for i in xrange(len(row[1])):
                if row[1][i] in string.ascii_lowercase:
                    hl = row[1][i]
                    h = row[1][(i + 1):]
                    if doc not in doc_hashes:
                        doc_hashes[doc] = {}
                    if hl in doc_hashes[doc]:
                        assert h == doc_hashes[doc][hl]
                    else:
                        doc_hashes[doc][hl] = h
                    break
    return doc_hashes
Beispiel #6
0
def get_doc_hashes(graph_file):
    assert 'lsh' in graph_file
    doc_hashes = {}
    with util.open_graph_file(graph_file, 'rb') as graph:
        datareader = csv.reader(graph, delimiter='\t')
        for row in datareader:
            doc = int(row[0])
            for i in xrange(len(row[1])):
                if row[1][i] in string.ascii_lowercase:
                    hl = row[1][i]
                    h = row[1][(i + 1):]
                    if doc not in doc_hashes:
                        doc_hashes[doc] = {}
                    if hl in doc_hashes[doc]:
                        assert h == doc_hashes[doc][hl]
                    else:
                        doc_hashes[doc][hl] = h
                    break
    return doc_hashes
Beispiel #7
0
def generate_labeled_baseline_graph(output_file, percentile=95, verbose=False):
    data_set = output_file.split('-')[0]
    data_counts = get_counts(data_set)
    num_docs = data_counts[0]
    num_features = data_counts[1]
    test_data = []

    words_doc_count = Counter()
    for doc, features in get_new_doc_features(data_set, output_file, percentile).items():
        for word, count in features:
            words_doc_count[word] += 1
            test_data.append([doc, word, count])
    if verbose: print 'Loaded doc features'

    with open_graph_file(output_file) as graph:
        datawriter = csv.writer(graph, delimiter='\t')
        for d, features in get_new_doc_features(data_set, output_file, percentile).items():
            for w, c in features:
                tfidf = math.log(c+1) * math.log(num_docs/float(words_doc_count[w]))
                datawriter.writerow([d, w, tfidf])
        if verbose: print 'Wrote graph file %s' % output_file
Beispiel #8
0
def generate_baseline_graph(data_set, filename=None, verbose=False):
    data_counts = get_counts(data_set)
    num_docs = data_counts[0]
    num_features = data_counts[1]
    test_data = []

    words_doc_count = Counter()
    for doc, features in get_doc_features(data_set).items():
        for word, count in features:
            words_doc_count[word] += 1
            test_data.append([doc, word, count])
    if verbose: print 'Loaded doc features'

    if not filename: filename = data_set + '-baseline'
    with open_graph_file(filename) as graph:
        datawriter = csv.writer(graph, delimiter='\t')
        for d,w,c in test_data:
            if type(c) is float:
                datawriter.writerow([str(d), str(w) + 'w', c])
            else:
                tfidf = math.log(c+1) * math.log(num_docs/float(words_doc_count[w]))
                datawriter.writerow([str(d), str(w) + 'w', tfidf])
        if verbose: print 'Wrote graph file %s' % filename
Beispiel #9
0
def generate_labeled_baseline_graph(output_file, percentile=95, verbose=False):
    data_set = output_file.split('-')[0]
    data_counts = get_counts(data_set)
    num_docs = data_counts[0]
    num_features = data_counts[1]
    test_data = []

    words_doc_count = Counter()
    for doc, features in get_new_doc_features(data_set, output_file,
                                              percentile).items():
        for word, count in features:
            words_doc_count[word] += 1
            test_data.append([doc, word, count])
    if verbose: print 'Loaded doc features'

    with open_graph_file(output_file) as graph:
        datawriter = csv.writer(graph, delimiter='\t')
        for d, features in get_new_doc_features(data_set, output_file,
                                                percentile).items():
            for w, c in features:
                tfidf = math.log(c + 1) * math.log(
                    num_docs / float(words_doc_count[w]))
                datawriter.writerow([d, w, tfidf])
        if verbose: print 'Wrote graph file %s' % output_file
Beispiel #10
0
def generate_baseline_graph(data_set, filename=None, verbose=False):
    data_counts = get_counts(data_set)
    num_docs = data_counts[0]
    num_features = data_counts[1]
    test_data = []

    words_doc_count = Counter()
    for doc, features in get_doc_features(data_set).items():
        for word, count in features:
            words_doc_count[word] += 1
            test_data.append([doc, word, count])
    if verbose: print 'Loaded doc features'

    if not filename: filename = data_set + '-baseline'
    with open_graph_file(filename) as graph:
        datawriter = csv.writer(graph, delimiter='\t')
        for d, w, c in test_data:
            if type(c) is float:
                datawriter.writerow([str(d), str(w) + 'w', c])
            else:
                tfidf = math.log(c + 1) * math.log(
                    num_docs / float(words_doc_count[w]))
                datawriter.writerow([str(d), str(w) + 'w', tfidf])
        if verbose: print 'Wrote graph file %s' % filename
Beispiel #11
0
def generate_knn_graphs(data_set, ks=[5,10,20,30,50,100], verbose=False):
    '''
    since we get a list of *all* the neighbors ordered by "nearness",
    it makes more sense to iterate through the different k's within
    the function rather than outside it
    '''
    data_counts = get_counts(data_set)
    num_docs = data_counts[0]
    num_features = data_counts[1]

    max_k = max(ks)

    assert max_k < num_docs

    feature_matrix = np.matrix(np.zeros((num_docs, num_features)))
    words_doc_count = np.zeros(num_features)
    is_tfidf = False
    docs = set()
    with open_data_file(data_set) as data:
        datareader = csv.reader(data, delimiter=' ')
        for row in datareader:
            doc = int(row[0]) - 1
            word = int(row[1]) - 1
            if is_tfidf:
                count = float(row[2])
            elif '.' in row[2]:
                count = float(row[2])
                is_tfidf = True
            else:
                count = int(row[2])
            words_doc_count[word] += 1
            docs.add(doc)
            feature_matrix.itemset((doc, word), count)
    if verbose: print 'Loaded test data'

    if verbose: print 'Generating feature matrix'
    if not is_tfidf:
        for doc in xrange(num_docs):
            if doc in docs:
                for word in xrange(num_features):
                    if words_doc_count[word] != 0:
                        count = feature_matrix.item((doc,word))
                        tfidf = math.log(count+1) * math.log(num_docs/float(words_doc_count[word]))
                        feature_matrix.itemset((doc,word), tfidf)
            if doc % 10 == 9:
                if verbose: print 'Processed %d out of %d documents' % (doc+1, num_docs)
    if verbose: print 'Generated feature matrix'

    normalizing_matrix = np.matrix(np.zeros((num_docs, num_docs)))
    for i in xrange(num_docs):
        f = feature_matrix[i]
        fft = math.sqrt(f * f.transpose())
        if fft < 1e-9:
            normalizing_matrix.itemset((i,i), 0.0)
        else:
            normalizing_matrix.itemset((i,i), 1.0 / fft)
    if verbose: print 'Generated normalizing matrix'

    if verbose: print 'Generating folded graph'
    edges = []
    N = normalizing_matrix
    F = feature_matrix
    doc_neighbors = {}
    for doc in xrange(num_docs):
        Nv = np.matrix(np.zeros((num_docs,1)))
        Nv.itemset(doc, N.item((doc, doc)))
        FtNv = F[doc].transpose() * N.item((doc,doc))
        doc_weights = np.array(N * (F * FtNv)).transpose()
        neighbors = np.argsort(doc_weights)[0]
        doc_neighbors[doc] = [(neighbor, doc_weights.item(neighbor)) for neighbor in neighbors[-max_k:]]
        if doc % 10 == 9:
            if verbose: print 'Processed %d out of %d documents' % (doc+1, num_docs)
    if verbose: print 'Generated folded graph'

    for k in ks:
        filename = '%s-knn-k%d' % (data_set, k)
        with open_graph_file(filename) as graph:
            datawriter = csv.writer(graph, delimiter='\t')
            for doc in xrange(num_docs):
                for neighbor, weight in doc_neighbors[doc][-k:]:
                    if weight >= 1e-9:
                        datawriter.writerow([str(doc+1), str(neighbor+1), weight])
            if verbose: print 'Wrote graph file %s' % filename
Beispiel #12
0
def generate_knn_graph(data_set, k, verbose=False):
    data_counts = get_counts(data_set)
    num_docs = data_counts[0]
    num_features = data_counts[1]

    assert k < num_docs

    feature_matrix = np.matrix(np.zeros((num_docs, num_features)))
    words_doc_count = np.zeros(num_features)
    is_tfidf = False
    docs = set()
    with open_data_file(data_set) as data:
        datareader = csv.reader(data, delimiter=' ')
        for row in datareader:
            doc = int(row[0]) - 1
            word = int(row[1]) - 1
            if is_tfidf:
                count = float(row[2])
            elif '.' in row[2]:
                count = float(row[2])
                is_tfidf = True
            else:
                count = int(row[2])
            words_doc_count[word] += 1
            docs.add(doc)
            feature_matrix.itemset((doc, word), count)
    if verbose: print 'Loaded test data'

    if verbose: print 'Generating feature matrix'
    if not is_tfidf:
        for doc in xrange(num_docs):
            if doc in docs:
                for word in xrange(num_features):
                    if words_doc_count[word] != 0:
                        count = feature_matrix.item((doc,word))
                        tfidf = math.log(count+1) * math.log(num_docs/float(words_doc_count[word]))
                        feature_matrix.itemset((doc,word), tfidf)
            if doc % 10 == 9:
                if verbose: print 'Processed %d out of %d documents' % (doc+1, num_docs)
    if verbose: print 'Generated feature matrix'

    normalizing_matrix = np.matrix(np.zeros((num_docs, num_docs)))
    for i in xrange(num_docs):
        f = feature_matrix[i]
        fft = math.sqrt(f * f.transpose())
        if fft < 1e-9:
            normalizing_matrix.itemset((i,i), 0.0)
        else:
            normalizing_matrix.itemset((i,i), 1.0 / fft)
    if verbose: print 'Generated normalizing matrix'

    if verbose: print 'Generating folded graph'
    edges = []
    N = normalizing_matrix
    F = feature_matrix
    for doc in xrange(num_docs):
        Nv = np.matrix(np.zeros((num_docs,1)))
        Nv.itemset(doc, N.item((doc, doc)))
        FtNv = F[doc].transpose() * N.item((doc,doc))
        doc_weights = np.array(N * (F * FtNv)).transpose()
        nearest_neighbors = np.argsort(doc_weights)
        for neighbor in nearest_neighbors[0][-k:]:
            if doc_weights.item(neighbor) < 1e-9:
                continue
            edges.append(((doc+1, int(neighbor)+1), doc_weights.item(neighbor)))
        if doc % 10 == 9:
            if verbose: print 'Processed %d out of %d documents' % (doc+1, num_docs)
    if verbose: print 'Generated folded graph'

    filename = '%s-knn-k%d' % (data_set, k)
    with open_graph_file(filename) as graph:
        datawriter = csv.writer(graph, delimiter='\t')
        for edge, weight in edges:
            datawriter.writerow([edge[0], edge[1], weight])
    if verbose: print 'Wrote graph file %s' % filename
Beispiel #13
0
def generate_knn_graphs(data_set, ks=[5, 10, 20, 30, 50, 100], verbose=False):
    '''
    since we get a list of *all* the neighbors ordered by "nearness",
    it makes more sense to iterate through the different k's within
    the function rather than outside it
    '''
    data_counts = get_counts(data_set)
    num_docs = data_counts[0]
    num_features = data_counts[1]

    max_k = max(ks)

    assert max_k < num_docs

    feature_matrix = np.matrix(np.zeros((num_docs, num_features)))
    words_doc_count = np.zeros(num_features)
    is_tfidf = False
    docs = set()
    with open_data_file(data_set) as data:
        datareader = csv.reader(data, delimiter=' ')
        for row in datareader:
            doc = int(row[0]) - 1
            word = int(row[1]) - 1
            if is_tfidf:
                count = float(row[2])
            elif '.' in row[2]:
                count = float(row[2])
                is_tfidf = True
            else:
                count = int(row[2])
            words_doc_count[word] += 1
            docs.add(doc)
            feature_matrix.itemset((doc, word), count)
    if verbose: print 'Loaded test data'

    if verbose: print 'Generating feature matrix'
    if not is_tfidf:
        for doc in xrange(num_docs):
            if doc in docs:
                for word in xrange(num_features):
                    if words_doc_count[word] != 0:
                        count = feature_matrix.item((doc, word))
                        tfidf = math.log(count + 1) * math.log(
                            num_docs / float(words_doc_count[word]))
                        feature_matrix.itemset((doc, word), tfidf)
            if doc % 10 == 9:
                if verbose:
                    print 'Processed %d out of %d documents' % (doc + 1,
                                                                num_docs)
    if verbose: print 'Generated feature matrix'

    normalizing_matrix = np.matrix(np.zeros((num_docs, num_docs)))
    for i in xrange(num_docs):
        f = feature_matrix[i]
        fft = math.sqrt(f * f.transpose())
        if fft < 1e-9:
            normalizing_matrix.itemset((i, i), 0.0)
        else:
            normalizing_matrix.itemset((i, i), 1.0 / fft)
    if verbose: print 'Generated normalizing matrix'

    if verbose: print 'Generating folded graph'
    edges = []
    N = normalizing_matrix
    F = feature_matrix
    doc_neighbors = {}
    for doc in xrange(num_docs):
        Nv = np.matrix(np.zeros((num_docs, 1)))
        Nv.itemset(doc, N.item((doc, doc)))
        FtNv = F[doc].transpose() * N.item((doc, doc))
        doc_weights = np.array(N * (F * FtNv)).transpose()
        neighbors = np.argsort(doc_weights)[0]
        doc_neighbors[doc] = [(neighbor, doc_weights.item(neighbor))
                              for neighbor in neighbors[-max_k:]]
        if doc % 10 == 9:
            if verbose:
                print 'Processed %d out of %d documents' % (doc + 1, num_docs)
    if verbose: print 'Generated folded graph'

    for k in ks:
        filename = '%s-knn-k%d' % (data_set, k)
        with open_graph_file(filename) as graph:
            datawriter = csv.writer(graph, delimiter='\t')
            for doc in xrange(num_docs):
                for neighbor, weight in doc_neighbors[doc][-k:]:
                    if weight >= 1e-9:
                        datawriter.writerow(
                            [str(doc + 1),
                             str(neighbor + 1), weight])
            if verbose: print 'Wrote graph file %s' % filename
Beispiel #14
0
def generate_knn_graph(data_set, k, verbose=False):
    data_counts = get_counts(data_set)
    num_docs = data_counts[0]
    num_features = data_counts[1]

    assert k < num_docs

    feature_matrix = np.matrix(np.zeros((num_docs, num_features)))
    words_doc_count = np.zeros(num_features)
    is_tfidf = False
    docs = set()
    with open_data_file(data_set) as data:
        datareader = csv.reader(data, delimiter=' ')
        for row in datareader:
            doc = int(row[0]) - 1
            word = int(row[1]) - 1
            if is_tfidf:
                count = float(row[2])
            elif '.' in row[2]:
                count = float(row[2])
                is_tfidf = True
            else:
                count = int(row[2])
            words_doc_count[word] += 1
            docs.add(doc)
            feature_matrix.itemset((doc, word), count)
    if verbose: print 'Loaded test data'

    if verbose: print 'Generating feature matrix'
    if not is_tfidf:
        for doc in xrange(num_docs):
            if doc in docs:
                for word in xrange(num_features):
                    if words_doc_count[word] != 0:
                        count = feature_matrix.item((doc, word))
                        tfidf = math.log(count + 1) * math.log(
                            num_docs / float(words_doc_count[word]))
                        feature_matrix.itemset((doc, word), tfidf)
            if doc % 10 == 9:
                if verbose:
                    print 'Processed %d out of %d documents' % (doc + 1,
                                                                num_docs)
    if verbose: print 'Generated feature matrix'

    normalizing_matrix = np.matrix(np.zeros((num_docs, num_docs)))
    for i in xrange(num_docs):
        f = feature_matrix[i]
        fft = math.sqrt(f * f.transpose())
        if fft < 1e-9:
            normalizing_matrix.itemset((i, i), 0.0)
        else:
            normalizing_matrix.itemset((i, i), 1.0 / fft)
    if verbose: print 'Generated normalizing matrix'

    if verbose: print 'Generating folded graph'
    edges = []
    N = normalizing_matrix
    F = feature_matrix
    for doc in xrange(num_docs):
        Nv = np.matrix(np.zeros((num_docs, 1)))
        Nv.itemset(doc, N.item((doc, doc)))
        FtNv = F[doc].transpose() * N.item((doc, doc))
        doc_weights = np.array(N * (F * FtNv)).transpose()
        nearest_neighbors = np.argsort(doc_weights)
        for neighbor in nearest_neighbors[0][-k:]:
            if doc_weights.item(neighbor) < 1e-9:
                continue
            edges.append(
                ((doc + 1, int(neighbor) + 1), doc_weights.item(neighbor)))
        if doc % 10 == 9:
            if verbose:
                print 'Processed %d out of %d documents' % (doc + 1, num_docs)
    if verbose: print 'Generated folded graph'

    filename = '%s-knn-k%d' % (data_set, k)
    with open_graph_file(filename) as graph:
        datawriter = csv.writer(graph, delimiter='\t')
        for edge, weight in edges:
            datawriter.writerow([edge[0], edge[1], weight])
    if verbose: print 'Wrote graph file %s' % filename