/
KohonenWordCategorizer.py
101 lines (82 loc) · 3.43 KB
/
KohonenWordCategorizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# KohonenWordCategorizer
# Sam Goree
# Uses PyBrain's implementation of Kohonen Self-Organizing Maps to sort a given body of data into categories
# This version reverses it: it clusters words based on their occurrence in files.
# Usage: python KohonenCategorizer arg1 arg2
# arg1 is the file of data, separated onto separate lines, if there is no file, it will read from stdin
# arg2 is the number of categories, should be a perfect square (i.e. 2, 4, 9), default is 4
# arg3 is --words if you want to print out the top words in each cluster rather than each word associated with a cluster
import pylab
from scipy import random
from pybrain.structure.modules import KohonenMap
import argparse
import sys
from math import sqrt
import numpy as np
from types import *
infile = sys.stdin
nnodes = 2
categories = 4
NWORDS = 7 # number of words to print out per category for the -words option
# reads and returns data from a file
# infile is the file of input
def readData(infile):
# process the data
tempData = []
for line in infile:
tempData.append(line.split(','))
# swap the axes
data = [[] for i in range(len(tempData[0]))]
for i in range(len(tempData)):
for j in range(len(tempData[i])):
data[j].append(tempData[i][j])
data = np.array(data)
return data
# runs the kohonen map and prints either raw results or words for Koushik's GA (based on the value of words)
# categories is the number of nodes in the kohonen map
def categorize(categories, data, words):
nnodes = sqrt(categories)
# make a kohonen map
som = KohonenMap(len(data[1])-1, nnodes)
# train the network
for i in range(25000):
# one forward and one backward (training) pass on a random line
som.activate(data[random.randint(1, len(data)-1)][1:])
som.backward()
# run on the data
if not words:
for point in data[1:-1]:
#find the category for a point of data
print(point[0] + ',' + str(som.activate(point[1:])))
#make wordlist output similar to ICA for use in GAs
if words:
results = [[] for i in range(categories)]
for point in data[1:-1]:
# find the cluster for this word and add in the word's data
result = som.activate(point[1:])
results[int(result[0]*nnodes + result[1])].append(point)
# print out the clusters
for i in range(NWORDS):
# TODO: This is super inefficient
for cluster in results:
if len(cluster) == 0:
sys.stdout.write('EMPTY')
continue
for j in range(len(cluster)):
for k in range(1, len(cluster[j])):
cluster[j][k] = float(cluster[j][k])
tempCluster = sorted(cluster, key=lambda point: -sum(point[1:]))
sys.stdout.write(str(tempCluster[i%len(cluster)][0]) + ',')
sys.stdout.write('\n')
if __name__ == "__main__":
# handle arguments
parser = argparse.ArgumentParser(description='Sort a body of data into categories')
parser.add_argument('infile', type=argparse.FileType('r'), help='the file of data, separated onto separate lines')
parser.add_argument('categories', type=int, help='the number of categories, should be a perfect square')
parser.add_argument('--words', '-w', '-words', dest='words', action='store_const', const=True, default=False, help='Prints the words associated with each category at the end of all of the categories')
args = parser.parse_args()
infile = args.infile
categories = args.categories
words = args.words
data = readData(infile)
categorize(categories, data, words)