forked from ankailou/reuters-preprocessing
-
Notifications
You must be signed in to change notification settings - Fork 0
/
feature1.py
127 lines (109 loc) · 4.77 KB
/
feature1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/usr/local/python-2.7.5/bin/python
""" feature1.py
-----------
@author = Ankai Lou
"""
import os
import sys
import string
import nltk
from tfidf import tfidf
import operator
###############################################################################
########## global variables for single-point of control over change ###########
###############################################################################
datafile = 'dataset1.csv'
###############################################################################
############### function for printing dataset to .csv document ################
###############################################################################
def generate_csv(documents, features, weights):
""" function: generate_csv
----------------------
print feature vectors & class labels to .csv file
:param documents: dictionary of document objects
:param features: sorted list of features to represent
"""
dataset = open(datafile, "w")
dataset.write('id\t')
for feature in features:
dataset.write(feature)
dataset.write('\t')
dataset.write('class-label:topics\t')
dataset.write('class-label:places\t')
dataset.write('\n')
# feature vector for each document
for i, document in enumerate(documents):
# document id number
dataset.write(str(i))
dataset.write('\t')
# each tf-idf score
for feature in features:
dataset.write(str(weights[i][feature]))
dataset.write('\t')
# topics/places class labels
dataset.write(str(document['topics']))
dataset.write(str(document['places']))
dataset.write('\n')
dataset.close()
###############################################################################
###################### function(s) for feature selection ######################
###############################################################################
def select_features(weights):
""" function: select_features
-------------------------
generated reduced feature list for vector generation
:param weights: dictionary from results of the tf-idf calculations
:returns: sorted list of terms representing the selected features
"""
features = set()
for doc, doc_dict in weights.iteritems():
top = dict(sorted(doc_dict.iteritems(), key=operator.itemgetter(1), reverse=True)[:5])
for term, score in top.iteritems():
if score > 0.0:
features.add(term)
# sort set into list
return sorted(features)
###############################################################################
############## function(s) for generating weighted tf-idf scores ##############
###############################################################################
def generate_weights(documents, lexicon):
""" function: generate_weights
--------------------------
perform tf-idf to generate importance scores for words in documents
:param document: list of documents to use in calculations
:returns: dictionary of dictionaries: {"id_" : {"word" : score,...}}
"""
# weight = { 'document' : { 'word' : score,... },... }
weights = dict()
m = tfidf()
print('Adding documents for TF-IDF...')
for i, document in enumerate(documents):
m.addDocument(i, document['words']['title']+document['words']['body'])
weights[i] = dict()
# generate dictionary of { "word", "score" } pairs for each document
print('Generating weight scores for words; This WILL take time...')
for word in lexicon['title'] | lexicon['body']:
# UNCOMMENT FOR SANITY
# print('Generating weights for word:', word)
m.get_similarities(word, weights)
return weights
###############################################################################
################ main function for generating refined dataset #################
###############################################################################
def generate_dataset(documents, lexicon):
""" function: generate_dataset
--------------------------
select features from @lexicon for feature vectors
generate dataset of feature vectors for @documents
:param documents: list of well-formatted, processable documents
:param lexicon: list of word stems for selecting features
"""
print '\nGenerating dataset @', datafile
weights = generate_weights(documents, lexicon)
# generate feature list
print 'Selecting features for the feature vectors @', datafile
features = select_features(weights)
# write vectors to dataset1.csv
print 'Writing feature vector data @', datafile
generate_csv(documents, features, weights)
print 'Finished generating dataset @', datafile