-
Notifications
You must be signed in to change notification settings - Fork 0
/
train.py
82 lines (67 loc) · 2.93 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import transcript
import scdb
import simplejson
import scores
from os import listdir
from os.path import isfile, join
mypath = "transcripts"
files = ["transcripts/" + f for f in listdir(mypath) if isfile(join(mypath, f))]
f = open("scores","w")
f.write("Scores:\n")
f.close()
for file in files:
print "Now processing " + file
slash = file.find('/')
end_docket = file.find('_')
if file.find('q') != -1:
docket_number = file.find('q')
docket_number = file[slash+1:end_docket]
print "Checking database for docket #%s" % docket_number
print "Winner identified: %s" % scdb.get_winning_party(docket_number)
raw_input()
the_transcript = transcript.get_transcript_from_PDF(file)
petitioners, respondents = transcript.get_petitioners_and_respondents(the_transcript)
argument = transcript.get_argument(the_transcript)
arguments_by_advocate = transcript.get_arguments_by_advocate(petitioners, respondents, argument)
for petitioner, argument in arguments_by_advocate["petitioner"].iteritems():
statements = transcript.get_statements_in_argument(argument, petitioner)
#scores.get_statistics_from_statements(statements)
number_of_words_per_speaker = scores.get_number_of_words_per_speaker(statements)
features = scores.get_features_from_statements(statements)
flat_features = scores.flatten_features(features)
#import matplotlib.pyplot as plt
#from mpltools import style
#style.use('ggplot')
#plt.ion()
#D = flat_features
#plt.barh(range(20), D.values()[:20], align='center')
#plt.yticks(range(20), D.keys()[:20])
#plt.draw()
#plt.clf()
normalized = scores.normalize_feature_list(flat_features)
myscores = scores.get_feature_vector(normalized)
print "Number of features: %d" % len(myscores)
f = open("scores","a")
f.write(simplejson.dumps(myscores)+"\n")
f.close()
for respondent, argument in arguments_by_advocate["respondent"].iteritems():
statements = transcript.get_statements_in_argument(argument, respondent)
#scores.get_statistics_from_statements(statements)
number_of_words_per_speaker = scores.get_number_of_words_per_speaker(statements)
features = scores.get_features_from_statements(statements)
flat_features = scores.flatten_features(features)
#import matplotlib.pyplot as plt
#from mpltools import style
#style.use('ggplot')
#plt.ion()
#D = flat_features
#plt.barh(range(20), D.values()[:20], align='center')
#plt.yticks(range(20), D.keys()[:20])
#plt.draw()
#plt.clf()
#normalized = scores.normalize_feature_list(flat_features)
myscores = scores.get_feature_vector(normalized)
print "Number of features: %d" % len(myscores)
f = open("scores","a")
f.write(simplejson.dumps(myscores)+"\n")
f.close()