/
statread.py
216 lines (203 loc) · 9.37 KB
/
statread.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
import MySQLdb as mdb
import itertools
import os
import keywordadd as ka
import matplotlib.pyplot as plt
import argparse
import collections
parser = argparse.ArgumentParser()
parser.add_argument('keyword')
parser.add_argument('--keyword2')
parser.add_argument('database')
parser.add_argument('--file')
parser.add_argument('--year')
parser.add_argument('--ngram')
args= parser.parse_args()
dbs = mdb.connect(user = "root", passwd = "root", db = args.database)
cur = dbs.cursor()
fig, ax = plt.subplots()
def uniTable(table_char): #uniTable(), and it's sister method uniCol(), serve as a way to help bridge the gap between mysql databases containing similar information, but are separately form atted. There are limitations of course. For example, each database would need to have some level of similarity in order to be useful.
cur.execute("SHOW TABLES FROM %s LIKE '%%%s%%'" % (args.database, table_char))
val=cur.fetchall()
table=val[0][0]
return table
def uniCol(table, col_char, type):
cur.execute("SHOW COLUMNS FROM %s LIKE '%%%s%%'" % (table, col_char))
val = cur.fetchall()
for i in range(len(val)):
if type in val[i][1]:
return val[i][0]
def tup2str(tup):
return str(','.join(str(x) for x in tup))
def tup2int(tup):
return int(','.join(str(x) for x in tup))
def plotUnion(lst): #This method is designed takes a list as a parameter, which, in theory, contains the docIDs for all of the documents in which two keywords are located.The list past as
combine = [] #a parameter should have been created by reading from a text file created by the makeUnion() method in keywordadd.py. This will become more apparent later on in this
#program's code.
table2=uniTable('utho')
title=uniCol(table, 'itl', 'text')
journal=uniCol(table, 'our', 'text')
last=uniCol(table2, 'ast', 'varchar')
first=uniCol(table2, 'irst', 'varchar')
for i in range(len(lst)): #This loop is simple in concept. Following it, you'll realize that it is designed to simply search for the identificatiion information of each article.
ident=[]
ID=int(lst[i])
cur.execute("SELECT %s, %s, %s FROM %s WHERE %s=%d" % (title, journal, year, table, docId,ID))
info = cur.fetchall()
ident.append(tup2str(info))
cur.execute("SELECT %s, %s FROM %s WHERE %s=%d" % (last, first, table2, docId, ID))
names= cur.fetchall()
name = []
for i in range(len(names)):
name.append(tup2str(names[i]))
ident.append(name)
combine.append(ident)
print(combine)
file = open("%sand%s.txt"%(args.keyword, args.keyword2), "w")
for item in combine:
file.write("%s\n"%item)
def plotOne(arg): #plotOne() is designed to plot the frequency of a keyword across the journals contained in a database, or by the frequency throughout the years of publication for all of the journals combined.
if args.ngram:
ka.writeOne(args.keyword, args.database, "%s" % docId, ngram=true) #In plotOne, as well as plotTwo() a separate keyword hunter is used. This allows the plotting program to run more smoothly and offers other programs the ability to sift through keywords as well.
else:
ka.writeOne(args.keyword, args.database, "%s" % docId)
doc = "%s.txt" % args.keyword
docnumbers = [line.strip() for line in open(doc)]
stats = []
names = cur.fetchall()
for i in range(len(rows)):
for x in range(len(docnumbers)):
ID = int(docnumbers[x])
cur.execute("SELECT %s FROM %s WHERE %s = %d" % (arg, table, docId, ID))
check = cur.fetchall()
if len(check) >0:
if rows[i]==check[0] and rows[i][0] is None:
stats.append(-1)
elif rows[i]==check[0]:
stats.append(tup2int(rows[i]))
print(stats)
rows_ = list(itertools.chain(*rows))
if args.year:
years=[stats.count(y) for y in rows_]
plt.plot(rows_, years, marker='o', linestyle='--', color='r',label=args.keyword)
labels=ax.set_xticklabels(rows_, rotation=90, fontsize='small')
plt.ylim([0,max(years)])
else:
ax.hist(stats, bins = range(min(rows_), max(rows_)+2, 1), alpha=0.5, label = args.keyword)
ax.legend(loc='upper right')
ax.set_xticks(rows_)
ax.set_xlabel("JournalID")
ax.set_ylabel("Frequency")
ax.set_title("Occurrences of %s" % args.keyword)
if args.file:
fig.savefig(args.file)
else:
fig.savefig("%s.png"%args.keyword)
plt.show()
def plotTwo(arg): #plotTwo() and plotOne() are very similar, but plotTwo() has a few more features and is a bit more robust. It follows much the same process as plotOne(), but plots an extra keyword as well as the union - if any exists - between the two keywords within a single document. Like plotOne(), plotTwo() needs to be passed an argument, which, in this case is either the journalID or the publication year.
if args.ngram:
ka.writeTwo(args.keyword, args.keyword2, args.database, "%s" % docId, ngram=true)
else:
ka.writeTwo(args.keyword, args.keyword2, args.database, "%s" % docId)
doc = "%s.txt" % args.keyword
doc2 = "%s.txt" % args.keyword2
ka.makeUnion(doc, doc2) #makeUnion() is a simple function that simply takes two documents and searches them for similar document numbers within each of them. If any unions exist, they are saved in a document simply called union.txt
docnumbers = [line.strip() for line in open(doc)]
stats = []
docnum = [line.strip() for line in open(doc2)]
stats2 = []
stats3 = []
if os.path.isfile("union.txt"):
docline = [line.strip() for line in open("union.txt")]
for i in range(len(rows)):
for z in range(len(docline)):
ID = int(docline[z])
cur.execute("SELECT %s FROM %s WHERE %s =%d" % (arg, table, docId, ID))
check = cur.fetchall()
if len(check) > 0:
if rows[i]==check[0] and rows[i] is None:
stats.append(-1)
elif rows[i]==check[0]:
stats3.append(tup2int(rows[i]))
os.remove("union.txt") #Once saved to a separate list, it's important to remove the document from whatever directory it was saved in order to prevent false positives with other searches.
for i in range(len(rows)):
for y in range(len(docnum)):
ID = int(docnum[y])
cur.execute("SELECT %s FROM %s WHERE %s = %d" % (arg, table, docId, ID))
check = cur.fetchall()
if len(check) >0:
if rows[i]==check[0] and rows[i] is None:
stats.append(-1)
elif rows[i]==check[0]:
stats2.append(tup2int(rows[i]))
for x in range(len(docnumbers)):
ID = int(docnumbers[x])
cur.execute("SELECT %s FROM %s WHERE %s = %d" % (arg, table, docId, ID))
check = cur.fetchall()
if len(check) >0:
if rows[i]==check[0] and rows[i] is None:
stats.append(-1)
elif rows[i]==check[0]:
stats.append(tup2int(rows[i]))
rows_ = list(itertools.chain(*rows)) #The global tuple "rows" is created by a cur.fetchall() call, which is demonstrated later. This tuple contains individual tuples and is not formatted in a way that matplotlib can easily access. Thus, itertools formats it to be more accessable for plotting.
if args.year:
years=[stats.count(y) for y in rows_]
plt.plot(rows_, years, marker='o', linestyle='--', color='r',label=args.keyword)
years2 = [stats2.count(y) for y in rows_]
plt.plot(rows_, years2, marker='o', linestyle='--', color='b', label=args.keyword2)
if len(stats3):
years3 = [stats3.count(y) for y in rows_]
plt.plot(rows_, years3, marker='o', linestyle='--', color='g',label="%s and %s" %(args.keyword, args.keyword2))
if len(years)>len(years2):
plt.ylim([0, max(years)])
elif len(years2)>=len(years):
plt.ylim([0, max(years2)])
labels=ax.set_xticklabels(rows_, rotation=90, fontsize='small')
else:
ax.hist(stats, bins = range(min(rows_), max(rows_)+2, 1), alpha=0.5, label = args.keyword)
ax.hist(stats2, bins = range(min(rows_), max(rows_)+2, 1), alpha=0.5, label = args.keyword2)
if len(stats3):
ax.hist(stats3, bins = range(min(rows_), max(rows_)+2, 1), alpha=0.5, label ="%s and %s"% (args.keyword, args.keyword2))
plotUnion(stats3)
ax.legend(loc='upper right')
ax.set_xticks(rows_)
if args.year:
labels=ax.set_xticklabels(rows_, rotation=30, fontsize='small')
ax.set_xlabel("JournalID")
ax.set_ylabel("Frequency")
ax.set_title("Occurrences of %s and %s" % (args.keyword, args.keyword2))
if args.file:
fig.savefig(args.file)
else:
fig.savefig("%sand%s.png"%(args.keyword, args.keyword2))
plt.show()
def plotYear():
if args.keyword2:
plotTwo(year)
else:
plotOne(year)
if args.year:
table=uniTable('oc')
year=uniCol(table, 'ear', 'int')
docId=uniCol(table, 'id', 'int')
cur.execute("SELECT DISTINCT %s FROM docs WHERE %s IS NOT NULL ORDER BY %s ASC" % (year, year, year))
rows=cur.fetchall()
plotYear()
elif args.keyword2:
table=uniTable('oc')
year=uniCol(table, 'ear', 'int')
docId=uniCol(table, 'id', 'int')
jID = uniCol(table, 'our', 'int')
cur.execute("SELECT DISTINCT %s FROM docs" % jID)
rows=cur.fetchall()
arg= jID
plotTwo(arg)
else:
table=uniTable('oc')
year=uniCol(table, 'ear', 'int')
docId=uniCol(table, 'id', 'int')
jID = uniCol(table, 'our', 'int')
cur.execute("SELECT DISTINCT %s FROM docs" % jID)
rows = cur.fetchall()
arg = jID
plotOne(arg)