/
similarity.py
238 lines (199 loc) · 8.57 KB
/
similarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
"""
http://www.megamillions.com/numbers/
https://bitcointalk.org/index.php?topic=34586.msg2028001;topicseen#msg2028001
<textarea id="wpTextbox1" name="wpTextbox1" cols="80" rows="25" readonly="">
==Articles==
===Collated===
*[[Home Remedy]] - Source: [http://en.wikibooks.org/wiki/Ethnomedicine/Ethnomedicine_by_Illness Ethnomedicine by Illness] and [http://en.wikibooks.org/wiki/Ethnomedicine/Home_Remedies Home Remedies] - Improvement: Combined wikibooks with wikipedia articles
===Original===
*[[:Arthritis]] - Source: Original
*[[:Devcoin]] - Source: [https://github.com/Unthinkingbit/charity/blob/master/devcoin.html https://github.com/Unthinkingbit/charity/blob/master/devcoin.html]
==Link==
https://raw.github.com/Unthinkingbit/charity/master/devcoin.html
==Tip==
Coin Address: 17vec4jQGCzMEsTnivizHPaowE715tu2CB
</textarea>
Account is a program to generate a devcoin receiver file from a bitcoinshare, bounty, devcoinshare and peer file.
This is meant to be used by devcoin accountants and auditors to create and check the receiver files. The account file has a list of addresses and shares. Anything after a dash is a comment.
==Commands==
===Help===
The -h option, the -help option, will print the help, which is this document. The example follows:
python account.py -h
===Input===
Default is https://raw.github.com/Unthinkingbit/charity/master/account_3.csv
The -input option sets the input file name. The example follows:
python account.py -input https://raw.github.com/Unthinkingbit/charity/master/account_3.csv
An example of an account information input file is at:
https://raw.github.com/Unthinkingbit/charity/master/account_3.csv
===Output===
Default is test_receiver.csv
The -output option sets the output. If the output ends with stderr, the output will be sent to stderr If the output ends with stdout, the output will be sent to stdout. If the output does not end with stderr or stdout, the output will be written to a file by that name, with whatever suffix the input file has. The example follows:
python genereceiver.py -output test_receiver.csv
An example of an genereceiver output file is at:
https://raw.github.com/Unthinkingbit/charity/master/test_receiver_3.csv
==Install==
For genereceiver to run, you need Python 2.x, almoner will probably not run with python 3.x. To check if it is on your machine, in a terminal type:
python
If python 2.x is not on your machine, download the latest python 2.x, which is available from:
http://www.python.org/download/
"""
import almoner
import cStringIO
import devtome
import os
import shutil
import sys
import zipfile
__license__ = 'MIT'
def addToAuthorDictionary(authorDictionary, name, text):
'Add author name to invoiced articles.'
isArticle = False
for line in almoner.getTextLines(text):
lineStrippedLower = line.strip().lower()
if '==' in lineStrippedLower:
isArticle = False
if 'collated' in lineStrippedLower or 'original' in lineStrippedLower:
isArticle = True
if isArticle:
title = devtome.getLinkName(lineStrippedLower, name)
if title != '':
authorDictionary[title] = name
def getArticles():
'Get the articles.'
articles = []
authorDictionary = {}
fileNameRoot = 'devtome_articles'
archiveFileName = fileNameRoot + '.zip'
zipArchive = zipfile.ZipFile(archiveFileName, 'r')
zipArchive.extractall(fileNameRoot)
zipArchive.close()
names = os.listdir(fileNameRoot)
for name in names:
filePath = os.path.join(fileNameRoot, name)
text = almoner.getFileText(filePath)
if name.startswith('wiki:user:'):
addToAuthorDictionary(authorDictionary, name[len('wiki:user:') :], text)
else:
longWords = getLongWords(text)
if len(longWords) > 40:
articles.append(Article(longWords, name))
for article in articles:
if article.name in authorDictionary:
article.author = authorDictionary[article.name]
shutil.rmtree(fileNameRoot)
return articles
def getGreatestSimilarity(article):
'Get the greatest similarity for a compare function.'
return article.greatestSimilarity
def getLongWords(text):
'Get the long words of the text.'
longWords = []
text = text.replace(',', ' ').replace(';', ' ').replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').lower()
words = text.split()
for word in words:
if word.endswith('.'):
word = word[: -1]
if len(word) > 4 and word.isalpha():
longWords.append(word)
return longWords
def getSimilarityText(articles):
'Get the text of the most similar other article for each article.'
cString = cStringIO.StringIO()
cString.write('Name,Other,Similarity (%)\n')
totalFrequencyDictionary = {}
for article in articles:
for frequencyKey in article.frequencyDictionary.keys():
count = article.frequencyDictionary[frequencyKey]
if frequencyKey in totalFrequencyDictionary:
totalFrequencyDictionary[frequencyKey] += count
else:
totalFrequencyDictionary[frequencyKey] = count
normalizeFrequencyDictionary(totalFrequencyDictionary)
for article in articles:
article.setDistinct(totalFrequencyDictionary)
for articleIndex, article in enumerate(articles):
article.setSimilar(articles[: articleIndex] + articles[articleIndex + 1 :])
articles.sort(key=getGreatestSimilarity)
articles.reverse()
for article in articles:
article.addLine(cString)
return cString.getvalue()
def getSockpuppetText(articles):
'Get the text of the most similar other article from different authors.'
cString = cStringIO.StringIO()
cString.write('Author,Other Author,Name,Other,Similarity (%)\n')
for article in articles:
article.addSockpuppetLine(cString)
return cString.getvalue()
def normalizeFrequencyDictionary(frequencyDictionary):
'Divide each frequency by the total count.'
totalCount = 0
for count in frequencyDictionary.values():
totalCount += count
totalCountInverse = 1.0 / float(totalCount)
for frequencyKey in frequencyDictionary.keys():
frequencyDictionary[frequencyKey] = totalCountInverse * float(frequencyDictionary[frequencyKey])
def writeOutput(arguments):
'Write output.'
if '-h' in arguments or '-help' in arguments or len(arguments) == 0:
print(__doc__)
return
outputSimilarityTo = almoner.getParameter(arguments, 'similarity.csv', 'output')
outputSockpuppetTo = almoner.getParameter(arguments, 'similarity_sockpuppet.csv', 'output')
articles = getArticles()
similarityText = getSimilarityText(articles)
sockpuppetText = getSockpuppetText(articles)
if almoner.sendOutputTo(outputSimilarityTo, similarityText):
print('The similarity file has been written to:\n%s\n' % outputSimilarityTo)
if almoner.sendOutputTo(outputSockpuppetTo, sockpuppetText):
print('The sockpuppet file has been written to:\n%s\n' % outputSockpuppetTo)
class Article:
'A class to handle an article.'
def __init__(self, longWords, name):
'Initialize.'
self.author = ''
self.name = name
self.frequencyDictionary = {}
for longWord in longWords:
if longWord in self.frequencyDictionary:
self.frequencyDictionary[longWord] += 1
else:
self.frequencyDictionary[longWord] = 1
def __repr__(self):
'Get the string representation of this class.'
return '%s, %s' % (self.name, self.author)
def addLine(self, cString):
'Add the article to the similarity csv cString.'
cString.write('%s,%s,%s\n' % (self.name, self.mostSimilar.name, round(100.0 * self.greatestSimilarity, 1)))
def addSockpuppetLine(self, cString):
'Add the article to the sockpuppet csv cString.'
if self.author != self.mostSimilar.author:
similarity = round(100.0 * self.greatestSimilarity, 1)
cString.write('%s,%s,%s,%s,%s\n' % (self.author, self.mostSimilar.author, self.name, self.mostSimilar.name, similarity))
def setDistinct(self, totalFrequencyDictionary):
'Set distinct words.'
normalizeFrequencyDictionary(self.frequencyDictionary)
for frequencyKey in self.frequencyDictionary.keys():
self.frequencyDictionary[frequencyKey] /= totalFrequencyDictionary[frequencyKey]
frequencies = self.frequencyDictionary.values()
frequencies.sort()
minimumDistinctiveness = frequencies[len(frequencies) / 2]
self.distinctSet = set([])
for frequencyKey in self.frequencyDictionary.keys():
if self.frequencyDictionary[frequencyKey] >= minimumDistinctiveness:
self.distinctSet.add(frequencyKey)
def setSimilar(self, articles):
'Set distinct words.'
self.greatestSimilarity = -1
self.mostSimilar = None
for article in articles:
intersection = len(self.distinctSet.intersection(article.distinctSet))
similarity = float(intersection) / float(max(len(self.distinctSet), len(article.distinctSet)))
if similarity > self.greatestSimilarity:
self.greatestSimilarity = similarity
self.mostSimilar = article
def main():
'Write output.'
writeOutput(sys.argv)
if __name__ == '__main__':
main()