-
Notifications
You must be signed in to change notification settings - Fork 0
/
dictionary_ubgrades.py
82 lines (61 loc) · 1.98 KB
/
dictionary_ubgrades.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# -*- coding: utf-8 -*-
"""
Created on Sun May 12 17:09:36 2013
just cheking on aothors numbers ets, sandbox
@author: Vasya
"""
import gensim
import collections
import genSimLDAlib as gslib
import someBrandFiltering as bf
allthreads =bf.init()
somethreads=allthreads[0:100]
authPostCount = collections.defaultdict(int)
for t in allthreads:
for post in t.getPosts():
author = post.msgAuthor
authPostCount[author]+=1
len(authPostCount) #375,569
authorStems=set()
for author in authPostCount.keys():
authorStem=gslib.wordCleanUp(gslib.textCleanUp(author))
authorStems.add(authorStem)
len(authorStems) #27,3418
for author in sorted(authPostCount.keys()):
if author.find(' ')!=-1:
print author
with open('authList.txt','w') as authListF:
for author in sorted(authPostCount.keys()):
authListF.write(author+'\n')
with open('authStemsList.txt','w') as authListF:
for author in sorted(authorStems):
authListF.write(author+'\n')
revlist = zip(authPostCount.values(),authPostCount.keys())
with open('authCount.txt','w') as authListF:
for t in sorted(revlist,reverse=True):
authListF.write(t[1]+" "+str(t[0])+'\n')
len([c for c in authPostCount.values() if c> 5]) #53,068
s = r"Z:\ermunds\results\1 prices paid\5-6-2013\PricesStemmed20passes_20topics.dict"
dict1 = gensim.corpora.dictionary.Dictionary().load(s)
tupl = []
for ID in dict1.keys():
tupl.append((dict1.dfs[ID],dict1[ID]))
tupl=sorted(tupl,reverse=True)
with open('wordCounts','w') as f:
for t in tupl:
f.write(str(t)+'\n')
lst= []
for b in bf.getMakes():
token = gslib.wordCleanUp(gslib.textCleanUp(b))
try:
ID=dict1.token2id[token]
fr=dict1.dfs[ID]
print b,fr,token
lst.append((fr,b))
except KeyError:
print b,'fail',token
lst = sorted(lst)
fname = 'brand_mentions_count.txt'
with open(fname,'w') as outfile:
for t in lst:
outfile.write(t[1]+":"+str(t[0])+'\n')