/
getDatafromSite.py
160 lines (145 loc) · 4.92 KB
/
getDatafromSite.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#encoding=utf-8
import urllib2
import urllib
import cookielib
import json
import uniout
from openpyxl import load_workbook
from openpyxl import Workbook
from openpyxl.compat import range
from openpyxl.cell import get_column_letter
from BeautifulSoup import BeautifulSoup
import os
idx = 0
txt = ''
order ='desc'
oderby = 'd.id'
total = 0
totalPage = 0
auditStatus = ''
status = ''
baseUrl = "http://adexchange.cn.miaozhen.com"
getDeals = baseUrl+"/deal/list" #
viewOne = baseUrl+'/deal/view?id='
baseNum = 500
maxPages = 5
class JSONHelper(object):
"""docstring for JSONHelper"""
def __init__(self,jc):
super(JSONHelper, self).__init__()
self.jsonContent = json.loads(jc[jc.find('{') : ])
def getAllUNAudiIds(self):
jdata = self.jsonContent
total = jdata['total']
mapper = jdata["tableData"]
idList = list()
for o in mapper:
idList.append(str(o['Id'])+'_'+str(o['AuditStatus'] ))
return idList
def auditStatus(self):
jdata = self.jsonContent
total = jdata['AuditStatus']
return total
class xlsxHelper(object):
"""docstring for outputRes"""
def __init__(self, allDetails,filename):
super(xlsxHelper, self).__init__()
self.allDetails = allDetails
self.filename = filename
def exportAsXls(self):
firstC = ['DealID','Deal名称*','CPM价格(分/千次曝光)*','对应上游媒体','上游DealID','总预算(元)','每日预算(元)','投放日期*','优先级','投放地区','投放设备类型','投放操作系统','DSP*','广告位*','广告主*','流量分布','审核状态','链接']
tarwb = Workbook()
dest_filename = self.filename
ws1 = tarwb.active
ws1.title = "all"
#生成列名
fcn = 1
for fc in firstC:
ws1.cell(column=fcn, row=1).value = str(fc)
fcn = fcn + 1
#填写数据
r = 2
for row in self.allDetails:
c = 1
for cell in row:
ws1.cell(column=c, row=r).value = cell
c = c +1
r = r +1
tarwb.save(filename = dest_filename)
class Details(object):
"""docstring for getAllDetails"""
def __init__(self, html):
super(Details, self).__init__()
self.html = html
def getOneRow(self):
soup = BeautifulSoup(self.html )
tables = soup.findChildren('table')
my_table = tables[0]
rows = my_table.findChildren(['th', 'tr'])
res = list()
for row in rows:
cells = row.findChildren('td')
listM = cells[::-2]
for cell in listM:
temp = cell.string
if temp:
res.append(temp.replace(' ','').replace('\n','').strip().encode('utf-8'))
else:
res.append('empty')
# print 'empty'
return res
def initCredential():
#获得一个cookieJar实例,
cj = cookielib.CookieJar()
#定义一个opener 作为后面带着cookie访问的对象
opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
#用户名和,密码,明文,注意安全哦
user = ''
password = ''
login_page = "http://adexchange.cn.miaozhen.com/j_spring_security_check"
try:
formdata = {'j_username':user, 'j_password':password }
#form data
ldata = urllib.urlencode(formdata)
opener.addheaders = [('User-agent','Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)')]
opener.open("http://adexchange.cn.miaozhen.com/j_spring_security_check",ldata)
return opener
except Exception,e:
print str(e)
def getJC(opener,url,pageNum,lines):
dealList = {'current':pageNum,'lines':lines,'idx':0,'order':order,'orderBy':oderby,'txt':txt,'total':0,'totalPage':0,'auditStatus':'','status':''}
dldata = urllib.urlencode(dealList);
request = urllib2.Request(url,dldata)
response = opener.open(request)
content = response.read()
return content
def getDetails(opener,viewOne,dId):
url = viewOne+str(dId)
request = urllib2.Request(url)
response = opener.open(request)
content = response.read()
return content
opener = initCredential()
for i in range(1,maxPages):
filename = 'dealsFile_'+str(i)+".xlsx"
allDetails = list ()
dealsContent = getJC(opener,getDeals,i,baseNum)
jh = JSONHelper(dealsContent)
temp = jh.getAllUNAudiIds()
if temp :
for Id in temp:
auditStatus = Id[-1]
html = getDetails(opener,viewOne,Id[0:-2])
link = viewOne + Id[0:-2]
hd = Details(html)
tempList = hd.getOneRow()
tempList.append(auditStatus)
tempList.append(link)
allDetails.append(tempList)
else:
#程序结束
break
gXls = xlsxHelper(allDetails,filename)
gXls.exportAsXls()
# print temp
# print jh.getTotal()