def main(): with open('../dataset/target_inc.json') as f0: targetInc = json.loads(f0.read()) with open('../dataset/beijing_inc.csv') as f: result = [] dataSource = csv.DictReader(f) # headers = [] fieldnames = [] i = 0 for row in dataSource: if row['证券简称'] in targetInc: try: print(str(i) + ':' + row['证券简称']) url = getStockURL(row['证券代码'].split('.')[0]) res = getHtml(url) csvData = parseHtml(res) sliceData = [sliceRow[:9] for sliceRow in csvData[1:]] # fieldnames = csvData[0] for record in sliceData: record.append(row['证券简称']) if i == 0: # csvData[0].append('incName') fieldnames = copy.deepcopy(csvData[0][:9]) fieldnames.append('company_name') result[0:0] = sliceData except: print('err') continue i += 1 else: print(row['证券简称'] + ' not found') with open('../dataset/stock.csv', 'w+') as csvfile: csvfile.write(','.join(fieldnames) + '\n') for row in result: csvfile.write(','.join(row) + '\n')
def generateTimetable(format, date, programId, year, branchId): timetable = getTimetable(date, programId, year, branchId) if timetable != False: (startDate, endDate, data) = parse.parseHtml(timetable) if format == 'txt': fileName = generateTxt(startDate, endDate, data) elif format == 'pdf': fileName = generatePdf(startDate, endDate, data) elif format == 'ical': fileName = generateIcal(startDate, endDate, data) return fileName
def generateTimetable(format, date, programId, year, branchId): timetable = getTimetable(date, programId, year, branchId) if timetable == False: return False else: (startDate, endDate, data) = parse.parseHtml(timetable) if format == 'txt': fileName = generateTxt(startDate, endDate, data) elif format == 'pdf': fileName = generatePdf(startDate, endDate, data) elif format == 'ical': fileName = generateIcal(startDate, endDate, data) return fileName
def main(): print('test') with open('../dataset/beijing_inc.csv') as f: result = [] dataSource = csv.DictReader(f) totalRows = 304#len(f.readlines()) i = 0 for row in dataSource: i += 1 try: print(row['证券简称'], str(i * 100 / totalRows) + '%') url = getBaiduURL(row['证券简称'] + '网络攻击') res = getHtml(url) fullData = parseHtml(res) filterData = dataFilter(row['证券简称'], ['网络攻击', '病毒', 'DDos', '勒索'], fullData) result[0:0] = filterData except: continue time.sleep(1) with open('../dataset/bj_safe_baidu.json', 'w+') as w: w.write(json.dumps(result))
req = urllib2.Request(str(url)) req.add_header("User-Agent", "couchmap 0.1") request = None try: request = urllib2.urlopen(req) except urllib2.URLError, e: continue except urllib2.HTTPError, e: status = e.code if status == 0: status = 200 html = request.read() data = parseHtml(html) if data is None: continue try: data["address"] = url data["http_status"] = status data["member_name"] = url.split("/")[-1] cdb.enqueue( [ u for u in data["urls"] if (not cdb.isInQueue(u)) and (not cdb.hasCrawled(u)) and u != settings.START_URL ]