コード例 #1
0
def main():
  with open('../dataset/target_inc.json') as f0:
    targetInc = json.loads(f0.read())
    with open('../dataset/beijing_inc.csv') as f:
      result = []
      dataSource = csv.DictReader(f)
      # headers = []
      fieldnames = []
      i = 0
      for row in dataSource:
        if row['证券简称'] in targetInc:
          try:
            print(str(i) + ':' + row['证券简称'])
            url = getStockURL(row['证券代码'].split('.')[0])
            res = getHtml(url)
            csvData = parseHtml(res)
            sliceData = [sliceRow[:9] for sliceRow in csvData[1:]]
            # fieldnames = csvData[0]
            for record in sliceData:
              record.append(row['证券简称'])
            if i == 0:
              # csvData[0].append('incName')
              fieldnames = copy.deepcopy(csvData[0][:9])
              fieldnames.append('company_name')
            result[0:0] = sliceData
          except:
            print('err')
            continue
          i += 1
        else:
          print(row['证券简称'] + ' not found')
      with open('../dataset/stock.csv', 'w+') as csvfile:
        csvfile.write(','.join(fieldnames) + '\n')
        for row in result:
          csvfile.write(','.join(row) + '\n')
コード例 #2
0
def generateTimetable(format, date, programId, year, branchId):
    timetable = getTimetable(date, programId, year, branchId)
    
    if timetable != False:
        (startDate, endDate, data) = parse.parseHtml(timetable)

    if format == 'txt':
        fileName = generateTxt(startDate, endDate, data)
    elif format == 'pdf':
        fileName = generatePdf(startDate, endDate, data)
    elif format == 'ical':
        fileName = generateIcal(startDate, endDate, data)
            
    return fileName
コード例 #3
0
def generateTimetable(format, date, programId, year, branchId):
    timetable = getTimetable(date, programId, year, branchId)

    if timetable == False:
        return False
    else:
        (startDate, endDate, data) = parse.parseHtml(timetable)

    if format == 'txt':
        fileName = generateTxt(startDate, endDate, data)
    elif format == 'pdf':
        fileName = generatePdf(startDate, endDate, data)
    elif format == 'ical':
        fileName = generateIcal(startDate, endDate, data)

    return fileName
コード例 #4
0
def main():
  print('test')
  with open('../dataset/beijing_inc.csv') as f:
    result = []
    dataSource = csv.DictReader(f)
    totalRows = 304#len(f.readlines())
    i = 0
    for row in dataSource:
      i += 1
      try:
        print(row['证券简称'], str(i * 100 / totalRows) + '%')
        url = getBaiduURL(row['证券简称'] + '网络攻击')
        res = getHtml(url)
        fullData = parseHtml(res)
        filterData = dataFilter(row['证券简称'], ['网络攻击', '病毒', 'DDos', '勒索'], fullData) 
        result[0:0] = filterData
      except:
        continue
      time.sleep(1)
      
    with open('../dataset/bj_safe_baidu.json', 'w+') as w:
      w.write(json.dumps(result))
コード例 #5
0
ファイル: couchcrawler.py プロジェクト: ralphite/couchmap
        req = urllib2.Request(str(url))
        req.add_header("User-Agent", "couchmap 0.1")

        request = None

        try:
            request = urllib2.urlopen(req)
        except urllib2.URLError, e:
            continue
        except urllib2.HTTPError, e:
            status = e.code
        if status == 0:
            status = 200
        html = request.read()

        data = parseHtml(html)

        if data is None:
            continue

        try:
            data["address"] = url
            data["http_status"] = status
            data["member_name"] = url.split("/")[-1]

            cdb.enqueue(
                [
                    u
                    for u in data["urls"]
                    if (not cdb.isInQueue(u)) and (not cdb.hasCrawled(u)) and u != settings.START_URL
                ]