Beispiel #1
0
 def verity_http(self):
     self.port()
     http = []
     http_ok = [200,302,404,500]
     f= open('/root/project/info/port/portscan.txt','rb')
     f_http = open('/root/project/info/port/http.txt','wb')
     f_no_http = open('/root/project/info/port/no_http.txt', 'wb')
     f_domain = open('/root/project/info/domain_all.txt','rb')
     f_ip = open('/root/project/info/domain_all.txt', 'rb')
     for line in f:
         line = line.strip()
         http.append(line)
     for line in f_domain:
         line = line.strip()
         http.append(line)
     for line in f_ip:
         line  = line.strip()
         http.append(line)
     for line in http:
         if '443' in line:
             url = 'https://'+line+'/'
         else:
             url = 'http://'+line+'/'
         try:
             res = req.get(url)
             if res.status_code:
                 print "{} http is ok".format(url)
                 self.http.append(url)
                 f_http.write(url +'\n')
         except Exception as e:
             print "sorry,this not http"
             f_no_http.write(line +'\n')
Beispiel #2
0
def run(av_id):
    # 获取视频页面的html
    AV_URL = r"https://www.bilibili.com/video/av{}"
    av_url = AV_URL.format(av_id)
    res = get(av_url)  # HTML
    # 1 提取info和cid
    (title, class1, class2, time, rank, uid) = getInfo(res)
    if (title == None):
        print("Title NOT FOUND")
        return
    cid = getCid(res)
    # 获取api页面的html
    # 2 获取总播放量、历史累计弹幕数、回复数、收藏数、硬币数、分享数、现在排名、历史最高排名、喜欢数、不喜欢数、版数、版权
    A_URL = r"https://api.bilibili.com/archive_stat/stat?aid={}"
    a_url = A_URL.format(av_id)
    data = get(a_url, decode=False).json()["data"]
    # 获取comment
    COMMENT_URL = "https://comment.bilibili.com/{}.xml"
    comment_url = COMMENT_URL.format(cid)
    res_d = get(comment_url, decode=False).content  # 弹幕的HTML
    # 3 获取弹幕
    d = getDannmaku(res_d)
    # 4 保存
    name = "{}".format(av_id)
    s = {
        "title": title,
        "class1": class1,
        "class2": class2,
        "time": time,
        "rank": rank,
        "uid": uid,
        "data": data,
        "av_id": av_id,
        "cid": cid,
        "d": d
    }
    save(name, s)
    print("{} Save Successful! ".format(av_id))
import cx_Oracle as oci
from _operator import sub


# 구현 순서
# 1. 목록 페이지를 접속한다.
# 2. 여행지 링크 주소를 찾는다.
# x 반복
# 3. 2번의 페이지를 접속한다.
# 4. 크롤링 텍스트 or 이미지를 가져온다.
# 5. 4번의 데이터를 데이터베이스에 추가한다.

domain = 'http://info.hanatour.com'
#          http://info.hanatour.com/dest/list/all/1?page=3
total = 'http://info.hanatour.com/dest/list/all/1'
totalHtml = get(total)
totalSoup = BeautifulSoup(totalHtml, 'html.parser')

print(len(totalSoup.select('.listArea a')))


# DB 접속
import cx_Oracle as oci


# cx_Oracle에서 한글 인코딩 처리
os.environ["NLS_LANG"] = ".AL32UTF8"
 
START_VALUE = u"Unicode \u3042 3".encode('utf-8')
END_VALUE = u"Unicode \u3042 6".encode('utf-8')
Beispiel #4
0
# ex13.py
from myrequest import get
from bs4 import BeautifulSoup

html = get('http://naver.com')
soup = BeautifulSoup(html, 'html.parser')

result = soup.find_all('a')
print(type(result))

print(len(result))

for link in result:
    print(link.get_text(), '-', link.get('href'))
    
    












Beispiel #5
0
# ex15.py
# http://weather.naver.com

# import requests
from myrequest import get  # 페이지 소스 긁어오기
from bs4 import BeautifulSoup  # 소스 분석하기

url = 'https://weather.naver.com/period/weeklyFcast.nhn'

html = get(url)

# print(html)

soup = BeautifulSoup(html, 'html.parser')

# 탐색 방법 1 - 태그 단일 요소 검색
h5 = soup.find('h5')
print(h5)

h6 = soup.find('h6')
print(h6)

strong = soup.find('strong')
print(strong)

dd = soup.find('dd')
print(dd)

td = soup.find('td')
print(td)
# ex12.py

from myrequest import get
from bs4 import BeautifulSoup

html = get('http://www.example.com/')

# 소스 분석 + 나누는 작업 > 파싱(Parsing)
soup = BeautifulSoup(html, 'html.parser')
print(type(soup))

result = soup.find("h1")
print(type(result))

print(result)
print(result.get_text())










Beispiel #7
0
# ex23.py

# 정적 데이터 크롤링
from myrequest import get
from bs4 import BeautifulSoup

# 마우스 오른쪽 버튼 > 페이지 소스 보기
html = get('http://211.63.89.31:8088/python/data.do')
soup = BeautifulSoup(html, 'html.parser')

staticdata = soup.select('.staticdata') # <li> x 4개
print(len(staticdata))

for sdata in staticdata:
    print(sdata.get_text())

print(soup.select('#name')[0].get_text())
print(soup.select('#age')[0].get_text())
print(soup.select('#address')[0].get_text())
print(soup.select('#gender')[0].get_text())

print('-----------------------')

# 동적 데이터 크롤링
dynamicdata = soup.select('.dynamicdata')
print(len(dynamicdata))

for ddata in dynamicdata:
    print('data : ', ddata.get_text())