コード例 #1
0
ファイル: sj25.py プロジェクト: iml1111/SOOJLE_Crawler
def Parsing_post_data(bs, post_url, URL):
	return_data = []
	post_data = {}
	domain = Domain_check(URL['url'])

	title = bs.find("div", {"class": "body contest-detail"}).find("span", {"class": "title"}).get_text(" ", strip = True)
	author = bs.find("div", {"class": "contest-overview"}).find("tbody").find("tr").text.strip()
	if author.find("관리자") != -1:
		author = "0"
	date = bs.find("th", text="접수기간").parent.find("td").text.strip()
	date = date[13:] + " 00:00:00"
	date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S"))
	post = bs.find("div", {"class": "info-cont"}).get_text(" ", strip = True)
	post = post_wash(post)		#post 의 공백을 전부 제거하기 위함
	tag_done = tag.tagging(URL, title)
	if bs.find("img", {"id": "poster"}) is None:
		img = 7
	else:
		try:
			img = bs.find("img", {"id": "poster"})['src']		#게시글의 첫번째 이미지를 가져옴.
			if 1000 <= len(img):
				img = 7
			else:
				if img.startswith("http://") or img.startswith("https://"):		# img가 내부링크인지 외부 링크인지 판단.
					pass
				elif img.startswith("//"):
					img = "http:" + img
				else:
					img = domain + img
		except:
			img = 7
	if img != 7:
		if img_size(img):
			pass
		else:
			img = 7

	#post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[], img:1, 'view':0} 같은 형식
	post_data['title'] = title.upper()
	post_data['author'] = author.upper()
	post_data['date'] = date
	post_data['post'] = post.lower()
	post_data['tag'] = tag_done 	# 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다.
	post_data['img'] = img
	post_data['url'] = post_url

	return_data.append(post_data)
	return_data.append(title)
	return_data.append(date)
	return return_data
コード例 #2
0
ファイル: sj7.py プロジェクト: donut0310/SIGNUS_Crawler
def Parsing_post_data(bs, post_url, URL):
    return_data = []
    post_data = {}

    title = bs.find("div", {
        "class": "prop article bt1"
    }).find("div", {
        "class": "subject"
    }).get_text(" ", strip=True)
    date = bs.find("span", {"class": "date"}).text
    date = date + " 00:00:00"
    try:
        date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S"))
    except:
        date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    post = bs.find("div", {"class": "phrase"}).get_text(" ", strip=True)
    post = post_wash(post)  #post 의 공백을 전부 제거하기 위함
    try:
        img = bs.find("div", {
            "class": "phrase"
        }).find("img")['src']  #게시글의 첫번째 이미지를 가져옴.
        if 1000 <= len(img):
            img = 1
        else:
            if img.startswith("http://") or img.startswith(
                    "https://"):  # img가 내부링크인지 외부 링크인지 판단.
                pass
            elif img.startswith("//"):
                img = "http:" + img
            else:
                img = domain + img
        if img != 1:
            if img_size(img):
                pass
            else:
                img = 1
    except:
        img = 1

    post_data['title'] = title.upper()
    post_data['author'] = "0"
    post_data['date'] = date
    post_data['post'] = post.lower()
    post_data['img'] = img
    post_data['url'] = post_url

    return_data.append(post_data)
    return_data.append(title)
    return_data.append(date)
    return return_data
コード例 #3
0
def Parsing_post_data(bs, post_url, URL):
    return_data = []
    post_data = {}
    domain = Domain_check(URL['url'])

    title = bs.find("strong", {"class": "tit"}).get_text(" ", strip=True)
    author = "0"
    date = bs.find("span", {"class": "each"}).text.strip()[6:]
    date = date + " 00:00:00"
    date = str(datetime.datetime.strptime(date, "%Y.%m.%d %H:%M:%S"))
    post = bs.find("div", {
        "class": "board_view_con"
    }).get_text(" ", strip=True)
    post = post_wash(post)
    tag_done = tag.tagging(URL, title)
    if bs.find("div", {"class": "board_view_con"}).find("img") is None:
        img = 1
    else:
        img = bs.find("div", {
            "class": "board_view_con"
        }).find("img")['src']  #게시글의 첫번째 이미지를 가져옴.
        if 1000 <= len(img):
            img = 1
        else:
            if img.startswith("http://") or img.startswith(
                    "https://"):  # img가 내부링크인지 외부 링크인지 판단.
                pass
            elif img.startswith("//"):
                img = "http:" + img
            else:
                img = domain + img
    if img != 1:
        if img_size(img):
            pass
        else:
            img = 1

    #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[],'fav_cnt':0,'view':0} 같은 형식
    post_data['title'] = title.upper()
    post_data['author'] = author.upper()
    post_data['date'] = date
    post_data['post'] = post.lower()
    post_data['tag'] = tag_done  # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다.
    post_data['img'] = img
    post_data['url'] = post_url

    return_data.append(post_data)
    return_data.append(title)
    return_data.append(date)
    return return_data
コード例 #4
0
def Parsing_post_data(bs, post_url, URL):
	return_data = []
	post_data = {}
	domain = Domain_check(URL['url'])

	date = post_url[-8:]
	url = post_url.replace(date, "")
	driver = URLparser_UTF8(url)
	bs = BeautifulSoup(driver, 'html.parser')

	title = bs.find("div", {"id": "contents"}).find("div", {"class": "vi_subj"}).get_text(" ", strip = True)
	author = "0"
	date = "20" + date + " 00:00:00"
	date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S"))
	post = bs.find("div", {"class": "vi_cont"}).get_text(" ", strip = True)
	post = post_wash(post)		#post 의 공백을 전부 제거하기 위함
	if bs.find("div", {"class": "vi_cont"}).find("img") is None:
		img = 1
	else:
		img = bs.find("div", {"class": "vi_cont"}).find("img")['src']		#게시글의 첫번째 이미지를 가져옴.
		if 1000 <= len(img):
			img = 1
		else:
			if img.startswith("http://") or img.startswith("https://"):		# img가 내부링크인지 외부 링크인지 판단.
				pass
			elif img.startswith("//"):
				img = "http:" + img
			else:
				img = domain + img
	if img != 1:
		if img_size(img):
			pass
		else:
			img = 1
	tag_done = tag.tagging(URL, title)

	#post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[], img:1, 'view':0} 같은 형식
	post_data['title'] = title.upper()
	post_data['author'] = author
	post_data['date'] = date
	post_data['post'] = post.lower()
	post_data['tag'] = tag_done 	# 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다.
	post_data['img'] = img
	post_data['url'] = post_url

	return_data.append(post_data)
	return_data.append(title)
	return_data.append(date)
	return return_data
コード例 #5
0
def Parsing_post_data(bs, post_url, URL):
    return_data = []
    post_data = {}
    domain = Domain_check(URL['url'])

    title = bs.find("thead").find("td", {
        "class": "subject-value"
    }).get_text(" ", strip=True)
    author = bs.find("thead").find("td", {"class": "writer"}).text.strip()
    if author.find("관리자") != -1:
        author = "0"
    date = bs.find("thead").find("td", {"class": "date"}).text.strip()
    date = date + " 00:00:00"
    date = str(datetime.datetime.strptime(date, "%Y.%m.%d %H:%M:%S"))
    post = bs.find("tbody").find("td", {
        "class": "content"
    }).get_text(" ", strip=True)
    post = post_wash(post)  #post 의 공백을 전부 제거하기 위함
    if bs.find("tbody").find("img") is None:
        img = 1
    else:
        img = bs.find("tbody").find("img")['src']  #게시글의 첫번째 이미지를 가져옴.
        if 1000 <= len(img):
            img = 1
        else:
            if img.startswith("http://") or img.startswith(
                    "https://"):  # img가 내부링크인지 외부 링크인지 판단.
                pass
            elif img.startswith("//"):
                img = "http:" + img
            else:
                img = domain + img
    if img != 1:
        if img_size(img):
            pass
        else:
            img = 1

    post_data['title'] = title.upper()
    post_data['author'] = author.upper()
    post_data['date'] = date
    post_data['post'] = post.lower()
    post_data['img'] = img
    post_data['url'] = post_url

    return_data.append(post_data)
    return_data.append(title)
    return_data.append(date)
    return return_data
コード例 #6
0
def Parsing_post_data(post_url, URL):
	post_data_prepare = []
	end_date = date_cut_dict['sj4']		# end_date 추출


	#udream 로그인하는 함수
	s = udream.login()
	
	page = s.get(post_url).text
	bs = BeautifulSoup(page, "html.parser")

	posts = bs.find("tbody").findAll("tr")	#tr묶음
	for post in posts:
		#[title, author, post1, post2, date] 형태
		post_infoes = post.findAll("td")	#td 묶음

		post_data = {}
		title = post_infoes[0].get_text(" ", strip = True)
		author = post_infoes[0].find("div").text
		if author.find("관리자") != -1:
			author = "0"
		date = post_infoes[4].text + " 00:00:00"
		date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S"))
		post = post_infoes[1].get_text(" ", strip = True) + post_infoes[2].get_text(" ", strip = True) + post_infoes[3].get_text(" ", strip = True) + "~" + post_infoes[4].get_text(" ", strip = True)
		post = post_wash(post)
		tag_done = tag.tagging(URL, title)
		post = post[:200]
		img = 1
		url = post_infoes[5].find("a")["href"]

		post_data['title'] = title.upper()
		post_data['author'] = author.upper()
		db_date = post_infoes[3].text + " 00:00:00"
		post_data['date'] = str(datetime.datetime.strptime(db_date, "%Y-%m-%d %H:%M:%S"))
		post_data['post'] = post.upper()
		post_data['tag'] = tag_done		# 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다.
		post_data['img'] = img
		post_data['url'] = url

		print(date, "::::", title)

		#게시물의 날짜가 end_date 보다 옛날 글이면 continue, 최신 글이면 append
		if str(date) <= end_date:
			continue
		else:
			post_data_prepare.append(post_data)
	s.close()
			
	return post_data_prepare
コード例 #7
0
ファイル: sj10.py プロジェクト: donut0310/SIGNUS_Crawler
def Parsing_post_data(bs, post_url, URL):
    return_data = []
    post_data = {}
    domain = Domain_check(URL['url'])

    tables = bs.find("div", {"align": "center"}).findAll("table")
    title_table = tables[3]

    tds = title_table.findAll("td")

    title = tds[1].get_text(" ", strip=True)
    author = "0"
    date = tds[0].text.strip()
    date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S"))
    post = bs.find("td", {"class": "sf_contents"}).get_text(" ", strip=True)
    post = post_wash(post)  #post 의 공백을 전부 제거하기 위함
    if bs.find("td", {"class": "sf_contents"}).find("img") is None:
        img = 1
    else:
        img = bs.find("td", {
            "class": "sf_contents"
        }).find("img")['src']  #게시글의 첫번째 이미지를 가져옴.
        if 1000 <= len(img):
            img = 1
        else:
            if img.startswith("http://") or img.startswith(
                    "https://"):  # img가 내부링크인지 외부 링크인지 판단.
                pass
            elif img.startswith("//"):
                img = "http:" + img
            else:
                img = domain + img
    if img != 1:
        if img_size(img):
            pass
        else:
            img = 1

    post_data['title'] = title.upper()
    post_data['author'] = author.upper()
    post_data['date'] = date
    post_data['post'] = post.lower()
    post_data['img'] = img
    post_data['url'] = post_url

    return_data.append(post_data)
    return_data.append(title)
    return_data.append(date)
    return return_data
コード例 #8
0
def Parsing_post_data(bs, post_url, URL):
    return_data = []
    post_data = {}
    domain = Domain_check(URL['url'])

    title = bs.find("div", {
        "id": "ModuleBoardView"
    }).find("div", {
        "class": "title"
    }).find("h5").get_text(" ", strip=True)
    author = bs.find("span", {"rel": "author"}).text.strip()
    date = bs.find("li", {"class": "date"}).find("time").text.strip()
    date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S"))
    post = bs.find("div", {"class": "content"}).get_text(" ", strip=True)
    post = post_wash(post)  #post 의 공백을 전부 제거하기 위함
    if bs.find("article", {"data-role": "post"}).find("img")['src'] is None:
        img = 1
    else:
        try:
            img = bs.find("article", {"data-role": "post"}).find("img")['src']
            if 1000 <= len(img):
                img = 1
            else:
                if img.startswith("http://") or img.startswith(
                        "https://"):  # img가 내부링크인지 외부 링크인지 판단.
                    pass
                elif img.startswith("//"):
                    img = "http:" + img
                else:
                    img = domain + img
        except:
            img = 1
    if img != 1:
        if img_size(img):
            pass
        else:
            img = 1

    post_data['title'] = title.upper()
    post_data['author'] = author.upper()
    post_data['date'] = date
    post_data['post'] = post.lower()
    post_data['img'] = img
    post_data['url'] = post_url

    return_data.append(post_data)
    return_data.append(title)
    return_data.append(date)
    return return_data
コード例 #9
0
def Parsing_post_data(bs, post_url, URL):
    return_data = []
    post_data = {}
    domain = Domain_check(URL['url'])

    title = bs.find("td", {"class": "subject-value"}).get_text(" ", strip=True)
    author = bs.find("td", {"class": "writer"}).text.strip()
    if author.find("관리자") != -1:
        author = "0"
    date = bs.find("td", {"class": "date"}).text
    date = str(datetime.datetime.strptime(date, "%Y.%m.%d %H:%M:%S"))
    post = bs.find("tbody").find("div").get_text(" ", strip=True)
    post = post_wash(post)  #post 의 공백을 전부 제거하기 위함
    tag_done = tag.tagging(URL, title)
    if bs.find("tbody").find("tr").find("img"):
        img = bs.find("tbody").find("tr").find("img")["src"]
        if 1000 <= len(img):
            img = 1
        else:
            if img.startswith("http://") or img.startswith(
                    "https://") or img.startswith(
                        "data:"):  # img가 내부링크인지 외부 링크인지 판단.
                pass
            elif img.startswith("//"):
                img = "http:" + img
            else:
                img = domain + img
    else:
        img = 1
    if img != 1:
        if img_size(img):
            pass
        else:
            img = 1

    #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[], img:1, 'view':0} 같은 형식
    post_data['title'] = title.upper()
    post_data['author'] = author.upper()
    post_data['date'] = date
    post_data['post'] = post.lower()
    post_data['tag'] = tag_done  # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다.
    post_data['img'] = img
    post_data['url'] = post_url

    return_data.append(post_data)
    return_data.append(title)
    return_data.append(date)
    return return_data
コード例 #10
0
ファイル: PK_today.py プロジェクト: iml1111/PKU_Crawler
def content_parse(domain, url):
	html = URLparser(url)
	bs0bj = BeautifulSoup(html.read(), "html.parser")
	db_record = {}
	db_record.update({"url":url})

	obj = bs0bj.find("table",{"class":"bbs-view-info"})
	obj2 = obj.find("tr").find("td")
	db_record.update({"title":obj2.get_text().strip()})
	obj2 = obj.find("tr").findNext("tr").find("td")
	db_record.update({"date":obj2.get_text().strip()})

	obj = bs0bj.find("table",{"class":"bbs-view"})
	db_record.update({"post":post_wash(str(obj.get_text().strip()))})

	return db_record
コード例 #11
0
def content_parse(url):
    html = URLparser(url)
    bs0bj = BeautifulSoup(html.read(), "html.parser")
    db_record = {}
    db_record.update({"url": url})

    obj = bs0bj.find("div", {"class": "read_header"}).h1
    db_record.update({"title": obj.get_text().strip()})

    obj = bs0bj.find("p", {"class": "time"}).get_text().strip()
    obj = obj.replace(".", "-")
    db_record.update({"date": obj})

    obj = bs0bj.find("div", {"class": "read_body"}).get_text().strip()
    db_record.update({"post": post_wash(obj)})
    return db_record
コード例 #12
0
ファイル: sj2.py プロジェクト: donut0310/SIGNUS_Crawler
def Parsing_post_data(bs, post_url, URL):
    return_data = []
    post_data = {}
    domain = Domain_check(URL['url'])

    title = bs.find("div", {
        "class": "col-lg-9 title"
    }).find("span").get_text(" ", strip=True)
    author = bs.find("span", {"name": "WRITENAME"}).text.strip()
    if author.find("관리자") != -1:
        author = "0"
    date = bs.find("span", {"name": "wdate"}).text
    date = date + " 00:00:00"
    date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S"))
    post = bs.find("div", {"class": "form-group"}).get_text(" ", strip=True)
    post = post_wash(post)  #post 의 공백을 전부 제거하기 위함
    #이미지가 있으면 이미지 url 을 넣고, 없으면 1을 넣어준다.
    if bs.find("img", {"align": "absmiddle"}) is None:
        img = 1
    else:
        img = domain + bs.find("img", {"align": "absmiddle"})['src']
        if 1000 <= len(img):
            img = 1
        else:
            if img.startswith("http://") or img.startswith(
                    "https://"):  # img가 내부링크인지 외부 링크인지 판단.
                pass
            elif img.startswith("//"):
                img = "http:" + img
            else:
                img = domain + img
    if img != 1:
        if img_size(img):
            pass
        else:
            img = 1
    post_data['title'] = title.upper()
    post_data['author'] = author.upper()
    post_data['date'] = date
    post_data['post'] = post.lower()
    post_data['img'] = img
    post_data['url'] = post_url

    return_data.append(post_data)
    return_data.append(title)
    return_data.append(date)
    return return_data
コード例 #13
0
def content_parse(url):
    html = URLparser(url)
    bs0bj = BeautifulSoup(html.read(), "html.parser")
    db_record = {}
    db_record.update({"url": url})

    obj = bs0bj.find("td", {"class": "list_loop_left"})
    db_record.update({"title": obj.get_text().strip()})

    obj = obj.findNext("td", {"class": "list_loop_left"}).get_text().strip()
    obj = obj.replace(".", "-").split("(")[1].split(" ")[0]
    db_record.update({"date": obj})

    obj = bs0bj.find("td", {"class": "view_content"}).get_text().strip()
    db_record.update({"post": post_wash(obj)})

    return db_record
コード例 #14
0
ファイル: sig36.py プロジェクト: donut0310/SIGNUS_Crawler
def Parsing_post_data(bs, post_url, URL):
	time.sleep(2)	#서버과부하를 막기위한 조치
	return_data = []
	post_data = {}
	domain = Domain_check(URL['url'])

	title = bs.find("div", {"class": "infoBx"}).find("h3").get_text(" ", strip = True)
	author = bs.find("p", {"class": "infoTx"}).find("span", {"class": "cate"}).text.strip()
	if author.find("관리자") != -1:
		author = "0"
	date = bs.find("p", {"class": "infoTx"}).find("span", {"class": "date"}).text.strip()
	date = date + ":00"
	date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S"))
	post = bs.find("div", {"id": "view_text"}).get_text(" ", strip = True)
	post = post_wash(post)
	if bs.find("div", {"id": "view_text"}).find("img") is None:
		img = 1
	else:
		img = bs.find("div", {"id": "view_text"}).find("img")['src']		#게시글의 첫번째 이미지를 가져옴.
		if 1000 <= len(img):
			img = 1
		else:
			if img.startswith("http://") or img.startswith("https://"):		# img가 내부링크인지 외부 링크인지 판단.
				pass
			elif img.startswith("//"):
				img = "http:" + img
			else:
				img = domain + img
	if img != 1:
		if img_size(img):
			pass
		else:
			img = 1

	post_data['title'] = title.upper()
	post_data['author'] = author.upper()
	post_data['date'] = date
	post_data['post'] = post.lower()
	post_data['img'] = img
	post_data['url'] = post_url

	return_data.append(post_data)
	return_data.append(title)
	return_data.append(date)
	return return_data
コード例 #15
0
ファイル: sj41.py プロジェクト: donut0310/SIGNUS_Crawler
def Parsing_post_data(bs, post_url, URL):
	try:
		return_data = []
		post_data = {}
		domain = Domain_check(URL['url'])

		title = bs.find("span", {"class": "col_blue"}).get_text(" ", strip = True)
		author = "0"
		date = bs.find("dl", {"class": "explainInfoBx"}).find("dd").text.strip()
		date = date + " 00:00:00"
		date = str(datetime.datetime.strptime(date, "%Y.%m.%d %H:%M:%S"))
		post = bs.find("p", {"class": "tx"}).get_text(" ", strip = True)
		post = post_wash(post)
		if bs.find("div", {"class": "img"}).find("img") is None:
			img = 1
		else:
			img = bs.find("div", {"class": "img"}).find("img")['src']		#게시글의 첫번째 이미지를 가져옴.
			if 1000 <= len(img):
				img = 1
			else:
				if img.startswith("http://") or img.startswith("https://"):		# img가 내부링크인지 외부 링크인지 판단.
					pass
				elif img.startswith("//"):
					img = "http:" + img
				else:
					img = domain + img
		if img != 1:
			if img_size(img):
				pass
			else:
				img = 1

		post_data['title'] = title.upper()
		post_data['author'] = author.upper()
		post_data['date'] = date
		post_data['post'] = post.lower()
		post_data['img'] = img
		post_data['url'] = post_url

		return_data.append(post_data)
		return_data.append(title)
		return_data.append(date)
		return return_data
	except:
		return None
コード例 #16
0
def content_parse(domain, url):
    html = URLparser(url)
    bs0bj = BeautifulSoup(html.read(), "html.parser")
    db_record = {}
    db_record.update({"url": url})

    obj = bs0bj.find(text="제목")
    db_record.update({"title": obj.findNext('td').get_text().strip()})
    obj = bs0bj.find(text="작성일")
    db_record.update({"date": obj.findNext('td').get_text().strip()})

    try:
        obj = bs0bj.find("div", {'class': "bbs-body"})
        db_record.update({"post": post_wash(str(obj.get_text().strip()))})
    except:
        db_record.update({"post": 1})

    return db_record
コード例 #17
0
ファイル: PK_aquacul.py プロジェクト: iml1111/PKU_Crawler
def content_parse(url):
	html = URLparser(url)
	bs0bj = BeautifulSoup(html.read(), "html.parser")
	db_record = {}
	db_record.update({"url":url})

	obj = bs0bj.find("td",{"class":"boardSub"})
	db_record.update({"title":obj.get_text().strip()})

	obj = obj.findNext("td").findNext("td").get_text().strip()
	obj = obj.replace(".","-")
	db_record.update({"date":obj})

	obj = bs0bj.find("td",{"class":"contens"}).get_text().strip()
	db_record.update({"post":post_wash(obj)})

	return db_record
	
コード例 #18
0
def Parsing_post_data(post_url, URL):
	post_data_prepare = []
	end_date = date_cut_dict['sj5']		# end_date 추출

	#udream 로그인하는 함수
	s = udream.login()
	
	page = s.get(post_url).text
	bs = BeautifulSoup(page, "html.parser")

	posts = bs.find("tbody").findAll("tr")	#tr묶음
	for post in posts:
		post_infoes = post.findAll("td")	#td 묶음

		post_data = {}
		title = post_infoes[0].get_text(" ", strip = True)
		author = post.find("div").text.strip()
		if author.find("관리자") != -1:
			author = "0"
		date = post_infoes[3].text + " 00:00:00"
		date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S"))
		phrase = post_infoes[1].text + post_infoes[2].get_text(" ", strip = True)
		phrase = post_wash(phrase)
		img = 1
		url_num = str(post_infoes[4].find("a")).split('"')[3]
		url = URL['post_url'] + url_num

		post_data['title'] = title.upper()
		post_data['author'] = author.upper()
		post_data['date'] = date
		post_data['post'] = phrase.lower()
		post_data['img'] = img
		post_data['url'] = url

		print(date, "::::", title)

		#게시물의 날짜가 end_date 보다 옛날 글이면 continue, 최신 글이면 append
		if str(date) <= end_date:
			continue
		else:
			post_data_prepare.append(post_data)
	s.close()
			
	return post_data_prepare
コード例 #19
0
def Parsing_post_data(bs, post_url, URL):
	return_data = []
	post_data = {}
	domain = URL['url'].split('/')[0] + '//' + URL['url'].split('/')[2]

	title = bs.find("span", {"class": "on"}).get_text(" ", strip = True)
	author = bs.find("table", {"class": "basic-table input-table"}).findAll("tr")[1].find("td").text.strip()
	if author.find("관리자") != -1:
		author = "0"
	date = bs.find("table", {"class": "basic-table input-table"}).findAll("tr")[3].find("td").text.strip()[:23].split('~')[1].strip()
	date = date + " 00:00:00"
	date = str(datetime.datetime.strptime(date, "%Y.%m.%d %H:%M:%S"))
	post = bs.find("ul", {"class": "summary-info"}).get_text(" ", strip = True)
	post = post_wash(post)
	if bs.find("div", {"class": "poster"}).find("img") is None:
		img = 1
	else:
		img = bs.find("div", {"class": "poster"}).find("img")['src']		#게시글의 첫번째 이미지를 가져옴.
		if 1000 <= len(img):
			img = 1
		else:
			if img.startswith("http://") or img.startswith("https://"):		# img가 내부링크인지 외부 링크인지 판단.
				pass
			elif img.startswith("//"):
				img = "http:" + img
			else:
				img = domain + img
	if img != 1:
		if img_size(img):
			pass
		else:
			img = 1

	post_data['title'] = title.upper()
	post_data['author'] = author.upper()
	post_data['date'] = date
	post_data['post'] = post.lower()
	post_data['img'] = img
	post_data['url'] = post_url

	return_data.append(post_data)
	return_data.append(title)
	return_data.append(date)
	return return_data
コード例 #20
0
ファイル: PK_physics.py プロジェクト: iml1111/PKU_Crawler
def content_parse(url):
    html = URLparser(url)
    bs0bj = BeautifulSoup(html.read(), "html.parser")
    db_record = {}
    db_record.update({"url": url})

    obj = bs0bj.find("td", {"class": "title"})
    db_record.update({"title": obj.get_text().strip()})

    obj = obj.findNext("td").findNext("td")
    db_record.update({"date": obj.get_text().strip()})

    try:
        obj = bs0bj.find("td", {"class": "tdc"}).get_text().strip()
        db_record.update({"post": post_wash(obj)})
    except:
        db_record.update({"post": 1})

    return db_record
コード例 #21
0
ファイル: PK_sh.py プロジェクト: iml1111/PKU_Crawler
def content_parse(url):
	html = URLparser(url)
	bs0bj = BeautifulSoup(html.read(), "html.parser").find("article",{"id":"bo_v"})
	db_record = {}
	db_record.update({"url":url})

	obj = bs0bj.find("h1",{"id":"bo_v_title"}).get_text().strip()
	db_record.update({"title":obj})

	obj = bs0bj.find("section",{"id":"bo_v_info"}).find("strong").find_next("strong")
	obj = "20" + obj.get_text().strip()
	db_record.update({"date":obj})

	try:
		obj = bs0bj.find("div",{"id":"bo_v_con"}).get_text().strip()
		db_record.update({"post":post_wash(obj)})
	except:
		db_record.update({"post":1})

	return db_record
コード例 #22
0
def content_parse(url):
	db_record = {}
	html = URLparser(url)
	bs0bj = BeautifulSoup(html.read(), "html.parser")
	bs0bj = bs0bj.find("div",{"id":"board_view"})
	db_record.update({"url":url})

	obj = bs0bj.find("h3").get_text().strip()
	db_record.update({"title":obj})

	obj = bs0bj.find("p",{"class":"writer"}).find("strong").get_text().strip()
	db_record.update({"date":obj})

	try:
		obj = bs0bj.find("div",{"class":"board_stance"}).get_text().strip()
		db_record.update({"post":post_wash(obj)})
	except:
		db_record.update({"post":1})

	return db_record
コード例 #23
0
def content_parse(domain, url):
    html = URLparser(url)
    bs0bj = BeautifulSoup(html.read(), "html.parser")
    db_record = {}
    db_record.update({"url": url})

    obj = bs0bj.find("tr", {
        "class": "head"
    }).find("td", {"class": "first txt-l"})
    db_record.update({"title": obj.get_text().strip()})
    obj = obj.find_next("td").find_next("td")
    db_record.update({"date": obj.get_text().strip()})

    try:
        obj = bs0bj.find("tr", {"class": "head"}).find_next("tr")
        db_record.update({"post": post_wash(str(obj.get_text().strip()))})
    except:
        db_record.update({"post": 1})

    return db_record
コード例 #24
0
ファイル: PK_coop.py プロジェクト: iml1111/PKU_Crawler
def content_parse(url):
    html = URLparser(url)
    bs0bj = BeautifulSoup(html.read(), "html.parser")
    db_record = {}
    db_record.update({"url": url})

    obj = bs0bj.find("h3", {"class": "title"}).get_text().strip()
    db_record.update({"title": obj})

    obj = bs0bj.find("span", {"class": "date"}).get_text().strip()
    obj = obj.split('.')[0] + "-" + obj.split('.')[1] + "-" + obj.split('.')[2]
    db_record.update({"date": obj})

    try:
        obj = bs0bj.find("div", {"class": "boardReadBody"}).get_text().strip()
        db_record.update({"post": post_wash(obj)})
    except:
        db_record.update({"post": 1})

    return db_record
コード例 #25
0
def Parsing_post_data(bs, post_url, URL):
    return_data = []
    post_data = {}
    domain = Domain_check(URL['url'])

    title = bs.find("div", {
        "class": "tbl_container"
    }).find("th").get_text(" ", strip=True)
    author = bs.find("div", {
        "class": "tbl_container"
    }).findAll("tr")[1].findAll("td")[1].text.strip()
    if author.find("관리자") != -1:
        author = "0"
    date = bs.find("div", {
        "class": "tbl_container"
    }).findAll("tr")[1].findAll("td")[3].text.strip()
    date = date.replace(" 오전", "")
    date = date.replace(" 오후", "")
    if len(date.split(":")) == 2:
        date = date + ":00"
    date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S"))
    post = bs.find("div", {
        "class": "tbl_container"
    }).findAll("tr")[2].get_text(" ", strip=True)
    post = post_wash(post)
    tag_done = tag.tagging(URL, title)
    img = 1

    #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[],'fav_cnt':0,'view':0} 같은 형식
    post_data['title'] = title.upper()
    post_data['author'] = author.upper()
    post_data['date'] = date
    post_data['post'] = post.lower()
    post_data['tag'] = tag_done  # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다.
    post_data['img'] = img
    post_data['url'] = post_url

    return_data.append(post_data)
    return_data.append(title)
    return_data.append(date)
    return return_data
コード例 #26
0
def content_parse(url):
    html = URLparser(url)
    bs0bj = BeautifulSoup(html.read(), "html.parser")
    db_record = {}
    db_record.update({"url": url})

    obj = bs0bj.find("span", {"class": "view_subj_core"})
    obj = obj.get_text().strip()
    db_record.update({"title": obj})

    obj = bs0bj.find("span", {"class": "view_subj_date"})
    obj = obj.get_text().strip()
    db_record.update({"date": obj})

    try:
        obj = bs0bj.find("div", {"class": "view_txt_container"})
        obj = obj.get_text().strip()
        db_record.update({"post": post_wash(str(obj))})
    except:
        db_record.update({"post": 1})
    return db_record
コード例 #27
0
ファイル: sig37.py プロジェクト: donut0310/SIGNUS_Crawler
def Parsing_post_data(bs, post_url, URL):
    try:
        time.sleep(2)  #서버과부하를 막기위한 조치
        return_data = []
        post_data = {}
        domain = Domain_check(URL['url'])

        author = bs.find("div", {
            "class": "sumTit"
        }).find("h3").find("span").text.strip()
        title = bs.find("div", {
            "class": "sumTit"
        }).find("h3").get_text(" ", strip=True).replace(author, "").strip()
        if author.find("관리자") != -1:
            author = "0"
        date = bs.find("dl", {
            "class": "date"
        }).findAll("dd")[1].find("span").text.strip()
        date = date + " 00:00:00"
        date = str(datetime.datetime.strptime(date, "%Y.%m.%d %H:%M:%S"))
        post = bs.find("div", {
            "class": "tbRow clear"
        }).get_text(" ", strip=True)
        post = post_wash(post)
        img = 1

        post_data['title'] = title.upper()
        post_data['author'] = author.upper()
        post_data['date'] = date
        post_data['post'] = post.lower()
        post_data['img'] = img
        post_data['url'] = post_url

        return_data.append(post_data)
        return_data.append(title)
        return_data.append(date)
        return return_data
    except:
        return None
コード例 #28
0
def Parsing_post_data(bs, post_url, URL):
	return_data = []
	post_data = {}
	domain = Domain_check(URL['url'])

	title = bs.find("div", {"class": "view_subject"}).find("h5").get_text(" ", strip = True)
	author = bs.find("ul", {"class": "data"}).find("li").text.strip()
	date = now
	date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S"))
	post = bs.find("div", {"class": "view_contents"}).get_text(" ", strip = True)
	post = post_wash(post)
	img = 1
	post_data['title'] = title.upper()
	post_data['author'] = author.upper()
	post_data['date'] = date
	post_data['post'] = post.lower()
	post_data['img'] = img
	post_data['url'] = post_url

	return_data.append(post_data)
	return_data.append(title)
	return_data.append(date)
	return return_data
コード例 #29
0
ファイル: PK_dorm.py プロジェクト: iml1111/PKU_Crawler
def content_parse(domain, url):
    html = URLparser(url)
    bs0bj = BeautifulSoup(html.read(), "html.parser")
    db_record = {}
    db_record.update({"url": url})

    bs0bj = bs0bj.find("table", {"class": "board_view"})
    obj = bs0bj.find("thead").get_text().strip()
    db_record.update({"title": obj})

    obj = bs0bj.find("tbody").find("tr").find("td").find_next("td").find_next(
        "td")
    obj = obj.get_text().strip().split(" ")[2]
    db_record.update({"date": obj})

    try:
        obj = bs0bj.find("tbody").find("td", {"class": "tdc"})
        obj = obj.get_text().strip()
        db_record.update({"post": post_wash(str(obj))})
    except:
        db_record.update({"post": 1})

    return db_record
コード例 #30
0
ファイル: sj41.py プロジェクト: iml1111/SOOJLE_Crawler
def Parsing_post_data(bs, post_url, URL):
    try:
        return_data = []
        post_data = {}
        domain = Domain_check(URL['url'])

        title = bs.find("span", {
            "class": "txt_jobfair"
        }).get_text(" ", strip=True)
        author = bs.find("span", {"class": "tit_company_name"}).text.strip()
        date = bs.find("p", {"class": 'info'}).find("span").text.strip()
        date = date + " 00:00:00"
        date = str(datetime.datetime.strptime(date, "%Y.%m.%d %H:%M:%S"))
        post = ""
        posts = bs.findAll("dl", {"class": "qna_list"})
        for posts_one in posts:
            post += posts_one.text.get_text(" ", strip=True)
        post = post_wash(post)
        tag_done = tag.tagging(URL, title)
        img = 1
        #post_data = {'title': ,'author': ,'date': ,'post': ,'tag':[],'fav_cnt':0,'view':0} 같은 형식
        post_data['title'] = title.upper()
        post_data['author'] = author.upper()
        post_data['date'] = date
        post_data['post'] = post.lower()
        post_data[
            'tag'] = tag_done  # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다.
        post_data['img'] = img
        post_data['url'] = post_url

        return_data.append(post_data)
        return_data.append(title)
        return_data.append(date)
        return return_data
    except:
        return None