def Parsing_post_data(post_url, URL): post_data_prepare = [] end_date = date_cut_dict['sj4'] # end_date 추출 #udream 로그인하는 함수 s = udream.login() page = s.get(post_url).text bs = BeautifulSoup(page, "html.parser") posts = bs.find("tbody").findAll("tr") #tr묶음 for post in posts: #[title, author, post1, post2, date] 형태 post_infoes = post.findAll("td") #td 묶음 post_data = {} title = post_infoes[0].get_text(" ", strip=True) author = post_infoes[0].find("div").text if author.find("관리자") != -1: author = "0" end_data = post_infoes[4].text + " 00:00:00" post = post_infoes[1].get_text( " ", strip=True) + post_infoes[2].get_text( " ", strip=True) + post_infoes[3].get_text( " ", strip=True) + "~" + post_infoes[4].get_text( " ", strip=True) post = post_wash(post) tag_done = tag.tagging(URL, title) post = post[:200] img = 1 url = post_infoes[5].find("a")["href"] post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = str( datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) post_data['end_data'] = datetime.datetime.strptime( end_data, "%Y-%m-%d %H:%M:%S") post_data['post'] = post.upper() post_data[ 'tag'] = tag_done # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다. post_data['img'] = img post_data['url'] = url print(date, "::::", title) #게시물의 날짜가 end_date 보다 옛날 글이면 continue, 최신 글이면 append if str(date) <= end_date: continue else: post_data_prepare.append(post_data) s.close() return post_data_prepare
def Parsing_post_data(post_url, URL): post_data_prepare = [] end_date = date_cut_dict['sj5'] # end_date 추출 #udream 로그인하는 함수 s = udream.login() page = s.get(post_url).text bs = BeautifulSoup(page, "html.parser") posts = bs.find("tbody").findAll("tr") #tr묶음 for post in posts: post_infoes = post.findAll("td") #td 묶음 post_data = {} title = post_infoes[0].get_text(" ", strip=True) author = post.find("div").text.strip() if author.find("관리자") != -1: author = "0" date = post_infoes[3].text + " 00:00:00" date = str(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S")) phrase = post_infoes[1].text + post_infoes[2].get_text(" ", strip=True) phrase = post_wash(phrase) tag_done = tag.tagging(URL, title) img = 1 url_num = str(post_infoes[4].find("a")).split('"')[3] url = URL['post_url'] + url_num post_data['title'] = title.upper() post_data['author'] = author.upper() post_data['date'] = date post_data['post'] = phrase.lower() post_data[ 'tag'] = tag_done # 태그1/태그2/태그3/태그4/.../ 같은 형식의 태그string이 들어간다. post_data['img'] = img post_data['url'] = url print(date, "::::", title) #게시물의 날짜가 end_date 보다 옛날 글이면 continue, 최신 글이면 append if str(date) <= end_date: continue else: post_data_prepare.append(post_data) s.close() return post_data_prepare
def Parsing_list_url(URL, page_url): List = [] #udream 로그인하는 함수 s = udream.login() page = s.get(page_url).text bs = BeautifulSoup(page, "lxml") #html.parser 오류 lxml 로 가져온다. #리스트 반환 posts = bs.findAll("tr", {"onmouseover": "hctrOn(this)"}) for post in posts: num = post.find("a")["onclick"] post_num = num.split("'")[1] page = URL['post_url'] + post_num List.append(page) s.close() return List
def Parsing_list_url(URL, page_url): List = [] domain = Domain_check(URL['url']) #udream 로그인하는 함수 s = udream.login() page = s.get(page_url).text bs = BeautifulSoup(page, "html.parser") #리스트 반환 posts = bs.find("table", { "class": "table b-t b-light" }).find("tbody").findAll("tr") for post in posts: num = post.find("div")["onclick"] post_num = num.split("'")[1] page = URL['post_url'] + post_num List.append(page) s.close() return List