def get_link(): with open('/home/kyw/agoraHTMLnumber/agoraHTMLnumber.txt', 'a') as wf: agora = 'http://c2djzrn6qx6kupkn.onion/' with requests.Session() as s: agora = cr.Site(agora) tup = agora.staticGet(s, agora.stem) s, html = tup[0], tup[1] links = re.compile('href="(\d+\.html)').findall(html.text) wf.write(links[-1].strip('.html') + '\n') return links[-1]
def agoraMultiCrawler(new_html): with requests.Session() as s: return_data = OrderedDict() agora = 'http://c2djzrn6qx6kupkn.onion/' agora = cr.Site(agora) # if i == 0: # tup = agora.staticGet(s, agora.stem) # s, html, soup = tup[0], tup[1].text, tup[2] tup = agora.staticGet(s, agora.stem + "/{}.html".format(new_html)) s, html, soup = tup[0], tup[1].text, tup[2] messages = soup.find_all("div", {"class": "message"}) labels = soup.find_all("label") ids = soup.find_all("span", {"class": "reflink"}) content_data = list() for id, label, message in zip(ids, labels, messages): temp_data = OrderedDict() posterman = label.find("span", { "class": "postername" }).get_text().encode( 'iso-8859-1').decode('utf-8').strip('\n') if label.find( "span", {"class": "postername"}) is not None else None filetitle = label.find("span", { "class": "filetitle" }).get_text().encode( 'iso-8859-1').decode('utf-8').strip('\n') if label.find( "span", {"class": "filetitle"}) is not None else None for lab in label("span"): lab.decompose() mid = id.find_all('a')[-1].get_text() date = label.get_text().encode('iso-8859-1').decode('utf-8').strip( '\n').strip(' ') ms = message.get_text().encode('iso-8859-1').decode('utf-8').strip( '\n') temp_data['author'] = posterman temp_data['title'] = filetitle temp_data['id'] = mid temp_data['date'] = date temp_data['message'] = ms content_data.append(temp_data) return_data['html'] = html return_data['content'] = content_data return_data['url'] = tup[1].url return return_data
except Exception as e: error_data[title['titleURL']] = e cr.mkjson(error_data, '/json_datas/highkorea', 'hkContent_error.json') pass if __name__ == '__main__': tod = datetime.date.today() todstr = tod.isoformat() loginPage = 'http://highkorea5ou4wcy.onion/ucp.php?mode=login' mainpage = 'http://highkorea5ou4wcy.onion' ID = 'michin' passwd = 'michin' LOGIN_INFO = {'username': ID, 'password': passwd} start_time = time.time() highkorea = cr.Site(mainpage) session = highkorealogin(LOGIN_INFO, highkorea, loginPage) tup = getForums(session, highkorea, highkorea.stem) session, forumtitles, forumurls = tup[0], tup[1], tup[2] tup = getLastPage(session, highkorea, forumtitles, forumurls) session, lastpages = tup[0], tup[1] pool = Pool(processes=4) # 4개의 프로세스를 사용합니다. results = pool.starmap( getTitles, zip(repeat(session), repeat(highkorea), lastpages.values())) cr.mkjson(results, '/json_datas', 'hkTitle.json') for i, forum in enumerate(results): content = list() pool = Pool(processes=4) # 4개의 프로세스를 사용합니다. results = pool.starmap(getContent, zip(repeat(session), repeat(highkorea), forum)) cr.mkjson(results, '/json_datas/highkorea',
html_datas.append(html) temp_data['image'] = image_datas temp_data['html'] = html_datas temp_data['content'] = content_datas return_data[topicurl] = temp_data return return_data # except Exception as e: # error_data[title['titleURL']] = e # cr.mkjson(error_data, '/home/kyw/json_datas/zion', 'zion_Content_error.json') # pass if __name__ == '__main__': mainpage = 'http://hzionerlko3on77m.onion' loginPage = 'http://hzionerlko3on77m.onion/ucp.php?mode=login' zion = cr.Site(mainpage) ID = 'chickenS2' passwd = 'chickenS2' LOGIN_INFO = {'username': ID, 'password': passwd} start_time = time.time() session = zionlogin(LOGIN_INFO, zion, loginPage) print(time.time() - start_time) tup = hktc.getForums(session, zion, zion.stem) session, forumtitles, forumurls = tup[0], tup[1], tup[2] tup = getLastPage(session, zion, forumtitles, forumurls) session, lastpages = tup[0], tup[1] pool = Pool(processes=4) # 4개의 프로세스를 사용합니다. results = pool.starmap( getTitles, zip(repeat(session), repeat(zion), lastpages.values())) cr.mkjson(results, '/home/kyw/json_datas/zion', 'zion_Title.json') for i, title in enumerate(results):