예제 #1
0
def get_link():
    with open('/home/kyw/agoraHTMLnumber/agoraHTMLnumber.txt', 'a') as wf:
        agora = 'http://c2djzrn6qx6kupkn.onion/'
        with requests.Session() as s:
            agora = cr.Site(agora)
            tup = agora.staticGet(s, agora.stem)
            s, html = tup[0], tup[1]
            links = re.compile('href="(\d+\.html)').findall(html.text)
            wf.write(links[-1].strip('.html') + '\n')
    return links[-1]
예제 #2
0
def agoraMultiCrawler(new_html):

    with requests.Session() as s:

        return_data = OrderedDict()
        agora = 'http://c2djzrn6qx6kupkn.onion/'
        agora = cr.Site(agora)

        # if i == 0:
        #     tup = agora.staticGet(s, agora.stem)
        #     s, html, soup = tup[0], tup[1].text, tup[2]
        tup = agora.staticGet(s, agora.stem + "/{}.html".format(new_html))
        s, html, soup = tup[0], tup[1].text, tup[2]
        messages = soup.find_all("div", {"class": "message"})
        labels = soup.find_all("label")

        ids = soup.find_all("span", {"class": "reflink"})

        content_data = list()

        for id, label, message in zip(ids, labels, messages):
            temp_data = OrderedDict()
            posterman = label.find("span", {
                "class": "postername"
            }).get_text().encode(
                'iso-8859-1').decode('utf-8').strip('\n') if label.find(
                    "span", {"class": "postername"}) is not None else None
            filetitle = label.find("span", {
                "class": "filetitle"
            }).get_text().encode(
                'iso-8859-1').decode('utf-8').strip('\n') if label.find(
                    "span", {"class": "filetitle"}) is not None else None
            for lab in label("span"):
                lab.decompose()
            mid = id.find_all('a')[-1].get_text()
            date = label.get_text().encode('iso-8859-1').decode('utf-8').strip(
                '\n').strip('  ')
            ms = message.get_text().encode('iso-8859-1').decode('utf-8').strip(
                '\n')
            temp_data['author'] = posterman
            temp_data['title'] = filetitle
            temp_data['id'] = mid
            temp_data['date'] = date
            temp_data['message'] = ms
            content_data.append(temp_data)
        return_data['html'] = html
        return_data['content'] = content_data
        return_data['url'] = tup[1].url
        return return_data
예제 #3
0
    except Exception as e:
        error_data[title['titleURL']] = e
        cr.mkjson(error_data, '/json_datas/highkorea', 'hkContent_error.json')
        pass


if __name__ == '__main__':
    tod = datetime.date.today()
    todstr = tod.isoformat()
    loginPage = 'http://highkorea5ou4wcy.onion/ucp.php?mode=login'
    mainpage = 'http://highkorea5ou4wcy.onion'
    ID = 'michin'
    passwd = 'michin'
    LOGIN_INFO = {'username': ID, 'password': passwd}
    start_time = time.time()
    highkorea = cr.Site(mainpage)
    session = highkorealogin(LOGIN_INFO, highkorea, loginPage)
    tup = getForums(session, highkorea, highkorea.stem)
    session, forumtitles, forumurls = tup[0], tup[1], tup[2]
    tup = getLastPage(session, highkorea, forumtitles, forumurls)
    session, lastpages = tup[0], tup[1]
    pool = Pool(processes=4)  # 4개의 프로세스를 사용합니다.
    results = pool.starmap(
        getTitles, zip(repeat(session), repeat(highkorea), lastpages.values()))
    cr.mkjson(results, '/json_datas', 'hkTitle.json')
    for i, forum in enumerate(results):
        content = list()
        pool = Pool(processes=4)  # 4개의 프로세스를 사용합니다.
        results = pool.starmap(getContent,
                               zip(repeat(session), repeat(highkorea), forum))
        cr.mkjson(results, '/json_datas/highkorea',
예제 #4
0
        html_datas.append(html)
    temp_data['image'] = image_datas
    temp_data['html'] = html_datas
    temp_data['content'] = content_datas
    return_data[topicurl] = temp_data
    return return_data
    # except Exception as e:
    #     error_data[title['titleURL']] = e
    #     cr.mkjson(error_data, '/home/kyw/json_datas/zion', 'zion_Content_error.json')
    #     pass


if __name__ == '__main__':
    mainpage = 'http://hzionerlko3on77m.onion'
    loginPage = 'http://hzionerlko3on77m.onion/ucp.php?mode=login'
    zion = cr.Site(mainpage)
    ID = 'chickenS2'
    passwd = 'chickenS2'
    LOGIN_INFO = {'username': ID, 'password': passwd}
    start_time = time.time()
    session = zionlogin(LOGIN_INFO, zion, loginPage)
    print(time.time() - start_time)
    tup = hktc.getForums(session, zion, zion.stem)
    session, forumtitles, forumurls = tup[0], tup[1], tup[2]
    tup = getLastPage(session, zion, forumtitles, forumurls)
    session, lastpages = tup[0], tup[1]
    pool = Pool(processes=4)  # 4개의 프로세스를 사용합니다.
    results = pool.starmap(
        getTitles, zip(repeat(session), repeat(zion), lastpages.values()))
    cr.mkjson(results, '/home/kyw/json_datas/zion', 'zion_Title.json')
    for i, title in enumerate(results):