Ejemplo n.º 1
0
r = 1
num = 0
start = time.time()

print("Keyword is:")
print(sys.argv[1])
print("Start!")
print("=================================================================")

while True:
    web = []
    print(r)
    r = r + 1

    html = _ptool.get_doc(page)
    web = _ptool.get_articles(keyword, html)
    try:
        for match_article in web:
            article_push = []

            file_name = str(num) + "_" + keyword + "_" + str(r) + ".txt"
            num = num + 1

            with open(file_name, mode='w', encoding='utf8') as fin:
                fin.write(match_article['title'])

                article_page = _ptool.get_web(match_article['link'])
                article_html = _ptool.get_doc(article_page)
                article_push = _ptool.get_push(article_html)
                context = _ptool.get_in_article(article_html)
Ejemplo n.º 2
0
import _ptool
import sys
import os
from threading import Thread, Lock
from pymongo import MongoClient

ptt_name = sys.argv[3]

url = 'https://www.ptt.cc/bbs/' + ptt_name + '/index.html'
pagefornum = _ptool.get_web(url)
docfornum = _ptool.get_doc(pagefornum)

num = _ptool.get_index(docfornum) + 1
data_n = 1

client = MongoClient()
db = client.ptt
postin = db[ptt_name]

print('home page is :%d' % (num))
print('home url is : %s' % (url))
print('====================================================================')


def mul_ptt(tid, keyword, lock):
    while True:

        with lock:
            global num
            if num > 0:
                global ptt_name
Ejemplo n.º 3
0
def mul_ptt(tid, keyword, lock):
    while True:

        with lock:
            global num
            if num > 0:
                global ptt_name
                ptt_url = 'https://www.ptt.cc/bbs/' + sys.argv[
                    3] + '/index' + str(num) + '.html'

                page_num = num
                num = num - 1

                print('thread id %d at link : %s' % (tid, ptt_url))

                web = []
                page = _ptool.get_web(ptt_url)
                html = _ptool.get_doc(page)
                web = _ptool.get_articles(
                    keyword, html)  #find the all articles that match keyword

                if web is False:
                    web_error = "WebError_" + str(tid) + ".txt"
                    with open(web_error, mode='a', encoding='utf8') as f:
                        f.write(ptt_url)
                        f.write("\n")
                        f.close()
                else:
                    try:
                        for match_article in web:
                            global data_n
                            data = data_n
                            data_n = data_n + 1

                            article_page = _ptool.get_web(
                                match_article['link'])
                            article_html = _ptool.get_doc(article_page)

                            if article_html is False:
                                file_error = "Error_" + str(tid) + ".txt"
                                with open(file_error,
                                          mode='a',
                                          encoding='utf8') as f:
                                    f.write(match_article['link'])
                                    f.write("\n")
                                    f.close()

                            else:
                                article_push = _ptool.get_push(article_html)

                                _dict = _ptool.get_in_article(article_html)
                                _detail = _ptool.get_detail(
                                    match_article['title'], _dict['time'])

                                retext = _ptool.get_retext(article_html)

                                post = {
                                    "Tid": tid,
                                    "Keyword": keyword,
                                    "Kind": _detail['Kind'],
                                    "Isre": _detail['Isre'],
                                    "Title": match_article['title'],
                                    "Author": match_article['author'],
                                    "Week": _detail['Week'],
                                    "Month": _detail['Month'],
                                    "Date": _detail['Date'],
                                    "Time": _detail['Time'],
                                    "Year": _detail['Year'],
                                    "Link": match_article['link'],
                                    "Push": article_push[0],
                                    "Re": article_push[1],
                                    "F**k": article_push[2],
                                    "Text": _dict['text'],
                                    "Retext": retext,
                                    "Postive": 0,
                                    "Negative": 0,
                                    "Score": 0
                                }

                                global postin
                                postin.insert_one(post)
                                print("Thread %d post in database" % tid)

                    except TypeError as e:
                        pass

            elif num == 0:
                print("Thread %d done" % tid)
                break
Ejemplo n.º 4
0
def mul_ptt(tid, keyword, lock, dir_path):
    while True:

        with lock:
            global num
            if num > 0:
                global ptt_name
                ptt_url = 'https://www.ptt.cc/bbs/' + sys.argv[
                    3] + '/index' + str(num) + '.html'
                page_num = num
                num = num - 1

                print('thread id %d at link : %s' % (tid, ptt_url))

                web = []
                page = _ptool.get_web(ptt_url)
                html = _ptool.get_doc(page)
                web = _ptool.get_articles(
                    keyword, html)  #find the all articles that match keyword

                if web is False:
                    web_error = "WebError_" + str(tid) + ".txt"
                    print("Error happened at thread id = %s" % str(tid))
                    with open(web_error, mode='a', encoding='utf8') as f:
                        f.write(ptt_url)
                        f.write("\n")
                        f.close()

                else:
                    try:
                        for match_article in web:
                            article_push = []

                            global data_n
                            file_name = dir_path + "/" + str(
                                data_n) + "_" + keyword + "_" + str(
                                    tid) + ".txt"  #global
                            data_n = data_n + 1
                            file_error = "Error_" + str(tid) + ".txt"

                            with open(file_name, mode='w',
                                      encoding='utf8') as fin:

                                fin.write(match_article['title'])

                                article_page = _ptool.get_web(
                                    match_article['link'])
                                article_html = _ptool.get_doc(article_page)

                                if article_html is False:
                                    fin.close()
                                    os.remove(file_name)
                                    print("Thread id = %s delete file : %s" %
                                          (str(tid), file_name))
                                    with open(file_error,
                                              mode='a',
                                              encoding='utf8') as f:
                                        f.write(match_article['link'])
                                        f.write("\n")
                                        f.close()

                                else:
                                    article_push = _ptool.get_push(
                                        article_html)
                                    retext = _ptool.get_retext(article_html)

                                    _dict = _ptool.get_in_article(article_html)

                                    _detail = _ptool.get_detail(
                                        match_article['title'], _dict['time'])

                                    fin.write(_dict['text'])
                                    fin.write("\n\n")
                                    fin.write("\nLink:")
                                    fin.write(match_article['link'])
                                    fin.write("\n\nAuthor: %s" %
                                              match_article['author'])
                                    fin.write("\n\n推文數:")
                                    fin.write(str(article_push[0]))
                                    fin.write("\n回文數:")
                                    fin.write(str(article_push[1]))
                                    fin.write("\n噓文數:")
                                    fin.write(str(article_push[2]))

                                    fin.write("\n")
                                    fin.write(_detail['Week'])
                                    fin.write("\n")
                                    fin.write(_detail['Month'])
                                    fin.write("\n")
                                    fin.write(_detail['Date'])
                                    fin.write("\n")
                                    fin.write(_detail['Time'])
                                    fin.write("\n")
                                    fin.write(_detail['Year'])
                                    fin.write("\n")
                                    fin.write(_detail['Isre'])
                                    fin.write("\n")
                                    fin.write(_detail['Kind'])
                                    fin.write(retext)
                                    fin.close()
                                    print('thread id %d closed status :' %
                                          (tid))
                                    print(fin.closed)  #check_file_closed

                    except TypeError as e:
                        pass

            elif num == 0:
                print("done")
                break