def main(): pageNum = int(input("输入爬取的页数:")) url = "https://www.mzitu.com/jiepai/" if (pageNum > 1): #接收下一页 stra = time.time() t = MyThread(pageNum, url) t.start() t.join() print("运行时间为:", time.time() - stra) else: getContent.getContent(url)
def test_getContent3(self): print getContent([ (1, u'http://www.onetonline.org/link/result/11-3071.01?c=tk&n_tk=0&e_tk=1&c_tk=50&s_tk=IM&n_tt=20&s_tt=s&e_tt=L&e_tt=C&n_kn=10&c_kn=50&s_kn=IM&n_sk=10&c_sk=50&s_sk=IM&n_ab=10&c_ab=50&s_ab=IM&n_wa=10&c_wa=50&s_wa=IM&n_dw=10&a_iw=g&a_iw=i&a_iw=d&a_iw=t&n_cx=10&c_cx=50&c_in=50&n_ws=10&c_ws=50&c_wv=50&n_wn=10&c_wn=50&n_cw=10&s_cw=CIP&g=Go' ), (1, u'http://www.onetonline.org/link/result/11-3071.02?c=tk&n_tk=0&e_tk=1&c_tk=50&s_tk=IM&n_tt=20&s_tt=s&e_tt=L&e_tt=C&n_kn=10&c_kn=50&s_kn=IM&n_sk=10&c_sk=50&s_sk=IM&n_ab=10&c_ab=50&s_ab=IM&n_wa=10&c_wa=50&s_wa=IM&n_dw=10&a_iw=g&a_iw=i&a_iw=d&a_iw=t&n_cx=10&c_cx=50&c_in=50&n_ws=10&c_ws=50&c_wv=50&n_wn=10&c_wn=50&n_cw=10&s_cw=CIP&g=Go' ), (1, u'http://www.onetonline.org/link/result/11-3071.03?c=tk&n_tk=0&e_tk=1&c_tk=50&s_tk=IM&n_tt=20&s_tt=s&e_tt=L&e_tt=C&n_kn=10&c_kn=50&s_kn=IM&n_sk=10&c_sk=50&s_sk=IM&n_ab=10&c_ab=50&s_ab=IM&n_wa=10&c_wa=50&s_wa=IM&n_dw=10&a_iw=g&a_iw=i&a_iw=d&a_iw=t&n_cx=10&c_cx=50&c_in=50&n_ws=10&c_ws=50&c_wv=50&n_wn=10&c_wn=50&n_cw=10&s_cw=CIP&g=Go' ) ])
def getText(): article = getContent() article_title = article['title'] article_content = article['content'] files = [f for f in listdir('text') if isfile(join('text', f))] if article_title + '.txt' in files: print "already there" return -1 file_out = open('text/' + article_title + '.txt', 'w') file_out.write(article_content.encode("utf-8")) file_out.close() return article_title
def ToMysql(q): db = pymysql.connect("localhost", "root", "123456", "testdb") cursor = db.cursor() while (q.empty): str = q.get() str = str.split(' ') sql = "SELECT enterpriseName FROM temp_icp_web2 where autoID = %s" cursor.execute(sql, int(str[0])) result = cursor.fetchone() print(str[0], '内容获取...') str.append(getContent(str[1], 1)) print(str[0], '爬取完成,读入数据库...') sql = "INSERT INTO Content (id,company,url,content) VALUES (%s,%s,%s,%s)" cursor.execute(sql, (int(str[0]), result[0], str[1], str[2])) db.commit() print(str[0], '读入成功') db.close()
def controller(keyword, numberOfLinks): #keyword = input("Enter keyword to be searched - ") links = getLinks.start(keyword, numberOfLinks) ctr = 1 contents = {} for link in links: content = getContent.getContent(link) if content != "error": contents[ctr] = content ctr = ctr + 1 status = prepareDoc.prepareDoc(contents) if status == "done": print("File saved") return "done"
def test_getContent2(self): print getContent([( 2, 'http://www.onetonline.org/link/result/11-1031.00?c=ta&n_ta=0&n_tt=20&s_tt=s&e_tt=L&e_tt=C&n_kn=10&c_kn=50&s_kn=IM&n_sk=10&c_sk=50&s_sk=IM&n_ab=10&c_ab=50&s_ab=IM&n_wa=10&c_wa=50&s_wa=IM&n_dw=10&a_iw=g&a_iw=i&a_iw=d&a_iw=t&n_wc=10&c_wc=50&c_in=50&n_ws=10&c_ws=50&c_wv=50&n_wn=10&c_wn=50&n_cw=10&s_cw=CIP&g=Go' )])
def __init__(self, url): self.gc = getContent(url) self.main()
from getContent import getContent from getURL import getURL import MySQLdb codeDescriptionList = [line.strip() for line in open("job_id_titles.txt", "r")] file = open("job_id_title_description", "w+") for each_item in codeDescriptionList: gensimIndex = each_item.index(',') code = each_item[:gensimIndex].strip() jobtitles = each_item[gensimIndex + 1:] try: url = getURL(code) description = getContent(url) file.write(code + "|" + jobtitles + "|" + description + '\n') except ValueError: print "value err" file.close()
import sys sys.path.append('../') from getContent import getContent from getData import getData from writeData import writeData if __name__ == '__main__': url = 'http://www.weather.com.cn/weather/101210101.shtml' #添加url html = getContent(url) #获取数据 result = getData(html) writeData(result, 'D:/weather.csv') print('mytest')
passwd = "8269202" DBName = "bullhorn" db = MySQLdb.connect(mySQLUrl, userName, passwd, DBName, charset='utf8', use_unicode=True) cursor = db.cursor() codeDescriptionList = [line.strip() for line in open("job_id_titles.txt", "r")] #file = open("job_id_title_description", "w+") for each_item in codeDescriptionList: gensimIndex = each_item.index(',') code = each_item[:gensimIndex].strip() jobtitles = each_item[gensimIndex + 1:] try: url = getURL(code) description = getContent(url) integerCode = int(code.replace('-', '')) #file.write(each_item + ',' + description + '\n') sql = """ INSERT INTO SOC_JOBTITLE VALUES ('%d', '%s', '%s'); """ % (integerCode, jobtitles.replace('\'', ''), description.replace('\'', '')) print sql cursor.execute(sql) db.commit() except ValueError: print "value err" cursor.close() db.close() #file.close()
def test_getContent(self): print getContent([(1, 'http://www.onetonline.org/link/result/11-1021.00?c=tk&n_tk=0&e_tk=1&c_tk=50&s_tk=IM&n_tt=20&s_tt=s&e_tt=L&e_tt=C&n_kn=10&c_kn=50&s_kn=IM&n_sk=10&c_sk=50&s_sk=IM&n_ab=10&c_ab=50&s_ab=IM&n_wa=10&c_wa=50&s_wa=IM&n_dw=10&a_iw=g&a_iw=i&a_iw=d&a_iw=t&n_cx=10&c_cx=50&c_in=50&n_ws=10&c_ws=50&c_wv=50&n_wn=10&c_wn=50&n_cw=10&s_cw=CIP&g=Go')])
def run(self): page = getContent.getContent(self.url) #循环用户需要的页数 for i in range(0, self.pageNum - 1): os.chdir("../") page = getContent.getContent(page)
import getLinks import getContent import prepareDoc keyword = input("Enter keyword to be searched - ") links = getLinks.start(keyword) ctr = 1 contents = {} for link in links: content = getContent.getContent(link) if content != "error": contents[ctr] = content ctr = ctr + 1 status = prepareDoc.prepareDoc(contents) if status == "done": print("File saved")