Example #1
0
def main():
    pageNum = int(input("输入爬取的页数:"))
    url = "https://www.mzitu.com/jiepai/"
    if (pageNum > 1):
        #接收下一页
        stra = time.time()
        t = MyThread(pageNum, url)
        t.start()
        t.join()
        print("运行时间为:", time.time() - stra)
    else:
        getContent.getContent(url)
 def test_getContent3(self):
     print getContent([
         (1,
          u'http://www.onetonline.org/link/result/11-3071.01?c=tk&n_tk=0&e_tk=1&c_tk=50&s_tk=IM&n_tt=20&s_tt=s&e_tt=L&e_tt=C&n_kn=10&c_kn=50&s_kn=IM&n_sk=10&c_sk=50&s_sk=IM&n_ab=10&c_ab=50&s_ab=IM&n_wa=10&c_wa=50&s_wa=IM&n_dw=10&a_iw=g&a_iw=i&a_iw=d&a_iw=t&n_cx=10&c_cx=50&c_in=50&n_ws=10&c_ws=50&c_wv=50&n_wn=10&c_wn=50&n_cw=10&s_cw=CIP&g=Go'
          ),
         (1,
          u'http://www.onetonline.org/link/result/11-3071.02?c=tk&n_tk=0&e_tk=1&c_tk=50&s_tk=IM&n_tt=20&s_tt=s&e_tt=L&e_tt=C&n_kn=10&c_kn=50&s_kn=IM&n_sk=10&c_sk=50&s_sk=IM&n_ab=10&c_ab=50&s_ab=IM&n_wa=10&c_wa=50&s_wa=IM&n_dw=10&a_iw=g&a_iw=i&a_iw=d&a_iw=t&n_cx=10&c_cx=50&c_in=50&n_ws=10&c_ws=50&c_wv=50&n_wn=10&c_wn=50&n_cw=10&s_cw=CIP&g=Go'
          ),
         (1,
          u'http://www.onetonline.org/link/result/11-3071.03?c=tk&n_tk=0&e_tk=1&c_tk=50&s_tk=IM&n_tt=20&s_tt=s&e_tt=L&e_tt=C&n_kn=10&c_kn=50&s_kn=IM&n_sk=10&c_sk=50&s_sk=IM&n_ab=10&c_ab=50&s_ab=IM&n_wa=10&c_wa=50&s_wa=IM&n_dw=10&a_iw=g&a_iw=i&a_iw=d&a_iw=t&n_cx=10&c_cx=50&c_in=50&n_ws=10&c_ws=50&c_wv=50&n_wn=10&c_wn=50&n_cw=10&s_cw=CIP&g=Go'
          )
     ])
Example #3
0
def getText():
    article = getContent()
    article_title = article['title']
    article_content = article['content']
    files = [f for f in listdir('text') if isfile(join('text', f))]
    if article_title + '.txt' in files:
        print "already there"
        return -1
    file_out = open('text/' + article_title + '.txt', 'w')
    file_out.write(article_content.encode("utf-8"))
    file_out.close()
    return article_title
Example #4
0
def ToMysql(q):
    db = pymysql.connect("localhost", "root", "123456", "testdb")
    cursor = db.cursor()
    while (q.empty):
        str = q.get()
        str = str.split(' ')
        sql = "SELECT enterpriseName FROM temp_icp_web2 where autoID = %s"
        cursor.execute(sql, int(str[0]))
        result = cursor.fetchone()
        print(str[0], '内容获取...')
        str.append(getContent(str[1], 1))
        print(str[0], '爬取完成,读入数据库...')
        sql = "INSERT INTO Content (id,company,url,content) VALUES (%s,%s,%s,%s)"
        cursor.execute(sql, (int(str[0]), result[0], str[1], str[2]))
        db.commit()
        print(str[0], '读入成功')
    db.close()
Example #5
0
def controller(keyword, numberOfLinks):
    #keyword = input("Enter keyword to be searched - ")
    links = getLinks.start(keyword, numberOfLinks)
    ctr = 1
    contents = {}

    for link in links:
        content = getContent.getContent(link)
        if content != "error":
            contents[ctr] = content
            ctr = ctr + 1

    status = prepareDoc.prepareDoc(contents)

    if status == "done":
        print("File saved")

    return "done"
 def test_getContent2(self):
     print getContent([(
         2,
         'http://www.onetonline.org/link/result/11-1031.00?c=ta&n_ta=0&n_tt=20&s_tt=s&e_tt=L&e_tt=C&n_kn=10&c_kn=50&s_kn=IM&n_sk=10&c_sk=50&s_sk=IM&n_ab=10&c_ab=50&s_ab=IM&n_wa=10&c_wa=50&s_wa=IM&n_dw=10&a_iw=g&a_iw=i&a_iw=d&a_iw=t&n_wc=10&c_wc=50&c_in=50&n_ws=10&c_ws=50&c_wv=50&n_wn=10&c_wn=50&n_cw=10&s_cw=CIP&g=Go'
     )])
Example #7
0
 def __init__(self, url):
     self.gc = getContent(url)
     self.main()
from getContent import getContent
from getURL import getURL
import MySQLdb


codeDescriptionList = [line.strip() for line in open("job_id_titles.txt", "r")]
file = open("job_id_title_description", "w+")
for each_item in codeDescriptionList:
    gensimIndex = each_item.index(',')
    code = each_item[:gensimIndex].strip()
    jobtitles = each_item[gensimIndex + 1:]
    try:
        url = getURL(code)
        description = getContent(url)
        file.write(code + "|" + jobtitles + "|" + description + '\n')
    except ValueError:
        print "value err"


file.close()
Example #9
0
import sys
sys.path.append('../')
from getContent import getContent
from getData import getData
from writeData import writeData
if __name__ == '__main__':
    url = 'http://www.weather.com.cn/weather/101210101.shtml'
    #添加url
    html = getContent(url)
    #获取数据
    result = getData(html)

    writeData(result, 'D:/weather.csv')
    print('mytest')
passwd = "8269202"
DBName = "bullhorn"


db = MySQLdb.connect(mySQLUrl, userName, passwd, DBName, charset='utf8', use_unicode=True)
cursor = db.cursor()

codeDescriptionList = [line.strip() for line in open("job_id_titles.txt", "r")]
#file = open("job_id_title_description", "w+")
for each_item in codeDescriptionList:
    gensimIndex = each_item.index(',')
    code = each_item[:gensimIndex].strip()
    jobtitles = each_item[gensimIndex + 1:]
    try:
        url = getURL(code)
        description = getContent(url)
        integerCode = int(code.replace('-', ''))
        #file.write(each_item + ',' + description + '\n')
        sql = """
            INSERT INTO SOC_JOBTITLE
            VALUES ('%d', '%s', '%s');
        """ % (integerCode, jobtitles.replace('\'', ''), description.replace('\'', ''))
        print sql
        cursor.execute(sql)
        db.commit()
    except ValueError:
        print "value err"

cursor.close()
db.close()
#file.close()
 def test_getContent(self):
     print getContent([(1,
                        'http://www.onetonline.org/link/result/11-1021.00?c=tk&n_tk=0&e_tk=1&c_tk=50&s_tk=IM&n_tt=20&s_tt=s&e_tt=L&e_tt=C&n_kn=10&c_kn=50&s_kn=IM&n_sk=10&c_sk=50&s_sk=IM&n_ab=10&c_ab=50&s_ab=IM&n_wa=10&c_wa=50&s_wa=IM&n_dw=10&a_iw=g&a_iw=i&a_iw=d&a_iw=t&n_cx=10&c_cx=50&c_in=50&n_ws=10&c_ws=50&c_wv=50&n_wn=10&c_wn=50&n_cw=10&s_cw=CIP&g=Go')])
Example #12
0
 def run(self):
     page = getContent.getContent(self.url)
     #循环用户需要的页数
     for i in range(0, self.pageNum - 1):
         os.chdir("../")
         page = getContent.getContent(page)
Example #13
0
import getLinks
import getContent
import prepareDoc

keyword = input("Enter keyword to be searched - ")
links = getLinks.start(keyword)
ctr = 1
contents = {}

for link in links:
    content = getContent.getContent(link)
    if content != "error":
        contents[ctr] = content
        ctr = ctr + 1

status = prepareDoc.prepareDoc(contents)

if status == "done":
    print("File saved")