Beispiel #1
0
    def __init__(self, url):
        self._result = []
        self._uid = "1000"
        self._bid = "28e05cbd"
        self._cid = str(time.time()).split('.')[0]
        self._did = "4509435"

        self._url = url
        getter = GetHtml()
        getter.set(self._url)
        self._data = getter.get()
        self._process()
Beispiel #2
0
def download_img_list(img_dir, img_url, page):
    print("Start download", img_dir)
    test = GetImgList(img_url)
    s = 0
    img_getter = GetHtml()

    for i in test:
        print('Page:', page, '**', img_dir, 'Now downloading', s + 1, "/",
              len(test))
        img_getter.set(str(i))
        fs = open(img_dir + '\\' + str(s) + '.jpg', 'wb')
        fs.write(img_getter.get())
        fs.close()
        s = s + 1
Beispiel #3
0
 def run(self):
     # Collect Users
     html = GetHtml(self.url)
     #get id level certificated mainpage portrait nikename address follows
     users = re.findall(r'divEditOperate_(\d*?)\".*?<span class=\"mainColor weight700\">.*?>(.*?)</span></span>(<br/>)?.*?href=(.*?) hidefocus.*?src=\"(.*?)\".*?alt=(.*?) title=.*?<p class="font12 lesserColor">(.*?)&nbsp;&nbsp;&nbsp;&nbsp;粉丝&nbsp;&gt;<span class="font12 mainColor">(\d*?)</span', html, re.S)
     #write to database
     for user in users:
         if user[2] == "":
             self.Write2database(user[0], user[1], "False", "http://www.moko.cc" + user[3], user[4], user[5], user[6], user[7])
         else:
             self.Write2database(user[0], user[1], "True", "http://www.moko.cc" + user[3], user[4], user[5], user[6], user[7])
 def run(self):
     while Producer_Flag or self.url_list != []:
         if self.url_list != []:
             html = GetHtml(self.url_list[0])
             while html == None:
                 html = GetHtml(self.url_list[0])
             url = self.url_list[0]
             del(self.url_list[0])
             
             try:
                 bs = BeautifulSoup(html,'html.parser')#standard library
                 title = self.getTitle(bs)
                 content = self.getContent(bs)
                 print(" [+] Spider{0}: {1} parsed".format(self.id, title))
                 writer.write(url, title, content)
             except Exception as e:
                 print(" [-] Spider{0}: {1}\n\twhen parsing {2}".format(self.id, e, url))
                 #parse again
                 self.url_list.insert(0, url)
     
     print(" [-] Spider {0} finished...".format(self.id))
 def run(self):
     html = GetHtml(self.url, timeout=3)
     while html == None:
         html = GetHtml(self.url, timeout=4)
     if webPage == "https://qxs.la/":
         chapters = re.findall(r'<div class=\"chapter\">.*?<a href=\"(.*?)\"', html, re.S)
         self.start_url = self.start_url[14:]
         self.end_url = self.end_url[14:]
         start = chapters.index(self.start_url)
         end = chapters.index(self.end_url) + 1
         chapters=chapters[start:end]
         for s in chapters:
             url = "https://qxs.la" + s
             writer.addUrl(url)
             #give the url to the most free one
             minn = 6666
             minid = -1
             for consumer in consumer_list:
                 if consumer.GetRemainedTasks() < minn:
                     minn = consumer.GetRemainedTasks()
                     minid = consumer.GetId()
             consumer_list[minid].addUrl(url)
     elif webPage == "http://www.clewx.com/":
         #没有re.S会单行匹配,有re.S会多行匹配
         chapters = re.findall(r'<dd><a href=\"(.*?)\" title', html)
         print(chapters)
         start = chapters.index(self.start_url)
         end = chapters.index(self.end_url) + 1
         chapters=chapters[start:end]
         for url in chapters:
             writer.addUrl(url)
             #give the url to the most free one
             minn = 6666
             minid = -1
             for consumer in consumer_list:
                 if consumer.GetRemainedTasks() < minn:
                     minn = consumer.GetRemainedTasks()
                     minid = consumer.GetId()
             consumer_list[minid].addUrl(url)
     elif webPage == "http://www.k6uk.com/":
Beispiel #6
0
base_t_url = "https://l.bdear.xyz/"

find_tid = re.compile(r'<a href="thread-.*?</a>')
re_tid = re.compile(r'thread-.*?/')
re_title = re.compile(r'>.*?<')

if (not os.path.exists('.\\Download')):
    os.mkdir('.\\Download')

threads = []
t_sum = 0
m_sum = 0
for i in range(int(start), int(end) + 1):
    url = base_url + str(i) + '.html'

    ht = GetHtml()
    ht.set(url)
    html = str(ht.get().decode('utf-8', 'ignore'))

    all_tid = find_tid.findall(html)
    for data in all_tid:
        if (re_title.search(data) != None):
            tid = re_tid.search(data).group(0).split('/')[0]
            title = re_title.search(data).group(
                0).split('>')[1].split('<')[0].replace('.', '').replace(
                    ':', '-').replace('/', '-').replace('?', '-')

            if (title.isdigit()):
                continue
            else:
                img_dir = '.\\Download\\' + title
Beispiel #7
0
def GetUserPages(url):
    html = GetHtml(url)
    pageNums = re.findall(r'onfocus=\"this\.blur\(\)\">(\d*?)<', html, re.S)
    pageNum = max(pageNums)
    return int(pageNum)