def get_article(index_url):
    '''
    获取当前页每一个原创帖子的url连接
    并返回一个字典类型k:帖子名称 v:帖子url地址
    '''
    title_list = []
    path_list = []
    url_list = []

    html = down.get(index_url, 3)
    html.encoding = 'gbk'
    all_font = BeautifulSoup(html.text, 'lxml').find_all(color="green")
    for font in all_font:
        title = font.get_text()
        title_list.append(title)
        print(u'发现帖子:', title)  ##加点提示不然太枯燥了
        path = str(title).replace(
            '?', '')  ##我注意到有个标题带有 ?  这个符号Windows系统是不能创建文件夹的所以要替换掉
        path_list.append(path)
        mkdir(path)  ##调用mkdir函数创建文件夹!这儿path代表的是标题title哦!!!!!不要糊涂了哦!
        href = font.parent['href']
        if dagaier_collection.find_one(
            {'主题页面': href}):  ##判断这个主题是否已经在数据库中、不在就运行else下的内容,在则忽略。
            print(u'这个页面已经爬取过了')
        else:
            url_list.append('https://liuyouba.tk/' + href)

    Aticles = dict(title=title_list, name=path_list, urls=url_list)

    return Aticles
Beispiel #2
0
 def img(self, url):
     girl = down.get(url, 3)
     img_soup = BeautifulSoup(girl.text, 'lxml')
     img_url = img_soup.find('div', class_='main-image').find('img')['src']
     self.img_urls.append(img_url)
     name = img_url[-9:-4]
     self.save(img_url, name)
Beispiel #3
0
 def save(self, img_url):
     name = img_url[-9:-4]
     print(u'开始保存:', img_url)
     img = down.get(img_url, 3)
     f = open(name + '.jpg', 'ab')
     f.write(img.content)
     f.close()
Beispiel #4
0
def start(url):
    respone = down.get(url, 3)
    Soup = BeautifulSoup(respone.text, 'lxml')
    imgs_list = Soup.find('ul', id='pins').select('span > a')
    for imgs in imgs_list:
        title = imgs.get_text()
        href = imgs['href']
        spider_queue.push(href, title)
Beispiel #5
0
def start(url):
    response = down.get(url, 3)
    all_a = BeautifulSoup(response.text,
                          'lxml').find('div', class_='all').find_all('a')
    for a in all_a:
        title = a.get_text()
        url = a['href']
        spider_queue.push(url, title)
Beispiel #6
0
 def html(self, href):
     img_a_girl_html = down.get(href, 3)
     img_a_girl = BeautifulSoup(img_a_girl_html.text, 'lxml')
     max_span = img_a_girl.find(
         'div', class_='pagenavi').find_all('span')[-2].get_text()
     self.img(href)
     for page in range(2, int(max_span) + 1):
         page_url = href + '/' + str(page)
         self.img(page_url)
Beispiel #7
0
 def html(self, href):
     html = down.get(href, 3)
     max_span = BeautifulSoup(html.text,
                              'lxml').find_all('span')[-2].get_text()
     page_num = 0  ##这个当作计数器用 (用来判断图片是否下载完毕)
     for page in range(1, int(max_span) + 1):
         page_num = page_num + 1  ##每for循环一次就+1  (当page_num等于max_span的时候,就证明我们的在下载最后一张图片了)
         page_url = href + '/' + str(page)
         self.img(page_url, max_span, page_num)  ##把上面我们我们需要的两个变量,传递给下一个函数。
def SavePic(item):
    '''
    通过中间件down
    将抓取到的图片保存到本地
    '''
    url = item.url
    filename = item.name
    basepath = str('E:/result/dagaier/')
    content = down.get(url, 3).content
    with open(basepath + filename, 'wb') as f:
        f.write(content)
    print('当前文件 {} 下载完毕'.format(basepath + filename))
Beispiel #9
0
 def all_url(self, url):
     html = down.get(url, 3)
     all_a = BeautifulSoup(html.text,
                           'lxml').find('div', class_='all').find_all('a')
     for a in all_a:
         title = a.get_text()
         self.title = title  ##将主题保存到self.title中
         print(u'开始保存:', title)
         path = str(title).replace("?", '_')
         self.mkdir(path)
         os.chdir("C:\mzitu\\" + path)
         href = a['href']
         self.url = href  ##将页面地址保存到self.url中
         if self.meizitu_collection.find_one(
             {'主题页面': href}):  ##判断这个主题是否已经在数据库中、不在就运行else下的内容,在则忽略。
             print(u'这个页面已经爬取过了')
         else:
             self.html(href)
 def img(self, page_url, max_span, page_num):  ##添加上面传递的参数
     img_html = down.get(page_url, 3)
     img_url = BeautifulSoup(img_html.text, 'lxml').find(
         'div', class_='main-image').find('img')['src']
     self.img_urls.append(
         img_url
     )  ##每一次 for page in range(1, int(max_span) + 1)获取到的图片地址都会添加到 img_urls这个初始化的列表
     if (
             int(max_span) // 10
     ) + 1 == page_num:  ##我们传递下来的两个参数用上了 当max_span和Page_num相等时,就是最后一张图片了,最后一次下载图片并保存到数据库中。
         self.save(img_url)
         post = {  ##这是构造一个字典,里面有啥都是中文,很好理解吧!
             '标题': self.title,
             '主题页面': self.url,
             '图片地址': self.img_urls,
             '获取时间': datetime.datetime.now()
         }
         self.meizitu_collection.save(post)  ##将post中的内容写入数据库。
         print(u'插入数据库成功')
     else:  ##max_span 不等于 page_num执行这下面
         self.save(page_url, img_url)
Beispiel #11
0
def get_pic(Aticles):
    '''
    打开每个帖子的url,
    找到图片的地址,
    保存在列表并返回
    '''
    # 用于保存item的列表
    aticles_urls = []

    # 从dict里分离漫画名和章节链接
    aticle_list = Aticles['urls']
    basedir = Aticles['name']
    mongo_title = Aticles['title']
    # print(len(basedir))
    # print(len(aticle_list))
    pathNum = 0
    for url in aticle_list:
        img_html = down.get(url, 3)
        img_urls = BeautifulSoup(img_html.text, 'lxml').find_all('input',
                                                                 type='image')
        post = {  ##这是构造一个字典,里面有啥都是中文,很好理解吧!
            '标题': mongo_title[pathNum],
            '主题页面': url,
            '图片地址': img_urls['src'],
            '获取时间': datetime.datetime.now()
        }
        dagaier_collection.save(post)  ##将post中的内容写入数据库。
        print(u'插入数据库成功')
        picNum = 1
        for img_url in img_urls:
            item = Item()
            item.url = img_url['src']
            item.name = str(basedir[pathNum]) + '/' + str(picNum) + '.jpg'
            aticles_urls.append(item)
            picNum = picNum + 1
        pathNum = pathNum + 1

    return aticles_urls
Beispiel #12
0
 def all_url(self, url):
     html = down.get(url, 3)
     Soup = BeautifulSoup(html.text, 'lxml')
     imgs_list = Soup.find('ul', id='pins').select('span > a')
     for imgs in imgs_list:
         title = imgs.get_text()
         self.title = title
         self.mkdir(title)
         href = imgs['href']
         self.url = href
         if self.mzitu_collection.find_one({'主题页面': href}):
             print('The title already repliled')
         else:
             self.html(href)
             post = {
                 '标题': self.title,
                 '主题页面': self.url,
                 '图片地址': self.img_urls,
                 '获取时间': datetime.datetime.now()
             }
             self.mzitu_collection.save(post)
             self.title = ''
             self.url = ''
             self.img_urls = []
Beispiel #13
0
 def save(self, img_url, name):
     img = down.get(img_url, 3)
     print('save a image who name is:' + name + '.jpg')
     f = open(name + '.jpg', 'ab')
     f.write(img.content)
     f.close()
Beispiel #14
0
def main():
  if (len(sys.argv) > 1):
    username = sys.argv[1]
  else:
    username=input('type the username you want to crawl: ')
  printWithTime('connecting database...')
  connect('douban_album')
  link='https://www.douban.com/people/'+username
  # http get
  main_page = down.get(link)
  # parse
  main_tree = html.fromstring(main_page.text)
  name = main_tree.xpath('//div[@class="user-info"]/div[@class="pl"]/text()')[0].strip()
  intro = main_tree.xpath('//span[@id="intro_display"]/text()')
  photo_link=link.rstrip('/')+'/photos'

  try:
    os.mkdir('data')
  except FileExistsError:
    pass
  os.chdir('data')
  try:
    os.mkdir(name)
  except FileExistsError:
    pass
  os.chdir(name)
  try:
    os.mkdir('albums')
  except FileExistsError:
    pass
  os.chdir('albums')

  # db
  try:
    user=userRecord.objects.get(username=username)
  except DoesNotExist:
    # not found -> create one
    user = userRecord(username=username)
    user.save()

  albumDict={}
  i=0
  while True:
    printWithTime('get album link at page '+str(i)+'...')
    photo_page = down.get(photo_link)
    time.sleep(1)
    photo_tree=html.fromstring(photo_page.text)
    albumList=photo_tree.xpath('//div[@class="wr"]/div[@class="albumlst"]')

    for album in albumList:
      link=album.xpath('.//div[@class="pl2"]/a/@href')[0]
      title = album.xpath('.//div[@class="pl2"]/a/text()')[0]
      #album filter
      total = int(album.xpath('.//span[@class="pl"]/text()')[0].split('张')[0].strip())  # get the total number of pictures of an album
      albumID = int(link.split('/')[-2])
      if total == albumRecord.objects(albumID=albumID,status=True).count():
        printWithTime("jump album {}".format(title))
        continue  # next album
      else:
        # not finished album
        pass
      albumDict[title]=[link,total]
    try:
      i+=1
      photo_link=photo_tree.xpath('//link[@rel="next"]/@href')[0]
    except IndexError: # the end of album page
      break

  for title,linkAndTotal in dict.items(albumDict):
    getAllPhotoInAlbum(linkAndTotal[0],title,linkAndTotal[1])
  printWithTime("Mission Complete!")
  os.chdir('../../..')
Beispiel #15
0
def getAllPhotoInAlbum(albumLink,title,total):
  photoIDList=[]
  albumID=int(albumLink.split('/')[-2])
  i=0
  printWithTime('{} begin...'.format(title))
  # if all in db (including status==False)
  if total == albumRecord.objects(albumID=albumID).count():
    pass # jump collecting
  else:
    # else continue collecting (add those not in db)
    while True:
      printWithTime('get photo link at page ' + str(i) + '...')
      page = down.get(albumLink)
      time.sleep(1)
      tree = html.fromstring(page.text)
      photoListInOnePage=tree.xpath('//a[@class="photolst_photo"]/@href')
      for photo in photoListInOnePage:
        picID = photo.split('/')[-2]
        try:
          albumRecord.objects.get(albumID=albumID, picID=picID)
        except: # if not in db, add
          temp = albumRecord(albumID=albumID, picID=picID, status=False)
          temp.save()
        else: # already in db, jump
          continue
      try:
        albumLink = tree.xpath('//link[@rel="next"]/@href')[0]
        if albumLink[-3:]=='sep': # in order to distinguish the 'next page' of album from the 'next page' of reviews
          break
        i += 1
      except IndexError:
        break
  for queryResult in albumRecord.objects(albumID=albumID, status=False):
    photoIDList.append(queryResult['picID'])
  # todo display percentage
  for i,picID in enumerate(tqdm(photoIDList)):
    # #pic filter
    # try:
    #   albumRecord.objects.get(albumID=albumID, picID=picID)
    # except DoesNotExist:
    #   #crawl
    #   # and record
    #   temp = albumRecord(albumID=albumID, picID=picID)
    #   # todo append album to user (error in python3.5, non local variable)
    #   # user.album.append(temp)
    #   temp.save()
    # else:
    #   # visited
    #   continue
    photoURL='https://img3.doubanio.com/view/photo/l/public/p{}.webp'.format(picID)
    try:
      os.mkdir(title)
    except FileExistsError:
      pass
    filename = title+'/'+str(picID)+'.webp'
    with open(filename,'wb') as f:
      print('Download '+photoURL.split('/')[-1]+' of album "'+title)
      f.write(down.get(photoURL).content)
    temp_img = Image.open(filename).convert("RGB")
    temp_img.save(title+'/'+str(picID)+'.jpg')
    os.remove(filename)
    time.sleep(1)
    # update status to True
    albumRecord.objects(albumID=albumID, picID=picID).update_one(status=True)
  printWithTime('done with album: '+title)