コード例 #1
0
 def down_cover(self, message):
     HEADER = {
         'User-Agent':
         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
     }
     bookid = message[0]
     cover_url = message[1]
     filename = self.cover_path + '/' + bookid + '.jpg'
     if os.path.exists(filename):
         self.senddistributefinish('process_cover', bookid)
         return
     try:
         proxy = self.getproxy()
         proxies = {'http': proxy, 'https': proxy}
         resp = requests.get(cover_url,
                             headers=HEADER,
                             timeout=20,
                             proxies=proxies)
         # resp = requests.get(cover_url, headers=HEADER, timeout=20)
     except:
         self.sendwork('down_cover', message)
         return
     if utils.Img2Jpg(resp.content, filename):
         utils.printf('下载图片%s成功' % filename)
         self.senddistributefinish('process_cover', bookid)
     else:
         self.sendwork('down_cover', message)
         return
コード例 #2
0
def downcover():
    url = 'https://aip.scitation.org'
    cover_dir_fullpath = os.path.dirname(os.path.abspath(__file__)) + '/cover'
    if not os.path.exists(cover_dir_fullpath):
        os.makedirs(cover_dir_fullpath)
    try:
        resp = utils.get_html(url)
    except:
        # exMsg = '* ' + traceback.format_exc()
        # print(exMsg)
        return False
    if not resp:
        return False
    if resp.text.find('</html>') < 0:
        return False
    soup = BeautifulSoup(resp.content.decode('utf8'), 'lxml')
    divList = soup.select('div.publicationCoverImage')
    # divpb = soup.select_one('div', data - widget - id='bfd39502-c303-4169-88ba-1d2b9bba85ab')
    for divtag in divList:
        coverurl = url + divtag.a.img['src']
        covername = cover_dir_fullpath + '/' + divtag.a['href'].split(
            '/')[-1].lower() + '.jpg'
        if os.path.exists(covername):
            continue
        resp = utils.get_html(coverurl)
        if utils.Img2Jpg(resp.content, covername):
            utils.printf('下载', covername, '成功...')
            time.sleep(3)
    # apburl = 'https://aip.scitation.org/pb-assets/images/publications/apb/apl-bioeng-1483023557097.jpg'
    # apbname = cover_dir_fullpath + '/' + 'apb.jpg'
    # resp = utils.get_html(apburl)
    # if utils.Img2Jpg(resp.content, apbname):
    #     utils.printf('下载', apbname, '成功...')

    return True
コード例 #3
0
def down_cover():
    conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB)
    now_time = time.strftime('%Y%m%d')
    dirpath = cover_path + '/' + now_time
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)
    sql_up = "update video set cover_stat = 1 where rawid = %s"
    result = []
    while True:
        sql = "select rawid,cover_url from video where cover_stat=0 limit 1000"
        cur = conn.cursor()
        cur.execute(sql)
        rows = cur.fetchall()
        if len(rows) == 0:
            break
        else:
            for rawid, cover_url in rows:
                path = dirpath + '/%s.jpg' % rawid
                res = utils.get_html(cover_url, proxies=proxy, timeout=50)
                if res:
                    if os.path.exists(path):
                        result.append((rawid))
                        utils.printf("该", rawid, "存在...")
                    else:
                        if utils.Img2Jpg(res.content, path):
                            result.append((rawid))
                            utils.printf("下载", rawid, "成功...")
                        else:
                            print('%s -- down cover error' % rawid)
                if utils.parse_results_to_sql(conn, sql_up, result, 100):
                    total = len(result)
                    result.clear()
                    print('更新 ', total, ' 个结果到数据库成功')
            utils.parse_results_to_sql(conn, sql_up, result)
            print('更新 ', len(result), ' 个结果到数据库成功')
コード例 #4
0
 def down_cover(self, message):
     utils.printf('开始下载图片')
     if not self.cover_path:
         self.initpath()
     self.refreshproxypool()
     conn = utils.init_db('mysql', 'hepengineeringjournal', 4)
     cur = conn.cursor()
     cur.execute(
         "select journal_id,cover_url from journal where cover_url!=''")
     rows = cur.fetchall()
     HEADER = {
         'User-Agent':
         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
     }
     for journal_id, cover_url in rows:
         filename = self.cover_path + '/' + journal_id + '.jpg'
         if os.path.exists(filename):
             continue
         while True:
             try:
                 proxy = self.getproxy()
                 proxies = {'http': proxy, 'https': proxy}
                 resp = requests.get(cover_url,
                                     headers=HEADER,
                                     timeout=20,
                                     proxies=proxies)
                 # resp = requests.get(cover_url, headers=HEADER, timeout=20)
             except:
                 utils.printf(filename)
                 continue
             if utils.Img2Jpg(resp.content, filename):
                 utils.printf('下载图片%s成功' % filename)
                 break
     self.sendwork('mapcover')
コード例 #5
0
 def down_cover(self, message):
     HEADER = {
         'User-Agent':
         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
     }
     url = message[0]
     cover_url = message[1]
     filename = '%s/10.2514/%s.jpg' % (self.cover_path, url.split('/')[-1])
     if os.path.exists(filename):
         self.senddistributefinish('process_cover', url)
         return
     try:
         proxy = self.getproxy()
         proxies = {'http': proxy, 'https': proxy}
         resp = requests.get('https://arc.aiaa.org' + cover_url,
                             headers=HEADER,
                             timeout=20,
                             proxies=proxies)
         # resp = requests.get(cover_url, headers=HEADER, timeout=20)
     except:
         exMsg = '* ' + traceback.format_exc()
         print(exMsg)
         self.sendwork('down_cover', message)
         return
     if utils.Img2Jpg(resp.content, filename):
         utils.printf('下载图片%s成功' % filename)
         self.senddistributefinish('process_cover', url)
     else:
         self.sendwork('down_cover', message)
         return
コード例 #6
0
def down_cover():
    provider = 'aspetjournal'
    path = r'E:\work\DMAD\cover'
    if not os.path.exists(path):
        os.mkdir(path)
    pathtxt = '%s/%s_cover_%s.txt' % (path, provider, now_time)
    dic_cover = {
        "dmd":
        "http://dmd.aspetjournals.org/current-issue/cover-image-medium/cover.jpg",
        "jpet":
        "http://jpet.aspetjournals.org/current-issue/cover-image-medium/cover.jpg",
        "molpharm":
        "http://molpharm.aspetjournals.org/current-issue/cover-image-medium/cover.jpg",
        "pharmrev":
        "http://pharmrev.aspetjournals.org/current-issue/cover-image-medium/cover.jpg"
    }
    for name, url in dic_cover.items():
        res = utils.get_html(url)
        if res:
            cover_path = '%s/%s.jpg' % (path, name)
            utils.Img2Jpg(res.content, cover_path)
            s = provider + '@' + name + '★' + '/smartlib/' + provider + '/' + name + '.jpg' + '\n'
            with open(pathtxt, 'a', encoding='utf-8') as f:
                f.write(s)
            utils.printf('下载%s图片成功' % name)
コード例 #7
0
 def down_cover(self):
     now_path = self.coverPath + '\\' + self.now_time
     if not os.path.exists(now_path):
         os.makedirs(now_path)
     sql_up = "update list set cover_stat = 1 where rawid = %s"
     sql_fail = "update list set fail_count = %s where rawid = %s"
     result = []
     result_fail = []
     while True:
         sql = "select rawid, cover_url, title, fail_count from list where cover_stat = 0 and fail_count < 10 limit 1000 "
         cur = self.conn.cursor()
         cur.execute(sql)
         rows = cur.fetchall()
         if len(rows) == 0:
             break
         else:
             for rawid, cover_url, title, fail_count in rows:
                 path = now_path + '/%s.jpg' % rawid
                 if os.path.exists(path):
                     result.append((rawid))
                     print('%s exists' % path)
                 elif 'zhlx.png' in cover_url:
                     result.append((rawid))
                     print('%s no ' % path)
                 elif cover_url == 'http://lib.fifedu.com/upload_dir/':
                     result.append((rawid))
                     print('%s no ' % path)
                 else:
                     try:
                         res = requests.get(cover_url, headers=self.headers)
                         if res.status_code == 200:
                             path = now_path + '/%s.jpg' % rawid
                             # utils.Img2Jpg(res.content,path)
                             if utils.Img2Jpg(res.content, path):
                                 print('%s -- down cover right' % title)
                                 result.append((rawid))
                             else:
                                 print('%s -- down cover error' % title)
                                 fail_count += 1
                                 result_fail.append((fail_count, rawid))
                                 utils.parse_results_to_sql(
                                     self.conn, sql_fail, result_fail)
                         else:
                             print('status_code != 200 ~')
                             fail_count += 1
                             result_fail.append((fail_count, rawid))
                             utils.parse_results_to_sql(
                                 self.conn, sql_fail, result_fail)
                     except Exception as e:
                         print(e)
                         # pass
                 if utils.parse_results_to_sql(self.conn, sql_up, result,
                                               100):
                     print('插入%s条' % len(result))
                     result.clear()
             utils.parse_results_to_sql(self.conn, sql_up, result)
             print('插入剩下%s条' % len(result))
             result.clear()
コード例 #8
0
def down_cover():
    conn = pymysql.connect(DBHOST, DBUSER, DBPWD, DB)
    sql_up = "update detail set cover_stat = 1 where cover_url = %s"
    sql_fail = "update detail set failcount =failcount+1 where cover_url = %s"
    result = []
    result_fail = []
    fdir = cover_path + '/' + now_time
    if not os.path.exists(fdir):
        os.mkdir(fdir)
    while True:
        cur = conn.cursor()
        sql = "select url,cover_url from detail where cover_stat = 0 and failcount < 5 limit 1000"
        cur.execute(sql)
        rows = cur.fetchall()
        if len(rows) == 0:
            break
        else:
            for url,cover_url in rows:
                rawid = url.replace("/chinese/web/Details.aspx?id=","")
                url = "http://www.360elib.com:2100" + cover_url
                res = utils.get_html(url)
                if res:
                    new_id = rawid.lower()
                    fname = "%s/%s.jpg" % (fdir,new_id)
                    if utils.Img2Jpg(res.content,fname):
                        result.append(
                            (cover_url)
                        )
                        utils.printf("%s封面下载成功" % new_id)
                    else:
                        result_fail.append(
                            (cover_url)
                        )
                        utils.printf("错误")
                else:
                    result_fail.append(
                        (cover_url)
                    )
                    utils.printf("错误")
                if utils.parse_results_to_sql(conn, sql_fail, result_fail,100):
                    print("错误%s条成功" % len(result_fail))
                    result_fail.clear()
                if utils.parse_results_to_sql(conn, sql_up, result,100):
                    print("更新%s条成功" % len(result))
                    result.clear()
            utils.parse_results_to_sql(conn, sql_up, result)
            print("更新剩下%s条成功" % len(result))
            result.clear() 
            utils.parse_results_to_sql(conn, sql_fail, result_fail)
            print("错误剩下%s条成功" % len(result_fail))
            result_fail.clear() 
コード例 #9
0
def down_cover():
    path = r'E:\work\PNAS\cover\pnas.jpg'
    url = "https://www.pnas.org/content/by/year"
    res = utils.get_html(url)
    if res:
        html = Selector(res.text,'html')
        cover_url = html.xpath("//div[@class='cover-issue-image']/a/img/@src").extract_first()
        utils.printf(cover_url)
        cover = requests.get(cover_url)
        if cover:
            utils.Img2Jpg(cover.content,path)
    provider = 'pnasjournal'
    pathtxt = r'E:\work\PNAS\cover\pnasjournal_cover_20191212.txt'
    s = provider + '@' + 'pnas' + '★' + '/smartlib/' + provider + '/' + 'pnas' + '.jpg' +'\n'
    with open (pathtxt, 'a',encoding='utf-8') as f:
        f.write(s)
コード例 #10
0
 def down_cover(self, message):
     url = message[0]
     filename = message[1]
     count = message[2]
     if os.path.exists(filename):
         self.senddistributefinish('process_cover')
         return
     if count == 5:
         self.senddistributefinish('process_cover')
         return
     feature = 'on-the-cover-homepage'
     utils.logerror(url)
     utils.printf(url)
     resp = self.gethtml('https://www.cambridge.org' + url)
     if not resp:
         self.sendwork('down_cover', (url, filename, count + 1))
         return
     if resp.content.decode('utf-8').find(feature) < 0:
         print("can not find on-the-cover-homepage")
         feature = 'row cover collapse'
         resp = self.gethtml(
             'https://www.cambridge.org' + url + '/latest-issue', feature)
         if not resp:
             self.sendwork('down_cover', (url, filename, count + 1))
             return
     soup = BeautifulSoup(resp.content.decode('utf-8'), 'lxml')
     imgtag = soup.select_one('div.image.on-the-cover-homepage > a > img')
     if not imgtag:
         imgtag = soup.select_one(
             'div.large-4.medium-4.small-12.columns > div > div > div > a > img'
         )
     if not imgtag:
         utils.logerror('%s:图片错误' % url)
         self.senddistributefinish('process_cover')
         return
     cover_url = 'https:' + imgtag.get('src').replace('https:', '')
     resp = self.gethtml(cover_url, feature=None, coverflag=True)
     if not resp:
         self.sendwork('down_cover', (url, filename, count + 1))
         return
     if utils.Img2Jpg(resp.content, filename):
         utils.printf('下载图片%s成功' % filename)
         self.senddistributefinish('process_cover')
     else:
         utils.printf('%s:图片错误' % url)
         utils.logerror('%s:图片错误' % url)
         self.senddistributefinish('process_cover')
コード例 #11
0
 def down_cover(self, message):
     fname = message[0]
     cover_url = message[1]
     if os.path.exists(fname):
         self.senddistributefinish('process_cover', cover_url)
         return
     url = 'https://ascelibrary.org' + cover_url
     while True:
         try:
             proxy = self.getproxy()
             proxies = {'http': proxy, 'https': proxy}
             resp = requests.get(url, proxies=proxies)
         except:
             exMsg = '* ' + traceback.format_exc()
             print(exMsg)
             continue
         if utils.Img2Jpg(resp.content, fname):
             utils.printf('下载图片%s成功' % fname)
             self.senddistributefinish('process_cover', cover_url)
             return
コード例 #12
0
 def down_cover(self, message):
     url = 'http://www.sciencemag.org/journals'
     while True:
         resp = self.gethtml(url)
         if resp:
             break
     html = resp.content.decode('utf-8')
     soup = BeautifulSoup(html, 'lxml')
     alist = soup.select('div.media__icon > a')
     for aTag in alist:
         imageTag = aTag.select_one('img')
         filename = self.cover_path + '/' + aTag.get('href').replace(
             'http://', '').split('.')[0] + '.jpg'
         if imageTag:
             imageurl = 'http:' + imageTag.get('src')
             while True:
                 try:
                     resp = requests.get(imageurl)
                 except:
                     continue
                 if utils.Img2Jpg(resp.content, filename):
                     break
     utils.printf('下载图片成功')
     self.sendwork('mapcover')