class Producer(threading.Thread):
    def __init__(self, antibody_url_queue, *args, **kwargs):
        super(Producer, self).__init__(*args, **kwargs)
        self.antibody_url_queue = antibody_url_queue
        self.mysql = MYSQL('new_antibodies_info')

    def run(self):
        while True:
            if self.antibody_url_queue.empty():
                break
            url = self.antibody_url_queue.get()
            self.parse_page(url)
            time.sleep(random.uniform(1.5, 2.5))

    def parse_page(self, url):
        global count
        try:
            r = requests.get(url, headers=get_request_headers(), timeout=10)
            r.raise_for_status()
        except Exception as e:
            self.antibody_url_queue.put(url)
            print('访问抗体列表页失败,将url放回队列')
            print(e)
        else:
            html = r.text

            try:
                catanum_list = re.findall('"ProductCode":"(.*?)"', html, re.S)
                catanum = get_first_item(catanum_list, 40)

                size_price_list = re.findall(
                    '"Size":"(.*?)".*?"Price":"(.*?)"', html, re.S)
                for size_price in size_price_list:
                    size = size_price[0]
                    price = size_price[1]
                    price_sql = 'insert into Abcam_Antibody_price(Catalog_Number,Size,Price) values("{}","{}","{}");'.format(
                        catanum, size, price)
                    self.mysql.insert_into_table(price_sql)

                # data = json.loads(html)
                # for i in data:
                #     title = i['Title']
                #     pmid = i['PubmedID']
                #     application = i['ApplicationsShortName']
                #     species = i['Species']
                #     citation_sql = 'insert into Abcam_Antibody_citations (Catalog_Number, PMID , Application, Species, Article_title) values ("{}","{}","{}","{}","{}");'.format(
                #         catanum, pmid, application, species, pymysql.escape_string(title))
                #     self.mysql.insert_into_table(citation_sql)

            except Exception as es:
                print(es)
                print(url)

            else:
                update_status_sql = 'update Abcam_Antibody_detail set Price_Status = "1" where Price_url = "%s";' % url
                self.mysql.insert_into_table(update_status_sql)
                count += 1
                print("\r获得抗体详情页进度: %d" % count, end="")
class Producer (threading.Thread):

    def __init__(self, antibody_url_queue, *args, **kwargs):
        super(Producer, self).__init__(*args, **kwargs)
        self.antibody_url_queue = antibody_url_queue
        self.mysql = MYSQL('new_antibodies_info')

    def run(self):
        while True:
            if self.antibody_url_queue.empty():
                break
            url = self.antibody_url_queue.get()
            self.parse_page(url)
            time.sleep(random.uniform(1.5, 2.5))

    def parse_page(self, url):
        global count
        try:
            r = requests.get(url, headers=get_request_headers(), timeout=10)
            r.raise_for_status()
        except Exception as e:
            self.antibody_url_queue.put(url)
            print('访问抗体列表页失败,将url放回队列')
            print(e)
        else:
            html = r.text
            element = etree.HTML(html)
            try:
                catanum = url.split('=')[1]

                data = json.loads(html)
                for i in data:
                    title = i['Title']
                    pmid = i['PubmedID']
                    application = i['ApplicationsShortName']
                    species = i['Species']
                    citation_sql = 'insert into Abcam_Antibody_citations (Catalog_Number, PMID , Application, Species, Article_title) values ("{}","{}","{}","{}","{}");'.format(
                        catanum, pmid, application, species, pymysql.escape_string(title))
                    self.mysql.insert_into_table(citation_sql)

            except Exception as es:
                print(es)
                print(url)

            else:
                update_status_sql = 'update Abcam_Antibody_detail set Citations_Status = "1" where Citations_url = "%s";' % url
                self.mysql.insert_into_table(update_status_sql)
                count += 1
                print("\r获得抗体详情页进度: %d" % count, end="")
Beispiel #3
0
import requests
from utils.Random_UserAgent import get_request_headers
import re
from lxml import etree

from Mysql_helper import MYSQL

mysql = MYSQL('new_antibodies_info')
for x in range(1, 3061):
    url = 'https://www.abcam.cn/products/loadmore?selected.productType=Primary+antibodies&pagenumber=%d' % x
    sql = 'insert into Abcam_Antibody_list_url (Antibody_list_URL) values("{}");'.format(
        url)
    mysql.insert_into_table(sql)
class Producer(threading.Thread):
    def __init__(self, pmid_queue, *args, **kwargs):
        super(Producer, self).__init__(*args, **kwargs)
        self.pmid_queue = pmid_queue
        self.mysql = MYSQL('antibodies_info')

    def run(self):
        while True:
            if self.pmid_queue.empty():
                break
            pmid = self.pmid_queue.get()
            self.parse_page(pmid)
            time.sleep(random.uniform(0.5, 2))

    def parse_page(self, pmid):
        global count
        try:
            url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={}&mode=xml&_=1572314980717'.format(
                pmid)
            r = requests.get(url, headers=get_request_headers(), timeout=10)
            r.raise_for_status()
        except Exception as e:
            self.pmid_queue.put(pmid)
            print('访问抗体列表页失败,将{}放回队列'.format(pmid))
            print(e)
        else:
            html = r.text
            #pmid_list = re.findall('<PMID Version="1">(.*?)</PMID>', html, re.S)
            authors = re.findall(
                '<LastName>(.*?)</LastName>.*?<ForeName>(.*?)</ForeName>',
                html, re.S)
            tab = []
            if not authors:
                authors = [('', '')]
            for i in range(len(authors)):
                a = authors[i][0] + ' ' + authors[i][1]
                tab.append(a)
            author = ', '.join(tab)
            pubdate = re.findall(
                '<PubDate>.*?<Year>(.*?)</Year>.*?<Month>(.*?)</Month>.*?<Day>(.*?)</Day>',
                html, re.S)
            if not pubdate:
                pubdate = [('', '', '')]
            for l in range(len(pubdate)):
                pubdatetime = pubdate[l][0] + '-' + pubdate[l][
                    1] + '-' + pubdate[l][2]
            institution = re.findall('<Affiliation>(.*?)</Affiliation>', html,
                                     re.S)
            if not institution:
                institution = ['']
            journal_list = re.findall('<Title>(.*?)</Title>', html, re.S)
            journal = get_first_item(journal_list, 100)
            try:
                insert_sql = 'insert into abcam_citations_details (pmid, journal, pub_date, institution, author) values ("{}","{}","{}","{}","{}");'.format(
                    pmid, journal, pubdatetime, institution[0], author)
                self.mysql.insert_into_table(insert_sql)
            except Exception as es:
                print(es)
                print(url)
            else:
                update_sql = 'update abcam_pmid set pmid_Status = "1" where  pmid = {};'.format(
                    pmid)
                self.mysql.insert_into_table(update_sql)
                count += 1
                print("\r获得citation详情页进度: %d" % count, end="")
class Producer(threading.Thread):
    def __init__(self, antibody_url_queue, *args, **kwargs):
        super(Producer, self).__init__(*args, **kwargs)
        self.antibody_url_queue = antibody_url_queue
        self.mysql = MYSQL('new_antibodies_info')

    def run(self):
        while True:
            if self.antibody_url_queue.empty():
                break
            url = self.antibody_url_queue.get()
            self.parse_page(url)
            time.sleep(random.uniform(2.5, 4.5))

    def parse_page(self, url):
        global count
        try:
            r = requests.get(url, headers=get_request_headers(), timeout=10)
            r.raise_for_status()
        except Exception as e:
            self.antibody_url_queue.put(url)
            print('访问抗体列表页失败,将url放回队列')
            print(e)
        else:
            html = r.text
            element = etree.HTML(html)
            try:
                antiname_list = re.findall(
                    '产品名称</h3>.*?<div class="value">(.*?)<', html, re.S)
                antiname = antiname_list[0]
                cataNo_list = re.findall('data-product-code="(.*?)">', html,
                                         re.S)
                cataNo = cataNo_list[0]
                antibody_type_list = re.findall(
                    '克隆</h3>.*?<div class="value">(.*?)</div>', html, re.S)
                antibody_type = get_first_item(antibody_type_list, 30)
                lis = element.xpath('.//li[@class="attribute list"]')
                if lis:
                    li = lis[-1]
                    synonyms_list = li.xpath('.//li/text()')
                    synonyms = ','.join(synonyms_list)
                    synonyms = get_string(synonyms, 3000)
                else:
                    synonyms = ''
                # 取指定文本标签的同级标签
                divs = element.xpath('.//h3[.="经测试应用"]/following-sibling::div')
                if divs:
                    div = divs[0]
                    application_lists = div.xpath('./abbr/text()')
                    application = ','.join(application_lists)
                else:
                    application = ''
                conjugation_list = re.findall('<li>Conjugation: (.*?)</li>',
                                              html, re.S)
                conjugation = get_first_item(conjugation_list, 200)
                clone_number_list = re.findall(
                    '克隆编号</h3>.*?div class="value">(.*?)</div>', html, re.S)
                clone_number = get_first_item(clone_number_list, 40)
                recombinant_list = re.findall(
                    'product-label product-label--recombinant">(.*?)</div>',
                    html, re.S)
                if recombinant_list:  # 判断是否有敲除验证
                    recombinant = "yes"
                else:
                    recombinant = ''
                modify_list = re.findall('(描述</h3>.*?</div>)', html, re.S)
                if modify_list:
                    modify_list = re.findall('描述</h3>.*?\((.*?)\)',
                                             modify_list[0], re.S)
                    modify = get_first_item(modify_list, 200)
                else:
                    modify = ''
                host_list = re.findall(
                    '宿主</h3>.*?<div class="value">(.*?)</div>', html, re.S)
                host = get_first_item(host_list, 200)
                species_reactivity_list = re.findall(
                    '种属反应性</h3>.*?</strong>(.*?)<', html, re.S)
                species_reactivity = get_first_item(species_reactivity_list,
                                                    1000)
                abid = re.findall('data-track-value="(.*?)"', html, re.S)
                if abid:
                    price_url = 'https://www.abcam.cn/datasheetproperties/availability?abId=' + abid[
                        0]
                    sellable = 'yes'
                else:
                    price_url = ''
                    sellable = 'no'
                geneid_list = re.findall('Entrez Gene:(.*?)</li>', html, re.S)
                if geneid_list:
                    geneid_list = list(
                        map(lambda geneid: geneid.strip(), geneid_list))
                    geneid = ','.join(geneid_list)
                    if len(geneid) > 499:
                        geneid = geneid[0:480]
                else:
                    geneid = ''
                siRNA_list = re.findall('alt="使用(.*?)细胞株进行验证', html, re.S)
                if siRNA_list:  # 判断是否有敲除验证
                    siRNA = "yes"
                else:
                    siRNA = ''
                swisprot_list = re.findall('SwissProt:(.*?)</li>', html, re.S)
                if swisprot_list:
                    swisprot_list = list(
                        map(lambda swisprot: swisprot.strip(), swisprot_list))
                    swisprot = ','.join(swisprot_list)
                    if len(swisprot) > 499:
                        swisprot = swisprot[0:480]
                else:
                    swisprot = ''
                predicted_mw_list = re.findall(
                    'Predicted band size:</b>(.*?)<br>', html, re.S)
                predicted_mw = get_first_item(predicted_mw_list, 200)
                observed_mw_list = re.findall('Observed band size:</b> (.*?)<',
                                              html, re.S)
                observed_mw = get_first_item(observed_mw_list, 200)
                isotype_list = re.findall(
                    '同种型</h3>.*?div class="value">(.*?)</div>', html, re.S)
                isotype = get_first_item(isotype_list, 100)
                citations_list = re.findall('被引用在 (.*?)文献中', html, re.S)
                citations = get_first_item(citations_list, 100)
                if citations_list:
                    reference_url = 'https://www.abcam.cn/DatasheetProperties/References?productcode=' + cataNo_list[
                        0]
                else:
                    reference_url = ''
                pdf_url_list = re.findall(
                    'class="pdf-links">.*?<li><a target="_blank" href="(.*?)"',
                    html, re.S)
                if pdf_url_list:
                    pdf_url = 'https://www.abcam.cn' + get_first_item(
                        pdf_url_list, 300)
                else:
                    pdf_url = ''
                review_list = re.findall('"reviewCount": "(.*?)"', html, re.S)
                review = get_first_item(review_list, 100)
                lis = element.xpath(
                    '//*[@id="description_images"]/div[2]/ul/li')
                image_qty = len(lis)
                # 在此处写入检测图片
                if lis:
                    for li in lis:
                        image_url_list = li.xpath('./div/a/@href')
                        image_url = get_first_item(image_url_list, 500)
                        description_list = li.xpath('./div[1]/div/div//text()')
                        description = get_first_item(description_list, 1000)
                        image_sql = 'insert into Abcam_Antibody_images (Catalog_Number, Image_url, Image_description) values ("{}","{}","{}");'.format(
                            cataNo, image_url, description)
                        self.mysql.insert_into_table(image_sql)
                # 在此处写入应用信息
                trs = element.xpath(
                    '//*[@id="description_applications"]/div[2]/table/tbody/tr'
                )
                if trs:
                    for tr in trs:
                        application_list = tr.xpath('./td[1]//text()')
                        application2 = get_first_item(application_list, 200)
                        dillution_list = tr.xpath('./td[3]//text()')
                        dillution = get_first_item(dillution_list, 1000)
                        application_sql = 'insert into Abcam_Antibody_application (Catalog_Number, Application, Dilution) values ("{}","{}","{}");'.format(
                            cataNo, application2, dillution)
                        self.mysql.insert_into_table(application_sql)
                detail_sql = 'insert into Abcam_Antibody_detail(Sellable,Catalog_Number, Product_Name, Antibody_Type, Synonyms, Application, Conjugated, Clone_Number, Recombinant_Antibody, Modified, Host_Species, Antibody_detail_URL, GeneId, KO_Validation, Species_Reactivity, SwissProt,Predicted_MW,Observed_MW,Isotype,Citations,Citations_url,DataSheet_URL,Review,Price_url,Image_qty) values ("{}", "{}", "{}","{}", ' \
                             '"{}","{}","{}","{}","{}", "{}", "{}","{}", "{}", "{}","{}", "{}","{}","{}", "{}", "{}","{}", "{}", "{}","{}", "{}");'.format(sellable,cataNo, antiname,antibody_type,synonyms,application,conjugation,clone_number,recombinant,modify,host,url,pymysql.escape_string(geneid),siRNA,species_reactivity,pymysql.escape_string(swisprot),predicted_mw,observed_mw,isotype,citations,reference_url,pdf_url,review,price_url,image_qty)
                self.mysql.insert_into_table(detail_sql)

            except Exception as es:
                print(es)
                print(url)

            else:
                update_status_sql = 'update Abcam_Antibody_list set Antibody_Status = "1" where  Antibody_detail_URL = "%s";' % url
                self.mysql.insert_into_table(update_status_sql)
                count += 1
                print("\r获得抗体详情页进度: %d" % count, end="")
class Producer(threading.Thread):
    headers = {
        'Cookies':
        'PP=1; _ga=GA1.2.1635461879.1565096809; _gid=GA1.2.133744421.1565496241; Hm_lvt_30fac56dd55db0f4c94ac3995'
        '5a1d1f1=1565268961,1565389179,1565496240,1565510325; Qs_lvt_186141=1565096808%2C1565268960%2C1565389178%2C'
        '1565496240%2C1565510324; _sp_ses.4591=*; mediav=%7B%22eid%22%3A%2295155%22%2C%22ep%22%3A%22%22%2C%22vid%22%'
        '3A%22-XCl%25T1HEC%3A4I9xGrX(9%22%2C%22ctn%22%3A%22%22%7D; _gat_UA-367099-9=1; _dc_gtm_UA-367099-9=1; C2LC=CN'
        '; JSESSIONID=C88C68C292A8E0EF6D45F465F0D12E1C.Pub1; Hm_lpvt_30fac56dd55db0f4c94ac39955a1d1f1=1565510331; Qs_p'
        'v_186141=3753158417549899000%2C4277747906829928000%2C349668107189188000%2C3100574197771999000%2C23251166996415'
        '81000; _sp_id.4591=0c85e63041a8e8b7.1565096808.6.1565510332.1565501036.a1a460db-b738-431c-9eb1-7f0dc0ed348b',
        'DPR':
        '1',
        'Host':
        'www.abcam.cn',
        'Referer':
        'https://www.abcam.cn/products?selected.productType=Primary+antibodies',
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 '
        'Safari/537.36',
        'X-Requested-With':
        'XMLHttpRequest',
        'Connection':
        'close'
    }

    def __init__(self, pageurl_queue, *args, **kwargs):
        super(Producer, self).__init__(*args, **kwargs)
        self.pageurl_queue = pageurl_queue
        self.mysql = MYSQL('new_antibodies_info')

    def run(self):
        while True:
            if self.pageurl_queue.empty():
                break
            url = self.pageurl_queue.get()
            self.parse_page(url)
            time.sleep(random.uniform(1, 3))

    def parse_page(self, url):
        global count
        try:
            r = requests.get(url, headers=self.headers)
            #r = gethtml(url)
        except Exception as e:
            self.pageurl_queue.put(url)
            print('访问抗体列表页失败,尝试其他代理')
            print(e)
        else:
            html = r.text
            element = etree.HTML(html)
            # print(html)
            divs = element.xpath('.//div[@class="pws-item-info"]')
            try:
                for div in divs:
                    anti_name = div.xpath('.//h3/a/text()')[0]
                    anti_catanum = div.xpath('.//h3/a/span/text()')[0]
                    href_list = div.xpath('.//h3/a/@href')
                    href = 'https://www.abcam.cn/' + href_list[0]
                    # print(anti_name, anti_catanum, href)
                    sql = 'insert into Abcam_Antibody_list (Catalog_Number, Product_Name, Antibody_detail_URL)' \
                          'values("{}","{}","{}");'.format(anti_catanum, anti_name, href)
                    self.mysql.insert_into_table(sql)
            except Exception as es:
                print(url)
                print(es)
            else:
                update_status_sql = 'update Abcam_Antibody_list_url set Antibody_Status = "1" where Antibody_list_URL  = "%s";' % url
                self.mysql.insert_into_table(update_status_sql)
                count += 1
                print("\r获得抗体详情页进度: %d" % count, end="")