class Producer(threading.Thread): def __init__(self, antibody_url_queue, *args, **kwargs): super(Producer, self).__init__(*args, **kwargs) self.antibody_url_queue = antibody_url_queue self.mysql = MYSQL('new_antibodies_info') def run(self): while True: if self.antibody_url_queue.empty(): break url = self.antibody_url_queue.get() self.parse_page(url) time.sleep(random.uniform(1.5, 2.5)) def parse_page(self, url): global count try: r = requests.get(url, headers=get_request_headers(), timeout=10) r.raise_for_status() except Exception as e: self.antibody_url_queue.put(url) print('访问抗体列表页失败,将url放回队列') print(e) else: html = r.text try: catanum_list = re.findall('"ProductCode":"(.*?)"', html, re.S) catanum = get_first_item(catanum_list, 40) size_price_list = re.findall( '"Size":"(.*?)".*?"Price":"(.*?)"', html, re.S) for size_price in size_price_list: size = size_price[0] price = size_price[1] price_sql = 'insert into Abcam_Antibody_price(Catalog_Number,Size,Price) values("{}","{}","{}");'.format( catanum, size, price) self.mysql.insert_into_table(price_sql) # data = json.loads(html) # for i in data: # title = i['Title'] # pmid = i['PubmedID'] # application = i['ApplicationsShortName'] # species = i['Species'] # citation_sql = 'insert into Abcam_Antibody_citations (Catalog_Number, PMID , Application, Species, Article_title) values ("{}","{}","{}","{}","{}");'.format( # catanum, pmid, application, species, pymysql.escape_string(title)) # self.mysql.insert_into_table(citation_sql) except Exception as es: print(es) print(url) else: update_status_sql = 'update Abcam_Antibody_detail set Price_Status = "1" where Price_url = "%s";' % url self.mysql.insert_into_table(update_status_sql) count += 1 print("\r获得抗体详情页进度: %d" % count, end="")
class Producer (threading.Thread): def __init__(self, antibody_url_queue, *args, **kwargs): super(Producer, self).__init__(*args, **kwargs) self.antibody_url_queue = antibody_url_queue self.mysql = MYSQL('new_antibodies_info') def run(self): while True: if self.antibody_url_queue.empty(): break url = self.antibody_url_queue.get() self.parse_page(url) time.sleep(random.uniform(1.5, 2.5)) def parse_page(self, url): global count try: r = requests.get(url, headers=get_request_headers(), timeout=10) r.raise_for_status() except Exception as e: self.antibody_url_queue.put(url) print('访问抗体列表页失败,将url放回队列') print(e) else: html = r.text element = etree.HTML(html) try: catanum = url.split('=')[1] data = json.loads(html) for i in data: title = i['Title'] pmid = i['PubmedID'] application = i['ApplicationsShortName'] species = i['Species'] citation_sql = 'insert into Abcam_Antibody_citations (Catalog_Number, PMID , Application, Species, Article_title) values ("{}","{}","{}","{}","{}");'.format( catanum, pmid, application, species, pymysql.escape_string(title)) self.mysql.insert_into_table(citation_sql) except Exception as es: print(es) print(url) else: update_status_sql = 'update Abcam_Antibody_detail set Citations_Status = "1" where Citations_url = "%s";' % url self.mysql.insert_into_table(update_status_sql) count += 1 print("\r获得抗体详情页进度: %d" % count, end="")
import requests from utils.Random_UserAgent import get_request_headers import re from lxml import etree from Mysql_helper import MYSQL mysql = MYSQL('new_antibodies_info') for x in range(1, 3061): url = 'https://www.abcam.cn/products/loadmore?selected.productType=Primary+antibodies&pagenumber=%d' % x sql = 'insert into Abcam_Antibody_list_url (Antibody_list_URL) values("{}");'.format( url) mysql.insert_into_table(sql)
class Producer(threading.Thread): def __init__(self, pmid_queue, *args, **kwargs): super(Producer, self).__init__(*args, **kwargs) self.pmid_queue = pmid_queue self.mysql = MYSQL('antibodies_info') def run(self): while True: if self.pmid_queue.empty(): break pmid = self.pmid_queue.get() self.parse_page(pmid) time.sleep(random.uniform(0.5, 2)) def parse_page(self, pmid): global count try: url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={}&mode=xml&_=1572314980717'.format( pmid) r = requests.get(url, headers=get_request_headers(), timeout=10) r.raise_for_status() except Exception as e: self.pmid_queue.put(pmid) print('访问抗体列表页失败,将{}放回队列'.format(pmid)) print(e) else: html = r.text #pmid_list = re.findall('<PMID Version="1">(.*?)</PMID>', html, re.S) authors = re.findall( '<LastName>(.*?)</LastName>.*?<ForeName>(.*?)</ForeName>', html, re.S) tab = [] if not authors: authors = [('', '')] for i in range(len(authors)): a = authors[i][0] + ' ' + authors[i][1] tab.append(a) author = ', '.join(tab) pubdate = re.findall( '<PubDate>.*?<Year>(.*?)</Year>.*?<Month>(.*?)</Month>.*?<Day>(.*?)</Day>', html, re.S) if not pubdate: pubdate = [('', '', '')] for l in range(len(pubdate)): pubdatetime = pubdate[l][0] + '-' + pubdate[l][ 1] + '-' + pubdate[l][2] institution = re.findall('<Affiliation>(.*?)</Affiliation>', html, re.S) if not institution: institution = [''] journal_list = re.findall('<Title>(.*?)</Title>', html, re.S) journal = get_first_item(journal_list, 100) try: insert_sql = 'insert into abcam_citations_details (pmid, journal, pub_date, institution, author) values ("{}","{}","{}","{}","{}");'.format( pmid, journal, pubdatetime, institution[0], author) self.mysql.insert_into_table(insert_sql) except Exception as es: print(es) print(url) else: update_sql = 'update abcam_pmid set pmid_Status = "1" where pmid = {};'.format( pmid) self.mysql.insert_into_table(update_sql) count += 1 print("\r获得citation详情页进度: %d" % count, end="")
class Producer(threading.Thread): def __init__(self, antibody_url_queue, *args, **kwargs): super(Producer, self).__init__(*args, **kwargs) self.antibody_url_queue = antibody_url_queue self.mysql = MYSQL('new_antibodies_info') def run(self): while True: if self.antibody_url_queue.empty(): break url = self.antibody_url_queue.get() self.parse_page(url) time.sleep(random.uniform(2.5, 4.5)) def parse_page(self, url): global count try: r = requests.get(url, headers=get_request_headers(), timeout=10) r.raise_for_status() except Exception as e: self.antibody_url_queue.put(url) print('访问抗体列表页失败,将url放回队列') print(e) else: html = r.text element = etree.HTML(html) try: antiname_list = re.findall( '产品名称</h3>.*?<div class="value">(.*?)<', html, re.S) antiname = antiname_list[0] cataNo_list = re.findall('data-product-code="(.*?)">', html, re.S) cataNo = cataNo_list[0] antibody_type_list = re.findall( '克隆</h3>.*?<div class="value">(.*?)</div>', html, re.S) antibody_type = get_first_item(antibody_type_list, 30) lis = element.xpath('.//li[@class="attribute list"]') if lis: li = lis[-1] synonyms_list = li.xpath('.//li/text()') synonyms = ','.join(synonyms_list) synonyms = get_string(synonyms, 3000) else: synonyms = '' # 取指定文本标签的同级标签 divs = element.xpath('.//h3[.="经测试应用"]/following-sibling::div') if divs: div = divs[0] application_lists = div.xpath('./abbr/text()') application = ','.join(application_lists) else: application = '' conjugation_list = re.findall('<li>Conjugation: (.*?)</li>', html, re.S) conjugation = get_first_item(conjugation_list, 200) clone_number_list = re.findall( '克隆编号</h3>.*?div class="value">(.*?)</div>', html, re.S) clone_number = get_first_item(clone_number_list, 40) recombinant_list = re.findall( 'product-label product-label--recombinant">(.*?)</div>', html, re.S) if recombinant_list: # 判断是否有敲除验证 recombinant = "yes" else: recombinant = '' modify_list = re.findall('(描述</h3>.*?</div>)', html, re.S) if modify_list: modify_list = re.findall('描述</h3>.*?\((.*?)\)', modify_list[0], re.S) modify = get_first_item(modify_list, 200) else: modify = '' host_list = re.findall( '宿主</h3>.*?<div class="value">(.*?)</div>', html, re.S) host = get_first_item(host_list, 200) species_reactivity_list = re.findall( '种属反应性</h3>.*?</strong>(.*?)<', html, re.S) species_reactivity = get_first_item(species_reactivity_list, 1000) abid = re.findall('data-track-value="(.*?)"', html, re.S) if abid: price_url = 'https://www.abcam.cn/datasheetproperties/availability?abId=' + abid[ 0] sellable = 'yes' else: price_url = '' sellable = 'no' geneid_list = re.findall('Entrez Gene:(.*?)</li>', html, re.S) if geneid_list: geneid_list = list( map(lambda geneid: geneid.strip(), geneid_list)) geneid = ','.join(geneid_list) if len(geneid) > 499: geneid = geneid[0:480] else: geneid = '' siRNA_list = re.findall('alt="使用(.*?)细胞株进行验证', html, re.S) if siRNA_list: # 判断是否有敲除验证 siRNA = "yes" else: siRNA = '' swisprot_list = re.findall('SwissProt:(.*?)</li>', html, re.S) if swisprot_list: swisprot_list = list( map(lambda swisprot: swisprot.strip(), swisprot_list)) swisprot = ','.join(swisprot_list) if len(swisprot) > 499: swisprot = swisprot[0:480] else: swisprot = '' predicted_mw_list = re.findall( 'Predicted band size:</b>(.*?)<br>', html, re.S) predicted_mw = get_first_item(predicted_mw_list, 200) observed_mw_list = re.findall('Observed band size:</b> (.*?)<', html, re.S) observed_mw = get_first_item(observed_mw_list, 200) isotype_list = re.findall( '同种型</h3>.*?div class="value">(.*?)</div>', html, re.S) isotype = get_first_item(isotype_list, 100) citations_list = re.findall('被引用在 (.*?)文献中', html, re.S) citations = get_first_item(citations_list, 100) if citations_list: reference_url = 'https://www.abcam.cn/DatasheetProperties/References?productcode=' + cataNo_list[ 0] else: reference_url = '' pdf_url_list = re.findall( 'class="pdf-links">.*?<li><a target="_blank" href="(.*?)"', html, re.S) if pdf_url_list: pdf_url = 'https://www.abcam.cn' + get_first_item( pdf_url_list, 300) else: pdf_url = '' review_list = re.findall('"reviewCount": "(.*?)"', html, re.S) review = get_first_item(review_list, 100) lis = element.xpath( '//*[@id="description_images"]/div[2]/ul/li') image_qty = len(lis) # 在此处写入检测图片 if lis: for li in lis: image_url_list = li.xpath('./div/a/@href') image_url = get_first_item(image_url_list, 500) description_list = li.xpath('./div[1]/div/div//text()') description = get_first_item(description_list, 1000) image_sql = 'insert into Abcam_Antibody_images (Catalog_Number, Image_url, Image_description) values ("{}","{}","{}");'.format( cataNo, image_url, description) self.mysql.insert_into_table(image_sql) # 在此处写入应用信息 trs = element.xpath( '//*[@id="description_applications"]/div[2]/table/tbody/tr' ) if trs: for tr in trs: application_list = tr.xpath('./td[1]//text()') application2 = get_first_item(application_list, 200) dillution_list = tr.xpath('./td[3]//text()') dillution = get_first_item(dillution_list, 1000) application_sql = 'insert into Abcam_Antibody_application (Catalog_Number, Application, Dilution) values ("{}","{}","{}");'.format( cataNo, application2, dillution) self.mysql.insert_into_table(application_sql) detail_sql = 'insert into Abcam_Antibody_detail(Sellable,Catalog_Number, Product_Name, Antibody_Type, Synonyms, Application, Conjugated, Clone_Number, Recombinant_Antibody, Modified, Host_Species, Antibody_detail_URL, GeneId, KO_Validation, Species_Reactivity, SwissProt,Predicted_MW,Observed_MW,Isotype,Citations,Citations_url,DataSheet_URL,Review,Price_url,Image_qty) values ("{}", "{}", "{}","{}", ' \ '"{}","{}","{}","{}","{}", "{}", "{}","{}", "{}", "{}","{}", "{}","{}","{}", "{}", "{}","{}", "{}", "{}","{}", "{}");'.format(sellable,cataNo, antiname,antibody_type,synonyms,application,conjugation,clone_number,recombinant,modify,host,url,pymysql.escape_string(geneid),siRNA,species_reactivity,pymysql.escape_string(swisprot),predicted_mw,observed_mw,isotype,citations,reference_url,pdf_url,review,price_url,image_qty) self.mysql.insert_into_table(detail_sql) except Exception as es: print(es) print(url) else: update_status_sql = 'update Abcam_Antibody_list set Antibody_Status = "1" where Antibody_detail_URL = "%s";' % url self.mysql.insert_into_table(update_status_sql) count += 1 print("\r获得抗体详情页进度: %d" % count, end="")
class Producer(threading.Thread): headers = { 'Cookies': 'PP=1; _ga=GA1.2.1635461879.1565096809; _gid=GA1.2.133744421.1565496241; Hm_lvt_30fac56dd55db0f4c94ac3995' '5a1d1f1=1565268961,1565389179,1565496240,1565510325; Qs_lvt_186141=1565096808%2C1565268960%2C1565389178%2C' '1565496240%2C1565510324; _sp_ses.4591=*; mediav=%7B%22eid%22%3A%2295155%22%2C%22ep%22%3A%22%22%2C%22vid%22%' '3A%22-XCl%25T1HEC%3A4I9xGrX(9%22%2C%22ctn%22%3A%22%22%7D; _gat_UA-367099-9=1; _dc_gtm_UA-367099-9=1; C2LC=CN' '; JSESSIONID=C88C68C292A8E0EF6D45F465F0D12E1C.Pub1; Hm_lpvt_30fac56dd55db0f4c94ac39955a1d1f1=1565510331; Qs_p' 'v_186141=3753158417549899000%2C4277747906829928000%2C349668107189188000%2C3100574197771999000%2C23251166996415' '81000; _sp_id.4591=0c85e63041a8e8b7.1565096808.6.1565510332.1565501036.a1a460db-b738-431c-9eb1-7f0dc0ed348b', 'DPR': '1', 'Host': 'www.abcam.cn', 'Referer': 'https://www.abcam.cn/products?selected.productType=Primary+antibodies', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 ' 'Safari/537.36', 'X-Requested-With': 'XMLHttpRequest', 'Connection': 'close' } def __init__(self, pageurl_queue, *args, **kwargs): super(Producer, self).__init__(*args, **kwargs) self.pageurl_queue = pageurl_queue self.mysql = MYSQL('new_antibodies_info') def run(self): while True: if self.pageurl_queue.empty(): break url = self.pageurl_queue.get() self.parse_page(url) time.sleep(random.uniform(1, 3)) def parse_page(self, url): global count try: r = requests.get(url, headers=self.headers) #r = gethtml(url) except Exception as e: self.pageurl_queue.put(url) print('访问抗体列表页失败,尝试其他代理') print(e) else: html = r.text element = etree.HTML(html) # print(html) divs = element.xpath('.//div[@class="pws-item-info"]') try: for div in divs: anti_name = div.xpath('.//h3/a/text()')[0] anti_catanum = div.xpath('.//h3/a/span/text()')[0] href_list = div.xpath('.//h3/a/@href') href = 'https://www.abcam.cn/' + href_list[0] # print(anti_name, anti_catanum, href) sql = 'insert into Abcam_Antibody_list (Catalog_Number, Product_Name, Antibody_detail_URL)' \ 'values("{}","{}","{}");'.format(anti_catanum, anti_name, href) self.mysql.insert_into_table(sql) except Exception as es: print(url) print(es) else: update_status_sql = 'update Abcam_Antibody_list_url set Antibody_Status = "1" where Antibody_list_URL = "%s";' % url self.mysql.insert_into_table(update_status_sql) count += 1 print("\r获得抗体详情页进度: %d" % count, end="")