def parse_page(self, url):
        global count
        try:
            r = requests.get(url, headers=get_request_headers(), timeout=10)
            r.raise_for_status()
        except Exception as e:
            self.antibody_url_queue.put(url)
            print('访问抗体列表页失败,将url放回队列')
            print(e)
        else:
            html = r.text
            element = etree.HTML(html)
            try:
                catanum = url.split('=')[1]

                data = json.loads(html)
                for i in data:
                    title = i['Title']
                    pmid = i['PubmedID']
                    application = i['ApplicationsShortName']
                    species = i['Species']
                    citation_sql = 'insert into Abcam_Antibody_citations (Catalog_Number, PMID , Application, Species, Article_title) values ("{}","{}","{}","{}","{}");'.format(
                        catanum, pmid, application, species, pymysql.escape_string(title))
                    self.mysql.insert_into_table(citation_sql)

            except Exception as es:
                print(es)
                print(url)

            else:
                update_status_sql = 'update Abcam_Antibody_detail set Citations_Status = "1" where Citations_url = "%s";' % url
                self.mysql.insert_into_table(update_status_sql)
                count += 1
                print("\r获得抗体详情页进度: %d" % count, end="")
def gethtml(url):
    i = 0
    while i < 5:
        try:
            html = requests.get(url, headers=get_request_headers(), timeout=5)
            time.sleep(random.uniform(2, 3))
            return html
        except requests.exceptions.RequestException:
            i += 1
Esempio n. 3
0
 def parse_page(self, pmid):
     global count
     try:
         url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id={}&mode=xml&_=1572314980717'.format(
             pmid)
         r = requests.get(url, headers=get_request_headers(), timeout=10)
         r.raise_for_status()
     except Exception as e:
         self.pmid_queue.put(pmid)
         print('访问抗体列表页失败,将{}放回队列'.format(pmid))
         print(e)
     else:
         html = r.text
         #pmid_list = re.findall('<PMID Version="1">(.*?)</PMID>', html, re.S)
         authors = re.findall(
             '<LastName>(.*?)</LastName>.*?<ForeName>(.*?)</ForeName>',
             html, re.S)
         tab = []
         if not authors:
             authors = [('', '')]
         for i in range(len(authors)):
             a = authors[i][0] + ' ' + authors[i][1]
             tab.append(a)
         author = ', '.join(tab)
         pubdate = re.findall(
             '<PubDate>.*?<Year>(.*?)</Year>.*?<Month>(.*?)</Month>.*?<Day>(.*?)</Day>',
             html, re.S)
         if not pubdate:
             pubdate = [('', '', '')]
         for l in range(len(pubdate)):
             pubdatetime = pubdate[l][0] + '-' + pubdate[l][
                 1] + '-' + pubdate[l][2]
         institution = re.findall('<Affiliation>(.*?)</Affiliation>', html,
                                  re.S)
         if not institution:
             institution = ['']
         journal_list = re.findall('<Title>(.*?)</Title>', html, re.S)
         journal = get_first_item(journal_list, 100)
         try:
             insert_sql = 'insert into abcam_citations_details (pmid, journal, pub_date, institution, author) values ("{}","{}","{}","{}","{}");'.format(
                 pmid, journal, pubdatetime, institution[0], author)
             self.mysql.insert_into_table(insert_sql)
         except Exception as es:
             print(es)
             print(url)
         else:
             update_sql = 'update abcam_pmid set pmid_Status = "1" where  pmid = {};'.format(
                 pmid)
             self.mysql.insert_into_table(update_sql)
             count += 1
             print("\r获得citation详情页进度: %d" % count, end="")
Esempio n. 4
0
    def parse_page(self, url):
        global count
        try:
            r = requests.get(url, headers=get_request_headers(), timeout=10)
            r.raise_for_status()
        except Exception as e:
            self.antibody_url_queue.put(url)
            print('访问抗体列表页失败,将url放回队列')
            print(e)
        else:
            html = r.text

            try:
                catanum_list = re.findall('"ProductCode":"(.*?)"', html, re.S)
                catanum = get_first_item(catanum_list, 40)

                size_price_list = re.findall(
                    '"Size":"(.*?)".*?"Price":"(.*?)"', html, re.S)
                for size_price in size_price_list:
                    size = size_price[0]
                    price = size_price[1]
                    price_sql = 'insert into Abcam_Antibody_price(Catalog_Number,Size,Price) values("{}","{}","{}");'.format(
                        catanum, size, price)
                    self.mysql.insert_into_table(price_sql)

                # data = json.loads(html)
                # for i in data:
                #     title = i['Title']
                #     pmid = i['PubmedID']
                #     application = i['ApplicationsShortName']
                #     species = i['Species']
                #     citation_sql = 'insert into Abcam_Antibody_citations (Catalog_Number, PMID , Application, Species, Article_title) values ("{}","{}","{}","{}","{}");'.format(
                #         catanum, pmid, application, species, pymysql.escape_string(title))
                #     self.mysql.insert_into_table(citation_sql)

            except Exception as es:
                print(es)
                print(url)

            else:
                update_status_sql = 'update Abcam_Antibody_detail set Price_Status = "1" where Price_url = "%s";' % url
                self.mysql.insert_into_table(update_status_sql)
                count += 1
                print("\r获得抗体详情页进度: %d" % count, end="")
Esempio n. 5
0
import re
import requests
import threading
from queue import Queue
from Mysql_helper import MYSQL
from utils.Random_UserAgent import get_request_headers
import time
import random
import pymysql
import csv
from lxml import etree
import json

r = requests.get('https://www.abcam.cn/datasheetproperties/availability?abId=8226', headers=get_request_headers())
html = r.text
print(html)
size_list = re.findall('"Size":"(.*?)".*?"Price":"(.*?)"', html, re.S)
# title = re.findall('"Title":"(.*?)".*?"PubmedID":(.*?),.*?"ApplicationsShortName":(.*?),.*?"Species":"(.*?)"', html , re.S)
print(size_list)
data = json.loads(html)
print(data["size-information"]['Sizes'][0]['Size'])
Esempio n. 6
0
    def parse_page(self, url):
        global count
        try:
            r = requests.get(url, headers=get_request_headers(), timeout=10)
            r.raise_for_status()
        except Exception as e:
            self.antibody_url_queue.put(url)
            print('访问抗体列表页失败,将url放回队列')
            print(e)
        else:
            html = r.text
            element = etree.HTML(html)
            try:
                antiname_list = re.findall(
                    '产品名称</h3>.*?<div class="value">(.*?)<', html, re.S)
                antiname = antiname_list[0]
                cataNo_list = re.findall('data-product-code="(.*?)">', html,
                                         re.S)
                cataNo = cataNo_list[0]
                antibody_type_list = re.findall(
                    '克隆</h3>.*?<div class="value">(.*?)</div>', html, re.S)
                antibody_type = get_first_item(antibody_type_list, 30)
                lis = element.xpath('.//li[@class="attribute list"]')
                if lis:
                    li = lis[-1]
                    synonyms_list = li.xpath('.//li/text()')
                    synonyms = ','.join(synonyms_list)
                    synonyms = get_string(synonyms, 3000)
                else:
                    synonyms = ''
                # 取指定文本标签的同级标签
                divs = element.xpath('.//h3[.="经测试应用"]/following-sibling::div')
                if divs:
                    div = divs[0]
                    application_lists = div.xpath('./abbr/text()')
                    application = ','.join(application_lists)
                else:
                    application = ''
                conjugation_list = re.findall('<li>Conjugation: (.*?)</li>',
                                              html, re.S)
                conjugation = get_first_item(conjugation_list, 200)
                clone_number_list = re.findall(
                    '克隆编号</h3>.*?div class="value">(.*?)</div>', html, re.S)
                clone_number = get_first_item(clone_number_list, 40)
                recombinant_list = re.findall(
                    'product-label product-label--recombinant">(.*?)</div>',
                    html, re.S)
                if recombinant_list:  # 判断是否有敲除验证
                    recombinant = "yes"
                else:
                    recombinant = ''
                modify_list = re.findall('(描述</h3>.*?</div>)', html, re.S)
                if modify_list:
                    modify_list = re.findall('描述</h3>.*?\((.*?)\)',
                                             modify_list[0], re.S)
                    modify = get_first_item(modify_list, 200)
                else:
                    modify = ''
                host_list = re.findall(
                    '宿主</h3>.*?<div class="value">(.*?)</div>', html, re.S)
                host = get_first_item(host_list, 200)
                species_reactivity_list = re.findall(
                    '种属反应性</h3>.*?</strong>(.*?)<', html, re.S)
                species_reactivity = get_first_item(species_reactivity_list,
                                                    1000)
                abid = re.findall('data-track-value="(.*?)"', html, re.S)
                if abid:
                    price_url = 'https://www.abcam.cn/datasheetproperties/availability?abId=' + abid[
                        0]
                    sellable = 'yes'
                else:
                    price_url = ''
                    sellable = 'no'
                geneid_list = re.findall('Entrez Gene:(.*?)</li>', html, re.S)
                if geneid_list:
                    geneid_list = list(
                        map(lambda geneid: geneid.strip(), geneid_list))
                    geneid = ','.join(geneid_list)
                    if len(geneid) > 499:
                        geneid = geneid[0:480]
                else:
                    geneid = ''
                siRNA_list = re.findall('alt="使用(.*?)细胞株进行验证', html, re.S)
                if siRNA_list:  # 判断是否有敲除验证
                    siRNA = "yes"
                else:
                    siRNA = ''
                swisprot_list = re.findall('SwissProt:(.*?)</li>', html, re.S)
                if swisprot_list:
                    swisprot_list = list(
                        map(lambda swisprot: swisprot.strip(), swisprot_list))
                    swisprot = ','.join(swisprot_list)
                    if len(swisprot) > 499:
                        swisprot = swisprot[0:480]
                else:
                    swisprot = ''
                predicted_mw_list = re.findall(
                    'Predicted band size:</b>(.*?)<br>', html, re.S)
                predicted_mw = get_first_item(predicted_mw_list, 200)
                observed_mw_list = re.findall('Observed band size:</b> (.*?)<',
                                              html, re.S)
                observed_mw = get_first_item(observed_mw_list, 200)
                isotype_list = re.findall(
                    '同种型</h3>.*?div class="value">(.*?)</div>', html, re.S)
                isotype = get_first_item(isotype_list, 100)
                citations_list = re.findall('被引用在 (.*?)文献中', html, re.S)
                citations = get_first_item(citations_list, 100)
                if citations_list:
                    reference_url = 'https://www.abcam.cn/DatasheetProperties/References?productcode=' + cataNo_list[
                        0]
                else:
                    reference_url = ''
                pdf_url_list = re.findall(
                    'class="pdf-links">.*?<li><a target="_blank" href="(.*?)"',
                    html, re.S)
                if pdf_url_list:
                    pdf_url = 'https://www.abcam.cn' + get_first_item(
                        pdf_url_list, 300)
                else:
                    pdf_url = ''
                review_list = re.findall('"reviewCount": "(.*?)"', html, re.S)
                review = get_first_item(review_list, 100)
                lis = element.xpath(
                    '//*[@id="description_images"]/div[2]/ul/li')
                image_qty = len(lis)
                # 在此处写入检测图片
                if lis:
                    for li in lis:
                        image_url_list = li.xpath('./div/a/@href')
                        image_url = get_first_item(image_url_list, 500)
                        description_list = li.xpath('./div[1]/div/div//text()')
                        description = get_first_item(description_list, 1000)
                        image_sql = 'insert into Abcam_Antibody_images (Catalog_Number, Image_url, Image_description) values ("{}","{}","{}");'.format(
                            cataNo, image_url, description)
                        self.mysql.insert_into_table(image_sql)
                # 在此处写入应用信息
                trs = element.xpath(
                    '//*[@id="description_applications"]/div[2]/table/tbody/tr'
                )
                if trs:
                    for tr in trs:
                        application_list = tr.xpath('./td[1]//text()')
                        application2 = get_first_item(application_list, 200)
                        dillution_list = tr.xpath('./td[3]//text()')
                        dillution = get_first_item(dillution_list, 1000)
                        application_sql = 'insert into Abcam_Antibody_application (Catalog_Number, Application, Dilution) values ("{}","{}","{}");'.format(
                            cataNo, application2, dillution)
                        self.mysql.insert_into_table(application_sql)
                detail_sql = 'insert into Abcam_Antibody_detail(Sellable,Catalog_Number, Product_Name, Antibody_Type, Synonyms, Application, Conjugated, Clone_Number, Recombinant_Antibody, Modified, Host_Species, Antibody_detail_URL, GeneId, KO_Validation, Species_Reactivity, SwissProt,Predicted_MW,Observed_MW,Isotype,Citations,Citations_url,DataSheet_URL,Review,Price_url,Image_qty) values ("{}", "{}", "{}","{}", ' \
                             '"{}","{}","{}","{}","{}", "{}", "{}","{}", "{}", "{}","{}", "{}","{}","{}", "{}", "{}","{}", "{}", "{}","{}", "{}");'.format(sellable,cataNo, antiname,antibody_type,synonyms,application,conjugation,clone_number,recombinant,modify,host,url,pymysql.escape_string(geneid),siRNA,species_reactivity,pymysql.escape_string(swisprot),predicted_mw,observed_mw,isotype,citations,reference_url,pdf_url,review,price_url,image_qty)
                self.mysql.insert_into_table(detail_sql)

            except Exception as es:
                print(es)
                print(url)

            else:
                update_status_sql = 'update Abcam_Antibody_list set Antibody_Status = "1" where  Antibody_detail_URL = "%s";' % url
                self.mysql.insert_into_table(update_status_sql)
                count += 1
                print("\r获得抗体详情页进度: %d" % count, end="")