Beispiel #1
0
    print('\n##########正在下载第{}页数据##########\n'.format(page))
    base_url = 'https://www.zhipin.com/c100010000-p100109/?page={}&ka=page-{}'.format(
        page, page)
    if page > 1:
        sleep(5)

    cookie = 'Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1596689319,1596706170; lastCity=100010000; __g=-; __zp_stoken__=a22daJB4DI1lsZkxcNEotc3AEMXhEbHRqFFdAYwB7JnIMWydyTWshf2pXaGIpFndlLUcoPGVnDFJ0PTAYFwhsHnJqKx0nInloej8bZVR9OyoNIBtUZ1xOB31HTgcZKwkub35tQxcGDVg2eT4%3D; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1596706456; __zp_sseed__=+ESIwp4DFQO7vkpLz5T9FtTBTD9zO5XO5H3GNSZMuTc=; __zp_sname__=cfe88225; __zp_sts__=1596706745034; __c=1596706172; __l=l=%2Fwww.zhipin.com%2Fc100010000-p100109%2F%3Fpage%3D10%26ka%3Dpage-10&r=&g=; __a=84984979.1596689321.1596689321.1596706172.20.2.12.20'
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
        'cookie': cookie,
    }

    html_data = get(url=base_url, headers=headers).text

    selector = Selector(html_data)
    result_list = selector.css("#main > div > div.job-list > ul > li")
    for sel in result_list:
        Job_benefits = sel.css(
            "div > div.info-append.clearfix > div.info-desc ::text"
        ).extract_first()  #工作福利

        job_name = sel.css(
            "div > div.info-primary > div.primary-wrapper > div > div.job-title > span.job-name > a ::text"
        ).extract_first()

        Working_data_1 = sel.css(
            "div > div.info-append.clearfix > div.tags > span:nth-child(1) ::text"
        ).extract_first()  #工作数据_1
        Working_data_2 = sel.css(
            "div > div.info-append.clearfix > div.tags > span:nth-child(2) ::text"
Beispiel #2
0
html = '''
<div>
    <ul>
         <li class="item-0">first item</li>
         <li class="item-1"><a href="link2.html">second item</a></li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
         <li class="item-1 active"><a href="link4.html">fourth item</a></li>
         <li class="item-0"><a href="link5.html">fifth item</a></li>
     </ul>
 </div>
'''
from parsel import Selector
selector = Selector(text=html)
result = selector.css('.item-0').re('link.*')
print(result)

def view_bot(browser):

    # The "pandas" package import the excel file and then creates a basic url list in python.
    df = pd.read_excel(
        r'C:\Users\Owner\PycharmProjects\LinkedInVeiwBot\URLS.xlsx')
    url_list = df["URLS"].tolist()
    # This guy suppose to open an excel.csv file and activate write mode:'w'
    writer = csv.writer(open('Linkedinfile.csv', 'w', encoding='utf-8'))
    # writerow() method to the write to the file object
    writer.writerow([
        'Name', 'Job Title', 'Company', 'Last jobs', 'College', 'Location',
        'URL'
    ])
    for url in url_list:

        # sleep to make sure everything loads, add random to make us look human.

        browser.get(url)
        time.sleep(random.randint(5, 10))
        # assigning the source code for the web page to variable sel
        sel = Selector(text=browser.page_source)

        # Find the button 'view more'
        browser.execute_script(
            "window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(2)
        try:
            element = browser.find_element_by_xpath(
                '//section[@id="experience-section"]/div/button')
            button_text = element.text
            while button_text != "Show fewer experiences":
                browser.execute_script("return arguments[0].scrollIntoView();",
                                       element)
                browser.execute_script("window.scrollBy(0,-200);")
                time.sleep(2)
                element.click()
                time.sleep(2)
                element = browser.find_element_by_xpath(
                    '//section[@id="experience-section"]/div/button')
                button_text = element.text
        except:
            print("No button found")
        else:
            pass

        # xpath to extract the text from the class containing the name
        name = sel.xpath(
            '//*[starts-with(@class, "pv-top-card-section__name")]/text()'
        ).extract_first()

        # if name exists
        if name:
            # .strip() will remove the new line /n and white spaces
            name = name.strip()

        # xpath to extract the text from the class containing the job title
        job_title = sel.xpath(
            '//*[starts-with(@class, "pv-top-card-section__headline")]/text()'
        ).extract_first()

        if job_title:
            job_title = job_title.strip()

        # xpath to extract the text from the class containing the company
        company = sel.xpath(
            '//*[starts-with(@class,"pv-entity__secondary-title")]/text()'
        ).extract_first()

        if company:
            company = company.strip()

        # Trying to get the entire list of job secondary job titles.
        last_positions_array = \
            browser.find_elements_by_xpath('//a[@data-control-name = "background_details_company"]/div/h3')
        last_jobs_array = browser.find_elements_by_xpath(
            '//span[@class = "pv-entity__secondary-title"]')
        last_jobs = ''
        for index, job in enumerate(last_jobs_array):
            last_jobs += last_positions_array[index].text + ' at '
            last_jobs += job.text
            last_jobs += ' || '
        last_jobs = last_jobs[:-2]
        # xpath to extract the text from the class containing the college
        college = sel.xpath(
            '//*[starts-with(@class, "pv-entity__school-name t-16 t-black t-bold")]/text()'
        ).extract_first()

        if college:
            college = college.strip()

        # xpath to extract the text from the class containing the location
        location = sel.xpath(
            '//*[starts-with(@class, "pv-top-card-section__location")]/text()'
        ).extract_first()

        if location:
            location = location.strip()

        # assignment of the current URL
        linkedin_url = browser.current_url

        # validating if the fields exist on the profile
        name = validate_field(name)
        job_title = validate_field(job_title)
        company = validate_field(company)
        college = validate_field(college)
        location = validate_field(location)
        linkedin_url = validate_field(linkedin_url)

        # printing the output to the terminal
        print('\n')
        print('Name: ' + name)
        print('Job Title: ' + job_title)
        print('Company: ' + company)
        print('Previous Jobs: ' + last_jobs)
        print('College: ' + college)
        print('Location: ' + location)
        print('URL: ' + linkedin_url)
        print('\n')

        # This guy writes all the params below into csv file.
        writer.writerow([
            name, job_title, company, last_jobs, college, location,
            linkedin_url
        ])
    tree = etree.ElementTree(root)
    if not os.path.exists(ARTICLE_STORAGE + categoryName):
        os.makedirs(ARTICLE_STORAGE + categoryName)
    tree.write(ARTICLE_STORAGE + categoryName + "/" + str(uuid.uuid4()) + ".xml", encoding='utf-8', pretty_print=True)


driver = webdriver.Chrome()

fileList = os.listdir(path=REF_STORAGE)
countHref = 1028
for k in range(N, len(fileList)):
    f = open(REF_STORAGE + fileList[k])
    for line in f:
        driver.get(URL_VALUE + line)
        sel = Selector(text=driver.find_element_by_xpath("//*").get_attribute("outerHTML"))

        textName = sel.xpath("//div[@class='main']//h1/i/text()").extract_first()

        textList = sel.xpath("//div[@class='ocr']/p/text()").extract()
        textArticle = ''

        # if set(item.lower() for item in textList).isdisjoint(END_ARTICLE):
        #   continue
        checkGoodContent = True
        for i in range(0, len(textList)):
            textList[i].replace('\ufeff', "")
            if textName.lower() in textList[i].lower() and i < len(textList) - 1:
                i = i + 1
                while textList[i].lower() not in END_ARTICLE and i < len(textList):
                    textArticle = textArticle + textList[i] + " "
Beispiel #5
0
import requests
from parsel import Selector
url = 'http://www.porters.vip/confusion/recruit.html'
# 向目标网址发起请求
resp = requests.get(url)
# 使用响应正文初始化Selector
sel = Selector(resp.text)
# 取出响应正文中的企业名称
company = sel.css('h1.interval::text').get()
print(company)
Beispiel #6
0
 def __get_download_file_name(self, response):
     selector = Selector(response.text)
     file_name = selector.xpath('//*[@id="footer"]/button/@onclick').get()
     file_name = file_name.replace('location.href=', "").replace("\'", "")
     return file_name
Beispiel #7
0
async def get_cities() -> dict:
    response = await request('GET', DOMAIN)
    if response:
        tree = Selector(response.text)
        return Dict.name_link(tree, XPATH_TO_CITIES)
Beispiel #8
0
 def __init__(self, page: str):
     self._sel = Selector(page)
     self._result = None
Beispiel #9
0
 def page_ok(page: str):
     sel = Selector(text=page)
     if len(sel.css('.error_Block')):
         return False
     return True
Beispiel #10
0
def getAndParseURL(result):
    #result = requests.get(url)
    soup = BeautifulSoup(result, 'html.parser')
    
    child = soup.find_all('a')
    for i in range(0, len(child)):
        if 'Next' in child[i].get_text():
            nextlink = child[i]['href']
            print(nextlink)
            return(nextlink)
    return None

all_links = []
base_path = 'http://example.webscraping.com'
response = requests.get(base_path)
selector = Selector(response.text)
href_links = selector.xpath('//a/@href').getall()
all_links += href_links
# get URL 
r = requests.get("http://example.webscraping.com/") 
next = ''
#data = r.text 
#soup = BeautifulSoup(data) 
#table =  soup.findAll('td')
while(1):
    r = requests.get("http://example.webscraping.com/"+next) 

    data = r.text 
    soup = BeautifulSoup(data) 
    for link in soup.find_all('td'): 
        print(link.find('a')['href'])
Beispiel #11
0
def search_char(char):
    r = requests.get(f'https://guildstats.eu/character?nick={char.name}#tab2')
    sel = Selector(r.text)

    char.online_time = sel.xpath('//table[@id="myTable"]//td[2]/text()').extract_first()
    return char
Beispiel #12
0
def start_urls(html):
    selector = Selector(html)
    base_url = selector.xpath('//div[@class="hezi"]//li/a/@href').getall()
    titles = selector.xpath('//ul[@class="img"]//p/a/text()').getall()
    return base_url,titles
Beispiel #13
0
def get_html(html,url):
    selector = Selector(html)
    next_url = selector.xpath('//div[@id="pages"]//a[@class="a1"][2]/@href').get() 
    img_urls = selector.xpath('//div[@class="content"]/img/@src').getall()
    title = selector.xpath('//div[@class="content"]/img/@alt').get()
    return next_url,img_urls
Beispiel #14
0
def getdata():
    url = "https://iqmining.com/pricing"
    logger.info(f"get page {url}")
    z1 = s.get(url, timeout=60)
    response = Selector(text=z1.text)
    jscode = response.xpath(
        '//script[contains(.,"pricesConfig")]/text()').extract_first()
    parse_js = js2xml.parse(jscode)
    pricesConfig = js2xml.jsonlike.getall(parse_js)
    ret = []
    for k, v in pricesConfig[0].items():
        gold, silver, bronze = {"t": "gold"}, {"t": "silver"}, {"t": "bronze"}
        gold.update(v)
        silver.update(v)
        bronze.update(v)
        del gold["fee"]
        del gold["options"]
        del gold["new_price"]
        del bronze["fee"]
        del bronze["options"]
        del bronze["new_price"]
        del silver["fee"]
        del silver["options"]
        del silver["new_price"]

        coin = ""
        if k in ["sha256", "shapro"]:
            coin = "BTC"
        elif k == "shabch":
            coin = "BCH"
        elif k == "eth":
            coin = "ETH"
        else:
            continue
        if coin == "ETH":
            gold["contract_size"] = v["mingold"]
            silver["contract_size"] = v["minsilver"]
            bronze["contract_size"] = v["mincalc"]
        else:
            # BTC BCH 这边拿到的是 GH/s 的值 基础单位是 1000GH=1TH
            gold["contract_size"] = v["mingold"] / 1000
            silver["contract_size"] = v["minsilver"] / 1000
            bronze["contract_size"] = v["mincalc"] / 1000
        gold["coin"] = coin
        silver["coin"] = coin
        bronze["coin"] = coin
        if v["fee"]:
            # BTC BCH 这边拿到的是 10GH/s 的值 基础单位是 1000GH
            gold["electricity_fee"] = float(v["fee"]["gold"]) * 100
            silver["electricity_fee"] = float(v["fee"]["silver"]) * 100
            bronze["electricity_fee"] = float(v["fee"]["bronze"]) * 100
        else:
            gold["electricity_fee"] = 0
            silver["electricity_fee"] = 0
            bronze["electricity_fee"] = 0
        if v.get("new_price", ""):  ##打折
            price_info = v["new_price"]
        else:
            price_info = v["options"]
        for y, p in price_info.items():
            if y == "y0":
                continue
            elif y == "y1":
                gold["duration"] = 365
                silver["duration"] = 365
                bronze["duration"] = 365
            elif y == "y2":
                gold["duration"] = 365 * 2
                silver["duration"] = 365 * 2
                bronze["duration"] = 365 * 2
            elif y == "y5":
                gold["duration"] = 365 * 5
                silver["duration"] = 365 * 5
                bronze["duration"] = 365 * 5
            if coin == "ETH":
                ## ETH这边拿到的是 0.1 MH/s 的值,基础单位是1MH
                gold["upfront_fee"] = float(p["gold"]) * 10
                silver["upfront_fee"] = float(p["silver"]) * 10
                bronze["upfront_fee"] = float(p["bronze"]) * 10
            else:
                ## BTC BCH 这边拿到的是 10GH/s 的值 基础单位是 1000GH
                gold["upfront_fee"] = float(p["gold"]) * 100
                silver["upfront_fee"] = float(p["silver"]) * 100
                bronze["upfront_fee"] = float(p["bronze"]) * 100
            ret.append(gold.copy())
            ret.append(silver.copy())
            ret.append(bronze.copy())
    return ret
Beispiel #15
0
     except urllib.error.URLError:
         pass
     except urllib.error.HTTPError:
         pass
     except timeout:
         pass
 else:
     fail.append(s[i]) 
     print ("failed to retive info from ",s[i],i)
     flag = True
 if flag ==True:
     pass
 else:
     clap = response.read()
     clap = clap.decode("utf-8") 
     h = Selector(text=clap)
     date = h.xpath('//meta[@content][@name="pub_date"]/@content').extract()
     if date:
         pass
     else:
         date = h.xpath('//meta[@content][@name="parsely-pub-date"]/@content').extract()
     key = h.xpath('//meta[@content][@name="keywords"]/@content').extract() 
     info = h.xpath('//div[@id = "article_body"]/p//text()').extract()
     if not info:
         info = h.xpath('//div[@class = "article-body__content"]/p//text()').extract()
     if len(info)>1:
         info = ' '.join(str(r) for r in info)
         info = info.replace(u"\xa0", u" ")
     if "T" in date[0]:
         date,t = date[0].split('T')
     else:
Beispiel #16
0
 def home_ok(page: str):
     sel = Selector(text=page)
     if len(sel.css('#mySignin')):
         return False
     return True
Beispiel #17
0
 def test_make_links_absolute(self):
     text = u'<a href="file.html">link to file</a>'
     sel = Selector(text=text, base_url='http://example.com')
     sel.root.make_links_absolute()
     self.assertEqual(u'http://example.com/file.html',
                      sel.xpath('//a/@href').extract_first())
Beispiel #18
0
 def lang_url(page: str):
     sel = Selector(text=page)
     url = sel.xpath('//*[@id="englishLanguage"]/@href').get()
     return f'{MiCubacelParser.url_base}{url}'
Beispiel #19
0
import requests
from parsel import Selector
import time
import pandas as pd

start = time.time()

all_images = {}

result = []
response = requests.get('https://www.tk421.net/lotr/film/')
selector = Selector(response.text)

href_links = selector.xpath('//a/@href').getall()
del href_links[-1]


def moviename(tag):
    if ('fotr' in tag):
        return 'The Fellowship of the Ring'
    elif ('ttt' in tag):
        return 'The Two Towers'
    elif ('rotk' in tag):
        return 'The Return of the King'
    else:
        return None


txtflag = 0
for link in href_links:
    try:
Beispiel #20
0
import time

import requests
from parsel import Selector

from headers import COMMENTS_HEADERS

base_url = "http://www.dianping.com/shop/67408602/review_all/p{}"

for i in range(1, 10):
    if i > 1:
        COMMENTS_HEADERS["Referer"] = base_url.format(i - 1)
    res = requests.get(base_url.format(1), headers=COMMENTS_HEADERS)
    selector = Selector(text=res.text)
    if selector.css(".review-recommend").getall():
        print(selector.css(".review-recommend").getall())
    else:
        print(base_url.format(1))
        print(res.content.decode("u8"))
    time.sleep(5)
Beispiel #21
0
def scrappyprofile(url, user):

    driver = StartSelenium(user)
    driver.get(url)

    time.sleep(2)

    scheight = .1
    while scheight < 9.9:
        driver.execute_script(
            "window.scrollTo(0, document.body.scrollHeight/%s);" % scheight)
        scheight += .01

    time.sleep(1)

    pagina = driver.page_source
    page = open("source.txt", "w")
    selec = Selector(text=pagina)
    page.write(pagina)

    url_imagen = selec.xpath(
        '//*[starts-with(@class,  "pv-top-card-section__photo presence-entity__image EntityPhoto-circle-9 lazy-image loaded ember-view")]/@src'
    ).extract()

    if len(url_imagen) > 0:
        b = 1
    else:
        b = 0

    name = selec.xpath(
        '//*[starts-with(@class, "inline t-24 t-black t-normal break-words")]/text()'
    ).extract()
    title = selec.xpath(
        '//*[starts-with(@class, "mt1 t-18 t-black t-normal")]/text()'
    ).extract()
    address = selec.xpath(
        '//*[starts-with(@class, "t-16 t-black t-normal inline-block")]/text()'
    ).extract()
    contacts = selec.xpath(
        '//*[starts-with(@class, "ember-view")]/text()').extract()
    extracto = selec.xpath(
        '//*[starts-with(@class, "pv-about__summary-text mt4 t-14 ember-view")]/span/text()'
    ).extract()
    # Obtenemos los cargos
    cargos = []
    css1cargo = selec.xpath(
        '//*[starts-with(@class, "pv-entity__summary-info pv-entity__summary-info--background-section ")]/h3/text()'
    ).extract()
    cargos += css1cargo
    cargo = selec.xpath(
        '//*[starts-with(@class, "t-16 t-black t-bold")]/span/text()').extract(
        )
    cargomas = selec.xpath(
        '//*[starts-with(@class, "t-14 t-black t-bold")]/span/text()').extract(
        )
    cargomas += cargo
    i = 1
    while i < len(cargomas):
        cargos.append(cargomas[i])
        i += 2

    # Obtenemos las empresas
    empresas = selec.xpath(
        '//*[starts-with(@class, "pv-entity__secondary-title t-14 t-black t-normal")]/text()'
    ).extract()
    # Obtenemos las fechas por empresa
    fechas = selec.xpath(
        '//*[starts-with(@class, "pv-entity__date-range t-14 t-black--light t-normal")]/span/text()'
    ).extract()

    #print("name: " + name[0])
    #print("title: " + title[0])
    #print("address: " + address[0])
    #print("contacts: " + str(contacts))
    #print("extracto: " + str(extracto))
    if b == 1:
        imagen = url[28:-1] + ".jpg"
    else:
        imagen = "default.jpg"

    perfil = Usuario(name[0], title[0], url, imagen)
    if len(extracto) > 0:
        perfil.extracto = extracto[0]
    trabajos = []
    if len(cargos) != len(empresas):
        for i in range(len(cargos) - len(empresas)):
            empresas.append("")
    if len(cargos) != int(len(fechas) / 2):
        for i in range(len(cargos) - int(len(fechas) / 2)):
            fechas.append("")
            fechas.append("")

    for i in range(len(cargos)):
        fechas.pop(0)
        trabajo = Cargo(cargos[i], empresas[i], "", fechas[i])
        trabajos.append(trabajo)
        #print("insertado: "+trabajo.fecha)
    perfil.cargos = trabajos
    #print("Cargos: "+ str(len(perfil.cargos)))

    #Obtenemos la educacion
    escuelas = selec.xpath(
        '//*[starts-with(@class, "pv-entity__school-name t-16 t-black t-bold")]/text()'
    ).extract()
    #print("escuelas: "+ str(escuelas))
    if len(escuelas) > 0:

        titulos = selec.xpath(
            '//*[starts-with(@class, "pv-entity__secondary-title pv-entity__degree-name t-14 t-black t-normal")]/span/text()'
        ).extract()
        disciplinas = selec.xpath(
            '//*[starts-with(@class, "pv-entity__secondary-title pv-entity__fos t-14 t-black t-normal")]/span/text()'
        ).extract()
        fechas = selec.xpath(
            '//*[starts-with(@class, "pv-entity__dates t-14 t-black--light t-normal")]/span/time/text()'
        ).extract()
        educacion = []

        for long in range(len(escuelas) - len(disciplinas)):
            disciplinas.append("")
            disciplinas.append("")
        #print("longitudes: " + str(len(escuelas)) + " : " + str(len(titulos)) + ":" + str(len(disciplinas)))
        for i in range(len(escuelas)):
            escuela = Escuela(escuelas[i])

            if len(titulos) > 0:
                titulos.pop(0)
                escuela.titulacion = titulos[i]
            if len(disciplinas) > 0:
                disciplinas.pop(0)
                escuela.disciplina = disciplinas[i]
            escuela.fecha = fechas[i] + " - " + fechas[i + 1]
            #print("Escuelas: " + escuela.name)
            #print("Titulos: " + escuela.titulacion)
            #print("Disciplinas: " + escuela.disciplina)
            #print("fechas: " + str(fechas))
            educacion.append(escuela)
        perfil.escuelas = educacion
    #obteniendo las aptitudes
    aptitudes = selec.xpath(
        '//*[starts-with(@class, "pv-skill-category-entity__name-text t-16 t-black t-bold")]/text()'
    ).extract()
    #print("aptitudes: " + str(aptitudes))
    perfil.aptitudes = aptitudes
    #Obteniendo los intereses
    intereses = selec.xpath(
        '//*[starts-with(@class, "pv-entity__summary-info ember-view")]/h3/span/text()'
    ).extract()
    #print("Intereses: " + str(intereses))
    perfil.intereses = intereses
    contactos, datosModelo = extractContacts(driver, url)

    perfil.contactos = contactos

    #Obteniendo licencias y certificaciones
    certificaciones = selec.xpath(
        '//*[starts-with(@class, "pv-certifications__summary-info pv-entity__summary-info pv-entity__summary-info--background-section pv-certifications__summary-info--has-extra-details")]/h3/text()'
    ).extract()
    #print("Certificaciones: " + str(certificaciones))
    perfil.certificaciones = certificaciones
    #Obteniendo logros
    logrosTitles = selec.xpath(
        '//*[starts-with(@class, "pv-accomplishments-block__count t-32 t-black t-normal pr3")]/span/text()'
    ).extract()
    logros = selec.xpath(
        '//*[starts-with(@class, "pv-accomplishments-block__list-container")]/ul/li/text()'
    ).extract()
    #print("Logros T: " + str(logrosTitles))
    #print("logros: "+ str(logros))
    conjuntoLogros = []
    i = 1
    while i < len(logrosTitles):
        aux = ""
        for j in range(int(logrosTitles[i])):
            if j == 0:
                aux = logros.pop(0)
            else:
                aux = aux + ", " + logros.pop(0)

        conjuntoLogros.append(aux)
        i += 2

    #print("conjuntoLogros: " + str(conjuntoLogros))
    perfil.logrosTitles = logrosTitles[0::2]
    perfil.logros = conjuntoLogros
    perfil.datosModelo = datosModelo
    connector.insertarUsuario(perfil)
    driver.close()

    return perfil
Beispiel #22
0
    else:

        hucreIci = oku['A{}'.format(hucreNo)].value

        urunAdresi = hucreIci

        baslik = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
        }

        toplama = requests.get(urunAdresi, headers=baslik)

        if toplama.status_code == 200:

            secici = Selector(toplama.text)

            duzenle = re.compile(
                '<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')

            urun = secici.css(
                '#product-detail-app > div > div.pr-cn > div.pr-cn-in > div.pr-in-w > div:nth-child(1) > div.pr-in-cn > h1'
            ).get()

            satici = secici.css(
                '#product-detail-app > div > div.pr-cn > div.pr-cn-in > div.pr-in-at > div.pr-in-sl-cnt > div > div.sl-nm > a'
            ).get()

            pfiyat = secici.xpath(
                '/html/body/div[3]/div/div/div[2]/div[2]/div[1]/div[1]/div[1]/div[2]/div/div/span[1]'
            ).get()
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
from parsel import Selector
import openpyxl
import pandas as pd
import matplotlib.pyplot as plt

driver=webdriver.Chrome('C:/Users/gunjan/Desktop/Web_Scraping/chromedriver')
driver.get('http://quotes.toscrape.com/')


sel=Selector(text=driver.page_source)
#quotes=sel.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "text", " " ))]/text()').extract()
#author=sel.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "author", " " ))]/text()').extract()
tags=sel.xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "tag", " " ))]/text()').extract()
#about_links=sel.xpath('//span//a').extract()

#next_btn=driver.find_element_by_xpath('//*[contains(concat( " ", @class, " " ), concat( " ", "next", " " ))]//a')
#next_btn.click()

path="C:/Users/gunjan/Desktop/quotes_web_scrap/tags_data.xlsx"

workbook=openpyxl.load_workbook(path)
sheet=workbook.active

for r in range(2,len(tags)+1):
        #for c in range(1,4):
        #sheet.cell(row=r+1,column=1).value=quotes[r-1]
        #sheet.cell(row=r+1,column=2).value=author[r-1]
        sheet.cell(row=r,column=1).value=tags[r-1]
Beispiel #24
0
 def selector(self):
     return Selector(self.text)
Beispiel #25
0
def main():
    url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/index.html'
    res = get_html_content(url)
    enc = chardet.detect(res)
    html = res.decode(enc['encoding'], errors='ignore')
    xpath_css = Selector(text=html)
    all_urls = xpath_css.xpath('//tr[@class="provincetr"]/td/a')
    base = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2017/'
    for url in all_urls[17:18]:
        province_url = base + url.xpath('./@href').extract_first()
        province = url.xpath('./text()').extract_first()
        res = get_html_content(province_url)
        enc = chardet.detect(res)
        html = res.decode(enc['encoding'], errors='ignore')
        xpath_css = Selector(text=html)
        city_code = xpath_css.xpath(
            '//tr[@class="citytr"]/td[1]/a/text()').extract()
        city_list = xpath_css.xpath(
            '//tr[@class="citytr"]/td[2]/a/text()').extract()
        city_urls = xpath_css.xpath(
            '//tr[@class="citytr"]/td[1]/a/@href').extract()
        # for i in range(len(city_urls[12])):
        url1 = base + city_urls[1]
        # print(url1)
        res = get_html_content(url1)
        enc = chardet.detect(res)
        html = res.decode(enc['encoding'], errors='ignore')
        xpath_css = Selector(text=html)
        county_code = xpath_css.xpath(
            '//tr[@class="countytr"]/td[1]/a/text()').extract()
        county_list = xpath_css.xpath(
            '//tr[@class="countytr"]/td[2]/a/text()').extract()
        county_urls = xpath_css.xpath(
            '//tr[@class="countytr"]/td[1]/a/@href').extract()
        for j in range(len(county_urls)):
            # print('省:{}  市:{}  县:{}'.format(province,city_list[20],county_list[j]))
            url2 = url1[0:-9] + county_urls[j]
            res = get_html_content(url2)
            enc = chardet.detect(res)
            try:
                html = res.decode("gbk").encode("utf-8")
            except:
                html = res.decode("gb2312").encode("utf-8")
            real_html = html.decode('utf-8')
            xpath_css = Selector(text=real_html)
            town_code = xpath_css.xpath(
                '//tr[@class="towntr"]/td[1]/a/text()').extract()
            town_list = xpath_css.xpath(
                '//tr[@class="towntr"]/td[2]/a/text()').extract()
            town_urls = xpath_css.xpath(
                '//tr[@class="towntr"]/td[1]/a/@href').extract()
            for k in range(len(town_urls)):
                # print('省:{}   市:{}   县:{}   镇:{}'.format(province,city_list[20],county_list[j],town_list[k]))
                url3 = url2[0:-11] + town_urls[k]
                print(url3)
                res = get_html_content(url3)
                # enc = chardet.detect(res)
                # html = res.decode(enc['encoding'],errors = 'ignore')
                try:
                    html = res.decode("gbk").encode("utf-8")
                except:
                    html = res.decode("gb2312").encode("utf-8")
                real_html = html.decode('utf-8')
                xpath_css = Selector(text=real_html)
                villagetr_code = xpath_css.xpath(
                    '//tr[@class="villagetr"]/td[1]/text()').extract()
                villagetr_code1 = xpath_css.xpath(
                    '//tr[@class="villagetr"]/td[2]/text()').extract()
                villagetr_list = xpath_css.xpath(
                    '//tr[@class="villagetr"]/td[3]/text()').extract()
                for x in range(len(villagetr_list)):
                    print('省:{}  市:{}  县:{}  镇:{}  村:{}'.format(
                        province,
                        str(city_list[1]).replace('市辖区', province) + '--' +
                        city_code[1], county_list[j] + '--' + county_code[j],
                        town_list[k] + '--' + town_code[k], villagetr_list[x] +
                        '--' + villagetr_code[x] + '--' + villagetr_code1[x]))
                    save_to_mysql(
                        province,
                        str(city_list[1]).replace('市辖区', province) + '--' +
                        city_code[1], county_list[j] + '--' + county_code[j],
                        town_list[k] + '--' + town_code[k], villagetr_list[x] +
                        '--' + villagetr_code[x] + '--' + villagetr_code1[x])
Beispiel #26
0
            hxs = Selector(text=data)
            posts = hxs.xpath(
                '//ul[@class="archive"]/li/span[@class="channel markets_and_finance"]/following-sibling::h1/a/@href'
            ).extract()
            posted.append(posts)
    return posted


if __name__ == '__main__':
    print("in main")
    totalWeeks = []
    totalPosts = []
    url = 'http://www.businessweek.com/archive/news.html#r=404'
    data = urllib.request.urlopen(url).read()
    data = data.decode("utf-8")
    sel = Selector(text=data)
    months = sel.xpath('//ul/li/a').re(
        'http://www.businessweek.com/archive/\\d+-\\d+/news.html')
    #admittMonths = 12*(2015-1991) + 8
    m = []
    for i in months:
        m.append([i])
    totalWeeks = []
    pool = Pool(8)
    totalWeeks = pool.map(mon, m)
    totalWeeks = [ent for sublist in totalWeeks for ent in sublist]
    print(len(totalWeeks))
    #club = [ent for sublist in totalWeeks for ent in sublist]
    #print (len(club))
    club = [ent for sublist in totalWeeks for ent in sublist]
    print(len(club))
Beispiel #27
0
from parsel import Selector
import requests

URL_BASE = "http://books.toscrape.com/catalogue/"

next_page_url = 'page-1.html'
while next_page_url:
    # Busca o conteúdo da próxima página
    response = requests.get(URL_BASE + next_page_url)
    selector = Selector(text=response.text)
    # Imprime os produtos de uma determinada página
    for product in selector.css(".product_pod"):
        # Busca e extrai o título e  o preço
        # title = product.css("h3 a::attr(title)").get()
        # price = product.css(".price_color::text").get()
        # print(title, price)

        # Busca o detalhe de um produto
        detail_href = product.css("h3 a::attr(href)").get()
        detail_page_url = URL_BASE + detail_href

        # Baixa o conteúdo da página de detalhes
        detail_response = requests.get(detail_page_url)
        detail_selector = Selector(text=detail_response.text)

        # Extrai a descrição do produto
        description = detail_selector.css(
            "#product_description ~ p::text").get()
        print(description)

    # Descobre qual é a próxima página
Beispiel #28
0
import requests
from parsel import Selector

response = requests.get(
    "http://books.toscrape.com/catalogue/the-grand-design_405/index.html")

selector = Selector(text=response.text)
titles = selector.css(".product_page > .row > .product_main > h1::text").get()
price = selector.css(
    ".product_page > .row > .product_main > p::text").re_first(r"\d+\.\d{2}")
description = selector.css(".product_page > p::text").get()
url = selector.css("img::attr(src)").get()
quantity = selector.css(".instock").re_first(r"\d{1,}")

suffix = "...more"
if description.endswith(suffix):
    description = description[:-len(suffix)]

print(titles, price, description, url, quantity, sep=",")
Beispiel #29
0
def linkedin(screen_name):
    for person in people:
        if person.screen_name == screen_name:
            return person.__dict__

    chromedriver = 'C:/Users/DELL/chromedriver_win32/chromedriver.exe'
    options = webdriver.ChromeOptions()
    options.add_argument('headless')

    driver = webdriver.Chrome(executable_path=chromedriver,
                              chrome_options=options)

    # driver.get method() will navigate to a page given by the URL address
    driver.get('https://www.linkedin.com')

    # locate email form by_class_name
    username = driver.find_element_by_class_name('login-email')

    # send_keys() to simulate key strokes
    username.send_keys('*****@*****.**')

    # sleep for 0.5 seconds
    sleep(0.5)

    # locate password form by_class_name
    password = driver.find_element_by_class_name('login-password')

    # send_keys() to simulate key strokes
    password.send_keys('meriam12345')
    sleep(0.5)

    # locate submit button by_xpath
    sign_in_button = driver.find_element_by_xpath('//*[@type="submit"]')

    # .click() to mimic button click
    sign_in_button.click()
    sleep(0.5)

    # driver.get method() will navigate to a page given by the URL address
    driver.get('https:www.google.com')
    sleep(3)

    # locate search form by_name
    search_query = driver.find_element_by_name('q')

    # send_keys() to simulate the search text key strokes
    nn = 'site:linkedin.com/in/ AND ' + screen_name
    search_query.send_keys(nn)  #a changer avec parametre de la fonction
    sleep(0.5)

    # navigate to the URL address specified by search_query in parameters.py
    driver.get(nn)  #a changer avec parametre de la fonction

    # .send_keys() to simulate the return key
    search_query.send_keys(Keys.RETURN)
    sleep(3)

    # locate URL by_class_name
    linkedin_urls = driver.find_elements_by_class_name('iUh30')

    # variable linkedin_url is equal to the list comprehension
    linkedin_urls = [url.text for url in linkedin_urls]
    sleep(0.5)

    # For loop to iterate over each URL in the list returned from the google search query

    linkedin_url = linkedin_urls[0]
    # get the profile URL
    driver.get(linkedin_url)
    sleep(5)

    # assigning the source code for the web page to variable sel
    sel = Selector(text=driver.page_source)

    # xpath to extract the text from the class containing the name
    name = sel.xpath(
        '//*[starts-with(@class, "pv-top-card-section__name")]/text()'
    ).extract_first()

    # if name exists
    if name:
        # .strip() will remove the new line /n and white spaces
        name = name.strip()

        # xpath to extract the text from the class containing the job title
    job_title = sel.xpath(
        '//*[starts-with(@class, "pv-top-card-section__headline")]/text()'
    ).extract_first()

    if job_title:
        job_title = job_title.strip()

    postes = sel.xpath(
        '//*[starts-with(@class, "t-16 t-black t-bold")]/text()').getall()
    for poste in postes:
        if poste:
            poste = poste.strip()

    societes = sel.xpath(
        '//*[starts-with(@class, "pv-entity__secondary-title")]/text()'
    ).getall()
    for societe in societes:
        if societe:
            societe = societe.strip()

    descriptions = sel.xpath(
        '//*[starts-with(@class, "lt-line-clamp__line")]/text()').getall()
    for description in descriptions:
        if description:
            description = description.strip()

    universites = sel.xpath(
        '//*[starts-with(@class, "pv-entity__school-name t-16 t-black t-bold")]/text()'
    ).getall()
    for universite in universites:
        if universite:
            universite = universite.strip()

    linkedin_url = driver.current_url

    name = validate_field(name)
    job_title = validate_field(job_title)
    postes = validate_field(postes)
    societes = validate_field(societes)
    descriptions = validate_field(descriptions)
    universites = validate_field(universites)
    linkedin_url = validate_field(linkedin_url)

    driver.quit()
    person = Person(name, job_title, postes, societes, descriptions,
                    universites, linkedin_url)
    people.append(person)
    return person.__dict__
def parse(response):
    # 生日
    shengri = re.search(
        '出生日期.*?</td>.*?<td class="data_tb_content".*?>(.*?)</td>',
        response,
        flags=re.S)
    if shengri:
        shengri = shengri.group(1)
    else:
        shengri = ''
    response = Selector(text=response)
    # 姓名
    xingming = response.xpath(
        '//td[contains(text(),"姓名")]/following-sibling::td[1]/text()').get(
            default='')
    # 姓名
    xingbie = response.xpath(
        '//td[contains(text(),"性别")]/following-sibling::td[1]/text()').get(
            default='')
    # 姓名
    zhiwu = response.xpath(
        '//td[contains(text(),"所内职务")]/following-sibling::td[1]/text()').get(
            default='')
    # 姓名
    dangyuan = response.xpath(
        '//td[contains(text(),"是否党员")]/following-sibling::td[1]/text()').get(
            default='')
    # 姓名
    xueli = response.xpath(
        '//td[contains(text(),"学历")]/following-sibling::td[1]/text()').get(
            default='')
    # 姓名
    xuewei = response.xpath(
        '//td[contains(text(),"学位")]/following-sibling::td[1]/text()').get(
            default='')
    # 姓名
    zhuanye = response.xpath(
        '//td[contains(text(),"所学专业")]/following-sibling::td[1]/text()').get(
            default='')
    # 姓名
    xuexiao = response.xpath(
        '//td[contains(text(),"毕业学校")]/following-sibling::td[1]/text()').get(
            default='')
    # 姓名
    kaohe = response.xpath(
        '//td[contains(text(),"资格取得方式(考试/考核)")]/following-sibling::td[1]/text()'
    ).get(default='')
    # 姓名
    shuhao = response.xpath(
        '//td[contains(text(),"全科合格证书号")]/following-sibling::td[1]/text()'
    ).get(default='')
    # 姓名
    nianfen = response.xpath(
        '//td[contains(text(),"全科合格年份")]/following-sibling::td[1]/text()').get(
            default='')
    # 姓名
    bianhao = response.xpath(
        '//td[contains(text(),"注册会计师证书编号")]/following-sibling::td[1]/text()'
    ).get(default='')
    # 姓名
    gudong = response.xpath(
        '//td[contains(text(),"是否合伙人(股东)")]/following-sibling::td[1]/text()'
    ).get(default='')
    # 姓名
    jianhao = response.xpath(
        '//td[contains(text(),"批准注册文件号")]/following-sibling::td[1]/text()'
    ).get(default='')
    # 姓名
    shijian = response.xpath(
        '//td[contains(text(),"批准注册时间")]/following-sibling::td[1]/text()').get(
            default='')
    # 姓名
    wusuo = response.xpath(
        '//td[contains(text(),"所在事务所")]/following-sibling::td[1]/text()').get(
            default='')
    # 姓名
    xueshi = response.xpath(
        '//td[contains(text(),"本年度应完成学时")]/following-sibling::td[1]/text()'
    ).get(default='')
    # 姓名
    yixueshi = response.xpath(
        '//td[contains(text(),"本年度已完成学时")]/following-sibling::td[1]/text()'
    ).get(default='')
    # 姓名
    xinxi = response.xpath(
        '//td[contains(text(),"处罚/惩戒信息")]/following-sibling::td[1]/text()'
    ).get(default='')
    # 姓名
    huodong = response.xpath(
        '//td[contains(text(),"参加公益活动")]/following-sibling::td[1]/text()').get(
            default='')
    # 生日
    items = {
        'xingming': xingming.strip(),
        'xingbie': xingbie.strip(),
        'zhiwu': zhiwu.strip(),
        'dangyuan': dangyuan.strip(),
        'xueli': xueli.strip(),
        'xuewei': xuewei.strip(),
        'zhuanye': zhuanye.strip(),
        'xuexiao': xuexiao.strip(),
        'kaohe': kaohe.strip(),
        'shuhao': shuhao.strip(),
        'nianfen': nianfen.strip(),
        'bianhao': bianhao.strip(),
        'gudong': gudong.strip(),
        'jianhao': jianhao.strip(),
        'shijian': shijian.strip(),
        'wusuo': wusuo.strip(),
        'xueshi': xueshi.strip(),
        'yixueshi': yixueshi.strip(),
        'xinxi': xinxi.strip(),
        'huodong': huodong.strip(),
        'shengri': shengri.strip(),
    }
    print(items)
    pipeline(items)