예제 #1
0
def push(site):
    print('push site is opening now....')
    time.sleep(0.5)
    sitemap_url = site["sitemapLink"]

    try:
        print('get sitemap link....', 'utf-8')
        data_ = bp(requests.get(sitemap_url).content, 'lxml')
    except Exception as e:
        print(e)

    list_url = []

    print('---------------------------------')
    for x, y in enumerate(data_.find_all('loc')):
        print(x, y.string)
        list_url.append(y.string.replace('http://', 'https://www.'))

    print('---------------------------------')

    print('pushing....')

    for x in list_url:
        print('now we push the siteWeb counts are:', x)
        get_(site["baiduAPIUrl"], x)
    print("推送完成,6s后自动关闭....")
    time.sleep(6)
예제 #2
0
def push(site):
    print('push site is opening now....')
    time.sleep(0.5)
    sitemap_url = site["sitemapLink"]

    try:
        print('get sitemap link....','utf-8')
        data_ = bp(requests.get(sitemap_url).content,'lxml')
    except Exception as e:
        print(e)

    list_url=[]

    print('---------------------------------')
    for x,y in enumerate(data_.find_all('loc')):
        print(x,y.string)
        list_url.append(y.string.replace('http://','https://www.'))

    print('---------------------------------')

    print('pushing....')
    xmlData = {"siteUrl":site["url"],"urlList":list_url}
    postTheXML(site["baiduAPIUrl"],xmlData)
    print("推送完成,6s后自动关闭....")
    time.sleep(6)
예제 #3
0
def getSitemap(sitemapLink,saveAddress):
    print('push site is opening now....')
    time.sleep(0.5)
    site_url = sitemapLink

    try:
        print('get sitemap link....','utf-8')
        data_ = bp(requests.get(site_url).content,'lxml')
    except Exception as e:
        print(e)

    list_url = []
    print('---------------------------------')
    for x,y in enumerate(data_.find_all('loc')):
        print(x,y.string)
        list_url.append(y.string.replace('http://','https://www.'))

    print('---------------------------------')

    print('Lisy',list_url)


    target = saveAddress
    try:
        with open(target,"w") as f:
            for i in range(len(list_url)):
                f.write(list_url[i]+"\n")
        print("写入完成,路径{}\n 6s后自动关闭".format(target))
        time.sleep(5)
    except Exception as e:
        print("ERROR! ",e.__str__())
예제 #4
0
 def start_1(self,text):
     s = bp(text,'html.parser')
     e = s.find_all('a',class_='position_link')
     a = []
     for x in e:
         a.append('http:'+x.get('href'))
     return a
예제 #5
0
def process_wrapper(lineID):
    outfile1 = open("bb.2", "a+")
    outfile2 = open("bb.3", "a+")
    with open("bb.1") as infile:
        for i, line in enumerate(infile):
            if i != lineID:
                continue
            else:
                line = infile.readline()
                my_url = 'http://astrogeo.org/cgi-bin/imdb_get_source.csh?source=' + line[
                    0:(len(line) - 1)]
                #print(lineID)
                uClient = uReq(my_url)
                page_html = uClient.read()
                uClient.close()
                page_parse = bp(page_html, "html.parser")
                containers = page_parse.findAll('tr', {"valign": "TOP"})
                if len(containers) != 0:
                    container = containers[0]
                    sub_containers = container.findAll("td",
                                                       {'align': 'RIGHT'})
                    sub_container1 = sub_containers[1]
                    sub_container2 = sub_containers[2]
                    print(sub_container1.text, file=outfile1)
                    print(sub_container2.text, file=outfile2)
                else:
                    print('11:11:11.1111', file=outfile1)
                    print('+11:11:11.111', file=outfile2)
                    logging.info(
                        'Source not found in the Astrogeo catalogue: %s' %
                        line)
    outfile1.close()
    outfile2.close()
예제 #6
0
def qidian_fengtui_list(url):
    r=rq.get(url)
    soup=bp(r.text,'lxml')
    lst=soup.find_all('a')
    with open('fengtui.txt','a',encoding='utf-8') as tar:
        for i in lst:
            if "book.qidian.com" in i.attrs['href']:
                if i.text=='':
                    pass
                else:
                    tar.write(i.text+'\n')
                    print(i.text)
예제 #7
0
def get_news(url, n_pages=1):
    """ Collect news from a given web page """
    news = []
    while n_pages:
        print("Collecting data from page: {}".format(url))
        response = requests.get(url)
        soup = bp(response.content, "html.parser")
        news_list = extract_news(soup)
        next_page = extract_next_page(soup)
        url = "https://news.ycombinator.com/" + next_page
        news.extend(news_list)
        n_pages -= 1
    return news
예제 #8
0
 def start_2(self,text):
     s = bp(text,'html.parser')
     e = s.find('dd', class_='job_request')
     fromsite = '拉勾网'
     #
     positionType = s.find('li',class_='labels').string
     salary = s.find('span',class_='salary').string
     positionName = s.find('span',class_='ceil-job').string
     #
     c = ''
     for x in e.find_all('span'):
         c = c+x.string
     d = c.split('/')
     #
                         city = d[1]                                 copyright()
def process_wrapper(lineID):
    outfile = open("aa2.tmp", "a+")
    with open("aa1") as infile:
        for i, line in enumerate(infile):
            if i != lineID:
                continue
            else:
                my_url = 'http://astrogeo.org/cgi-bin/imdb_get_source.csh?source=' + line[
                    0:(len(line) - 1)]
                uClient = uReq(my_url)
                page_html = uClient.read()
                uClient.close()
                page_parse = bp(page_html, "html.parser")
                containers = page_parse.findAll("td", {
                    "align": "RIGHT",
                    "nowrap": ""
                })
                out = open("kk.{}".format(i), "w")
                for x in range(6, len(containers), 3):
                    container = containers[x]
                    out.write(container.text)
                out.close()
                #remove the blank lines for a text file:
                with open("kk.{}".format(i), 'r') as inp:
                    with open("ll.{}".format(i), 'w') as out:
                        for line1 in inp:
                            if not line1.isspace():
                                out.write(line1)
                #Check for the nearly simultaneous and different bands observations:
                df = pd.read_csv("ll.{}".format(i),
                                 sep=' ',
                                 header=None,
                                 names=['Date', 'Band'])
                for j, k in zip(range(len(df)), range(1, len(df))):
                    d1 = df['Date'][j]
                    d2 = df['Date'][k]
                    b1 = df['Band'][j]
                    b2 = df['Band'][k]
                    if days_between(d1, d2) < 365 and b1 != b2:
                        str_line = '%s %s' % (str(i), line)
                        outfile.write(str_line)
    outfile.close()
예제 #10
0
 def parse(self, response):
     item = ItDataItem()
     s = bp(response.body, 'html.parser')
     a = s.find_all('table', class_='newlist')
     a.pop(0)
     print a
     for x in a:
         item['fromsite'] = '智联招聘'
         item['positionType'] = x.b.get_text().encode('utf8')
         item['positionName'] = x.a.get_text().encode('utf8')
         item['salary'] = x.find_all(
             'td', class_='zwyx')[0].get_text().encode('utf8')
         item['city'] = x.find_all(
             'td', class_='gzdd')[0].get_text().encode('utf8')
         if u'学历' in x.li.find_all('span')[3].get_text():
             item['education'] = x.li.find_all('span')[3].get_text().encode(
                 'utf8')
             item['workYear'] = ''
         else:
             if u'经验' in x.li.find_all('span')[3].get_text():
                 item['workYear'] = x.li.find_all(
                     'span')[3].get_text().encode('utf8')
                 item['education'] = x.li.find_all(
                     'span')[4].get_text().encode('utf8')
             else:
                 pass
         item['jobDes'] = x.li.li.get_text().encode('utf8')
         item['company'] = x.find_all(
             'td', class_='gsmc')[0].get_text().encode('utf8')
         item['companySize'] = x.li.find_all('span')[2].get_text().encode(
             'utf8')
         item['financeStage'] = x.li.find_all('span')[1].get_text().encode(
             'utf8')
         item['industryField'] = ''
         #-------#
         item['rate'] = 0
         item['number'] = 0
         yield item
예제 #11
0
파일: app.py 프로젝트: maomao622/job_spider
def extract(response):
    ip_port = {'ip': '', 'port': ''}
    try:
        s = bp(response, 'html.parser')
        a = s.find_all('tr')
        a.pop(0)
        for x in a:
            ip = x.find_all('td')[0].get_text()
            port = x.find_all('td')[1].get_text()
            ip_port['ip'] = ip
            ip_port['port'] = port
            if test_for_list(ip_port):
                if input('http://' + ip['ip'] + ':' + ip['port']):
                    print '插入成功'
                else:
                    print '插入失败'
                    continue
            else:
                print '验证失败'
                continue
        return True
    except:
        return False
예제 #12
0
def start(url):
    z = 1
    for i in range(97, 153):
        urlx = url + str(i)
        #time.sleep(random.randint(29,50))
        print 'Crawl : ' + urlx
        UA = random.choice(headerss)
        headers = {'User-Agent': UA}
        proxies = {'http': '115.237.13.176:8118'}
        req = requests.get(url=urlx, headers=headers, proxies=proxies)
        #time.sleep(random.randint(29,60))
        #print req.headers
        reqe = req.content.encode('utf-8', 'utf-8')
        bs = bp(reqe, 'lxml')
        a = bs.find_all('a', class_='xlistju')
        for x in a:
            try:
                print str(z) + ' : ' + x.string
                with open('longzu.txt', 'a+') as a:
                    a.write(str(z) + '  ' + str(x.string) + '\n' + '\n')
                    #time.sleep(random.randint(2, 8))
                    z += 1
            except:
                pass
예제 #13
0
 def parse_detail(self,response):
     s = bp(response.body,'html.parser')
     item = ItDataItem()
     a = s.find_all('div',class_='tHeader')[0]
     item['fromsite'] = '前程无忧'.encode('utf8')
     item['positionType'] = 0
     item['positionName'] = a.h1.get_text().encode('utf8')
     item['salary'] = a.strong.get_text().encode('utf8')
     item['city'] = a.span.get_text().encode('utf8')
     try:
         item['education'] = s.find_all('span',class_='sp2')[0].get_text()
         item['workYear'] = s.find_all('span',class_='sp1')[0].get_text()
     except:
         item['education'] = ''
         item['workYear'] = ''
     item['jobDes'] = a.find_all('div',class_='bmsg')[0].get_text().encode('utf8')
     item['company'] = a.find_all('p',class_='cname')[0].get_text().encode('utf8')
     item['companySize'] = a.find_all('p',class_='msg')[0].get_text().split('|')[1]
     item['financeStage'] = a.find_all('p',class_='msg')[0].get_text().split('|')[0]
     item['industryField'] = a.find_all('p',class_='msg')[0].get_text().split('|')[2]
     #-----#
     item['rate'] = ''
     item['number'] = ''
     yield item
예제 #14
0
import smtplib
import requests
from bs4 import BeautifulSoup as bp

vanpeople_URL = 'https://www.vanpeople.com/c/s_W1/1/0/0/0/0/0.html'
vansky_URL = 'https://www.vansky.com/info/ZFBG08.html?page=1&location=&year=&title=w1'
"""Vanpeople Part"""

vanpeople_page = requests.get(vanpeople_URL)

soup = bp(vanpeople_page.content, 'html.parser')

vanpeople_macbook_title_list = []
vanpeople_macbook_date_list = []
hyper_link_list = []

date_bp = soup.find_all('div', class_='f-fl time')
title_link_bp = soup.find_all('a', href=True, class_='ahtitle')

for i in range(10):
    temp = title_link_bp[i]
    temp2 = date_bp[i]
    vanpeople_macbook_title_list.append(temp.text.strip())
    hyper_link_list.append(temp['href'])
    vanpeople_macbook_date_list.append(temp2.text.strip())

vanpeople_result = zip(vanpeople_macbook_title_list,
                       vanpeople_macbook_date_list, hyper_link_list)

for i in vanpeople_result:
    print(i)
예제 #15
0
#coding:utf-8
import requests
import time
from bs4 import BeautifulSoup as bp

print unicode('Langzi.Fun 自动推送开启....', 'utf-8')
time.sleep(0.5)
site_url = 'https://code.boyumanong.top/baidusitemap.xml'

try:
    print unicode('Langzi.Fun 获取sitemap链接....', 'utf-8')
    data_ = bp(requests.get(site_url).content, 'lxml')
except Exception.e:
    print e

list_url = []


def get_(data):
    headers = {'User-Agent': 'curl/7.12.1 ', 'Content-Type': 'text/plain '}
    try:
        r = requests.post(
            url=
            'http://data.zz.baidu.com/urls?site=code.boyumanong.top&token=这里改写成你的token',
            data=data)
        print r.status_code
        print r.content
    except Exception, e:
        print e

예제 #16
0
import requests
from bs4 import BeautifulSoup as bp
page = requests.get("http://dataquestio.github.io/web-scraping-pages/simple.html")
soup = bp(page.content, "html.parser")
print(page.status_code)
print("\n")

print(soup.prettify())
예제 #17
0
# A função next_siblings() do BeautifulSoup facilita a coleta de dados em tabelas,
# principalmente as que têm linhas de título

from urllib.request import urlopen
from bs4 import BeautifulSoup as bp

html = urlopen("http://www.pythonscraping.com/pages/page3.html")
bsObj = bp(html, "lxml")

for sibling in bsObj.find("table", {"id": "giftList"}).tr.next_siblings:
    print(sibling)

    # não captura os irmãos do título (função excludente), apenas as linhas seguintes
예제 #18
0
def parseHtml(html):  # 获取一个html 返回得到翻译结果一段div元素
    soup = bp(html, features="lxml")
    explain = soup.find('div', {'class': "trans-container"})
    return explain
예제 #19
0
import requests as rq
from bs4 import BeautifulSoup as bp
import itertools as it
url = 'http://www.100ppi.com/mprice/mlist-1.html'
r = rq.get(url)
soup = bp(r.text, 'lxml')
print(soup)
from urllib.request import urlopen as uReq

#Make the website url as a variable:
my_url = 'http://astrogeo.org/cgi-bin/imdb_get_source.csh'

#Open the url:
uClient = uReq(my_url)

#Read the url and save its content as a variable:
page_html = uClient.read()

#Close the url:
uClient.close()

#Parse (i.e analyze) the html page into its parts:
page_parse = bp(page_html, "html.parser")

#See the parts of the html page within the variable 'page_parse' in
#a structured way (or organized format) so that the sub-parts of this
#variable having the required informations can be identified:
#print(bp.prettify(page_parse))

#find the sections or parts of the parsed html page which has
#all the required information and save it as a variable:
containers = page_parse.findAll("table", {"cellpadding": "3%"})

#Check the length of the variable 'containers' to know the
#number of sections of the same type has the required information.
#print(len(containers))

#Select the first section within container and save it as a variable:
예제 #21
0
''' 
Process: To scrapping the disaster types from  https://www.emdat.be/classification
Input : website link:  https://www.emdat.be/classification  
Output : CSV file containing the disaster types '''

import requests
import csv
from bs4 import BeautifulSoup as bp

url = 'https://www.emdat.be/classification'
res = requests.get(url)
soap = bp(res.text, 'html.parser')
a = []
rows = soap.find_all('tr')
data = []
for row in range(len(rows)):
    team_row = []
    columns = rows[row].findAll('td')
    for column in columns:
        team_row.append(column.getText())
    if len(team_row) == 4:
        data.append(team_row)
    else:
        d1 = []
        m = 4 - len(team_row)
        for i in range(0, m):
            d1.append('NONE')
        d1 = d1 + team_row
        data.append(d1)

with open('filename.csv', 'w') as csvfile:
예제 #22
0
        ch_it = 0.0
        for i in range(nodes):
            old = score[i]
            score[i] = 1 - d
            for j in range(nodes):
                score[i] += d * similarty[i][j] * score[j] / total[j]
            old = score[i] - old
            if ch_it < old:
                ch_it = old
        iterations += 1


files = os.listdir(path)
for file in files:  #reading all the files and generating the sentences and its corresponding vetors in fullsentences and sentences respectively
    doc = io.open(path + file, 'r', encoding='utf-8')
    soup = bp(doc.read(), 'html.parser')
    fulls = [word.string for word in (soup.find_all('p'))[1:]]
    for full in fulls:
        temp = full.encode('ascii').split('.')
        initial = len(fullsentences)
        fullsentences += temp
        sent = [x.translate(None, string.punctuation).strip() for x in temp]
        i = 0
        for line in sent:
            if line == '':
                fullsentences.pop(i + initial)
                continue
            temp = defaultdict(lambda: 0)
            j = 0
            for word in nltk.word_tokenize(line):
                temp[word.lower()] += 1
예제 #23
0
 def parse(self, response):
     s = bp(response.body,'html_parser')
     a = s.find_all('div',class_='el')[10:]
     for x in a:
         link = x.a.get('href')
         yield scrapy.Request(link,callback=self.parse_detail)
예제 #24
0
from bs4 import BeautifulSoup as bp
import requests
import csv


csv_file = open('./dados/fornecedores/fornecedores.csv',
                'w', newline='', encoding='utf-8')
csv_write = csv.writer(csv_file)
csv_write.writerow(['id', 'empresa', 'telefone', 'site', 'email'])

url = 'http://www.abimad.com.br/associados/pesquisar#menu-top'

r = requests.get(url)

sup = bp(r.content, 'html.parser')

todos_elementos = sup.find_all('figure')

id = 0

for elemento in todos_elementos:
    empresa = elemento.h5.text
    telefone = elemento.find(
        'i', class_='fa-phone').next_element.next_element.text
    site = elemento.find('i', class_='fa-link').next_element.next_element.text
    email = elemento.find('i', class_='fa-at').next_element.next_element.text

    csv_write.writerow([id, empresa, telefone, site, email])
    id += 1

csv_file.close()
예제 #25
0
from urllib.request import urlopen as uReq

#Make the website url as a variable:
my_url = 'http://astrogeo.org/cgi-bin/imdb_get_source.csh'

#Open the url:
uClient = uReq(my_url)

#Read the url and save its content as a variable:
page_html = uClient.read()

#Close the url:
uClient.close()

#Parse (i.e analyze) the html page into its parts:
page_parse = bp(page_html, "html.parser")

#See the parts of the html page within the variable 'page_parse' in
#a structured way (or organized format) so that the sub-parts of this
#variable having the required informations can be identified:
#print(bp.prettify(page_parse))

#find the sections or parts of the parsed html page which has
#all the required information and save it as a variable:
containers = page_parse.findAll("table", {"cellpadding": "3%"})

#Check the length of the variable 'containers' to know the
#number of sections of the same type has the required information.
#print(len(containers))

#Select the first section within container and save it as a variable:
예제 #26
0
import requests
from pprint import pprint as pp
from bs4 import BeautifulSoup as bp

url = 'https://news.ycombinator.com'
r = requests.get('https://news.ycombinator.com/news?p=2')
page = bp(r.content, 'html.parser')

def extract_news(parser):
    """ Extract news from a given web page """
    news_list = []
    titles = []

    tables = parser.table.find_all('table') # Находим все таблицы
    needed_table = tables[1] # Выбираем нужную.
    tds = needed_table.find_all('td', attrs={'class': 'subtext'}) # Код всех строчек с информацией о новости
    ttls = needed_table.find_all('a', attrs={'class': 'storylink'}) # Код всех заголовков
    for title in ttls: # С
        titles.append(title.text)
    
    for i, td in enumerate(tds):
        tr = td.find_all('a')
        url = ttls[i]['href']
        title = titles[i]
        
        author = td.find('a', attrs={'class': 'hnuser'})
        if author == None:
            author = '--'
        else:
            author = author.text
예제 #27
0
def process_third_step(lineID):
    z_out = open("cc.1", "a+")
    with open('bb.6', 'r') as f1, open('bb.7', 'r') as f2:
        for ID, line in enumerate(zip(f1, f2)):
            if ID != lineID:
                continue
            else:
                x = line[0]
                y = line[1]
                my_url = 'http://ned.ipac.caltech.edu/cgi-bin/nph-objsearch?in_csys=Equatorial&in_equinox=J2000.0&lon=' + x[
                    0:(len(x) - 1)] + '&lat=' + y[0:(
                        len(y) - 1
                    )] + '&radius=0.1&out_csys=Equatorial&out_equinox=J2000.0&obj_sort=Distance+to+search+center&of=pre_text&zv_breaker=30000.0&list_limit=32&img_stamp=YES&z_constraint=Unconstrained&z_value1=&z_value2=&z_unit=z&ot_include=ANY&in_objtypes3=Radio&nmp_op=ANY&search_type=Near+Position+Search'
                #print(my_url)
                uClient = uReq(my_url)
                page_html = uClient.read()
                uClient.close()
                page_parse = bp(page_html, "html.parser")
                containers = page_parse.findAll('td', {"bgcolor": "lightgrey"})
                if len(containers) != 0:
                    container = containers[0]
                else:
                    my_url1 = 'http://ned.ipac.caltech.edu/cgi-bin/nph-objsearch?in_csys=Equatorial&in_equinox=J2000.0&lon=' + x[
                        0:(len(x) - 1)] + '&lat=' + y[0:(
                            len(y) - 1
                        )] + '&radius=0.5&out_csys=Equatorial&out_equinox=J2000.0&obj_sort=Distance+to+search+center&of=pre_text&zv_breaker=30000.0&list_limit=32&img_stamp=YES&z_constraint=Unconstrained&z_value1=&z_value2=&z_unit=z&ot_include=ANY&in_objtypes3=Radio&nmp_op=ANY&search_type=Near+Position+Search'
                    uClient1 = uReq(my_url1)
                    page_html1 = uClient1.read()
                    uClient1.close()
                    page_parse1 = bp(page_html1, "html.parser")
                    containers1 = page_parse1.findAll('td',
                                                      {"bgcolor": "lightgrey"})
                    if len(containers1) != 0:
                        container = containers1[0]
                    else:
                        my_url2 = 'http://ned.ipac.caltech.edu/cgi-bin/nph-objsearch?in_csys=Equatorial&in_equinox=J2000.0&lon=' + x[
                            0:(len(x) - 1)] + '&lat=' + y[0:(
                                len(y) - 1
                            )] + '&radius=2.0&out_csys=Equatorial&out_equinox=J2000.0&obj_sort=Distance+to+search+center&of=pre_text&zv_breaker=30000.0&list_limit=32&img_stamp=YES&z_constraint=Unconstrained&z_value1=&z_value2=&z_unit=z&ot_include=ANY&in_objtypes3=Radio&nmp_op=ANY&search_type=Near+Position+Search'
                        uClient2 = uReq(my_url2)
                        page_html2 = uClient2.read()
                        uClient2.close()
                        page_parse2 = bp(page_html2, "html.parser")
                        containers2 = page_parse2.findAll(
                            'td', {"bgcolor": "lightgrey"})
                        if len(containers2) != 0:
                            container = containers2[0]
                        else:
                            my_url3 = 'http://ned.ipac.caltech.edu/cgi-bin/nph-objsearch?in_csys=Equatorial&in_equinox=J2000.0&lon=11h11m11.1111&lat=+11d11m11.111&radius=2.0&out_csys=Equatorial&out_equinox=J2000.0&obj_sort=Distance+to+search+center&of=pre_text&zv_breaker=30000.0&list_limit=32&img_stamp=YES&z_constraint=Unconstrained&z_value1=&z_value2=&z_unit=z&ot_include=ANY&in_objtypes3=Radio&nmp_op=ANY&search_type=Near+Position+Search'
                            uClient3 = uReq(my_url3)
                            page_html3 = uClient3.read()
                            uClient3.close()
                            page_parse3 = bp(page_html3, "html.parser")
                            containers3 = page_parse3.findAll(
                                'td', {"bgcolor": "lightgrey"})
                            container = containers3[0]
                            logging.info(
                                'Radio source not found within 2.0 arcmin radius from NED around coordinates:\n%s %s'
                                % (x, y))
                #i=i+1
                outfile = open("kk.{}".format(ID), "w")
                outfile.write(container.pre.text)
                outfile.close()
                #Removing the first 3 lines of a text file:
                lines = open("kk.{}".format(ID), "r").readlines()
                open("ll.{}".format(ID), 'w').writelines(lines[3:])
                #Selecting particular columns based on their positions in a text file table:
                inp = open("ll.{}".format(ID), 'r')
                lines = inp.readlines()
                out = open("mm.{}".format(ID), 'w')
                for line in lines:
                    out.write(line[74:83])
                    out.write(line[120:123])
                    out.write("\n")
                out.close()
                inp.close()
                #Sorting a text file table with respect to a column of numbers:
                with open("mm.{}".format(ID), 'r') as inp:
                    with open("nn.{}".format(ID), 'w') as out:
                        lines = inp.readlines()
                        lines.sort(key=lambda line: int(line.split()[1]),
                                   reverse=True)
                        out.writelines(lines)
                #Selecting a particular element of text file table:
                inp = open("nn.{}".format(ID), 'r').readlines()
                line = inp[0].split()
                if line[0] == '...':
                    z_value = 'NaN'  #We can make it '0.0' also.
                else:
                    z_value = line[0]
                z_out.write(z_value)
                z_out.write('\n')
    z_out.close()
예제 #28
0
#coding:utf-8
import requests
import time
from bs4 import BeautifulSoup as bp

print unicode('Langzi.Fun 自动推送开启....','utf-8')
time.sleep(0.5)
site_url = 'https://yuwangi.github.io/baidusitemap.xml'

try:
    print unicode('Langzi.Fun 获取sitemap链接....','utf-8')
    # print requests.get(site_url).content
    data_ = bp(requests.get(site_url).content,"html.parser")
    print "2222"
    print data_.url
    
except Exception,e:
    print "111111"
    print e

list_url=[]

def get_(data):
    headers={'User-Agent':'curl/7.12.1 ',
             'Content-Type':'text/plain '}
    try:
        r = requests.post(url='http://data.zz.baidu.com/urls?site=yuwangi.github.io&token=kViymNiCmZjrxxoa',data=data)
        print r.status_code
        print r.content
    except Exception,e:
        print e
예제 #29
0
import requests
from bs4 import BeautifulSoup as bp
import pandas as pd

page = requests.get("https://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168")
soup = bp(page.content, 'html.parser')
seven_day = soup.find(id="seven-day-forecast")
forecast_items = seven_day.find_all(class_="tombstone-container")
today = forecast_items[0]

#print(today.prettify())

period = today.find(class_="period-name").get_text()
short_desc = today.find(class_="short-desc").get_text()
temp = today.find(class_="temp").get_text()

#print(period)
#print(short_desc)
#print(temp)

img = today.find("img")
desc = img['title']

#print(desc)

period_tags = seven_day.select(".tombstone-container .period-name")
periods = [pt.get_text() for pt in period_tags]
#print(periods)

short_descs = [sd.get_text() for sd in seven_day.select(".tombstone-container .short-desc")]
temps = [t.get_text() for t in seven_day.select(".tombstone-container .temp")]
예제 #30
0
import requests
from bs4 import BeautifulSoup as bp

url = 'http://news.qq.com/'
newdata = requests.get(url).text
soup = bp(newdata, 'lxml')
titles = soup.select("div.text > em.f14 > a.linkto")
for i in range(len(titles)):
    for n in titles:
        my_title = n.get_text()
        my_link = n.get("href")
        dataset = {my_title, my_link}
        print(dataset)