Example #1
0
def main():
    """Main entry point for the script."""

    url = 'https://blog.docker.com/category/engineering/'

    response = requests.get(url)

    #print response.text

    blog_list = []

    response_data = BeautifulSoup(response.text, "html.parser")

    if response_data.__len__() == 0:
        print "no data returned"
        exit(1)

    for each_title in response_data.find_all('entry-title'):
        print each_title.text
        #blog_title = each_title.text
        #if blog_title != 'X':
        #    blog_list.append((blog_title.text))

    #for blog_data in blog_list:
    #    print blog_data.text

    print 'End main'
    pass
Example #2
0
 def get_content(self):
     content = self.content
     bs = BeautifulSoup(content, 'html.parser')
     bs = bs.text
     if bs:
         if bs.__len__() > 60:
             return bs[:60] + '...'
     return bs
Example #3
0
def get_article_content():
    print("正在获取文章目录...")
    content_list = []
    href_list = []
    txt=""
    for page in range(1,26):
        #取得url内容
        htmltext = get_conn(url_jichupian+"list_1_"+str(page)+".html").text
        soup = BeautifulSoup(htmltext, 'html.parser')
        soup = soup.find_all("ul")[1]
        soup = soup.find_all("h3")
        for item in range(soup.__len__()):
            title = soup[item].get_text()
            content_list.append(title)
    save_list(content_list)
Example #4
0
    def load_page(self):
        conn = self.connect_database()
        cursor = conn.cursor()

        try:

            browser = AcademyRankInfo.browser
            '''
                预留30s管理员手动点击///无奈///
            '''
            print("请开始点击页面上的加载更多,90s内完成")
            time.sleep(90)
            print("数据抓取已经开始,请不要做其他操作")

            html = browser.page_source
            doc = pq(html)
            soup = BeautifulSoup(str(doc), 'html.parser')
            tbody = soup.select('.tbody-container')
            trs = BeautifulSoup(str(tbody), 'html.parser').find_all('tr')
            print('trs长度:', trs.__len__())
            #           SELECT aca_no FROM academy_info WHERE aca_name = '北京大学'
            for tr in trs:
                try:
                    aca_ranking = int(
                        BeautifulSoup(
                            str(tr),
                            'html.parser').select('.t1')[0].get_text())
                    aca_name = BeautifulSoup(
                        str(tr), 'html.parser').select('.t2')[0].get_text()
                    query = Query()
                    aca_id = str(query.query_acaIdByacaName(aca_name))
                    cur_sql_academyRank_value = "('" + aca_id + "','" + aca_name + "','" + str(
                        aca_ranking) + "')"
                    cur_sql_academyRank = "insert into " + AcademyRankInfo.insert_db + "(aca_id,aca_name,aca_ranking) values" + cur_sql_academyRank_value
                    print(cur_sql_academyRank)
                    cursor.execute(cur_sql_academyRank)
                    conn.commit()
                except Exception as e:
                    print(repr(e))

        except TimeoutException:
            print("爬取院校失败")
Example #5
0
 def query(self, query):
     page = 1
     thumbs = []
     print("Fetching results")
     while (True):
         r = requests.get("https://alpha.wallhaven.cc/search?q=" +
                          query.replace(" ", "+") + "&page=" + str(page))
         results = BeautifulSoup(r.content, "html.parser").find_all(
             "a", {"class": "preview"})
         r.close()
         if results.__len__() > 0:
             for result in results:
                 thumbs.append(result)
         else:
             break
         page += 1
         print("Fetched {} results".format(len(thumbs)))
     print("Found " + str(thumbs.__len__()) + " results")
     self.results = thumbs
     self.handle()
def lyricsMint(song, album, save):
    print song, "Searching..."
    # Searching Song
    url = "http://www.lyricsmint.com/search?q="+"+".join(song.split())
    req = urllib2.Request(url=url)
    html = urllib2.urlopen(req)
    soup = BeautifulSoup(html)
    soup = soup.select('.blog-posts.hfeed .date-outer')

    #No Lyrics
    if soup.__len__() == 0:
        print "No Lyrics found"
        return
    lyric = None
    href = []
    song = ''.join(ch for ch in song if ch.isalnum() or ch.isspace())
    album = ''.join(ch for ch in album if ch.isalnum() or ch.isspace())

    #Searching require lyric
    for s in soup:
        temp = s.select('.post-title a')[0]
        for name in song.split():
            if name.lower() in temp.text.lower():
                href.append((temp.text, temp['href']))
                break
    
    if href.__len__() == 1:
        lyric = href[0]
    else:
        if href.__len__() == 0:
            for i in range(soup.__len__()):
                title = soup[i].select('.post-title a')[0]
                href.append((title.text, title['href']))
                #print i, title.text
            #choice = input("Enter the choice: ")
            #lyric = (soup[choice].select('.post-title a')[0].text, soup[choice].select('.post-title a')[0]['href'])
        else:
            pass

        temp = []
        for i in range(href.__len__()):
            for name in album.split():
                if name.lower() in href[i][0].lower():
                    temp.append(href[i])
                    break
        if temp.__len__() == 1:
            lyric = temp[0]
        else:
            for i in range(temp.__len__()):
                print i, temp[i][0]
            choice = input("Enter the choice: ")
            lyric = temp[choice]
    
    url = lyric[1]
    req = urllib2.Request(url=url)
    html = urllib2.urlopen(req)
    soup = BeautifulSoup(html)
    soup = soup.select('#lyric')
    file = open(save, 'w')
    file.write(soup[0].select('p')[0].text)
    file.close()
    print 'Downloaded!'
Example #7
0
 def get_equip(self, link):
     ## La lista che restituiremo
     equip = []
     ## Faccio la richiesta
     response = requests.get(link)
     ## E' tutto in try e expect siccome potrebbe essere che non abbiamo nessuna equip. Per come è fatto, non possiamo
     ## Fare controlli. L'unica maniera è un try except
     try:
         ## Prendo il testo che contiene l'equipe
         prima_parte = \
         BeautifulSoup(response.content, "html.parser").find(id="accordion-content-equipe-%c2%bb").contents[
             1].getText()
         ## Se contiene i : allora vuol dire che è semplice come struttura
         if prima_parte.__contains__(":"):
             ## Per non fare un codice stra lungo ( e siccome sono solamente 2 le pagine quelle diverse ) faccio questa cosa dove le toglie i due casi speciali
             if prima_parte.__len__() < 3000 and prima_parte.__contains__(
                     "Antonio RAMPONI") == False:
                 ## Nel caso normale,
                 ## Prendo la prima parte, la divido in parti a seconda del :, prendo ciò che ci interessa e la ridivido per \n. Una volta questo,
                 ## Itero ogni parte della lista e ci toglio le cose in più per poi toglierli le celle vuote e le varie eccezioni.
                 ## Lo rendo una stringa per poi dividerlo ogni ,
                 equip = (list(
                     filter(
                         lambda val: val.split().__len__() != 1,
                         list(
                             filter(
                                 lambda val: val.__len__(
                                 ) != 0 and val != "Struttura semplice" and
                                 val != "Strutture semplici" and val !=
                                 "Coordinatore Infermieristico",
                                 list(
                                     map(
                                         lambda val: val.strip(),
                                         prima_parte.split(":")[1].split(
                                             "\n"))))))))
             ## In questi 2 casi, li aggiungo "manualmente"
             elif prima_parte.__contains__("Antonio RAMPONI"):
                 equip = [
                     "Cristiana BOZZOLA", "Francesca FOTI",
                     "Angela GIACALONE", "Monica LEUTNER",
                     "Emanuela UGLIETTI", "Guido VALENTE"
                 ]
             else:
                 equip = [
                     "Patrizia NOTARI", "Matteo VIDALI",
                     "Vessellina KRUOMOVA", "Giuseppina ANTONINI",
                     "Ilaria CRESPI", "Luisa DI TRAPANI", "Lucia FRANCHINI",
                     "Roberta Rolla", "Marco Bagnati", "Patrizia PERGOLONI"
                 ]
         else:
             ## Nel caso non abbia i :, allora ce la caviamo semplicemente così
             equip = \
                 prima_parte.strip().split(",")
     except AttributeError:
         pass
     ## Per risolvere un errore
     if equip.__len__() == 1 and equip[0].__len__() > 20:
         equip = equip[0].split(',')
     ## Chiudo la connessione
     response.close()
     ## Ritorno l'array
     return equip
Example #8
0
def lyricsMint(song, album, save):
    print song, "Searching..."
    # Searching Song
    url = "http://www.lyricsmint.com/search?q=" + "+".join(song.split())
    req = urllib2.Request(url=url)
    html = urllib2.urlopen(req)
    soup = BeautifulSoup(html)
    soup = soup.select('.blog-posts.hfeed .date-outer')

    #No Lyrics
    if soup.__len__() == 0:
        print "No Lyrics found"
        return
    lyric = None
    href = []
    song = ''.join(ch for ch in song if ch.isalnum() or ch.isspace())
    album = ''.join(ch for ch in album if ch.isalnum() or ch.isspace())

    #Searching require lyric
    for s in soup:
        temp = s.select('.post-title a')[0]
        for name in song.split():
            if name.lower() in temp.text.lower():
                href.append((temp.text, temp['href']))
                break

    if href.__len__() == 1:
        lyric = href[0]
    else:
        if href.__len__() == 0:
            for i in range(soup.__len__()):
                title = soup[i].select('.post-title a')[0]
                href.append((title.text, title['href']))
                #print i, title.text
            #choice = input("Enter the choice: ")
            #lyric = (soup[choice].select('.post-title a')[0].text, soup[choice].select('.post-title a')[0]['href'])
        else:
            pass

        temp = []
        for i in range(href.__len__()):
            for name in album.split():
                if name.lower() in href[i][0].lower():
                    temp.append(href[i])
                    break
        if temp.__len__() == 1:
            lyric = temp[0]
        else:
            for i in range(temp.__len__()):
                print i, temp[i][0]
            choice = input("Enter the choice: ")
            lyric = temp[choice]

    url = lyric[1]
    req = urllib2.Request(url=url)
    html = urllib2.urlopen(req)
    soup = BeautifulSoup(html)
    soup = soup.select('#lyric')
    file = open(save, 'w')
    file.write(soup[0].select('p')[0].text)
    file.close()
    print 'Downloaded!'
Example #9
0
from botocore.config import Config
from bs4 import BeautifulSoup
import boto3.s3
import boto3
from botocore.exceptions import NoCredentialsError
registrationNumber=[]
result=[]
names=[]
marks=[]
missingNumbers=[]
for x in range(1400000001,1499999999):
 try:
     url="https://www.vidyavision.com/results/ssc2014.aspx?h="+str(x)
     page = requests.get(url).text
     soup = BeautifulSoup(page, "html.parser")
     if(soup.__len__()!=0):
         registrationNumber.append(str(soup).split('~')[0])
         names.append(str(soup).split('~')[1])
         result.append(str(soup).split('~')[-1])
         marks.append(str(soup).rsplit("~")[-3])
         print(str(x)+"OK")
     else:
        missingNumbers.append(x)
        print(str(x) + "Not  OK")
 except:
    print("Error"+str(x))
d = [registrationNumber, names,marks,result]
export_data = zip_longest(*d, fillvalue = '')
with open('output.csv', 'w', newline='') as myfile:
      wr = csv.writer(myfile)
      wr.writerow(("registrationNumber", "names","marks","result"))