Example #1
0
    def start(self):
        while True:
            # Get links from subreddit we are interested in
            for submission in self.subreddit.get_hot(limit=10):
                # Check for the cases where we will skip a submission:
                if "imgur.com/" not in submission.url:
                    continue # skip non-imgur submissions
                if 'http://imgur.com/a/' in submission.url:
                    # This is an album submission.
                    albumId = submission.url[len('http://imgur.com/a/'):]
                    htmlSource = requests.get(submission.url).text

                    soup = BeautifulSoup(htmlSource)
                    try:
                        matches = soup.select('.album-view-image-link a')
                        for match in matches:
                            imageUrl = match['href']
                            if '?' in imageUrl:
                                imageFile = imageUrl[imageUrl.rfind('/') + 1:imageUrl.rfind('?')]
                            else:
                                imageFile = imageUrl[imageUrl.rfind('/') + 1:]
                            self.checkHigherRes('http:' + match['href'])
                    except Exception as e:
                        print e.message

                elif 'http://i.imgur.com/' in submission.url:
                    # The URL is a direct link to the image.
                    mo = self.imgurUrlPattern.search(submission.url) # using regex here instead of BeautifulSoup because we are pasing a url, not html

                    imgurFilename = mo.group(2)
                    if '?' in imgurFilename:
                        # The regex doesn't catch a "?" at the end of the filename, so we remove it here.
                        imgurFilename = imgurFilename[:imgurFilename.find('?')]

                    self.checkHigherRes(submission.url)

                elif 'http://imgur.com/' in submission.url and 'http://imgur.com/' not in submission.url:
                    # This is an Imgur page with a single image.
                    htmlSource = requests.get(submission.url).text # download the image's page
                    soup = BeautifulSoup(htmlSource)
                    try:
                        imageUrl = soup.select('.image div img')[0]['src']
                        if imageUrl.startswith('//'):
                            # if no schema is supplied in the url, prepend 'http:' to it
                            imageUrl = 'http:' + imageUrl
                        imageId = imageUrl[imageUrl.rfind('/') + 1:imageUrl.rfind('.')]

                        if '?' in imageUrl:
                            imageFile = imageUrl[imageUrl.rfind('/') + 1:imageUrl.rfind('?')]
                        else:
                            imageFile = imageUrl[imageUrl.rfind('/') + 1:]

                        self.checkHigherRes(imageUrl)
                    except Exception as e:
                        print e.message

            time.sleep(1800)
def get_elections():
    r = requests.get('http://elections.sos.state.tx.us/index.htm')
    soup = BeautifulSoup(r.text)
    return [{
        'election_code': o['value'],
        'title': o.text
    } for o in soup.select('option')]
Example #3
0
def getSeasonPage(seasonLink):
    selected_episode = None
    sleep(0.5)
    request_season_page = requests.get(seasonLink, headers=headers)
    soup = BeautifulSoup(request_season_page.text, 'html.parser')
    seasonEpisodes = soup.select('[class*="EpisodeListItem__title"]')
    minEntryIndex = int(min(range(len(seasonEpisodes))) + 1)
    maxEntryIndex = int(max(range(len(seasonEpisodes))) + 1)
    for index, seasonEpisode in enumerate(seasonEpisodes):
        print(seasonEpisode.text)
        print('Index : %d' % (index + 1))

    select_number = input(
        'Please select a number from %d to %d for the desired episode: ' %
        (minEntryIndex, maxEntryIndex))

    while not select_number:
        select_number = input(
            'Please select a number from %d to %d for the desired episode: ' %
            (minEntryIndex, maxEntryIndex))

    while not int(select_number) >= minEntryIndex:
        select_number = input(
            'Please select a number from %d to %d for the desired episode: ' %
            (minEntryIndex, maxEntryIndex))

    while not int(select_number) <= maxEntryIndex:
        select_number = input(
            'Please select a number from %d to %d for the desired episode: ' %
            (minEntryIndex, maxEntryIndex))

    episodeURI = seasonEpisodes[int(select_number) - 1].find('a')

    getEpisodeSong(episodeURI['href'])
Example #4
0
def fetchGoal(link):
    page = fetchHtmlContent(link)
    soup = BeautifulSoup(page, 'lxml')

    description = ''

    # Huvudmål
    tags = soup.select('h1.goal-title')
    for tag in tags:
        description += tag.text + '\n\n'

    # Lång beskrivning
    tags = soup.select('div.single-goal-long-description-text')
    for tag in tags:
        description += tag.text
    description += '\n'

    # Delmål
    tags = soup.select('div.container div.row div.col-8 h4')
    for tag in tags:
        description += tag.text + '\n'
    description += '\n'

    # Beskrivningar av delmål
    tags = soup.select(
        'div.container div.row div.col-8 div.target-description')
    for tag in tags:
        description += tag.text
    description += '\n'

    # Snabba tips
    tags = soup.select('div.single-tip-inner h4')
    for tag in tags:
        description += tag.text + '\n'

    # additional tips
    tags = soup.select('div.single-additional-tip-content h4')
    for tag in tags:
        description += tag.text + '\n'
    description += '\n'

    # Snabba tips descriptions
    tags = soup.select('div.single-tip-inner div.tip-description p')
    for tag in tags:
        description += tag.text + '\n'

    # Additional tips descriptions
    tags = soup.select('div.single-additional-tip-content p')
    odd = False
    for tag in tags:
        if (odd):
            description += tag.text + '\n'
        odd = not odd

    return description
def getContent(html):
    try:
        soup = BeautifulSoup(html, "html.parser")
        result = soup.select('div.articulum > p')
        content = result[0].text.encode("utf-8")
        if len(content) != 0:
            #TODO
            pass
        else:
            getContent(html)
    except:
        print("soup exception")
Example #6
0
def crawler():
    counter = 1
    for url_ref in config.FULL_URLS:
        resp = requests.get(url_ref)
        if resp.status_code == 200:
            _, name = get_name(url_ref)
            # Ensure folder exists
            folter_path = create_folder([config.LYRICS_FOLDER, name])
            # Get all links
            parsed_html = BeautifulSoup(resp.content, features='html.parser')
            lyrics_links = parsed_html.select('.listalbum-item a')
            LOG.info(f"Number of {name.upper()} songs: {len(lyrics_links)}")

            lyric_paths = [extract_link(link) for link in lyrics_links]

            for lyric_path in lyric_paths:

                try:
                    writer, song_name = get_name(lyric_path)
                    if name != writer:
                        alt_folder = create_folder(
                            [config.LYRICS_FOLDER, writer])
                        lyrics_file = alt_folder.joinpath(song_name + '.txt')
                        file_found = lyrics_file.is_file()
                    else:
                        writer = name
                        lyrics_file = folter_path.joinpath(song_name + '.txt')
                        file_found = lyrics_file.is_file()

                    if not file_found:
                        # url = config.BASE_URL + lyric_path
                        text = get_lyrics(lyric_path).strip()
                        LOG.info("Downloading (" + str(counter).zfill(3) +
                                 f") [{writer}]: {song_name}")
                        counter += 1

                        with open(lyrics_file, "w") as f:
                            f.write(text)
                        time.sleep(config.CRAWLER_WAIT +
                                   config.CRAWLER_WAIT * random.random())

                except IndexError:
                    LOG.error(
                        f"Access denied while scraping: {lyric_path} \n"
                        f"Try increasing the waiting time.\n"
                        f"Finishing the scrapping for the moment. Try to access on your browser to unblock access"
                    )
                    return
                except Exception as err:
                    print(f"ERROR: {lyric_path}: {err}")

        else:
            LOG.warning(f"Unable to load: {url_ref}")
Example #7
0
def getUserInfo(shared_url, **headers):
    html_doc = getHtml(shared_url, **headers)
    result = {}
    if html_doc:
        html_doc = html_doc.replace('&#', 'hzsd')
        soup = BeautifulSoup(html_doc, 'html.parser')
        header_url = soup.select("[class~=avatar]")[0]['src']
        nickname = soup.select("[class~=nickname]")[0].string
        uid = soup.select("[class~=shortid]")[0].get_text()
        uid = uid.split(" ")
        id = woff2tff(uid)
        sign = soup.select("[class~=signature]")[0].string
        dataInfo = soup.select("[class~=follow-info]")[0]
        dataInfo = splitByChinese(dataInfo.get_text())
        dataInfo = [d for d in dataInfo if len(d) > 0]
        focus = dataInfo[0].split(' ')
        focus = woff2tff(focus)
        fans = dataInfo[1].split(' ')
        fans = woff2tff(fans)
        liked = dataInfo[2].split(' ')
        liked = woff2tff(liked)
        works = soup.select(
            "[class='user-tab active tab get-list']")[0].get_text()
        works = woff2tff(works.split(' '))
        result['avatar'] = header_url
        result['nickname'] = nickname
        result['id'] = id
        result['sign'] = sign
        result['focus'] = focus
        result['fans'] = fans
        result['liked'] = liked
        result['works'] = works
    return result
def login(browser):
    br.set_handle_robots(False)
    # Set cookie container
    cj = cookielib.CookieJar()
    br.set_cookiejar(cj)
    # Allow refresh of the content
    br.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
    # Set the fake user-agent and rest of headers to emulate the browser
    br.addheaders = [('User-agent','Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11'),
                     ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
                     ('Accept-Encoding', 'gzip,deflate,sdch'),                  
                     ('Accept-Language', 'en-US,en;q=0.8'),                     
                     ('Accept-Charset', 'ISO-8859-1,utf-8;q=0.7,*;q=0.3'),
                    ]

    logger = logging.getLogger("mechanize")
    logger.addHandler(logging.StreamHandler(sys.stdout))
    logger.setLevel(logging.DEBUG)

    # Set the login url for coursera, the final url for the lectures is enclosed within the url here.
    br.open('https://accounts.coursera.org/signin?course_id=973439&r=https%3A%2F%2Fclass.coursera.org%2Fprogfun-005%2Flecture&user_action=class&topic_name=Functional%20Programming%20Principles%20in%20Scala')

    soup = BeautifulSoup(response.get_data())

    # Look for the right parts in the login form and fill them. Click in the button
    soup.select("signin-email").string("my-username")
    soup.select("signin-password").string("my-password")
    soup.select("btn bt-success coursera-signin-button").
Example #9
0
def populateGoalNames(dbname, goal_links):
    goals = []
    for ix, goal_link in enumerate(goal_links):
        print(goal_link)
        # print(fetchGoal(goal_link))
        page = fetchHtmlContent(goal_link)
        soup = BeautifulSoup(page, 'lxml')
        tags = soup.select('h1.goal-title')
        goals.append(tags[0].text)
        sql_insert = 'INSERT INTO Goals VALUES ("{}", "{}", "")'.format(
            ix + 1, tags[0].text)
        print(sql_insert)
        writeToSQliteDB(dbname, sql_insert)
Example #10
0
def Categories():
        BASE = 'http://thepiratebay.org/browse/'
        addDir('Search','',icon,4, 0)
        result = cache.cacheFunction(cache_cats)
        soup = BeautifulSoup(result[0], convertEntities=BeautifulSoup.HTML_ENTITIES)
        cats = soup.select('optgroup')
        for i in cats:
            main_name = i['label']
            main_id = i.option['value'][0]+'00'
            if sort == "1":
                BASE = BASE.replace('browse', 'top')
            addDir(main_name, BASE+main_id, icon, 1, 0)
            for sub in i('option'):
                sub_name = sub.string
                sub_id = sub['value']
                if sort == "1":
                    BASE = BASE.replace('browse', 'top')
                addDir(main_name+' - '+sub_name, BASE+sub_id, icon, 1, 0)
Example #11
0
def getIngredientNames(index):
    # get = request.GET
    # index = int(get.get('index'))
    #
    #
    # from recipes.views import *
    # getIngredientNames(8279)
    #
    urlBase = 'http://cooking.nytimes.com/recipes/'
    while index < 2000000:
        url = urlBase + str(index)
        print index
        index += 1
        try:
            req = urllib2.Request(url.encode("utf8"),
                                  headers={
                                      'accept': '*/*',
                                      'User-Agent': "Magic Browser"
                                  })
            html = urllib2.urlopen(req, timeout=10)
        except:
            continue
        soup = BeautifulSoup(html, "html5lib")
        ingredients = soup.select('.ingredient-name span')
        for i in ingredients:
            i = i.text.lower()
            if not 'nutritional information' in i:
                if ' and ' in i:
                    i = i.split(' and ')
                elif ' or ' in i:
                    i = i.split(' or ')
                elif ', ' in i:
                    i = i.split(' or ')
                else:
                    i = [i]
                for part in i:
                    if 'our' in part:
                        Ingredient.objects.get_or_create(name=part)
                    else:
                        if part != singularize(part):
                            print part, singularize(part)
                        Ingredient.objects.get_or_create(
                            name=singularize(part))
    print 'DONE'
Example #12
0
def getSeason(content):
    selectedSeason = None
    sleep(0.5)
    request_content_page = requests.get(tunefind_search_uri + content['uri'],
                                        headers)
    soup = BeautifulSoup(request_content_page.text, 'html.parser')
    allSeasons = soup.select('[class*="MainList__item"]')

    minEntryIndex = int(min(range(len(allSeasons))) + 1)
    maxEntryIndex = int(max(range(len(allSeasons))) + 1)

    if minEntryIndex == maxEntryIndex:
        seasonLink = tunefind_search_uri + allSeasons[0].find('a')['href']
        getSeasonPage(seasonLink)
        return

    for index, season in enumerate(allSeasons):
        season_link = season.find('a')

        if season_link is None:
            continue

        print('Title: %s' % (season_link.text))
        print('Index: %d' % (int(index) + 1))

    select_number = input('Please select a number from %d to %d: ' %
                          (minEntryIndex, maxEntryIndex))

    while not select_number:
        select_number = input('Please select a number from %d to %d: ' %
                              (minEntryIndex, maxEntryIndex))

    while not int(select_number) >= minEntryIndex:
        select_number = input('Please select a number from %d to %d: ' %
                              (minEntryIndex, maxEntryIndex))

    while not int(select_number) <= maxEntryIndex:
        select_number = input('Please select a number from %d to %d: ' %
                              (minEntryIndex, maxEntryIndex))

    selectedSeason = allSeasons[int(select_number) - 1]
    seasonLink = str(tunefind_search_uri + selectedSeason.find('a')['href'])
    getSeasonPage(seasonLink)
Example #13
0
def getIngredientNames(index):
    # get = request.GET
    # index = int(get.get('index'))
    #
    #
# from recipes.views import *
# getIngredientNames(8279)
#
    urlBase = 'http://cooking.nytimes.com/recipes/'
    while index < 2000000:
        url = urlBase + str(index)
        print index
        index += 1
        try:
            req = urllib2.Request(url.encode("utf8"), headers={'accept': '*/*', 'User-Agent' : "Magic Browser"})
            html = urllib2.urlopen(req, timeout=10)
        except:
            continue
        soup = BeautifulSoup(html, "html5lib")
        ingredients = soup.select('.ingredient-name span')
        for i in ingredients:
            i = i.text.lower()
            if not 'nutritional information' in i:
                if ' and ' in i:
                    i = i.split(' and ')
                elif ' or ' in i:
                    i = i.split(' or ')
                elif ', ' in i:
                    i = i.split(' or ')
                else:
                    i = [i]
                for part in i:
                    if 'our' in part:
                        Ingredient.objects.get_or_create(name = part)
                    else:
                        if part != singularize(part):
                            print part, singularize(part)
                        Ingredient.objects.get_or_create(name = singularize(part))
    print 'DONE'
Example #14
0
 def get_page_details(self, page_url, thread_id):
     self.check_html_getter()
     html = self.html_getter.get_html(page_url)
     soup = BeautifulSoup(html, "lxml")
     items = soup.select(".gl-item")
     for item in items:
         item_id = item.select(".gl-i-wrap.j-sku-item").attrs("data-sku")
         item_name = item.select(".p-name em").get_text()
         item_price = item.select(".J_price.js_ys").get_text()
         if not item_price:
             continue
         item_url = item.select(".p-name a").attrs("href")
         img_url = item.select(".p-img img").attrs("src")
         if img_url is None:
             img_url = item.select(".p-img img").attrs("data-lazy-img")
         yield {
             'item_id': item_id,
             'item_name': item_name,
             'item_price': item_price,
             'item_url': item_url,
             'img_url': img_url,
         }
def getGroups(url, f):
  log.write('-defGroups-\n')
  groups = []
  try:
    html = requests.get(url)
    content = BeautifulSoup(html.text, "lxml")
    log.write('->Got main page content\n')
    #Selecting all the links that lead to a group page
    for link in content.select('a[href*="index.cfm?CatID="]'):
      #Selecting the groups
      if link.strong:
        try:
          group = Group()
          group.setName(link.text.replace("on Twitter", "").replace("players", "").encode('utf-8'))
          group.setLink(link['href'])
          getSubGroups(url, group, f)
          groups.append(group)
        except Exception, e:
          log.write('erro: '+str(e)+'\n')
      #Resting for 10 seconds to avoid overhead
      time.sleep(10)
  except Exception, e:
    log.write('erro: '+str(e)+'\n')
Example #16
0
from soupselect import select
from bs4 import BeautifulSoup
import re
import sys
import re
import tweepy, time
import random
reload(sys)
sys.setdefaultencoding('utf-8')

url = "http://press.unian.ua/announcement/"
page = urllib2.urlopen(url)
soup = BeautifulSoup(page.read(), "lxml")

#зчитування з сайту дату, назву, лінк, час публікації
title_data = soup.select('div.other_news span.title')
time_data = soup.select('div.other_news span.time')
date_link = soup.select('div.other_news span.date')
link_data = soup.select('div.other_news ul li a')

# full_dict словник, у якому ключ - лінк на статтю
full_dict = {"http://press.unian.ua" + a['href']: [] for a in link_data}

# список, що складається із словників. Словник має наступну структуру: ключ - лінк на базу псевдосоціолога, значення - список із усіма можливими варіантами запису пріхвища та ім*я
fake_sociologists = [{
    'url': 'http://goo.gl/6i4K4g',
    'keywords': ["Радчук", "апро"]
}, {
    'url':
    "",
    'keywords':
def scraping_documenti():

    # Browser mechanize
    br = mechanize.Browser()

    # Empty dict
    lista_docs = []

    """
        articoli del volume 74 n 1 statistica
        -http://rivista-statistica.unibo.it/issue/view/467
    """
    # trova gli url degli articoli
    resp = br.open("http://rivista-statistica.unibo.it/issue/view/467")
    raw_html = resp.read()  # raw html source code
    soup = BeautifulSoup(raw_html)
    divclass = soup.findAll('div', attrs={'class': 'tocTitle'})  # lista

    for div in divclass:
        results = div.findAll("a")
        for res in results:
            url = res["href"]
            data = {}
            data['url'] = url
            data['title'] = res.text
            lista_docs.append(data)



    """
        articoli del volume 20 issue 11/12
        -http://www.dlib.org/dlib/november14/11contents.html
    """
    url_base = "http://www.dlib.org/dlib/november14/11contents.html"
    resp = br.open("http://www.dlib.org/dlib/november14/11contents.html")
    raw_html = resp.read()  # raw html source code
    soup = BeautifulSoup(raw_html)
    divclass = soup.findAll('p', attrs={'class': 'contents'})  # lista

    for div in divclass:
        results = div.findAll("a")
        for res in results:
            url = res["href"]
            url = urlparse.urljoin(url_base, url)
            data = {}
            data['url'] = url
            data['title'] = res.text
            lista_docs.append(data)


    """
        tutti gli articoli di una issue a scelta su dilib
        -http://www.dlib.org/dlib/july15/07contents.html
    """

    url_base = "http://www.dlib.org/dlib/july15/07contents.html"
    resp = br.open(url_base)
    raw_html = resp.read()  # raw html source code
    soup = BeautifulSoup(raw_html)
    divclass = soup.findAll('p', attrs={'class': 'contents'})  # lista

    for div in divclass:
        results = div.findAll("a")
        for res in results:
            url = res["href"]
            url = urlparse.urljoin(url_base, url)
            data = {}
            data['url'] = url
            data['title'] = res.text
            lista_docs.append(data)


    """
        tutti gli articoli di questa issue
        http://almatourism.unibo.it/issue/view/512
    """

    url_base = "http://almatourism.unibo.it/issue/view/512"
    resp = br.open("http://almatourism.unibo.it/issue/view/512")
    raw_html = resp.read()  # raw html source code
    soup = BeautifulSoup(raw_html)
    results = soup.select("div.tocTitle a")

    for res in results:
        url = res["href"]
        url = urlparse.urljoin(url_base, url)
        data = {}
        data["url"] = url
        data["title"] = res.text
        lista_docs.append(data)


    """
        tutti gli articoli di questa issue
        http://antropologiaeteatro.unibo.it/issue/view/513
    """

    url_base = "http://antropologiaeteatro.unibo.it/issue/view/513"
    resp = br.open(url_base)
    raw_html = resp.read()  # raw html source code
    soup = BeautifulSoup(raw_html)
    divclass = soup.findAll('div', attrs={'class': 'tocTitle'})  # lista

    for div in divclass:
        results = div.findAll("a")
        for res in results:
            url = res["href"]
            data = {}
            data['url'] = url
            data['title'] = res.text
            lista_docs.append(data)


    #print json.dumps(lista_docs)
    return lista_docs
Example #18
0
 def get_page_nums(self, main_url):
     html = get_html(main_url)
     soup = BeautifulSoup(html, "lxml")
     page = soup.select(".p-skip em b")
     print "page number:" + page.get_text()
     return int(page.get_text())
def scraping_documenti():

    # Browser mechanize
    br = mechanize.Browser()

    # Empty dict
    lista_docs = []
    """
        articoli del volume 74 n 1 statistica
        -http://rivista-statistica.unibo.it/issue/view/467
    """
    # trova gli url degli articoli
    resp = br.open("http://rivista-statistica.unibo.it/issue/view/467")
    raw_html = resp.read()  # raw html source code
    soup = BeautifulSoup(raw_html)
    divclass = soup.findAll('div', attrs={'class': 'tocTitle'})  # lista

    for div in divclass:
        results = div.findAll("a")
        for res in results:
            url = res["href"]
            data = {}
            data['url'] = url
            data['title'] = res.text
            lista_docs.append(data)
    """
        articoli del volume 20 issue 11/12
        -http://www.dlib.org/dlib/november14/11contents.html
    """
    url_base = "http://www.dlib.org/dlib/november14/11contents.html"
    resp = br.open("http://www.dlib.org/dlib/november14/11contents.html")
    raw_html = resp.read()  # raw html source code
    soup = BeautifulSoup(raw_html)
    divclass = soup.findAll('p', attrs={'class': 'contents'})  # lista

    for div in divclass:
        results = div.findAll("a")
        for res in results:
            url = res["href"]
            url = urlparse.urljoin(url_base, url)
            data = {}
            data['url'] = url
            data['title'] = res.text
            lista_docs.append(data)
    """
        tutti gli articoli di una issue a scelta su dilib
        -http://www.dlib.org/dlib/july15/07contents.html
    """

    url_base = "http://www.dlib.org/dlib/july15/07contents.html"
    resp = br.open(url_base)
    raw_html = resp.read()  # raw html source code
    soup = BeautifulSoup(raw_html)
    divclass = soup.findAll('p', attrs={'class': 'contents'})  # lista

    for div in divclass:
        results = div.findAll("a")
        for res in results:
            url = res["href"]
            url = urlparse.urljoin(url_base, url)
            data = {}
            data['url'] = url
            data['title'] = res.text
            lista_docs.append(data)
    """
        tutti gli articoli di questa issue
        http://almatourism.unibo.it/issue/view/512
    """

    url_base = "http://almatourism.unibo.it/issue/view/512"
    resp = br.open("http://almatourism.unibo.it/issue/view/512")
    raw_html = resp.read()  # raw html source code
    soup = BeautifulSoup(raw_html)
    results = soup.select("div.tocTitle a")

    for res in results:
        url = res["href"]
        url = urlparse.urljoin(url_base, url)
        data = {}
        data["url"] = url
        data["title"] = res.text
        lista_docs.append(data)
    """
        tutti gli articoli di questa issue
        http://antropologiaeteatro.unibo.it/issue/view/513
    """

    url_base = "http://antropologiaeteatro.unibo.it/issue/view/513"
    resp = br.open(url_base)
    raw_html = resp.read()  # raw html source code
    soup = BeautifulSoup(raw_html)
    divclass = soup.findAll('div', attrs={'class': 'tocTitle'})  # lista

    for div in divclass:
        results = div.findAll("a")
        for res in results:
            url = res["href"]
            data = {}
            data['url'] = url
            data['title'] = res.text
            lista_docs.append(data)

    #print json.dumps(lista_docs)
    return lista_docs
def get_counties():
    r = requests.get('http://elections.sos.state.tx.us/elchist175_countyselect.htm')
    soup = BeautifulSoup(r.text)
    return [{'id': o['value'], 'name': o.text} for o in soup.select('option')]
Example #21
0
while not search_param.strip():
    search_param = input(
        'Please enter the name of the TV show, movie or artist: ')

search_param = urllib.parse.quote(search_param).lower()

search_results = requests.get(
    '%s/search/site?q=%s' % (tunefind_search_uri, search_param), headers)

if 'no results found' in search_results.text.lower():
    print('No results found for your query')
    sys.exit(1)

soup = BeautifulSoup(search_results.text, 'html.parser')

results_table = soup.select('.pageSearchWrapper + .container')[0]

if results_table is None:
    print('Cannot find any search results for some reason.')
    sys.exit(1)

#results_columns = results_table.find_all(class_='col-md-')
results_columns = results_table.select('[class*="col-md-"]')

if not results_columns:
    print('Cannot find any search results for some reason.')
    sys.exit(1)

for results_column in results_columns:
    if not results_column.find('a'): continue
    results_items = results_column.find_all('li')
    def getQuestions(self, page_num):
        page = self.getPageByNum(page_num)
        soup = BeautifulSoup(page)

        questions = soup.select("div.question_list ul li")
        for question in questions:
            info = self.getQuestionInfo(question)
            if info:
                # 得到问题的URL
                url = "http://iask.sina.com.cn/" + info[1]
                ans = self.page_spider.getAnswer(url)
                print self.getCurrentTime(), "当前爬取第", page_num, "的内容,发现一个问题", info[2], "回答数量", info[3]
                # 构造问题的字典,插入问题
                ques_dict = {
                    "text": info[2],
                    "questioner": info[0],
                    "date": info[4],
                    "ans_num": info[3],
                    "url": url
                }

                insert_id = self.mysql.insertData('iask_question', ques_dict)
                good_ans = ans[0]
                print self.getCurrentTime(), "保存到数据库,此问题的ID为", insert_id

                # 如果存在最佳答案,那么就插入
                if good_ans:
                    print self.getCurrentTime(), insert_id, "号问题存在最佳答案", good_ans[0]
                    # 构造最佳答案的字典
                    good_ans_dict = {
                        "text": good_ans[0],
                        "answerer": good_ans[1],
                        "date": good_ans[2],
                        "is_good": str(good_ans[3]),
                        "question_id": str(insert_id)
                    }
                    # 插入最佳答案
                    if self.mysql.insertData("iask_answers", good_ans_dict):
                        print self.getCurrentTime(), "保存最佳答案成功"
                    else:
                        print self.getCurrentTime(), "保存最佳答案失败"

                # 获得其他答案
                other_anses = ans[1]
                # 遍历每一个其他答案
                for other_ans in other_anses:
                    # 如果答案存在
                    if other_ans:
                        print self.getCurrentTime(),insert_id,"号问题存在其他答案",other_ans[0]
                        # 构造其他答案的字典
                        other_ans_dict = {
                                "text": other_ans[0],
                                "answerer": other_ans[1],
                                "date": other_ans[2],
                                "is_good": str(other_ans[3]),
                                "question_id": str(insert_id)
                                }
                        # 插入这个答案
                        if self.mysql.insertData("iask_answers", other_ans_dict):
                            print self.getCurrentTime(), "保存其他答案成功"
                        else:
                            print self.getCurrentTime(),"保存其他答案失败"
    def _downloadFiles(self):
        message("Creating URL", INFORMATION)
        parser = self._getSelectedParser()
        url = parser.getDownloadUrl(
            self._selected,
            self._domain
        )

        message("Downloading page contains download link", INFORMATION)
        response = urllib2.urlopen(url)
        html = response.read()

        #remove CDATA
        e = etree.XML(html)
        html = etree.tostring(e)

        message("Parsing page", INFORMATION)
        soup = Soup(html)
        fileUrl = select(soup, 'fileurl')
        if len(fileUrl) == 0:
            fileUrl = soup.select('fileUrl')
        edv = select(soup, 'edv')

        if len(fileUrl) == 0:
            message("The page does not have a download link. ", ERROR)
            return False

        if len(edv) == 0:
            message("The page does not have 'edv' key", ERROR)
            return False

        #name = self.fileName[self.selected-1]
        name = parser.getFileName(self._selected)
        fileKeyName = "%s.key" % name

        edv = edv[0]

        if html.find("<edv/>") != -1:
            message("The .key file is not necessary.", INFORMATION)
        else:
            message("Creating .key file", INFORMATION)

            f = open(str(fileKeyName), 'w')
            f.write(str(edv.string))
            f.close()
            message("Created", DONE)

        fileUrl = fileUrl[0]
        downloadUrl = fileUrl.string

        # contains html entities
        if downloadUrl.find("&amp") >= 0:
            downloadUrl = HTMLParser.HTMLParser().unescape(downloadUrl)

        listUrl = downloadUrl.split('.')

        items = len(listUrl)
        iterationPosition = items - 2

        if items < 5 and len(listUrl[iterationPosition]) == 0:
            message("URL is wrong. Actual URL is: %s" % fileUrl, ERROR)
            return False

        i = 1
        while(True):
            part = str(i)
            if i < 10:
                part = "0%i" % i

            listUrl[iterationPosition] = part
            fileName = "%s.part%s" % (name, part)
            url = ".".join(listUrl)
            if self._downloadFile(url, fileName) == False:
                break
            i += 1
        if i == 1:
            self._glueNeeded = False
            if self._downloadFile(downloadUrl, name) == False:
                message("Something is wrong, probably URL", ERROR)
                return False

        message("Downloaded.", INFORMATION)
        return True
Example #24
0
    # Read html file to url
    try:
        htmlpage = requests.get(url).content
    except Exception, e:
        log("Error while opening:\t" + url)
        log(str(e))
        return
    try:
        soup = BeautifulSoup(htmlpage)
    except Exception, e:
        log("Error while parsing:\t" + url)
        log(str(e))
        return

    # Extract image links from html
    urls = soup.select('.album-view-image-link a')
    processed = []
    for url in urls:
        url = url['href']

        # Download image
        url = format_url(url)
        processed.append(url)

    return processed


def format_url(url):
    """
	Formats url strings by front adding "http:" if needed and removing ?s
	:param url: "//imgur.com/XYZ123?1"
def get_countylist(election_code):
    r = requests.get(
        'http://elections.sos.state.tx.us/elchist%s_countyselect.htm' %
        election_code)
    soup = BeautifulSoup(r.text)
    return [{'id': o['value'], 'name': o.text} for o in soup.select('option')]
        print(crime_id.attrs['href'])
        crime_id_unique = crime_id.attrs['href']
        crime_id_url = crime_base_url + crime_id_unique
        crime_links.append(crime_id_url)
    except:
        print("Offical report hasn't been filed for {}".format(crime_id))

print(crime_links)
if len(crime_links) > 0:
    with open('crimes.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['incidentId', 'reported', 'occured', 'building', 'location', 'locationCode', 'incidentCode', 'person_status', 'person_name', 'person_RG', 'person_affiliation', 'person_crime', 'person_age', 'synopsis'])
        for crime in crime_links:
            html = urllib.request.urlopen(crime)
            soup = BeautifulSoup(html, 'lxml')
            incident_id = soup.select('#ctl00_ContentPlaceHolder1__IncidentNumber')
            incident_id = incident_id[0].text
            print(incident_id)
            reported = soup.select('#ctl00_ContentPlaceHolder1_DateReported')
            reported = reported[0].text
            print(reported)
            occured = soup.select('#ctl00_ContentPlaceHolder1_OccurredDate')
            occured = occured[0].text
            building = soup.select('#ctl00_ContentPlaceHolder1_Building')
            building = building[0].text
            location = soup.select('#ctl00_ContentPlaceHolder1_Location')
            location = location[0].text
            location_code = soup.select('#ctl00_ContentPlaceHolder1_LocationCode')
            location_code = location_code[0].text
            incident_code = soup.select('#ctl00_ContentPlaceHolder1_IncidentCode')
            incident_code = incident_code[0].text
Example #27
0
def get_lyrics(url):
    resp = requests.get(url)
    parsed_html = BeautifulSoup(resp.content, features="html.parser")
    text = parsed_html.select('.col-xs-12.col-lg-8.text-center')[0].text
    return text
def get_elections():
    r = requests.get('http://elections.sos.state.tx.us/index.htm')
    soup = BeautifulSoup(r.text)
    return [{'election_code': o['value'], 'title': o.text} for o in soup.select('option')]
Example #29
0
import requests
from BeautifulSoup import BeautifulSoup

a = []
conn = pymysql.connect(host='52.78.104.59',
                       user='******',
                       password='******',
                       db='',
                       charset='utf8')
curs = conn.cursor()

req = requests.get("https://www.naver.com/")  # connection
html = req.text  # 소스 가져오기

soup = BeautifulSoup(html, 'html.parser')
sillsigan = soup.select(
    'div.ah_roll.PM_CL_realtimeKeyword_rolling_base > div > ul > li')
b = []
for sill in sillsigan:
    b.append(sill.text)  # 태그 내 문자열을 b리스트에 추가
k = 1
list_sillsigan = []
for i in b:  # 문자열에서 핵심 문자열만 list_sillsigan 리스트에 추가
    if k > 9:
        list_sillsigan.append(i[5:-2])
    else:
        list_sillsigan.append(i[4:-2])
    k += 1
for s, list in enumerate(list_sillsigan):
    a.append(list)

curs.execute(
Example #30
0
from urlparse import urljoin
from BeautifulSoup import BeautifulSoup
import requests


BASE_URL = "http://genius.com"
artist_url = "http://genius.com/artists/Andre-3000/"

# response = requests.get(artist_url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36'})

response = requests.get(artist_url)
# response.encoding = "utf-8"

text = response.text

soup = BeautifulSoup(text)
# print soup
print soup.select("song_title")

# for song_link in soup.select("ul.song_list > li > a"):
#    link = urljoin(BASE_URL, song_link['href'])
#    response = requests.get(link)
#    text = response.text
#    soup = BeautifulSoup(text)
#    lyrics = soup.find('div', class_='lyrics').text.strip()

# tokenize `lyrics` with nltk
Example #31
0
pages = []

books = []
genres = []
ebook_genres = []

MAX_PAGES = 37
for npage in range(1, MAX_PAGES + 1):
    print("Scanning page {}".format(npage))
    req = requests.get(BASE_URL + "ebooks?page=" + str(npage))

    dir_html = req.text

    parsed_dir_html = BeautifulSoup(dir_html)

    for link in parsed_dir_html.select("main > ol li > a[href]"):
        pages.append(link['href'])

total = len(pages)
current = 1
current_genre = 1
for page in pages:
    try:
        page_html = requests.get(BASE_URL + page)
        page_html = BeautifulSoup(page_html.text)
        title = page_html.select_one(".ebook hgroup > h1").text
        author = page_html.select_one(".ebook hgroup > h2").text
        description = page_html.select_one("#description p").text
        dl_link = page_html.select_one('#download .epub')['href']
        tags = [
            tag.text for tag in page_html.select("#reading-ease .tags li a")