コード例 #1
0
    def _get_reviews(self, query, conn, cursor, num_scrolls=1, sleep=5, proxies=None):

        if proxies:
            handler = urllib.request.ProxyHandler(proxies)
            opener = urllib.request.build_opener(handler)
            urllib.request.install_opener(opener)

        id_ = self._get_id(query)
        url = f'https://www.imdb.com/title/{id_}/reviews/?ref_=tt_ql_urv'
        req = urllib.request.Request(url, headers=self.headers)
        resp = urllib.request.urlopen(req).read()
        bs4_page = _BeautifulSoup(resp, 'lxml')

        try:
            page_key = bs4_page.find('div', {'class': 'load-more-data'})['data-key']
        except TypeError:
            print("ERROR: Excessive requests. Increase sleep time or using proxies for longer scraping")
            exit(1)

        for review in bs4_page.findAll('div', {'class': 'review-container'}):
            # Sometimes the rating doesn't render appropriately
            try:
                rating = review.find('span', {'class': 'rating-other-user-rating'}).find('span').text
            except Exception:
                rating = None

            cursor.execute("INSERT INTO REVIEWS VALUES (?,?,?,?,?)",
                           (None, review.find('div', {'class': 'text'}).text, rating,
                            review.find('span', {'class': 'display-name-link'}).text,
                            review.find('a', {'class': 'title'}).text))
        conn.commit()
        print(f"Sleeping for {sleep} seconds to avoid excessive requests")
        time.sleep(sleep)

        for _ in tqdm(range(num_scrolls)):
            pagination_url = f'https://www.imdb.com/title/{id_}/reviews/_ajax?ref_=undefined&paginationKey={page_key}'
            req = urllib.request.Request(pagination_url)
            resp = urllib.request.urlopen(req).read()
            bs4_page = _BeautifulSoup(resp, 'lxml')
            try:
                page_key = bs4_page.find('div', {'class': 'load-more-data'})['data-key']
            except TypeError:
                print("ERROR: Excessive requests. Increase sleep time or using proxies for longer scraping")
                exit(1)

            for review in bs4_page.findAll('div', {'class': 'review-container'}):
                try:
                    rating = review.find('span', {'class': 'rating-other-user-rating'}).find('span').text
                except Exception:
                    rating = None

                cursor.execute("INSERT INTO REVIEWS VALUES (?,?,?,?,?)",
                               (None, review.find('div', {'class': 'text'}).text, rating,
                                review.find('span', {'class': 'display-name-link'}).text,
                                review.find('a', {'class': 'title'}).text))
            conn.commit()
            print(f"Sleeping for {sleep} seconds to avoid excessive requests")
            time.sleep(sleep)
コード例 #2
0
    def _update_usa(self):
        """
        Update whitelist based on usa.gov
        """
        print 'Getting agencies from usa.gov...'

        url_base = 'https://www.usa.gov'
        letters = _string.ascii_lowercase

        agency_dic = {}

        for letter in letters:
            url = url_base + '/federal-agencies/' + letter
            soup = _BeautifulSoup(_urlopen(url).read())

            links_content = [l for l in soup.find_all('ul') if 'class' in l.attrs and 'one_column_bullet' in l['class']]
            if len(links_content) == 1:
                links_list = links_content[0].find_all('a')
                for agency_html in links_list:
                    name_short = self._preprocess_name(agency_html.string)
                    agency_dic[name_short] = {'html': agency_html,
                                              'url': url_base + agency_html['href'],
                                              'name_full': agency_html.string,
                                              'source': 'usa.gov'}

                    print agency_html.string

            elif len(links_content) == 0:
                pass

            else:
                raise ValueError('Too many list elements found! Please modify the HTML parser.')

        self.agency_dictionary.update(agency_dic)
コード例 #3
0
    def __init__(self, parent, login=False, username=None, password=None):
        self._parent = parent

        self.download_page_soup = None
        self.current_archive_id = None
        self.session = s = _requests.Session()
        self.login = l = login

        # If login requested, populated login info
        if l:
            # Set post parameters
            login_data = {
                'username': username,
                'password': password,
                'action': 'auth',
                'redirect': '/'
            }

            self._login_credentials_present(username, password)

            self._parent.throttle.throttle('page')
            r = s.post(_LOGIN_URL, data=login_data)

            if r.status_code != 200:
                raise ConnectionError(
                    f'Encountered a problem connecting '
                    f'during ArchiveDownloader initialization:'
                    f' code = {r.status_code}, login = {l}')
            # Check successful login
            soup = _BeautifulSoup(r.text, 'lxml')

            if soup(text='Log in Failed!'):
                raise NavigatorException(f'Login credentials rejected by the '
                                         f'server.')
コード例 #4
0
    def _scrape_contents(self):
        ### Scrape the contents of the currently displayed calendar
        # Scrape the nav page
        soup = _BeautifulSoup(self._browser.page_source, 'lxml')

        # Isolate & store the calendar contents
        self._contents = soup.find('table', {'class': 'table-condensed'})
コード例 #5
0
    def _scrape_contents(self):
        ### Scrape the contents of the currently displayed calendar
        # Scrape the nav page
        soup = _BeautifulSoup(self._browser.page_source, 'lxml')

        # Isolate & store the ATT contents
        self._contents = soup.find('table', attrs={
            'id': 'archiveTimes'
        }).find('tbody')
コード例 #6
0
    def _get_id(self, query):

        url = f'https://www.imdb.com/find?' + urllib.parse.urlencode({'q': query, 'ref_': 'nv_sr_sm'})
        req = urllib.request.Request(url, headers=self.headers)
        response = urllib.request.urlopen(req)

        bs4_page = _BeautifulSoup(response.read(), 'lxml')
        anchor_tag = bs4_page.find('td', {'class': 'result_text'}).find('a')['href']
        return anchor_tag.split('/')[2]
コード例 #7
0
def soup(content='', headers=None):
    if content.startswith('http'):
        content = url_get(content, headers=headers)

    if content:
        _soup = _BeautifulSoup(content, 'lxml')
        # soup.find('title', attrs={'itemprop': "name"})
        return _soup
    return None
コード例 #8
0
    def _get_version_ids(self):
        base_url = 'http://uscode.house.gov/download/annualhistoricalarchives/downloadxhtml.shtml'
        soup = _BeautifulSoup(_urllib2.urlopen(base_url))

        tags = [t for t in soup.find_all('a') if '.zip' in t['href']]

        id_vals = [_re.search('[0-9]+', t['href']).group(0) for t in tags]

        return id_vals
コード例 #9
0
def _get_feed_name(feed_id):
    s = _requests.Session()
    with s:
        r = s.get(_FEED_URL_STEM + feed_id)
        if r.status_code != 200:
            raise ConnectionError(f'Problem connecting: {r.status_code}')

        soup = _BeautifulSoup(r.text, 'lxml')

        return soup.find('span', attrs={'class': 'px13'}).text
コード例 #10
0
    def _update_wikipedia(self):
        # do a little bit of name preprocessing here too
        from requests import ConnectionError
        from wikipedia import PageError

        print 'Getting data from Wikipedia...'

        page_current = _wikipedia.page('List_of_federal_agencies_in_the_United_States')
        html = page_current.html()
        subset = html[_re.search('<h2>.*?Legislative Branch', html).start():_re.search('<h2>.*?See also', html).start()]
        soup = _BeautifulSoup(subset)

        links = soup.find_all(lambda x: x.name == 'a' and x.has_attr('href') and '/wiki/' in x['href'] and
                              'File:' not in x['href'])

        agency_dic = {self._preprocess_name(link['title']): {'html': link,
                                                             'url': 'https://en.wikipedia.org' + link['href'],
                                                             'name_full': link['title'],
                                                             'source': 'wikipedia'}
                      for link in links}

        category_pages = ['https://en.wikipedia.org/w/api.php?action=query&list=categorymembers&' +
                          'cmtitle=Category:Defunct_agencies_of_the_United_States_government&cmlimit=500&format=json',
                          'https://en.wikipedia.org/w/api.php?action=query&list=categorymembers&' +
                          'cmtitle=Category:Committees_of_the_United_States_Senate&cmlimit=500&format=json',
                          'https://en.wikipedia.org/w/api.php?action=query&list=categorymembers&' +
                          'cmtitle=Category:Joint_committees_of_the_United_States_Congress' +
                          '&cmlimit=500&format=json',
                          'https://en.wikipedia.org/w/api.php?action=query&list=categorymembers&' +
                          'cmtitle=Category:Committees_of_the_United_States_House_of_Representatives' +
                          '&cmlimit=500&format=json',
                          ]

        for category_page in category_pages:
            content_defunct = _json.loads(_urlopen(category_page).read())

            for result in content_defunct['query']['categorymembers']:
                if result['ns'] == 0:
                    url_defunct = 'https://en.wikipedia.org/wiki/' + _re.sub(' ', '_', result['title'])
                    print(result['title'])
                    try:
                        page_defunct = _wikipedia.page(result['title'])

                        name_short = self._preprocess_name(result['title'])

                        agency_dic[name_short] = {'html': page_defunct.html(),
                                                  'url': url_defunct,
                                                  'name_full': result['title'],
                                                  'source': 'wikipedia'}

                    except (ConnectionError, PageError):
                        print('Failed to get agency HTML!')

        self.agency_dictionary.update(agency_dic)
コード例 #11
0
    def get_download_soup(self, archive_id):
        self.current_archive_id = archive_id
        s = self.session

        self._parent.throttle.throttle()
        r = s.get(_ARCHIVE_DOWNLOAD_STEM + archive_id)
        if r.status_code != 200:
            raise ConnectionError(
                f'Problem connecting while getting soup from '
                f'{_ARCHIVE_DOWNLOAD_STEM + archive_id}: {r.status_code}')

        self.download_page_soup = _BeautifulSoup(r.text, 'lxml')

        return self.download_page_soup
コード例 #12
0
    def login(self, login, password):
        """
        Login to your router - you need to do this before you get any other site

        :param login: Your login. It can't be anything else than 'admin' so...
        :param password: Password to your admin
        """

        # Get main site to get public RSA key
        login_r = self._session.get(self._url + '/loginpage.htm', verify=False)
        if not login_r.ok:
            raise ConnectionError
        # Scrape the key
        login_soup = _BeautifulSoup(login_r.content, features='html.parser')
        pub_key_txt = login_soup.find(id='divpem').text
        pub_key_txt = pub_key_txt.replace('\n', '').strip()

        # This is what the site does (in aes.js) before sending password
        # Why does it generate 16 random digits before password?
        # ¯\_(ツ)_/¯
        pwdv = password + ':' + ''.join(
            _random.choice(_string.digits) for i in range(16))

        # We need to use JavaScript engine
        ctx = _py_mini_racer.MiniRacer()
        # ...to execute code that encrypts password before sending.
        # I couldn't get it working in Python, so I just stole all required JS
        # and execute it :)
        ctx.eval((_pathlib.Path(__file__).parent /
                  'stolen_javascript.js').read_text())
        pwd_hash = ctx.eval(f"""
        var key = RSA.getPublicKey("{pub_key_txt}");
        RSA.encrypt("{pwdv}", key);
        """)

        # Get cookies by logging in
        auth_r = self._session.post(
            self._url +
            f'/log/in?un={_urllib_parse.quote(login)}&pw={_urllib_parse.quote(pwd_hash)}'
            f'&rd=%2Fuir%2Fdwrhome.htm&rd2=%2Fuir%2Floginpage.htm&Nrd=1&Nlmb=',
            verify=False)

        def _is_redirect_ok(location: str):
            question = location.index('?')
            return location[:question] != '/uir/loginpage.htm'

        if not auth_r.ok or not _is_redirect_ok(
                auth_r.history[0].headers['Location']):
            raise ConnectionError
コード例 #13
0
ファイル: lyricwikia.py プロジェクト: enricobacis/lyricwikia
def get_all_lyrics(artist, song, linesep='\n', timeout=None):
    """Retrieve a list of all the lyrics versions of a song."""
    url = create_url(artist, song)
    response = _requests.get(url, timeout=timeout)
    soup = _BeautifulSoup(response.content, "html.parser")
    lyricboxes = soup.findAll('div', {'class': 'lyricbox'})

    if not lyricboxes:
        raise LyricsNotFound('Cannot download lyrics')

    for lyricbox in lyricboxes:
        for br in lyricbox.findAll('br'):
            br.replace_with(linesep)

    return [lyricbox.text.strip() for lyricbox in lyricboxes]
コード例 #14
0
ファイル: new_lyricwikia.py プロジェクト: Swat712/lyricwikia
def get_all_lyrics(artist, song, language='', linesep=' \n ', timeout=None):
    """Retrieve a list of all the lyrics versions of a song."""
    url = create_url(artist, song, language)
    response = _requests.get(url, timeout=timeout)
    soup = _BeautifulSoup(response.content, "html.parser")
    lyricboxes = soup.findAll('div', {'class': 'lyricbox'})

    if not lyricboxes:
        raise LyricsNotFound('Cannot download lyrics')

    for lyricbox in lyricboxes:
        for br in lyricbox.findAll('br'):
            br.replace_with(linesep)

    return [lyricbox.text.strip() for lyricbox in lyricboxes]
コード例 #15
0
    def _get_feed_name(self, feed_id):
        s = _requests.Session()
        with s:
            r = s.get(_FEED_URL_STEM + feed_id)
            if r.status_code != 200:
                raise ConnectionError(
                    f'Problem connecting while getting feed name: '
                    f' {r.status_code}')

            soup = _BeautifulSoup(r.text, 'lxml')
            try:
                feed_name = soup.find('span', attrs={'class': 'px13'}).text
            except AttributeError:
                raise NavigatorException(f'Invalid feed_id ({feed_id}).')

            self.feed_name = feed_name
コード例 #16
0
    def _get_ids(self):
        id_vals = []
        years = range(1988, 2017)

        for year in years:
            soup = _BeautifulSoup(
                _urlopen(
                    'http://www.legislation.gov.uk/ukpga/{0}'.format(year)))
            n_results = _re.search('has returned ([0-9]+) results',
                                   soup.text.lower()).group(1)
            id_vals += [
                str(year) + '_' + str(i) for i in range(1,
                                                        int(n_results) + 1)
            ]

        return id_vals
コード例 #17
0
def bgip():
    'http://www.89ip.cn/index_<2,2>.html'
    load = _plug.load('bgip')
    allip = set()
    for html in load:
        soup = _BeautifulSoup(html, 'lxml')
        tbody = soup.find('tbody')
        for tr in tbody:
            l = []
            for td in tr:
                for td_str in td:
                    l.append(td_str.strip())
            result = _plug.ip_match(l, protocol='http')
            if result:
                allip.add(result)
    return allip
コード例 #18
0
    def _get_data(self, publication_id):

        max_attempts = 10
        attempts = 0

        xml_content = None
        soup = None

        while attempts < max_attempts:
            search_id = _re.sub('_', '/', publication_id)
            try:
                xml_content = _urlopen(
                    'http://www.legislation.gov.uk/ukpga/{0}/data.xml'.format(
                        search_id)).read()
                soup = _BeautifulSoup(xml_content, 'xml')
                break
            except:
                attempts += 1

        if 'amendment' in soup.title.text.lower():
            amend = True
        else:
            amend = False

        if 'repeal' in soup.title.text.lower():
            repeal = True
        else:
            repeal = False

        if soup.EnactmentDate is not None:
            date = soup.EnactmentDate['Date']
        elif soup.PrimaryPrelims is not None:
            date = soup.PrimaryPrelims['RestrictStartDate']
        else:
            date = None
            print 'warning! No date found.'

        meta = _format_meta_entry(country=u'united_kingdom',
                                  title=soup.title.text,
                                  id=publication_id,
                                  date=date,
                                  type=u'annual',
                                  xml=xml_content,
                                  amendment=amend,
                                  repealed=repeal)

        return meta
コード例 #19
0
def xici():
    'http://www.xicidaili.com/nn/<1,2>'
    load = _plug.load('xici')
    allip = set()
    for html in load:
        soup = _BeautifulSoup(html, 'lxml')
        td = soup.find_all('td', class_='country')
        for b in td:
            l = []
            for d in b.next_siblings:
                if d.string != '\n':
                    l.append(str(d.string))
            if l != []:
                result = _plug.ip_match(l)
                if result:
                    allip.add(result)
    return allip
コード例 #20
0
def llip():
    'http://www.66ip.cn/<2,2>.html'
    load = _plug.load('llip')
    allip = set()
    for html in load:
        soup = _BeautifulSoup(html, 'lxml')
        tab = soup.find('table', bordercolor='#6699ff')
        tr_all = tab.find_all('tr')
        for tr in tr_all:
            l = []
            for td in tr:
                for td_str in td.children:
                    l.append(td_str)
            result = _plug.ip_match(l, protocol='http')
            if result:
                allip.add(result)
    return allip
コード例 #21
0
    def __scrape_nav_page(self):
        if self.verbose: print('\tScraping navigation page...')
        self.__check_browser()

        # Wait for page to render
        element = _WebDriverWait(self.browser, 10).until_not(
            _EC.text_to_be_present_in_element(
                (_By.XPATH, _first_uri_in_att_xpath), self.current_first_uri))

        self.current_first_uri = self.__get_current_first_uri()

        # Scrape page content
        soup = _BeautifulSoup(self.browser.page_source, 'lxml')

        # Isolate the calendar and the archiveTimes table
        self.calendar_soup = soup.find('table', {'class': 'table-condensed'})
        self.att_soup = soup.find('table', attrs={
            'id': 'archiveTimes'
        }).find('tbody')
コード例 #22
0
    def _update_register(self):
        print 'Getting agencies from the federal register...'

        url_base = 'https://www.federalregister.gov/agencies'
        soup = _BeautifulSoup(_urlopen(url_base))
        links = soup.find_all(lambda x: x.name == 'li' and x.has_attr('data-filter-live') and not x.has_attr('class'))

        agency_dic = {}

        for link in links:
            agency = link.find('a')
            name_short = self._preprocess_name(agency.string)
            agency_dic[name_short] = {'html': agency,
                                      'url': agency['href'],
                                      'name_full': agency.string,
                                      'source': 'federal_register'}

            print agency.string

        self.agency_dictionary.update(agency_dic)
コード例 #23
0
ファイル: new_lyricwikia.py プロジェクト: Swat712/lyricwikia
def get_lyrics_for_all_languages(artist, song, linesep='\n', timeout=None):
    """Retrieve the lyrics of the song in all languages available"""
    url = create_url(artist, song, '')
    response = _requests.get(url, timeout=timeout)
    soup = _BeautifulSoup(response.content, "html.parser")
    lyricboxes = soup.find('table', {'class': 'banner banner-song'})
    result = dict()
    result['default'] = get_lyrics_by_language(artist,
                                               song,
                                               '',
                                               linesep='\n',
                                               timeout=None)

    for a in lyricboxes.findAll('a', href=True):
        result[a.getText()] = get_lyrics_by_language(artist,
                                                     song,
                                                     a['href'].split('/')[-1],
                                                     linesep='\n',
                                                     timeout=None)

    return result
コード例 #24
0
ファイル: new_lyricwikia.py プロジェクト: Swat712/lyricwikia
def parse_page_now(url, df, timeout=None):
    response = _requests.get(url, timeout=timeout)
    soup = _BeautifulSoup(response.content, "html.parser")
    data = soup.findAll('li', attrs={'class': 'category-page__member'})

    if not data:
        raise LanguageNotFound('No such language')

    for div in data:
        links = div.findAll('a')
        for a in links:

            lyric_link = a['href'].strip('/wiki/')
            artist = lyric_link.split(":")[0]
            title = lyric_link.split(":")[1]
            if (artist == "Category"):
                continue
            df = df.append({
                'Artist': artist,
                'Title': title
            },
                           ignore_index=True)

    if (soup.find('div', attrs={'class':
                                'category-page__pagination'}) == None):
        return df

    next_page_text = soup.find(
        'div', attrs={
            'class': 'category-page__pagination'
        }).find(
            'a',
            attrs={
                'class':
                'category-page__pagination-next wds-button wds-is-secondary'
            })
    if next_page_text != None:
        next_page_url = next_page_text['href']
        df = parse_page_now(next_page_url, df)
    return df
コード例 #25
0
def scrape(login: str, password: str):
    session = _re.session()
    login_site_r = session.get('https://mobilevikings.pl/en/account/login/')
    login_soup = _BeautifulSoup(login_site_r.content, features='html.parser')
    csrf_middle_token = login_soup.find('input',
                                        attrs={'name':
                                               'csrfmiddlewaretoken'})['value']
    payload = {
        'csrfmiddlewaretoken': csrf_middle_token,
        'next': '/mysims/',
        'username': login,
        'password': password
    }
    login_r = session.post(
        'https://mobilevikings.pl/en/account/login/',
        data=payload,
        headers={'Referer': 'https://mobilevikings.pl/en/account/login/'})
    json_r = session.get(
        'https://mobilevikings.pl/mysims/sim/148128/balance/json/',
        headers={'Referer': login_r.url})

    return _json.loads(json_r.content)
コード例 #26
0
def scrape(no_cache=False, cache_file_name='vote-results.json', cache_expire_time=24 * 60 * 60):
    """
    This function downloads site http://ewybory.eu/sondaze and scrapes it for support data.
    Then, it saves the results in cache file (named vote-results.json by default),
    and uses this file for next 24 hours.

    If some party support can't be read for some reason, it will be -1

    :param no_cache: Don't save results in cache file
    :param cache_file_name: Alternative cache file path
    :param cache_expire_time: Time (in seconds) after the cache file will be discarded and site will be downloaded again
    :return: Dict with results
    """
    result = {
        'success': False,
        'support': {
            'pis': -1,
            'ko': -1,
            'lewica': -1,
            'konfederacja': -1,
            'psl': -1,
            'polska2050': -1
        },
        'growth': {
            'pis': 0,
            'ko': 0,
            'lewica': 0,
            'konfederacja': 0,
            'psl': 0,
            'polska2050': 0
        }
    }

    get_site = False
    try:
        if no_cache:
            raise ValueError
        modify = _os.path.getmtime(cache_file_name)
        if modify < _time.time() - cache_expire_time:  # If cache file is older than 24h
            raise IOError
        with open(cache_file_name, 'rb') as f:
            print('Got results from cache')
            return _json.load(f)
    except FileNotFoundError:
        print('Cache file not found!')
        get_site = True
    except IOError:
        print('Cache file toot old!')
        get_site = True
    except ValueError:
        print('No-cache set to true, not touching cache files!')
        get_site = True

    if get_site:
        print('Getting the site from internet...')
        res = _get_site()
        if res is None:
            print("Can't get the site! Most probably no internet :/")
            return result
        soup = _BeautifulSoup(res.content, 'html.parser')

    print('Parsing with soup...')

    div = soup.find('div', class_='entry-content clearfix')
    table = div.find('table')
    tr = table.find('tr')
    ths = tr.find_all('th', class_='name_party_poll')

    name_party = {
        'pis': 'pis',
        'ko': 'ko',
        'lewica': 'lewica',
        'konfederacja': 'konfederacja',
        'psl': 'psl',
        'polska 2050': 'polska2050',
        'n.solidarność': 'nowasolidarnosc'
    }

    for i in range(len(name_party)):
        party = name_party.get(_get_name(ths[i]).lower(), None)
        if party is None:
            print("Looks like some unknown party is on graph? "
                  "It's possible that this means that this repo needs update - "
                  "feel free to make an Issue on GitHub about that :)")
            continue
        sup = _get_sup(ths[i])
        result['support'][party] = sup[0]
        result['growth'][party] = sup[1]

    result['success'] = True

    if not no_cache:
        with open(cache_file_name, 'w') as f:
            _json.dump(result, f)

    return result
コード例 #27
0
    def _get_data(self, publication_id):
        import bs4

        search_term = _re.sub('_', '/', publication_id)

        text_soup = None
        text_content = None

        try:
            text_url = 'https://www.congress.gov/bill/{0}/text'.format(
                search_term)
            text_soup = _BeautifulSoup(_urlopen(text_url))
        except:
            pass

        if text_soup is not None:
            if text_soup.find('pre') is not None:
                text_content = str(text_soup.find('pre'))
            else:
                text_content = str(
                    text_soup.find('table',
                                   attrs={'class': 'lbexTableStyleEnr'}))

        meta_url = 'https://www.congress.gov/bill/{0}/all-info'.format(
            search_term)
        meta_soup = _BeautifulSoup(_urlopen(meta_url))

        title = _re.search(
            ': (.*)',
            meta_soup.find('meta', attrs={'name': 'dc.title'})['content'])
        if title is not None:
            title = title.group(1)

        date = meta_soup.find('meta', attrs={'name': 'dc.date'})['content']
        sponsor = meta_soup.find('meta', attrs={'name': 'dc.creator'})
        if sponsor is not None:
            sponsor = sponsor['content']

            sponsor_party = _re.search(sponsor + ' \[([A-Z])', meta_soup.text)
            if sponsor_party is not None:
                sponsor_party = sponsor_party.group(1)
        else:
            sponsor_party = None

        cosponsors = [
            tag.text for tag in meta_soup.find_all('a', href=True)
            if 'member/' in tag['href'] and sponsor not in tag.text
        ]

        policy_area = _re.search('Policy Area:\s*(.*)', meta_soup.text)
        if policy_area is not None:
            policy_area = policy_area.group(1)

        committee_entries = meta_soup.find_all('tr', class_='committee')
        referred = [entry.find('th').text for entry in committee_entries]
        hearings_held = []

        for entry in committee_entries:
            committee_name = entry.find('th').text
            actions = [entry.find_all('td')[1].text]

            entry = entry.next_sibling
            while type(entry) == bs4.element.Tag and (
                    'class' not in entry.attrs
                    or 'committee' not in entry['class']):
                actions.append(entry.find_all('td')[1].text)
                entry = entry.next_sibling

                if type(entry) == bs4.element.NavigableString:
                    break

            hearings = [action for action in actions if 'Hearing' in action]
            hearings_held += [committee_name] * len(hearings)

        if 'amend' in title:
            amendment = True
        else:
            amendment = False

        if 'resolution' in publication_id:
            subtype = u'resolution'
        else:
            subtype = u'law'

        meta = _format_meta_entry(country=u'united_states',
                                  title=title,
                                  id=publication_id,
                                  date=date,
                                  type=u'annual',
                                  subtype=subtype,
                                  amendment=amendment,
                                  sponsor=sponsor,
                                  sponsor_party=sponsor_party,
                                  cosponsors=cosponsors,
                                  referred=referred,
                                  hearings=hearings_held,
                                  policy_area=policy_area,
                                  html=text_content)

        return meta
コード例 #28
0
from v2ex.errors import Need2FA, NeedLogin, SigninFailed
from v2ex.utils import parse_cookies

DEFAULT_HEADERS = {
    'referer':
    'https://www.v2ex.com/',
    'accept-language':
    'en,zh;q=0.9',
    'accept-encoding':
    'gzip, deflate, br',
    'user-agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
}

log = logging.getLogger(__name__)
BeautifulSoup = lambda text: _BeautifulSoup(text, features='lxml')


def check_session(response):
    if '你要查看的页面需要先登录' in response.text:
        raise NeedLogin
    if '两步验证登录' in response.text:
        raise Need2FA


def logged_in(response):
    if response.url.path == '/2fa':
        return False
    if '确定要从 V2EX 登出?' in response.text:
        return True
    return False
コード例 #29
0
    def _get_data(self, publication_id):
        from urllib2 import HTTPError

        def get_xml(xml_link):
            try:
                xml_data = _urlopen(xml_link).read()

                if 'xml' in xml_data[0:100]:
                    return xml_data
                else:
                    return None

            except HTTPError:
                return None

        def get_html(html_link):
            html_response = _urlopen(html_link)
            html_data = html_response.read()

            return html_data

        parl_url = 'http://www.parl.gc.ca'

        html_base = 'http://www.parl.gc.ca/HousePublications/Publication.aspx?Language=E&Mode=1&DocId={0}&Col=1'

        html_docs = []
        xml_docs = []

        initial_html = _urlopen(html_base.format(publication_id)).read()
        initial_soup = _BeautifulSoup(initial_html)
        full_doc_links = [
            tag for tag in initial_soup.find_all('a')
            if 'Click here for the entire document' in tag.text
        ]
        next_links = [
            tag for tag in initial_soup.find_all('a')
            if 'Next Page' in repr(tag)
        ]

        full_link_success = False
        if len(full_doc_links) == 1:
            url = parl_url + full_doc_links[0]['href']
            try:
                print 'full link...'
                print publication_id, url
                html_local = get_html(url)
                xml_local = get_xml(url + '&xml=true')

                html_docs.append(html_local)
                xml_docs.append(xml_local)

                full_link_success = True
            except:
                pass

        next_link_success = False
        if full_link_success is False and len(next_links) > 0:
            try:
                while len(next_links) > 0:
                    file_regex = _re.search('File=[0-9]+',
                                            next_links[0]['href'])

                    # occasionally pages are malformed with "next" links that don't actually go anywhere
                    if file_regex is not None:
                        url = html_base.format(
                            publication_id) + '&' + file_regex.group(0)
                        print 'next links...'
                        print publication_id, url

                        html_docs.append(get_html(url))
                        xml_docs.append(get_xml(url + '&xml=true'))

                        next_links = [
                            tag for tag in _BeautifulSoup(
                                html_docs[-1]).find_all('a')
                            if 'Next Page' in repr(tag)
                        ]
                    else:
                        break
                next_link_success = True
            except:
                pass

        if full_link_success is False and next_link_success is False:
            print 'failsafe'
            print publication_id, html_base
            xml_docs.append(
                get_xml(html_base.format(publication_id) + '&xml=true'))
            html_docs.append(initial_html)

        self.data[publication_id].update({'html': html_docs, 'xml': xml_docs})
        return self.data[publication_id]
コード例 #30
0
    def _get_ids(self):
        """ Note structure here is a little different than later classes - metadata and IDs retrieved at same time. """
        def get_meta(bill_content):
            """ Get metadata search results based on a given URL. """

            title = bill_content.BillTitle.find(name='Title',
                                                language='en').text

            if 'amend' in title.lower():
                amend = True
            else:
                amend = False

            # rarely, no published version of a bill is available
            publication_tags = [
                t for t in bill_content.find_all('Publication')
                if t.find(name='Title', language='en').text == 'Royal Assent'
            ]
            if len(publication_tags) == 1:
                publication_id = publication_tags[0]['id']
            else:
                publication_id = None

            # all other metadata appear to be consistently present
            date = bill_content.Events.LastMajorStageEvent.Event['date']
            session = bill_content.ParliamentSession['parliamentNumber']
            subtype = bill_content.BillType.find(name='Title',
                                                 language='en').text
            sponsor = bill_content.SponsorAffiliation.Person.FullName.text
            sponsor_party = bill_content.SponsorAffiliation.PoliticalParty.find(
                name='Title', language='en').text
            majority_party = bill_content.PrimeMinister.PoliticalParty.find(
                name='Title', language='en').text

            committee_tags = bill_content.find_all(name='Committee',
                                                   accronym=True)
            committee_names = [t['accronym'] for t in committee_tags]
            committee_data = {
                c: committee_names.count(c)
                for c in set(committee_names)
            }

            metadata = _format_meta_entry(country=u'canada',
                                          title=title,
                                          id=publication_id,
                                          date=date,
                                          session=session,
                                          type=u'annual',
                                          subtype=subtype,
                                          amendment=amend,
                                          sponsor=sponsor,
                                          sponsor_party=sponsor_party,
                                          majority_party=majority_party,
                                          hearings=committee_data)

            return metadata

        base_url = 'http://www.parl.gc.ca{0}'
        bill_types = [
            '/LegisInfo/Result.aspx?BillType=Senate%20Government%20Bill' +
            '&BillStatus=RoyalAssentGiven&Language=E&Mode=1',
            '/LegisInfo/Result.aspx?BillType=Private%20Member%E2%80%99s%20Bill'
            + '&BillStatus=RoyalAssentGiven&Language=E&Mode=1',
            '/LegisInfo/Result.aspx?BillType=House%20Government%20Bill' +
            '&BillStatus=RoyalAssentGiven&Language=E&Mode=1',
            '/LegisInfo/Result.aspx?BillType=Senate%20Public%20Bill'
        ]

        searches = []
        for bill_type in bill_types:
            search_content = _BeautifulSoup(
                _urlopen(base_url.format(bill_type)))
            sessions = [
                _re.sub('&Page=1', '&download=xml', tag['href'])
                for tag in search_content.find_all('a') if _re.search(
                    '[0-9]{2}-[0-9]\s*\([0-9]+\)', tag.text) is not None
            ]
            searches += sessions

        id_vals = []
        for s in searches:
            url = base_url.format(s)
            content = _BeautifulSoup(_urlopen(url).read(), features='xml')

            bills = content.find_all('Bill')
            for bill in bills:
                meta = get_meta(bill)

                if meta['id'] not in self.log_data['Annual']['Canada']:
                    id_vals.append(meta['id'])
                    self.data[meta['id']] = meta

        return id_vals
コード例 #31
0
    def _get_ids(self):
        # URL corresponding to the following search:
        # - All congresses from 1989 forward (first date with full bill text and metadata)
        # - Only legislation that can become law
        # - Only public bills/laws
        # - Only actual laws

        max_attempts = 10

        id_vals = []

        search_url = 'https://www.congress.gov/advanced-search/legislation?query=%7B%22congresses%22%3A%5B%22114%22%2C'\
                     '%22113%22%2C%22112%22%2C%22111%22%2C%22110%22%2C%22109%22%2C%22108%22%2C%22107%22%2C%22106%22%2C'\
                     '%22105%22%2C%22104%22%2C%22103%22%2C%22102%22%2C%22101%22%5D%2C%22restrictionType%22%3A%22field%'\
                     '22%2C%22restrictionFields%22%3A%5B%22billSummary%22%2C%22allBillTitles%22%5D%2C%22wordVariants%2'\
                     '2%3A%22true%22%2C%22legislationTypes%22%3A%5B%22hr%22%2C%22hjres%22%2C%22s%22%2C%22sjres%22%5D%2'\
                     'C%22legislationScope%22%3A%22Public%22%2C%22legislativeAction%22%3A%22115%22%2C%22legislativeAct'\
                     'ionWordVariants%22%3A%22true%22%2C%22sponsorTypes%22%3A%5B%22sponsor%22%2C%22sponsor%22%5D%2C%22'\
                     'sponsorTypeBool%22%3A%22Or%22%2C%22committeeBoolType%22%3A%22Or%22%2C%22legislationCanBecomeLaw%'\
                     '22%3A%22true%22%2C%22sponsorState%22%3A%22One%22%2C%22sourceTab%22%3A%22legislation%22%7D'

        driver = _webdriver.Firefox()
        driver.get(search_url)

        n_results = _Select(driver.find_element_by_name('pageSize'))
        n_results.select_by_visible_text('250 per page')

        while True:
            soup = _BeautifulSoup(driver.page_source)
            result_tags = [
                t for t in soup.find_all('span')
                if 'class' in t.attrs and 'result-heading' in t['class']
            ]
            result_urls = list(set([t.a['href'] for t in result_tags]))

            new_ids = [
                _re.search('bill/(.*?)\?', url).group(1) for url in result_urls
            ]
            new_ids = [_re.sub('/', '_', e) for e in new_ids]
            print new_ids
            id_vals += new_ids

            attempts = 0
            while attempts < max_attempts:
                try:
                    next_button = driver.find_element_by_class_name('next')
                    next_button.click()
                    _sleep(5)
                    break
                except:
                    print('reattempting pageforward...')
                    _sleep(5)
                    attempts += 1

            attempts = 0
            closed = False
            while attempts < max_attempts:
                print 'checking closure...'
                try:
                    closure_check = driver.find_element_by_class_name(
                        'next').get_attribute('outerHTML')
                    if 'next off' in closure_check:
                        closed = True

                    break
                except:
                    attempts += 1
                    print('reattempting closure check...')
                    _sleep(5)

            if closed:
                break

        return id_vals