コード例 #1
0
def error_check_prepare_netloc():
    url1 = None
    url2 = 23
    url3 = '//www.reddit.com'
    url4 = 'www.google.com'
    urls = [url1, url2, url3, url4]
    for url in urls:
        with pytest.raises(InvalidUrlError):
            um.prepare_netloc(url)
コード例 #2
0
ファイル: requester.py プロジェクト: pharvie/scraper
def internal(url, host):
    if not validate_url(url):
        raise InvalidUrlError(
            'Cannot define internal status of invalid url: ' + str(url))
    if not validate_url(host):
        raise InvalidUrlError(
            'Cannot define internal status with invalid base: ' + str(host))
    url_netloc = um.prepare_netloc(url)
    host_netloc = um.prepare_netloc(host)
    return url_netloc == host_netloc
コード例 #3
0
def add_hosts_to_database():
    for host in hosts:
        host = um.prepare_netloc(
            host)  # prepare the network location of the host
        if host_list.entry_from_host(
                host) is None:  # if there is not an entry at the host
            host_list.add_to_hosts(host)  # add it to the database
コード例 #4
0
def test_document_from_ip_address():
    streamer = Streamer('test_document_from_ip_address')
    host = 'http://list-iptv.com'
    url1 = 'http://62.210.245.19:8000/live/testapp/testapp/2.ts'
    url2 = 'http://clientportalpro.com:2500/live/VE5DWv4Ait/7KHLqRRZ9E/2160.ts'
    url3 = 'http://ndasat.pro:8000/live/exch/exch/1227.ts'
    for url in [url1, url2, url3]:
        netloc = um.prepare_netloc(url)
        ip_addresses = socket.gethostbyname_ex(um.remove_schema(netloc))[2]
        for ip_address in ip_addresses:
            data = {
                'ip_address':
                ip_address,
                'network_locations': [
                    SON([('network_location', netloc), ('linked_by', [host]),
                         ('working_link', True)])
                ]
            }
            streamer.collection().insert(data)
            doc = streamer.document_from_ip_address(ip_address)
            assert doc['ip_address'] == ip_address
            assert doc['network_locations'] == [
                SON([('network_location', netloc), ('linked_by', [host]),
                     ('working_link', True)])
            ]

    streamer.delete()
コード例 #5
0
ファイル: database.py プロジェクト: pharvie/scraper
 def add_to_hosts(self, host, running=False):
     if not requester.validate_url(host):
         raise InvalidUrlError(
             'Cannot add a url to streams with an invalid host: %s' % host)
     entry = self.entry_from_host(host)
     if entry:
         raise EntryInDatabaseError(
             'The following host is already in the database: %s' % host)
     data = {
         'host': um.prepare_netloc(host),
         'running': running
     }  # data for the entry
     self._db.hosts.insert(data)
コード例 #6
0
ファイル: database.py プロジェクト: pharvie/scraper
 def entry_from_host(self, host):
     if not requester.validate_url(host):
         raise InvalidUrlError(
             'Cannot retrieve entry with an invalid host: %s' % host)
     cursors = self._db.hosts.find({'host': um.prepare_netloc(host)})
     if cursors.count(
     ) > 1:  # count counts the number of cursor, if there are multiple cursors an error is raised
         raise MultipleEntriesInDatabaseError(
             'There are multiple entries in the hosts database with the same host: %s'
             % host)
     if cursors.count() == 1:
         return cursors[0]  # returns the cursor corresponding to the url
     return None  # returns None if no cursor is found
コード例 #7
0
def test_add_to_database_by_ip_address():
    streamer = Streamer('test_add_to_database_by_ip_address')
    host = 'http://list-iptv.com'
    url1 = 'http://62.210.245.19:8000/live/testapp/testapp/2.ts'
    url2 = 'http://clientportalpro.com:2500/live/VE5DWv4Ait/7KHLqRRZ9E/2160.ts'
    url3 = 'http://ndasat.pro:8000/live/exch/exch/1227.ts'
    for url in [url1, url2, url3]:
        netloc = um.prepare_netloc(url)
        ip_addresses = socket.gethostbyname_ex(um.remove_schema(netloc))[2]
        for ip_address in ip_addresses:
            streamer.add_to_database_by_ip_address(ip_address, netloc, host,
                                                   None)
            doc = streamer.document_from_ip_address(ip_address)

            assert doc['ip_address'] == ip_address
            for entry in doc['network_locations']:
                assert entry == netloc

    streamer.delete()
コード例 #8
0
def test_prepare_netloc():
    url1 = um.prepare_netloc(
        'http://s4.bossna-caffe.com:80/hls/1200.m3u8?channelId=1200&deviceMac=00:1A:79:3A:2D:B9&uid=35640'
    )
    url2 = um.prepare_netloc(
        'http://www.s4.bossna-caffe.com:80/hls/1200.m3u8?channelId=1200&deviceMac=00:1A:79:3A:2D:B9&uid=35640/'
    )
    url3 = um.prepare_netloc('http://192.68.132.1:800')
    url4 = um.prepare_netloc('http://192.68.132.1:400')
    url5 = um.prepare_netloc(
        'http://www.soledge7.dogannet.tv/S1/HLS_LIVE/tv2/1000/prog_index.m3u8')
    url6 = um.prepare_netloc(
        'http://soledge7.dogannet.tv/S1/HLS_LIVE/tv2/1000/prog_index.m3u8')
    url7 = um.prepare_netloc('http://reddit.com')

    assert url1 == url2 == 'http://s4.bossna-caffe.com'
    assert url3 == url4 == 'http://192.68.132.1'
    assert url5 == url6 == 'http://soledge7.dogannet.tv'
    assert url7 == 'http://reddit.com'
コード例 #9
0
ファイル: database.py プロジェクト: pharvie/scraper
 def add_to_streams(self, url, host, ext_title=None):
     if not requester.validate_url(url):
         raise InvalidUrlError('Cannot add an invalid url to streams: %s' %
                               url)
     if not requester.validate_url(host):
         raise InvalidUrlError(
             'Cannot add a url to streams with an invalid host: %s' % host)
     netloc = um.prepare_netloc(url)
     if netloc not in self.broken_stream_links and netloc not in self.working_stream_links:  # Note that network locations are only
         # added to broken_stream_links or working_stream_links if their working link status is known. Also note that visitor classes
         # are unique to crawler classes, therefore each visitor class will only deal with one host. Therefore if a stream is
         # added to the database from a given visitor class with a known working link status, then it is no longer necessary
         # to evaluate that stream, doing so would lead to redundant requests to the database
         if netloc not in self.ip_addresses:  #if there isn't an IP address assigned to the network location
             try:
                 ip_addresses = socket.gethostbyname_ex(
                     um.remove_schema(netloc)
                 )[2]  #fetches all IP addresses from the network location
                 self.ip_addresses[netloc] = ip_addresses
             except socket.gaierror:  #if this error is raised then the network location is down
                 ip_addresses = None
         else:
             ip_addresses = self.ip_addresses[netloc]
         if ip_addresses:
             stream_statuses = {}
             for ip_address in ip_addresses:
                 playable_url = False
                 if (ip_address, netloc) not in self.connection_attempts:
                     self.connection_attempts[(ip_address, netloc)] = 1
                 if self.connection_attempts[(ip_address,
                                              netloc)] in self.fibs:
                     if url not in stream_statuses:
                         try:
                             stream_status = requester.evaluate_stream(url)
                         except StreamTimedOutError:
                             stream_statuses[url] = working_link = False
                         else:
                             if stream_status:
                                 stream_statuses[url] = working_link = True
                                 r = requester.make_request(url)
                                 if r and r.ok:
                                     playable_url = True
                             elif self.connection_attempts[(
                                     ip_address, netloc)] == self.fibs[-1]:
                                 stream_statuses[url] = working_link = False
                             else:
                                 stream_statuses[url] = working_link = None
                     else:
                         working_link = stream_statuses[url]
                     if not playable_url:
                         self.add_to_database_by_ip_address(
                             ip_address, netloc, host, working_link,
                             ext_title)
                     else:
                         self.add_to_database_by_ip_address(
                             ip_address, netloc, host, working_link,
                             ext_title, url)
                     if working_link:
                         self.working_stream_links.add(netloc)
                     elif working_link is False:
                         self.broken_stream_links.add(netloc)
                 self.connection_attempts[(ip_address, netloc)] += 1
     elif netloc in self.working_stream_links:
         ip_addresses = self.ip_addresses[netloc]
         for ip_address in ip_addresses:
             self.add_to_titles(ip_address, ext_title)