Example #1
0
 def test_url_titles_without_ssl_verification(self):
     reader = URLTitleReader(verify_ssl=False)
     for url, expected_title in TEST_CASES_WITH_BAD_SSL.items():
         if URL_FILTER and (URL_FILTER not in url):
             continue
         with self.subTest(url=url):
             self.assertEqual(expected_title, reader.title(url))
Example #2
0
 def test_url_titles(self):
     reader = URLTitleReader()
     for url, expected_title in TEST_CASES.items():
         if URL_FILTER and (URL_FILTER not in url):
             continue
         with self.subTest(url=url):
             self.assertEqual(expected_title, reader.title(url))
Example #3
0
from urltitle import config, URLTitleReader

config.configure_logging()

TEST_URL = 'https://www.google.com/'

reader = URLTitleReader()
reader.title(TEST_URL)
reader.title(TEST_URL)  # Should use cache.
Example #4
0
from urltitle import config, URLTitleReader

config.configure_logging()

TEST_URL = 'https://www.google.com/'

TEST_URLS = [
    'https://www.amazon.com/Natures-Plus-Chewable-Iron-Supplement/dp/B00014DAFM',
    'https://www.amazon.com/Bluebonnet-Earth-Vitamin-Chewable-Tablets/dp/B00ENYUIO2/',
    'https://www.amazon.com/dp/B0749WVS7J/ref=ods_gw_ha_h1_d_rr_021519?pf_rd_p=8bf51e9c-a499-47ad-829e-a0b4afcae72e&pf_rd_r=9SHQNHFS1W35WG02P75M',
    'https://www.amazon.com/dp/B0794W1SKP/ref=ods_mccc_lr',
    'https://www.amazon.com/ProsourceFit-Tri-Fold-Folding-Exercise-Carrying/dp/B07NCJDHBM?',
]

reader = URLTitleReader()
for url in TEST_URLS:
    reader.title(url)
Example #5
0
"""Read and log the title of a URL."""
import logging

from urltitle import URLTitleReader, config

config.configure_logging()
log = logging.getLogger(f"{config.PACKAGE_NAME}.{__name__}")

URL = "https://www.google.com"

reader = URLTitleReader()  # pylint: disable=invalid-name
log.info(f"{URL} has title: {reader.title(URL)}")
log.info("Testing cache.")
log.info(f"{URL} has title: {reader.title(URL)}")  # Should use cache.
config.configure_logging()
log = logging.getLogger(__name__)

EXTRA_HEADERS = {
    "Accept": "*/*",
    "Accept-Language": "en-US,en;q=0.5",
    "Accept-Encoding": "gzip",
    "Referer": "https://google.com/",
    "DNT": 1,
    "Connection": "keep-alive",
    "Cookie": "",
    "Upgrade-Insecure-Requests": 1,
    "Pragma": "no-cache",
    "Cache-Control": "no-cache",
}
NETLOC = URLTitleReader().netloc(TEST_URL)
log.info("Netloc for %s is %s.", TEST_URL, NETLOC)

titles: Dict[str, str] = {}
config.NETLOC_OVERRIDES[NETLOC] = {"extra_headers": {}}
EXTRA_CONFIG_HEADERS = config.NETLOC_OVERRIDES[NETLOC]["extra_headers"]
for h_key, h_val in EXTRA_HEADERS.items():
    log.debug("Adding header: %s=%s", h_key, h_val)
    EXTRA_CONFIG_HEADERS[h_key] = h_val
    reader = URLTitleReader()  # Fresh instance avoids cache.
    try:
        title = reader.title(TEST_URL)
    except URLTitleError as exc:
        log.error("Ignoring exception after adding header %s=%s: %s", h_key,
                  h_val, exc)
        continue