def scrape_page(url): ''' NOTE: this throws away any links that can't be addons (ie: assumes we're not going any deeper) ''' resp = None links = set() if url.endswith('.jpg') or url.endswith('.png') or url.endswith( '.gif') or url.endswith('.rar'): return set() head = time_wrapper(requests.head, (url, ), t=3) if head: try: cl = int(head.headers['Content-Length']) except: cl = -1 if cl < 1000000: resp = time_wrapper(requests.get, (url, ), t=3) if not resp: return set() netloc = urlparse(url).netloc.split(':')[0] http_encoding = resp.encoding if 'charset' in resp.headers.get( 'content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True) encoding = html_encoding or http_encoding soup = BeautifulSoup(resp.content, from_encoding=encoding) for link in soup.find_all('a', href=True): if ".zip" in link['href'] or 'github' in link['href']: href = link['href'] if not href.startswith('http'): href = 'http://' + netloc + '/' + href if can_be_repo(href): links.add(href) return links
def get_all_uic_links_from_url(base_url, h=None): resp = requests.get(base_url, headers=headers) base_url = resp.url if is_url_end_point(base_url): return [], "" http_encoding = resp.encoding if 'charset' in resp.headers.get( 'content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True) encoding = html_encoding or http_encoding soup = BeautifulSoup(resp.content, from_encoding=encoding) uic_link_list = [] for link in soup.find_all('a', href=True): if is_url_end_point(link['href']): continue target_url = '' o = urlparse(link['href']) if "uic.edu" in o.netloc: target_url = link['href'].rstrip('/') elif not is_absolute(link['href']): target_url = (urllib.parse.urljoin(base_url, link['href'])).rstrip('/') target_url = target_url.replace("http:", "https:") if target_url is not '': uic_link_list.append(target_url) return list(set(uic_link_list)), h.handle(resp.text)
def prepare_markup(self, markup, user_specified_encoding=None, exclude_encodings=None, document_declared_encoding=None): """ :yield: A series of 4-tuples. (markup, encoding, declared encoding, has undergone character replacement) Each 4-tuple represents a strategy for parsing the document. """ # Instead of using UnicodeDammit to convert the bytestring to # Unicode using different encodings, use EncodingDetector to # iterate over the encodings, and tell lxml to try to parse # the document as each one in turn. is_html = not self.is_xml if is_html: self.processing_instruction_class = ProcessingInstruction else: self.processing_instruction_class = XMLProcessingInstruction if isinstance(markup, str): # We were given Unicode. Maybe lxml can parse Unicode on # this system? yield markup, None, document_declared_encoding, False if isinstance(markup, str): # No, apparently not. Convert the Unicode to UTF-8 and # tell lxml to parse it as UTF-8. yield (markup.encode("utf8"), "utf8", document_declared_encoding, False) try_encodings = [user_specified_encoding, document_declared_encoding] detector = EncodingDetector( markup, try_encodings, is_html, exclude_encodings) for encoding in detector.encodings: yield (detector.markup, encoding, document_declared_encoding, False)
def prepare_complete_links(url): http_regex = re.compile(r'http') page = requests.get(url) http_encoding = page.encoding if 'charset' in page.headers.get( 'content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding(page.content, is_html=True) encoding = html_encoding or http_encoding soup = BeautifulSoup(page.content, 'html.parser', from_encoding=encoding) complete_links = [] for alink in soup.find_all('a', href=True): if http_regex.search(alink['href']) is not None: complete_links.append(alink['href']) print( http_regex.search(alink['href']).group() + "---" + alink['href']) elif 'javascript' not in alink['href'] and len( alink['href'].strip()) > 0: if alink['href'][:1] == '/': temp_link = TWM_DOMAIN + alink['href'] complete_links.append(temp_link) print("need http" + "---" + alink['href']) else: temp_link = TWM_DOMAIN + "/" + alink['href'] complete_links.append(temp_link) return list(set(complete_links))
def from_warc(warc_record, decode_errors="replace"): """ Extracts relevant information from a WARC record. This function does not invoke scrapy but only uses the article extractor. :return: """ raw_stream = warc_record.raw_stream.read() encoding = None try: encoding = warc_record.http_headers.get_header('Content-Type').split(';')[1].split('=')[1] except: pass if not encoding: encoding = EncodingDetector.find_declared_encoding(raw_stream, is_html=True) if not encoding: # assume utf-8 encoding = 'utf-8' try: html = raw_stream.decode(encoding, errors=decode_errors) except LookupError: # non-existent encoding: fallback to utf-9 html = raw_stream.decode('utf-8', errors=decode_errors) if not html: raise EmptyResponseError() url = warc_record.rec_headers.get_header('WARC-Target-URI') download_date = warc_record.rec_headers.get_header('WARC-Date') article = NewsPlease.from_html(html, url=url, download_date=download_date) return article
def detect_encoding(data, encoding=None, fallback='latin1', is_html=False): '''Detect the character encoding of the data. Returns: str: The name of the codec Raises: ValueError: The codec could not be detected. This error can only occur if fallback is not a "lossless" codec. ''' if encoding: encoding = normalize_codec_name(encoding) bs4_detector = EncodingDetector( data, override_encodings=(encoding, ) if encoding else (), is_html=is_html) candidates = itertools.chain(bs4_detector.encodings, (fallback, )) for candidate in candidates: if not candidate: continue candidate = normalize_codec_name(candidate) if not candidate: continue if try_decoding(data, candidate): return candidate raise ValueError('Unable to detect encoding.')
def scrape_politifact_article(story_url): resp = requests.get(story_url) http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True) encoding = html_encoding or http_encoding soup = BeautifulSoup(resp.content, 'lxml', from_encoding=encoding) return soup.find("div", "article__text").get_text()
def from_warc(warc_record): """ Extracts relevant information from a WARC record. This function does not invoke scrapy but only uses the article extractor. :return: """ raw_stream = warc_record.raw_stream.read() encoding = None try: encoding = warc_record.http_headers.get_header( 'Content-Type').split(';')[1].split('=')[1] except: pass if not encoding: encoding = EncodingDetector.find_declared_encoding(raw_stream, is_html=True) if not encoding: # assume utf-8 encoding = 'utf-8' html = raw_stream.decode(encoding) url = warc_record.rec_headers.get_header('WARC-Target-URI') download_date = warc_record.rec_headers.get_header('WARC-Date') article = NewsPlease.from_html(html, url=url, download_date=download_date) return article
def getHTML(url, verb=False): ''' This function takes and url as an input and returns the corresponding bs4 object ''' from bs4.dammit import EncodingDetector try: re = session.get(url, headers=headers, timeout=(10, 30)) except: print(r'problem here') return (None) else: if re.status_code == 200: # dealing with encoding http_encoding = re.encoding if 'charset' in re.headers.get( 'content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding( re.content, is_html=True) encoding = html_encoding or http_encoding # generating BeautifulSoup object bsObj = BeautifulSoup(re.content, 'html5lib', from_encoding=encoding) if verb == True: print("The title of html is %s" % bsObj.title.getText()) return (bsObj) else: return (None)
def compile_links(web_address): ''' compile_links accesses a webpage at a given address, finds all of the links on that page, and appends certain links to a list called links_list. compile links works together with find_diffraction_files to get only the relevant links. inputs are a web address, and the list for storing links ''' html_page = requests.get(web_address) http_encoding = html_page.encoding if 'charset' in\ html_page.headers.get('content-type', '').lower() else None html_encoding =\ EncodingDetector.find_declared_encoding(html_page.content, is_html=True) encoding = html_encoding or http_encoding soup = BeautifulSoup(html_page.content, from_encoding=encoding, features="html.parser") links_list = [] permutation_attempt = soup(text=re.compile("Now trying variations on your request:")) if len(permutation_attempt) is not 0: return links_list for link in soup.find_all(href=find_diffraction_files): links_list.append('http://rruff.geo.arizona.edu'+link['href']) return links_list
def top4leagues(leagueList,index): rangeOfWork = team_qulfied[index] defaultLst = [] temIndex = sample(range(12),rangeOfWork) for i in range(rangeOfWork): clubsIndex = randint(0,1) try: tempClubsLst = [] url = 'https://www.worldfootball.net'+leagueList[clubsIndex] source = requests.get(url, headers=header) http_encoding = source.encoding if 'charset' in source.headers.get('content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding(source.content, is_html=True) encoding = html_encoding or http_encoding soup = BeautifulSoup(source.content, 'lxml', from_encoding=encoding) find_boxS= soup.find('div',class_="scrollable_tables") the_team_table = find_boxS.find('table', {'class':'standard_tabelle'}) for theTeamAtag in the_team_table.find_all('a',href=True): if theTeamAtag.text: tempClubsLst.append(theTeamAtag.text) y = temIndex[i] teamNames = tempClubsLst[y] defaultLst.append(teamNames) tempClubsLst.pop() except Exception as e: print(e) return defaultLst
def getSteam(self, q, size): querys = q.replace(" ", "+") url = ('https://store.steampowered.com/search/?term=' + str(querys) + '&category1=998') resp = requests.get(url) http_encoding = resp.encoding if 'charset' in resp.headers.get( 'content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True) encoding = html_encoding or http_encoding soup = BeautifulSoup(resp.content, from_encoding=encoding, features="lxml") print(url) SteamLinkList = [] #find links to apps for link in soup.find_all('a', href=re.compile('app')): #remove duplicates if (link['href'] not in SteamLinkList): SteamLinkList.append(link['href']) #remove first two irrelevant links return SteamLinkList[2:size + 2]
def getLinks(): parser = 'html.parser' # or 'lxml' (preferred) or 'html5lib', if installed for i in range(1,100): if os.path.exists('pdfs/' + str(i)): print(str(i),'already exists') continue resp = requests.get("https://quizbowlpackets.com/"+str(i)) http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True) encoding = html_encoding or http_encoding soup = BeautifulSoup(resp.content, parser, from_encoding=encoding) links = [] allLinks = soup.find_all('a', href=True) combined = [True if 'pdf' in link['href'] else False for link in allLinks] if not any(combined): print(str(i), 'doesn\'t exist') continue for link in allLinks: link = link['href'] if 'Packet' in link: links.append(link) print(links) with open('pdfs/' + str(i),'wb') as file: pickle.dump(links, file)
def get_soup(self, _page=0): """ scrape web-site page """ # get request self.__response = self.get_request() if self.__verbose: _log.debug(f'self.__response={self.__response}') # get encoding _http_encoding = self.__response.encoding if 'charset' in self.__response.headers.get( 'content-type', '').lower() else None _html_encoding = EncodingDetector.find_declared_encoding( self.__response.content, is_html=True) # get soup self.__soup = None try: if self.__verbose: _log.debug(f'Getting soup from self.__response.text') self.__soup = BeautifulSoup(self.__response.text, features='html5lib', from_encoding=(_html_encoding or _http_encoding)) if self.__verbose: _log.debug(f'Got soup from self.__response.text OK') except Exception as e: self.__soup = None if self.__verbose: _log.error( f'Failed to get soup from self.__response.text, error={e}')
def desi_crawler(u_r_l): web_list = [] url = u_r_l web_list.append(url) domain = url if "www." not in domain: div = domain.replace('//', ' ').replace('.', ' ').split() domain = div[1] else: div = domain.replace('//', ' ').replace('.', ' ').split() domain = div[2] for url in web_list: response = requests.get(url) http_encoding = response.encoding if 'charset' in response.headers.get( 'content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding( response.content, is_html=True) encoding = html_encoding or http_encoding soup = BeautifulSoup(response.content, from_encoding=encoding) for link in soup.find_all('a', href=True): if domain in link['href']: if link['href'] not in web_list: web_list.append(link['href'])
def otherTeams(sumOfQ): remainigTeams = 16- sumOfQ finalTeams = [] if remainigTeams <=0: return [] else: randomTeams = sample(range(10),remainigTeams) randomTeamsIndex = sample(range(remainigTeams+2),remainigTeams) randomTeamSelection = choices(range(len(league_qlf_list)),k=remainigTeams) for i in range(0,remainigTeams): temp_x = randomTeamSelection[i] try: tempClubsLst = [] url = 'https://www.worldfootball.net'+league_qlf_list[temp_x] source = requests.get(url, headers=header) http_encoding = source.encoding if 'charset' in source.headers.get('content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding(source.content, is_html=True) encoding = html_encoding or http_encoding soup = BeautifulSoup(source.content, 'lxml', from_encoding=encoding) find_boxS= soup.find('div',class_="scrollable_tables") the_team_table = find_boxS.find('table', {'class':'standard_tabelle'}) for theTeamAtag in the_team_table.find_all('a',href=True): if theTeamAtag.text: tempClubsLst.append(theTeamAtag.text) y = randomTeamsIndex[i] teamNames = tempClubsLst[y] finalTeams.append(teamNames) except Exception as e: print(e) return finalTeams
def getIMDB(self, queryi): url = ('https://www.imdb.com/search/keyword/?keywords=' + str(queryi) + '&ref_=fn_kw_kw_1&mode=detail&page=1&sort=moviemeter,asc') resp = requests.get(url) http_encoding = resp.encoding if 'charset' in resp.headers.get( 'content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True) encoding = html_encoding or http_encoding soup = BeautifulSoup(resp.content, from_encoding=encoding, features="lxml") print(url) imdbLinkList = [] #find links to titles for link in soup.find_all('a', href=re.compile('title')): #remove irrelevant links if "vote" not in link['href'] and "search" not in link[ 'href'] and "plotsummary" not in link['href']: #remove duplicates if ('https://www.imdb.com' + link['href'] not in imdbLinkList): imdbLinkList.append('https://www.imdb.com' + link['href']) return imdbLinkList
def getOneEntry(searchTerm): searchTerm = searchTerm.replace('\n', '') response = requests.get( urlSearchTemplate.format(searchTerm.replace(' ', '%20'))) if response.ok: http_encoding = response.encoding if 'charset' in response.headers.get( 'content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding( response.content, is_html=True) encoding = html_encoding or http_encoding soup = BeautifulSoup(response.content, 'lxml', from_encoding=encoding) result = processHtml(soup, searchTerm) if ("/tpl" in result[0]): result = getOneEntry2(result[1], result[0]) resultSplited = result.split(',') if len(resultSplited) == 3: resultSplited = [i.decode('utf-8').strip() for i in resultSplited] nome = resultSplited[0] status = resultSplited[1] nome_aceito = resultSplited[1] return nome, status, nome_aceito else: return '', '', '' else: return 'Bad Response!'
def doc_encoding(self) -> str: http_encoding = self.doc.encoding if "charset" in self.doc.headers.get( "Content-Type", "").lower() else None html_encoding = EncodingDetector.find_declared_encoding( self.doc.content, is_html=True) encoding: str = str(html_encoding or http_encoding) self.sdoc.encoding = encoding return encoding
def get_text(html): # Detect encoding and extract plain text from page encoding = EncodingDetector.find_declared_encoding(html, is_html=True) soup = BeautifulSoup(html, "lxml", from_encoding=encoding) for script in soup(["script", "style"]): script.extract() return soup.get_text(" ", strip=True)
def get_url_soup(url): url_request = requests.get(url, headers=headers, allow_redirects=True) http_encoding = url_request.encoding if 'charset' in url_request.headers.get( 'content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding( url_request.content, is_html=True) encoding = html_encoding or http_encoding return BeautifulSoup(url_request.content, 'lxml', from_encoding=encoding)
def get_html_title(self, page, record): try: encoding = EncodingDetector.find_declared_encoding(page, is_html=True) soup = BeautifulSoup(page, "lxml", from_encoding=encoding) title = soup.title.string.strip() return title except: return ""
def grab_projects(self, resp): http_encoding = resp.encoding if 'charset' in resp.headers.get('content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True) encoding = html_encoding or http_encoding soup = BeautifulSoup(resp.content, from_encoding=encoding) links = [self.BASE_URL + link['href'] for link in soup.find_all('a', href=True) if link['href'].startswith("/projects/")] self.add_to_queue(urls=links, website_name=self.NAME) return len(links)
def get_html(url): headers = {"User-Agent": USERAGENT} resp = requests.get(url, headers=headers) http_encoding = resp.encoding if 'charset' in resp.headers.get( 'content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True) encoding = html_encoding or http_encoding return resp.text
def get_soup_for_url(base_url): resp = requests.get(base_url) http_encoding = resp.encoding if 'charset' in resp.headers.get( 'content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True) encoding = html_encoding or http_encoding soup = BeautifulSoup(resp.content, from_encoding=encoding) return soup
def get_source_html(url): headers = {"User-Agent": 'Chrome'} resp = requests.get(url, headers=headers) http_encoding = resp.encoding if 'charset' in resp.headers.get( 'content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True) encoding = html_encoding or http_encoding webpage = BeautifulSoup(resp.content, 'lxml', from_encoding=encoding) return webpage
def get_soup_html(url, headers=GET_HEADER): resp = SESSION.get(url, headers=headers) http_encoding = resp.encoding if 'charset' in resp.headers.get( 'content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding(resp.content, is_html=True) encoding = html_encoding or http_encoding soup = BeautifulSoup(resp.content, 'lxml', from_encoding=encoding) return soup
def get_html_text_body(self, page, record): try: encoding = EncodingDetector.find_declared_encoding(page, is_html=True) soup = BeautifulSoup(page, "lxml", from_encoding=encoding) for script in soup(["script", "style"]): script.extract() return soup.get_text(" ", strip=True) except: return ""
def prepare_markup( self, markup, user_specified_encoding=None, exclude_encodings=None, document_declared_encoding=None, ): """Run any preliminary steps necessary to make incoming markup acceptable to the parser. lxml really wants to get a bytestring and convert it to Unicode itself. So instead of using UnicodeDammit to convert the bytestring to Unicode using different encodings, this implementation uses EncodingDetector to iterate over the encodings, and tell lxml to try to parse the document as each one in turn. :param markup: Some markup -- hopefully a bytestring. :param user_specified_encoding: The user asked to try this encoding. :param document_declared_encoding: The markup itself claims to be in this encoding. :param exclude_encodings: The user asked _not_ to try any of these encodings. :yield: A series of 4-tuples: (markup, encoding, declared encoding, has undergone character replacement) Each 4-tuple represents a strategy for converting the document to Unicode and parsing it. Each strategy will be tried in turn. """ is_html = not self.is_xml if is_html: self.processing_instruction_class = ProcessingInstruction else: self.processing_instruction_class = XMLProcessingInstruction if isinstance(markup, str): # We were given Unicode. Maybe lxml can parse Unicode on # this system? yield markup, None, document_declared_encoding, False if isinstance(markup, str): # No, apparently not. Convert the Unicode to UTF-8 and # tell lxml to parse it as UTF-8. yield (markup.encode("utf8"), "utf8", document_declared_encoding, False) try_encodings = [user_specified_encoding, document_declared_encoding] detector = EncodingDetector(markup, try_encodings, is_html, exclude_encodings) for encoding in detector.encodings: yield (detector.markup, encoding, document_declared_encoding, False)
def getSoup(matchUrl): res = requests.get(matchUrl) res.raise_for_status() http_encoding = res.encoding if 'charset' in res.headers.get( 'content-type', '').lower() else None html_encoding = EncodingDetector.find_declared_encoding(res.content, is_html=True) encoding = html_encoding or http_encoding return bs4.BeautifulSoup(res.content, 'lxml', from_encoding=encoding)
def get_html_encoding(html): return EncodingDetector.find_declared_encoding(html, is_html=True, search_entire_document=False)