def operater(pageQueue,resultQueue): while True: data = pageQueue.get(block = True) if not data: continue playerid = '/'.join(tool.getplayerid(data[0])) htmltext = data[1] # print htmltext,"at:operater" html = formatHTML(htmltext) hc = MyHTMLParser() hc.feed(html) playerinfo = hc.get_playerinfo() career = hc.get_career() if playerinfo: playerinfo = tool.addkeytodict(playerinfo,'player_id',playerid) if career: career = tool.addkeytodict(career,'player_id',playerid) if playerinfo: resultQueue.put((playerinfo,career),block = True) print "resultQueue have more record!" print playerinfo print career print "I am working ,boss."
def operater(pageQueue,resultQueue): data = pageQueue.get(block = True) playerid = '/'.join(tool.getplayerid(data[0])) htmltext = data[1] html = formatHTML(html) hc = MyHTMLParser() hc.feed(html) playerinfo = hc.get_playerinfo() career = hc.get_career() playerinfo = tool.addkeytodict(playerinfo,'player_id',playerid) career = tool.addkeytodict(career,'player_id',playerid) resultQueue.put((playerinfo,career),block = True)
def get_subtitle_download_url(response_content): download_url = "" parser = MyHTMLParser() parser.feed(response_content) for attr in parser.attrs: attr_name, attr_value = attr if attr_name == "src" and attr_value.find(SUBTITLE_PAGE) >= 0: download_url = SUBTITLE_DOWNLOAD_URL % attr_value break parser.close() return download_url
def fetch_links(html_string, counter, library, append_number): soup = BeautifulSoup(html_string, "html.parser") # first check if the html string contains frame tag. # If so, the string has to be forwarded to a different function append_number += 5 base_url = 'http://doc2/FIISDEVTECPUB/techpubs/browser/' parser = MyHTMLParser(base_url) parser.feed(html_string) # fetch the dictionary create after html parsing # dictionary_of_links = dict() dictionary_of_links = parser.send_links_and_data() if not bool(dictionary_of_links): pass else: print(dictionary_of_links) file = os.path.join(library + '/' + library + str(append_number) + '.csv') print(file) if not os.path.isfile(file): with io.open(file, "w", newline="", encoding="utf-8") as f: write_into_csv = csv.writer(f) write_into_csv.writerow(['Product Doc Suites', 'Link']) f.close() # even if the file exists overwrite it with empty character with open(file, 'w', newline="", encoding='utf-8') as csv_file: writer = csv.writer(csv_file) writer.writerow(['Product Doc Suites', 'Link']) for x, y in dictionary_of_links.items(): append_number += 2 if x is None: pass else: temp = x.split() temp2 = str(temp[-1]).lower().replace('_fram', '-1') temp[-1] = temp2 print(temp) writer.writerow([ y, ''.join(temp) ]) # converting list type to string using join csv_file.close() # you have a dictionary consisting of product name and links from the html table # crawl these sub pages to get all the nested links # thus authenticate again and collect the html string for key, value in dictionary_of_links.items(): if key is None or not key: pass # checking if the key is empty or not elif key == 'mailto:www.com': pass elif counter >= 2: pass else: second_html_string = authenticate_access(key) if second_html_string == "Page cannot be found": print("The Url gave 404 Error") else: counter += 1 append_number += 3 # randomly increasing the number fetch_links(second_html_string, counter, library, append_number) append_number += 2 # randomly increasing the number counter -= 1 # so that it wont eliminate the rest of the keys after it counter -= 1
def get_url(): parser = MyHTMLParser() with urllib.request.urlopen(URL) as response: parser.feed(response.read().decode('utf-8')) return parser.links
from html_parser import MyHTMLParser import urllib.request from bs4 import BeautifulSoup import requests from language_detecter import LanguageDetector parser = MyHTMLParser() #url = "https://www.vpnverbinding.nl/beste-vpn/netflix/" url = "https://www.vpnconexion.es/blog/mejor-vpn-para-netflix/?_ga=2.224715098.1306859094.1600959792-1235625754.1600959792" req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'}) response = urllib.request.urlopen(req) html = response.read() page = requests.get(url).text soup = BeautifulSoup(page, "html.parser") print("Analyzing....") print(soup.title.string) #get the webpage language language = soup.html["lang"].replace("-", "_") print("The language webpage is: " + language) lang_validate = LanguageDetector(language) print("-----------") #find the titles h3,h2,h1 too text in p, div and span inside the divs contentTable = soup.find('div') rows = contentTable.find_all( ['h3', 'h2', 'h1', 'p', 'div', 'span', 'img', 'li', 'ul'])
print name fd = open(name,'r') data = filesplit(fd,'@newpage@') print len(data) for page in data[1:]: print type(page),len(page) if not tool.isplayerpage(page[0]): continue else: playerid = '/'.join(tool.getplayerid(page[0])) html = ''.join(page[1:]) html = formatHTML(html) hc = MyHTMLParser() hc.feed(html) playerinfo = hc.get_playerinfo() career = hc.get_career() playerinfo = tool.addkeytodict(playerinfo,'player_id',playerid) career = tool.addkeytodict(career,'player_id',playerid) print playerinfo #insert record tool.insertplayerinfo(qiud_cur,playerinfo) for car_record in career: tool.insertcareer(qiud_cur,car_record) qiud_conn.commit()