Ejemplo n.º 1
0
def operater(pageQueue,resultQueue):
    while True:

        data = pageQueue.get(block = True)
        if not data:
            continue

        playerid = '/'.join(tool.getplayerid(data[0]))
        htmltext = data[1]
#        print htmltext,"at:operater"

        html = formatHTML(htmltext)

        hc = MyHTMLParser()
        hc.feed(html)

        playerinfo = hc.get_playerinfo()
        career = hc.get_career()

        if playerinfo:
            playerinfo = tool.addkeytodict(playerinfo,'player_id',playerid)
        if career:
            career = tool.addkeytodict(career,'player_id',playerid)

        if playerinfo:
            resultQueue.put((playerinfo,career),block = True)
            print "resultQueue have more record!"
            print playerinfo
            print career

        print "I am working ,boss."
Ejemplo n.º 2
0
def operater(pageQueue,resultQueue):

    data = pageQueue.get(block = True)
    playerid = '/'.join(tool.getplayerid(data[0]))
    htmltext = data[1]

    html = formatHTML(html)

    hc = MyHTMLParser()
    hc.feed(html)

    playerinfo = hc.get_playerinfo()
    career = hc.get_career()

    playerinfo = tool.addkeytodict(playerinfo,'player_id',playerid)
    career = tool.addkeytodict(career,'player_id',playerid)

    resultQueue.put((playerinfo,career),block = True)
Ejemplo n.º 3
0
def get_subtitle_download_url(response_content):
    download_url = ""
    parser = MyHTMLParser()
    parser.feed(response_content)
    for attr in parser.attrs:
        attr_name, attr_value = attr
        if attr_name == "src" and attr_value.find(SUBTITLE_PAGE) >= 0:
            download_url = SUBTITLE_DOWNLOAD_URL % attr_value
            break
    parser.close()
    return download_url
Ejemplo n.º 4
0
def fetch_links(html_string, counter, library, append_number):
    soup = BeautifulSoup(html_string, "html.parser")
    # first check if the html string contains frame tag.
    # If so, the string has to be forwarded to a different function
    append_number += 5
    base_url = 'http://doc2/FIISDEVTECPUB/techpubs/browser/'
    parser = MyHTMLParser(base_url)
    parser.feed(html_string)
    # fetch the dictionary create after html parsing
    # dictionary_of_links = dict()
    dictionary_of_links = parser.send_links_and_data()
    if not bool(dictionary_of_links):
        pass
    else:
        print(dictionary_of_links)
        file = os.path.join(library + '/' + library + str(append_number) +
                            '.csv')
        print(file)
        if not os.path.isfile(file):
            with io.open(file, "w", newline="", encoding="utf-8") as f:
                write_into_csv = csv.writer(f)
                write_into_csv.writerow(['Product Doc Suites', 'Link'])
                f.close()
                # even if the file exists overwrite it with empty character
        with open(file, 'w', newline="", encoding='utf-8') as csv_file:
            writer = csv.writer(csv_file)
            writer.writerow(['Product Doc Suites', 'Link'])
            for x, y in dictionary_of_links.items():
                append_number += 2
                if x is None:
                    pass
                else:
                    temp = x.split()
                    temp2 = str(temp[-1]).lower().replace('_fram', '-1')
                    temp[-1] = temp2
                    print(temp)
                    writer.writerow([
                        y, ''.join(temp)
                    ])  # converting list type to string using join
            csv_file.close()

        # you have a dictionary consisting of product name and links from the html table
        # crawl these sub pages to get all the nested links
        # thus authenticate again and collect the html string
        for key, value in dictionary_of_links.items():
            if key is None or not key:
                pass  # checking if the key is empty or not
            elif key == 'mailto:www.com':
                pass
            elif counter >= 2:
                pass
            else:
                second_html_string = authenticate_access(key)
                if second_html_string == "Page cannot be found":
                    print("The Url gave 404 Error")
                else:
                    counter += 1
                    append_number += 3  # randomly increasing the number
                    fetch_links(second_html_string, counter, library,
                                append_number)
                    append_number += 2  # randomly increasing the number
                    counter -= 1  # so that it wont eliminate the rest of the keys after it
        counter -= 1
Ejemplo n.º 5
0
def get_url():
    parser = MyHTMLParser()
    with urllib.request.urlopen(URL) as response:
        parser.feed(response.read().decode('utf-8'))
    return parser.links
Ejemplo n.º 6
0
from html_parser import MyHTMLParser
import urllib.request
from bs4 import BeautifulSoup
import requests

from language_detecter import LanguageDetector

parser = MyHTMLParser()
#url = "https://www.vpnverbinding.nl/beste-vpn/netflix/"
url = "https://www.vpnconexion.es/blog/mejor-vpn-para-netflix/?_ga=2.224715098.1306859094.1600959792-1235625754.1600959792"
req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
response = urllib.request.urlopen(req)
html = response.read()
page = requests.get(url).text
soup = BeautifulSoup(page, "html.parser")
print("Analyzing....")
print(soup.title.string)

#get the webpage language
language = soup.html["lang"].replace("-", "_")

print("The language webpage is: " + language)

lang_validate = LanguageDetector(language)

print("-----------")

#find the titles h3,h2,h1 too text in p, div and span inside the divs
contentTable = soup.find('div')
rows = contentTable.find_all(
    ['h3', 'h2', 'h1', 'p', 'div', 'span', 'img', 'li', 'ul'])
Ejemplo n.º 7
0
    print name
    fd = open(name,'r')
    data = filesplit(fd,'@newpage@')
    print len(data)

    for page in data[1:]:
        print type(page),len(page)
        if not tool.isplayerpage(page[0]):
            continue
        else:
            playerid = '/'.join(tool.getplayerid(page[0]))

            html = ''.join(page[1:])
            html = formatHTML(html)

            hc = MyHTMLParser()
            hc.feed(html)
            playerinfo = hc.get_playerinfo()
            career = hc.get_career()
            playerinfo = tool.addkeytodict(playerinfo,'player_id',playerid)
            career = tool.addkeytodict(career,'player_id',playerid)
            print playerinfo

            #insert record
            tool.insertplayerinfo(qiud_cur,playerinfo)

            for car_record in career:
                tool.insertcareer(qiud_cur,car_record)
            qiud_conn.commit()