Python getHTML Examples, hmc_urllib.getHTML Python Examples

Example #1

0

Show file

def webCrawl(urlList, pagesDeep, listOfWords, listOfCrawled):
    """ The webCrawl function is a recursive method that returns a list of
        strings from pages a given distance away from the starting URL. The
        pages do not repeat themselves. The inputs are a list of URLs, the 
        "depth" or distance away from the first URL, a list of strings
        compiled from URLs that have already been crawled, and a list of
        URLs that have already been crawled. """
        
    """ Test: webCrawl at DEPTH = 0 for the three test pages. Then, webCrawl at
        DEPTH = 1 for test page 2. The result should be a simple addition of
        all three pages. """
    
    if pagesDeep == DEPTH:
        return listOfWords
    else:
        pagesDeep = pagesDeep + 1
        newUrlList = []
        for url in urlList:
            if url not in listOfCrawled:
                listOfWords = listOfWords + getHTML(url)[0].split()
                listOfCrawled.append(url)
                for x in getHTML(url)[1]:
                    if x not in newUrlList:
                        newUrlList.append(x)
        return webCrawl(newUrlList, pagesDeep, listOfWords, listOfCrawled)

Example #2

0

Show file

File: textcloud.py Project: akowtko/TextCloud

def getstring(URL,DEPTH):
    """Retrieves the text from the given url and crawls to other pages based on
        the depth parameter, storing it all in one string.
            Input: url, a string, depth, the depth parameter
            Output: string, a string of all the text on the given webpages"""
    #Base case
    if DEPTH == 0:
        #The string is simply the text on that page
        string = getHTML(URL)[0]
        #Add that page to a list of visited pages
        visitedpage.append(URL)
        return string
    else:
        #get the string for the current page
        string = getHTML(URL)[0]
        #Add that page to visited pages
        visitedpage.append(URL)
        #Create a list of the other links on the page
        otherlinks = getHTML(URL)[1]
        #Loop through the list of links
        for link in otherlinks:
            #If the link has not been visited yet
            if link not in visitedpage:
                #Add the page to the list of visited pages
                visitedpage.append(link)
                #Recurse on that link and add the text onto the string
                string = string + getstring(link,DEPTH - 1)
        return string

Example #3

0

Show file

def write_txt_from_challonge(url, file):
    """Writes the results from a Challonge URL to TxtFile.
Challonge: a string; the URL for a Challonge.
TxtFile: a string; the name of the file to be written.
Example: WriteTxtFromChallonge('http://apex2015melee.challonge.com/singles', 'Apex 2015')"""
    file = sf.add_txt(file)
    webpage = getHTML(url)[0].replace('Mark as In Progress\n\n\nUnmark as In Progress\n\n\n\n', '') \
        .replace('\n\n\n\n\n\nEdit\n\n\nReopen', '').split('\n\n\n\nMatch Details\n\n\n\n\n\n\n')[1:]

    parsed_matches = ""

    for item in webpage:
        item = item.splitlines()
        if item[2] == "" or item[7] == "":
            continue
        try:
            if int(item[24]) < 0:
                continue
        except:
            pass
        try:
            if int(item[27]) < 0:
                continue
        except:
            pass

        line = item[2] + "," + item[24] + "," + item[7] + "," + item[27]
        line = sf.strip_match(line)
        if line is not None and sf.parse_match(line) != "":
            parsed_matches += sf.parse_match(line) + "\n"

    with open(file, 'a') as file:
        file.write(parsed_matches)

Example #4

0

Show file

File: scrapers.py Project: avnestico/smash_trueskill_ratings

def write_txt_from_challonge(url, file):
    """Writes the results from a Challonge URL to TxtFile.
Challonge: a string; the URL for a Challonge.
TxtFile: a string; the name of the file to be written.
Example: WriteTxtFromChallonge('http://apex2015melee.challonge.com/singles', 'Apex 2015')"""
    file = sf.add_txt(file)
    webpage = getHTML(url)[0].replace('Mark as In Progress\n\n\nUnmark as In Progress\n\n\n\n', '') \
        .replace('\n\n\n\n\n\nEdit\n\n\nReopen', '').split('\n\n\n\nMatch Details\n\n\n\n\n\n\n')[1:]

    parsed_matches = ""

    for item in webpage:
        item = item.splitlines()
        if item[2] == "" or item[7] == "":
            continue
        try:
            if int(item[24]) < 0:
                continue
        except:
            pass
        try:
            if int(item[27]) < 0:
                continue
        except:
            pass

        line = item[2] + "," + item[24] + "," + item[7] + "," + item[27]
        line = sf.strip_match(line)
        if line is not None and sf.parse_match(line) != "":
            parsed_matches += sf.parse_match(line) + "\n"

    with open(file, 'a') as file:
        file.write(parsed_matches)

Example #5

0

Show file

def URLList(url, depth):
    L = [url]
    if depth == 0:
        return L
    else:
        C = getHTML(url)[1]
        for x in C:
            New_L = URLList(x, depth - 1)
            L += New_L
            removeRepeats(L)
    return L

Example #6

0

Show file

File: TextCloud.py Project: tparadise/web-crawler

def getListOfWords(url,DEPTH,listVisited):
    ''' Returns a list of words from the given url and from all urls on that
        page up to a certain page depth, DEPTH, that also don't appear in the
        list of already visited urls,listVisited.
    '''
    
    siteTuple = getHTML(url)
    listOfWords = siteTuple[0][1:-1].split()
    repeatURL = False
    for urlS in siteTuple[1]:
        if urlS in listVisited:
            repeatURL = True
        if DEPTH <= 0 and repeatURL == False:
            return listOfWords
        if repeatURL == False:
            listVisited = listVisited + [urlS]
            listOfWords += str(getListOfWords(urlS,DEPTH-1,listVisited)).split()
    return listOfWords

Example #7

0

Show file

File: project_part1.py Project: jslee98/WebCrawler

    print (dictionaryString + ' }')

def printTextCloud(sortedListTuple):
    """ The printTextCloud function prints a parallel list tuple in the format
        provided in the assignment. The input is a two-tuple of lists. """
        
    print('Here is the text cloud for your web page:')
    for i in range (0, len(sortedListTuple[0])):
         print(sortedListTuple[0][i] + ' (' + str(sortedListTuple[1][i])  + ')')                 
                   
# Execution

inputUrl = input("Enter URL to analyze: ")
while inputUrl[0:7] != 'http://':
    inputUrl = input("Invalid URL. Please ensure URL begins with 'http://': ")
    
urlTuple = getHTML(inputUrl)
textString = urlTuple[0]
wordList = urlTuple[0].split()

cleanList(wordList)
stemList(wordList)
removeBoringWords(wordList)
removeNonWords(wordList)

sortedListTuple = wordSorter(wordCounter(wordList))
print('')
printDictionary(sortedListTuple)
print('')
printTextCloud(sortedListTuple)

Example #8

0

Show file

            if url not in listOfCrawled:
                listOfWords = listOfWords + getHTML(url)[0].split()
                listOfCrawled.append(url)
                for x in getHTML(url)[1]:
                    if x not in newUrlList:
                        newUrlList.append(x)
        return webCrawl(newUrlList, pagesDeep, listOfWords, listOfCrawled)
            
             
# Execution

inputUrl = input("Enter URL to analyze: ")
while inputUrl[0:7] != 'http://':
    inputUrl = input("Invalid URL. Please ensure URL begins with 'http://': ")
    

wordList = getHTML(inputUrl)[0].split()
crawledUrls = [inputUrl]
wordList = webCrawl(getHTML(inputUrl)[1], 0, wordList, crawledUrls)

cleanList(wordList)
stemList(wordList)
removeBoringWords(wordList)
removeNonWords(wordList)

sortedListTuple = wordSorter(wordCounter(wordList))
print('')
printDictionary(sortedListTuple)
print('')
printTextCloud(sortedListTuple)

Example #9

0

Show file

def createList(url):
    """ createList returns a list of all of the words on a given webpage."""
    return str.split(getHTML(url)[0])

Example #10

0

Show file

def makeInput(url):
    """Take the url as input and gets the contents of the website and convert it into list"""
    return list(getHTML(url))

Example #11

0

Show file

File: textcloud.py Project: sclin929/Projects

def textcloud(url, depth):
    """ generates a textcloud with the given link """
    b = getHTML(url)

    # for depth 0
    if depth == 0:
        # make a list of text in first tuple
        ListofText = b[0]
        # remove punctuation, \n, uninteresting text, and numbers from the list
        NoPunct = removePunct(ListofText)
        NoN = removeN(NoPunct)
        NoUnint = removeUnint(NoN)
        NoNum = removeNum(NoUnint)
        # stemming the words
        WordList = splitLine(NoNum)
        OrdList = OrderedLenList(WordList)
        StemmedWords = checkWords(OrdList)
        FreqList = NumList(StemmedWords)
        Top50 = MaxWords(FreqList)
        # make n the list of words in first tuple
        return Top50

    # for depth 1
    elif depth == 1:
        ListofText = [getHTML(x)[0] for x in b[1]]
        StringofText = ''.join(ListofText)
        StringofText += b[0]
        # with the string of text from pages connected with a depth of 1,
        # follow the same process as before
        NoPunct = removePunct(StringofText)
        NoN = removeN(NoPunct)
        NoUnint = removeUnint(NoN)
        NoNum = removeNum(NoUnint)
        # stemming the words
        WordList = splitLine(NoNum)
        OrdList = OrderedLenList(WordList)
        StemmedWords = checkWords(OrdList)
        FreqList = NumList(StemmedWords)
        Top50 = MaxWords(FreqList)
        # make n the list of words in first tuple
        return Top50

    # technically for depth 2
    else:
        # depth 0 and 1 urls
        ListofURL1 = [url] + b[1]
        # depth 2 urls
        ListofURL2 = [getHTML(x)[1] for x in b[1]]
        # all the urls, using a helper function
        ListofAll = ListofURL1 + LoLtoL(ListofURL2)
        # removing duplicates, using another helper function
        UniqueURL = uniqify(ListofAll)
        # getting a list of texts from all the URLs
        ListofText = [getHTML(x)[0] for x in UniqueURL]
        # processing the text
        StringofText = ''.join(ListofText)
        # from here on, everything is the same
        NoPunct = removePunct(StringofText)
        NoN = removeN(NoPunct)
        NoUnint = removeUnint(NoN)
        NoNum = removeNum(NoUnint)
        # stemming the words
        WordList = splitLine(NoNum)
        OrdList = OrderedLenList(WordList)
        StemmedWords = checkWords(OrdList)
        FreqList = NumList(StemmedWords)
        Top50 = MaxWords(FreqList)
        return Top50