def webCrawl(urlList, pagesDeep, listOfWords, listOfCrawled): """ The webCrawl function is a recursive method that returns a list of strings from pages a given distance away from the starting URL. The pages do not repeat themselves. The inputs are a list of URLs, the "depth" or distance away from the first URL, a list of strings compiled from URLs that have already been crawled, and a list of URLs that have already been crawled. """ """ Test: webCrawl at DEPTH = 0 for the three test pages. Then, webCrawl at DEPTH = 1 for test page 2. The result should be a simple addition of all three pages. """ if pagesDeep == DEPTH: return listOfWords else: pagesDeep = pagesDeep + 1 newUrlList = [] for url in urlList: if url not in listOfCrawled: listOfWords = listOfWords + getHTML(url)[0].split() listOfCrawled.append(url) for x in getHTML(url)[1]: if x not in newUrlList: newUrlList.append(x) return webCrawl(newUrlList, pagesDeep, listOfWords, listOfCrawled)
def getstring(URL,DEPTH): """Retrieves the text from the given url and crawls to other pages based on the depth parameter, storing it all in one string. Input: url, a string, depth, the depth parameter Output: string, a string of all the text on the given webpages""" #Base case if DEPTH == 0: #The string is simply the text on that page string = getHTML(URL)[0] #Add that page to a list of visited pages visitedpage.append(URL) return string else: #get the string for the current page string = getHTML(URL)[0] #Add that page to visited pages visitedpage.append(URL) #Create a list of the other links on the page otherlinks = getHTML(URL)[1] #Loop through the list of links for link in otherlinks: #If the link has not been visited yet if link not in visitedpage: #Add the page to the list of visited pages visitedpage.append(link) #Recurse on that link and add the text onto the string string = string + getstring(link,DEPTH - 1) return string
def write_txt_from_challonge(url, file): """Writes the results from a Challonge URL to TxtFile. Challonge: a string; the URL for a Challonge. TxtFile: a string; the name of the file to be written. Example: WriteTxtFromChallonge('http://apex2015melee.challonge.com/singles', 'Apex 2015')""" file = sf.add_txt(file) webpage = getHTML(url)[0].replace('Mark as In Progress\n\n\nUnmark as In Progress\n\n\n\n', '') \ .replace('\n\n\n\n\n\nEdit\n\n\nReopen', '').split('\n\n\n\nMatch Details\n\n\n\n\n\n\n')[1:] parsed_matches = "" for item in webpage: item = item.splitlines() if item[2] == "" or item[7] == "": continue try: if int(item[24]) < 0: continue except: pass try: if int(item[27]) < 0: continue except: pass line = item[2] + "," + item[24] + "," + item[7] + "," + item[27] line = sf.strip_match(line) if line is not None and sf.parse_match(line) != "": parsed_matches += sf.parse_match(line) + "\n" with open(file, 'a') as file: file.write(parsed_matches)
def URLList(url, depth): L = [url] if depth == 0: return L else: C = getHTML(url)[1] for x in C: New_L = URLList(x, depth - 1) L += New_L removeRepeats(L) return L
def getListOfWords(url,DEPTH,listVisited): ''' Returns a list of words from the given url and from all urls on that page up to a certain page depth, DEPTH, that also don't appear in the list of already visited urls,listVisited. ''' siteTuple = getHTML(url) listOfWords = siteTuple[0][1:-1].split() repeatURL = False for urlS in siteTuple[1]: if urlS in listVisited: repeatURL = True if DEPTH <= 0 and repeatURL == False: return listOfWords if repeatURL == False: listVisited = listVisited + [urlS] listOfWords += str(getListOfWords(urlS,DEPTH-1,listVisited)).split() return listOfWords
print (dictionaryString + ' }') def printTextCloud(sortedListTuple): """ The printTextCloud function prints a parallel list tuple in the format provided in the assignment. The input is a two-tuple of lists. """ print('Here is the text cloud for your web page:') for i in range (0, len(sortedListTuple[0])): print(sortedListTuple[0][i] + ' (' + str(sortedListTuple[1][i]) + ')') # Execution inputUrl = input("Enter URL to analyze: ") while inputUrl[0:7] != 'http://': inputUrl = input("Invalid URL. Please ensure URL begins with 'http://': ") urlTuple = getHTML(inputUrl) textString = urlTuple[0] wordList = urlTuple[0].split() cleanList(wordList) stemList(wordList) removeBoringWords(wordList) removeNonWords(wordList) sortedListTuple = wordSorter(wordCounter(wordList)) print('') printDictionary(sortedListTuple) print('') printTextCloud(sortedListTuple)
if url not in listOfCrawled: listOfWords = listOfWords + getHTML(url)[0].split() listOfCrawled.append(url) for x in getHTML(url)[1]: if x not in newUrlList: newUrlList.append(x) return webCrawl(newUrlList, pagesDeep, listOfWords, listOfCrawled) # Execution inputUrl = input("Enter URL to analyze: ") while inputUrl[0:7] != 'http://': inputUrl = input("Invalid URL. Please ensure URL begins with 'http://': ") wordList = getHTML(inputUrl)[0].split() crawledUrls = [inputUrl] wordList = webCrawl(getHTML(inputUrl)[1], 0, wordList, crawledUrls) cleanList(wordList) stemList(wordList) removeBoringWords(wordList) removeNonWords(wordList) sortedListTuple = wordSorter(wordCounter(wordList)) print('') printDictionary(sortedListTuple) print('') printTextCloud(sortedListTuple)
def createList(url): """ createList returns a list of all of the words on a given webpage.""" return str.split(getHTML(url)[0])
def makeInput(url): """Take the url as input and gets the contents of the website and convert it into list""" return list(getHTML(url))
def textcloud(url, depth): """ generates a textcloud with the given link """ b = getHTML(url) # for depth 0 if depth == 0: # make a list of text in first tuple ListofText = b[0] # remove punctuation, \n, uninteresting text, and numbers from the list NoPunct = removePunct(ListofText) NoN = removeN(NoPunct) NoUnint = removeUnint(NoN) NoNum = removeNum(NoUnint) # stemming the words WordList = splitLine(NoNum) OrdList = OrderedLenList(WordList) StemmedWords = checkWords(OrdList) FreqList = NumList(StemmedWords) Top50 = MaxWords(FreqList) # make n the list of words in first tuple return Top50 # for depth 1 elif depth == 1: ListofText = [getHTML(x)[0] for x in b[1]] StringofText = ''.join(ListofText) StringofText += b[0] # with the string of text from pages connected with a depth of 1, # follow the same process as before NoPunct = removePunct(StringofText) NoN = removeN(NoPunct) NoUnint = removeUnint(NoN) NoNum = removeNum(NoUnint) # stemming the words WordList = splitLine(NoNum) OrdList = OrderedLenList(WordList) StemmedWords = checkWords(OrdList) FreqList = NumList(StemmedWords) Top50 = MaxWords(FreqList) # make n the list of words in first tuple return Top50 # technically for depth 2 else: # depth 0 and 1 urls ListofURL1 = [url] + b[1] # depth 2 urls ListofURL2 = [getHTML(x)[1] for x in b[1]] # all the urls, using a helper function ListofAll = ListofURL1 + LoLtoL(ListofURL2) # removing duplicates, using another helper function UniqueURL = uniqify(ListofAll) # getting a list of texts from all the URLs ListofText = [getHTML(x)[0] for x in UniqueURL] # processing the text StringofText = ''.join(ListofText) # from here on, everything is the same NoPunct = removePunct(StringofText) NoN = removeN(NoPunct) NoUnint = removeUnint(NoN) NoNum = removeNum(NoUnint) # stemming the words WordList = splitLine(NoNum) OrdList = OrderedLenList(WordList) StemmedWords = checkWords(OrdList) FreqList = NumList(StemmedWords) Top50 = MaxWords(FreqList) return Top50