def resolve(url, depends, recommends, suggests): print '\nDownloading (and creating) list of dependecies. Please wait...' tocrawlh = [url] # tocrawl history tocrawl = [url] crawled = [] while tocrawl: popped = tocrawl.pop() page = netfunc.get_page(popped) if popped not in crawled: crawled.append(popped) #print crawled #print page[:300] if page != '': temp = page.find('<ul class="uldep">') dep_snum = page.find('<ul class="uldep">', temp + len('<ul class="uldep">')) dep_enum = page[dep_snum:].find('</ul>') part_dep = page[dep_snum:dep_snum + dep_enum] temp = page.find('<ul class="ulrec">') rec_snum = page.find('<ul class="ulrec">', temp + len('<ul class="ulrec">')) rec_enum = page[rec_snum:].find('</ul>') part_rec = page[rec_snum:rec_snum + rec_enum] temp = page.find('<ul class="ulsug">') sug_snum = page.find('<ul class="ulsug">', temp + len('<ul class="ulsug">')) sug_enum = page[sug_snum:].find('</ul>') part_sug = page[sug_snum:sug_snum + sug_enum] else: print 'Error: Could not connect to http://packages.ubuntu.com/' print 'Script will exit now...' return -1 ############################################################################### ######################### TODO: suggests and recommends ###################### ################################################################################ #tocrawl = [] if depends == 1: #l = len(part_dep) #print part_dep while len(part_dep) > 20: #tocrawl = [] start = part_dep.find('<dl>') end = part_dep.find('</dl>') part = part_dep[start:end] pos1 = part.find('<a href="') #print pos1 pos2 = part.find('">',pos1) #print pos2 if 'http://packages.ubuntu.com' + part[pos1 + len('<a href="'): pos2] not in tocrawlh: tocrawlh.append('http://packages.ubuntu.com' + part[pos1 + len('<a href="'): pos2]) tocrawl.append('http://packages.ubuntu.com' + part[pos1 + len('<a href="'): pos2]) #print tocrawl[-1] part_dep = part_dep[end+ len('</dl>'):] #print tocrawl #raw_input() #print tocrawl return crawled
def gen_download_url(url, flag): dl_urls = [] dl_urls_temp = [] page = netfunc.get_page(url) pat1 = '<div id="pdownload">' pat2 = '</div> <!-- end pdownload -->' part_of_page = stripstr.part_of_str(page, pat1, pat2) if flag == 'sourcenames': pat1 = '<td><a href="' pat2 = '">' len_pat1 = len(pat1) loc1 = part_of_page.find(pat1) while loc1 != -1: loc2 = part_of_page[loc1:].find(pat2) dl_urls.append(part_of_page[loc1 + len_pat1:loc1 + loc2]) part_of_page = part_of_page[loc1 + loc2:] loc1 = part_of_page.find(pat1) return dl_urls else: #print '#####################' pat1 = '<th><a href="' pat2 = '">' len_pat1 = len(pat1) loc1 = part_of_page.find(pat1) start_of_url = 'http://packages.ubuntu.com' while loc1 != -1: loc2 = part_of_page[loc1:].find(pat2) dl_urls_temp.append(start_of_url + part_of_page[loc1 + len_pat1:loc1 + loc2]) part_of_page = part_of_page[loc1 + loc2:] loc1 = part_of_page.find(pat1) flag_slash = '/' + flag + '/' for item in dl_urls_temp: if item.find(flag_slash) != -1: dl_urls.append(choose_url_mirror(item)) #print '**********' return dl_urls for item in dl_urls_temp: all_loc = item.find('/all/') if all_loc != -1: dl_urls.append(choose_url_mirror(item)) #print '$$$$$$$$$$$$' return dl_urls return 1
def choose_url_mirror(url): page = netfunc.get_page(url) pat1 = '<p>You can download the requested file from the ' pat2 = '<div id="pdownloadnotes">' part_of_page = stripstr.part_of_str(page, pat1, pat2) pat3 = '<li><a href="' pat4 = '">' loc3 = part_of_page.find(pat3) loc4 = part_of_page[loc3:].find(pat4) return part_of_page[loc3 + len(pat3): loc3 + loc4]
def release(): page = netfunc.get_page('http://packages.ubuntu.com/') #print page if page != '': # Find locations of part of file that interests us # And then, strip that part in another string 'part_page' search_start = '<h2>Browse through the lists of packages:</h2>' temp1 = page.find(search_start) search_end = '</ul>' temp2 = page[temp1:].find(search_end) # Final start and end locations of part of page which interest us start_loc = temp1 + len(search_start) end_loc = temp1 + temp2 + len(search_end) # Let's slice part of page (which interests us) now part_page = page[start_loc:end_loc] else: print 'Error: Could not connect to http://packages.ubuntu.com/' print 'Script will exit now...' return -1 # List of releases releases = [] # start pattern and end pattern spat = '<li><a href="' epat = '/">' len_spat = len(spat) while len(part_page) > 50: sloc = part_page.find(spat) eloc = part_page.find(epat) releases.append(part_page[sloc + len_spat:eloc]) part_page = part_page[eloc + len(epat):] releases.append('all') print '\nList of available releases:' i = 0 j = len(releases) for item in releases: print ' [', str(i).rjust(2), '] or [', str(-j).rjust(3), ']', item i += 1 j -= 1 try: rnum = input('Enter release number = ') return releases[rnum] except: print 'Invalid release number input!' print 'Program will exit now...' return -1
def results(url, sflag): info = "" page = netfunc.get_page(url) part_page = stripstr.part_of_str(page, '<div id="psearchres">', "</div>") pac_list = [] pattern1_1 = "<h3>Package " pattern1_2 = "<h3>Source Package " if sflag == "sourcenames": pattern1 = pattern1_2 else: pattern1 = pattern1_1 i = 0 while True: # print "#" pac_name = stripstr.part_of_str(part_page, pattern1, "</h3>") if pac_name == 1: break pat1 = '<a class="resultlink" href="' loc1 = part_page.find(pat1) pat2 = '">' pat3_1 = '[<strong class="' pat3_2 = "<br>" if part_page[loc1:].find(pat3_1) < part_page[loc1:].find(pat3_2): pat3 = pat3_1 else: pat3 = pat3_2 tempstr = stripstr.part_of_str(part_page[loc1:], pat2, pat3) # print tempstr pat4 = "</a>" loc4 = tempstr.find(pat4) # Find [category, description] of package in a list if sflag == "sourcenames": cat_desc = tempstr[loc4 + len(pat4) :].rsplit(": ") else: cat_desc = tempstr[loc4 + len(pat4) :].rsplit("\n") # print cat_desc pac_category, pac_desc = cat_desc[0], cat_desc[1] pac_desc = pac_desc.strip() info = pac_name + ": [" + tempstr[:loc4] + "]" + pac_category + " " + pac_desc print str(i) + ". " + info.rstrip("\n") part_url = tempstr[:loc4] + "/" + pac_name pac_list.append(part_url) # print part_url loc3 = part_page[loc1:].find(pat3) part_page = part_page[loc1 + loc3 :] i += 1 selected = input("Enter the package number to download = ") if selected < len(pac_list) and selected >= 0: # print '#######' # print pac_list[selected] return pac_list[selected] else: print "Invalid input!" print "Program will exit now..." return 1