def actualizar_diputados(contador = 0): global url global fichero url2 = url if contador: url2 += '&paginaActual=' + str(contador) tipo = 'a' else: tipo = 'w' web = pattern.web.URL(url2) pagina = web.download(user_agent='Mozilla/5.0') soup = BeautifulSoup(pagina, 'html.parser') diputados = soup.findAll('span', class_ = 'dorado') with open('files/' + fichero, tipo) as archivo: for diputado in diputados: nombre = diputado.previous_element.previous_element.encode('utf-8') enlace = 'http://www.congreso.es' + \ diputado.previous_element.previous_element.parent['href'].encode('utf-8') archivo.write(nombre + ': ' + enlace + '\n') cuenta_total = soup.find('span', class_ = 'sinEnlace') if cuenta_total: if cuenta_total.string != u'Página Siguiente >>': actualizar_diputados(contador + 1) else: tkMessageBox.showinfo('Información', 'Todos los diputados han sido añadidos al fichero') else: actualizar_diputados(contador + 1)
def get_image(): img = soup.img.parent.get('href') w = screen_width # h = screen_height img = website+img imgdw = download(img, cached=False, unicode=False) imgname = timestamp+"_apod.jpg" imgfile = open(imgname, "wb") imgfile.write(imgdw) imgfile.close wall = Image.open(imgname) draw = ImageDraw.Draw(wall) FOREGROUND = (255, 255, 255) # font = ImageFont.truetype("sans-serif.ttf", 16) font_path = "/usr/share/fonts/truetype/freefont/FreeSans.ttf" font = ImageFont.truetype(font_path, 20, encoding='unic') y_text = 0 for line in text: width, height = font.getsize(line) y_text += height y_text = screen_height - y_text for line in text: draw.text(((w - width) / 2, y_text), line, font=font, fill=FOREGROUND) y_text += height # draw.text((0, 0),text,(255,255,255)) wall.save(imgname) imgpath = directory + "/" + imgname uri = "'file://%s'" % imgpath return uri
def get_all_pages(self, url): while True: try: html = download(url) except: time.sleep(.5) print "Error getting all the pages" continue break # get the page numbers element pageNumbers = html.split("class=\"pageNumbers\">")[1] pageNumbers = pageNumbers.split("</div>")[0] # get a list of all the pages pages = pageNumbers.split("<")[2:] min_page = self.get_num(self.clean_href(pages[1])) max_page = self.get_num(self.clean_href(pages[len(pages)-2])) url_parts = self.get_base_url(self.clean_href(pages[1])) urls = [] for number in range(min_page, 90+30, 30): urls.append(url_parts[0] + str(number) + url_parts[1]) return urls
def visit(self, link, source=None): print 'visited:', repr(link.url), 'from:', link.referrer html = download(link.url) blob = TextBlob(plaintext(html)) for sentence in blob.sentences: print(sentence.sentiment #.polarity )
def get_html(datafile, folder): "stores the html for all the urls in datafile in folder." from pattern.web import download urls = get_urls_from_datafile(datafile) for url in urls: filename = url[1] + ":" + url[2] + ".html" with open(folder+filename, 'w') as outfile: outfile.write(download(url[0], cached=False))
def scrap_preceeding(base_url): homepage_html_content = web.download(base_url) homepage_soup = bsoup(homepage_html_content) ul_content = homepage_soup.find_all('ul') a_content = bsoup(str(ul_content)).find_all('a') volume_page_links = [] for raw_link in a_content: volume_page_links.append(join(base_url, raw_link.get('href'))+'/') os.chdir('/home/sorkhei/Desktop/LDA-Papers/JMLR/Preceedings/') for base_link in volume_page_links[32:]: folder_name = base_link.split('/')[-2] address = os.path.join(os.getcwd(), folder_name) if not os.path.exists(address): os.mkdir(folder_name) else: index = 1 while os.path.exists(address): folder_name = base_link.split('/')[-2] + '-' + str(index) print folder_name address = os.path.join(os.getcwd(), folder_name) index += 1 os.mkdir(folder_name) os.chdir(address) print '--------------' print 'downloading from ' + base_link volume_content_soup = bsoup(web.download(base_link)).find_all('div', {'id': 'content'}) a_content = bsoup(str(volume_content_soup)).find_all('a') # print a_content pdf_links = [join(base_link, link.get('href')) for link in a_content if str(link.get('href')).endswith('pdf')] for download_link in pdf_links: if not download_link.endswith('supp.pdf'): try: content = web.download(download_link) except: print 'link : %s is obsolete' % download_link continue f = open(download_link.split('/')[-1], 'wb') f.write(content) f.close() os.chdir('/home/sorkhei/Desktop/LDA-Papers/JMLR/Preceedings/')
def run(o): #http://www.clips.ua.ac.be/pages/pattern-web#mail # should be able to do some cool stuff with the pattern libs print "PATTERM TEST ---------------- STARTED" from pattern.web import download html = download('http://www.clips.ua.ac.be/', cached=False) print html print "PATTERM TEST ---------------- COMPLETE"
def extract_keywords(url, model, all_keywords): minimum_word_index_for_unigrams = 20000 # minimum rarity of word to be considered a keyword try: text = plaintext(download(url)) words = [word for word in tokenize.word_tokenize(text) if word.isalnum() and word.lower() not in corpus.stopwords.words('english') and word.lower() not in url] for collocation, frequency in get_collocations(words): word_index = get_index(collocation, model) if word_index: if collocation.count('_') == 0 and word_index < minimum_word_index_for_unigrams: pass else: all_keywords[collocation] = all_keywords.get(collocation, 0) + evaluate_keyword(frequency, word_index) except (URLError, URLTimeout, HTTPError, HTTP403Forbidden, SSLError, UnicodeEncodeError, ValueError) as e: pass
def get_url(soup): urls = [] contador = 0 for a in soup.findAll('a'): html = download(a['href'], unicode=True) soupEspaceNet = BeautifulSoup(html) urlDescription = soupEspaceNet.findAll('a', {"class": "publicationLinkClass"}) for url in urlDescription: urlEspace = "http://worldwide.espacenet.com/" + str(url['href']) urlEspace = urlEspace.replace("biblio","description") urls.append(urlEspace) contador = contador + 1 if (contador > 9 ) : return urls return urls
def search_regex(title, url, regex): print title try: html = download(url, unicode=True) except URLError as e: print "Something happened... Error: " + str(e) + "\n" exit(0) results = re.findall(regex, html) if len(results) == 0: print "There are no news...\n" else: print "Go to " + url + " to see some news.\n"
def get_url(soup): urls = [] contador = 0 for a in soup.findAll('a'): html = download(a['href'], unicode=True) soupEspaceNet = BeautifulSoup(html) urlDescription = soupEspaceNet.findAll( 'a', {"class": "publicationLinkClass"}) for url in urlDescription: urlEspace = "http://worldwide.espacenet.com/" + str(url['href']) urlEspace = urlEspace.replace("biblio", "description") urls.append(urlEspace) contador = contador + 1 if (contador > 9): return urls return urls
def get_attractions_from_page(self, url, city, client): while True: try: html = download(url) except: print "Error Getting Attractions from page" time.sleep(.5) continue break properties = html.split("property_title\">") for index in range(1, len(properties)): attraction = self.get_attraction(properties[index]) attraction['categories'] = self.get_attraction_details(attraction["url"], attraction["name"]) if(len(attraction['categories']) > 0): client.insert({"city":city, "name":attraction["name"], "_id":attraction["url"], "review_count":attraction["review_count"], "categories":attraction["categories"]}) else: print "Error on: " + attraction["name"]
def get_attraction_details(self, attraction_url, attraction_name): while True: try: html = download(attraction_url) except: print "Error getting attraction details" time.sleep(.5) continue break # get the descriptions of the location in the ugliest line of code I've ever written try: detailsplit = html.split("div class=\"detail\">") split_loc = 1 if "Neighborhood:" not in detailsplit[1] or len(detailsplit)==2 else 2 details = detailsplit[split_loc].split("</div>")[0] except IndexError: return [] # split up the details into categories list_details = [] jank_details = details.split(", ") # I'm not really sure what I'm doing here but it works for jank in jank_details: split = jank.split("<") if(len(split) > 1): title = split[1] title_split = title.split(">") if(len(title_split) > 1): clean_title = title_split[1] list_details.append(clean_title) return list_details
def run(self): while True: try: logging.info("%s : Waiting for surname" % self.getName()) surname = self.queue.get() row_data = [] for j, town in enumerate(towns): logging.info("%s : %s \t #%s \t %s" % (self.getName(), surname, j, town)) url = "http://yellowbook.intelius.com/results.php?ReportType=34&refer=2464&adword=RP&qar=off&qc=%s&qdma=off&qi=0&qk=6&qn=%s&qs=NJ" % (town, surname) try: html = unicode(download(url), "latin-1").encode('ascii', 'replace') except KeyboardInterrupt: sys.exit() except: logging.info("Failed.") continue dom = DOM(html) for result in dom('div.cobrand_wp_multiresult_record'): name = plaintext(result('div.cobrand_wp_multiresult_record_name:first-child')[0].content) addr = plaintext(result('div.cobrand_wp_multiresult_record_address')[0].content).replace("\n"," ") phone = plaintext(result('div.cobrand_wp_multiresult_record_phone')[0].content) row_data.append({"name" : name, "addr" : addr, "phone" : phone}) df = pd.DataFrame(row_data) df.to_csv("%s%s.csv" % (OUTPUT_FOLDER,surname), index = False) logging.info("%s : Processing surname %s complete" % (self.getName(), surname)) except: pass
from pattern.en import quantify print(quantify({'strawberry': 200, 'peach': 15})) print(quantify('orange', amount=1200)) # ## Pattern Library Functions for Data Mining # For macOS SSL issue when downloading file(s) from external sources import ssl ssl._create_default_https_context = ssl._create_unverified_context # ### Accessing Web Pages from pattern.web import download page_html = download('https://en.wikipedia.org/wiki/Artificial_intelligence', unicode=True) from pattern.web import URL, extension page_url = URL( 'https://upload.wikimedia.org/wikipedia/commons/f/f1/RougeOr_football.jpg') file = open('football' + extension(page_url.page), 'wb') file.write(page_url.download()) file.close() # ### Finding URLs within Text from pattern.web import find_urls print(find_urls('To search anything, go to www.google.com', unique=True))
def visit(self, link, source=None): print 'visited:', repr(link.url), 'from:', link.referrer html = download(link.url) blob = TextBlob(plaintext(html)) print blob.noun_phrases
# Extracting noun phrases from a local file path = '/Users/sharattadimalla/github/vcu/assignment/' file_name = 'President_Obama_Speech_2004.txt' with open(path+file_name, 'rU') as file: content = file.readlines() blob = TextBlob(str(content)) print "Static Text Noun Phrases" print blob.noun_phrases # Extracting noun phrases from the web weblink = 'http://www.americanrhetoric.com/speeches/convention2004/barackobama2004dnc.htm' html = download(weblink) blob1 = TextBlob(plaintext(html)) print "Web Page Content Noun Phrases" print blob1.noun_phrases # Extracting noun phrases from pdf document url = URL('http://www.bellevue.edu/student-support/career-services/pdfs/resume-samples.pdf') pdfcontent = PDF(url.download()) blob2 = TextBlob(pdfcontent) print blob2.noun_phrases
def visit(self, link, source=None): print 'visited:', repr(link.url), 'from:', link.referrer print plaintext(download(link.url))
def get(self, route): parser = reqparse.RequestParser() parser.add_argument('direction', type=str, required=False, help='Direction of the route') parser.add_argument('stop', type=int, required=False, help='Stop number') parser.add_argument('latitude', type=float, required=False, help='Latitude to sort results by') parser.add_argument('longitude', type=float, required=False, help='Longitude to sort results by') args = parser.parse_args() url = "http://www.ltconline.ca/WebWatch/UpdateWebMap.aspx?u={}" try: html = download(url.format(route), timeout=60, cached=False) except (HTTP404NotFound, URLTimeout) as ex: msg = "LTC WebWatch service looks down ({})" return { "message": msg.format(repr(ex)), "status": 408, }, 408 timestamp, main_stops, info_text, minor_stops = html.split("*") stops_lines = (main_stops + minor_stops).split(";") stops = [] for stop_line in stops_lines: stop_line_split = stop_line.split("|") if len(stop_line_split) == 7: (latitude, longitude, name, direction, stop_number, times_text) = stop_line_split[:6] stop_number = int(stop_number.replace("Stop Number ", "")) times = [] for time_text in times_text.split("<br>"): time_text_splits = time_text.split(" TO ") if len(time_text_splits) == 2: time, destination = time_text_splits destination = destination.strip() route_time = unicode(route) if destination.startswith(route_time): destination_split = destination.split(" ", 1) route_time, destination = destination_split times.append({ "time": time, "destination": destination.title(), "route": route_time, }) direction = direction.lower() if ((not args["stop"] or args["stop"] == stop_number) and (not args["direction"] or (direction.startswith(args["direction"].lower()) or args["direction"].lower() == direction))): stop = { "latitude": float(latitude), "longitude": float(longitude), "name": name.title(), "direction": direction.title(), "number": stop_number, "times": times, "route": route, } if args["latitude"] and args["longitude"]: stop_location = stop["latitude"], stop["longitude"] request_location = args["latitude"], args["longitude"] distance_obj = distance(stop_location, request_location) stop.update({ "distance": { "meters": distance_obj.m, "miles": distance_obj.miles, } }) stops.append(stop) if stops and args["latitude"] and args["longitude"]: stops.sort(key=lambda x: x["distance"]["meters"]) return stops
def visit(self, link, source=None): print 'visited:', repr(link.url), 'from:', link.referrer html = download(link.url) blob = TextBlob(plaintext(html)) for sentence in blob.sentences: print(sentence.sentiment.polarity)
screenh = screenh.split("+") screenh = int(screenh[0]) i += 1 else: break return (screenw, screenh) screen_width, screen_height = get_screensize() # Set timestamp format timestamp = time.strftime('%Y%m%d') # set url for download and download current website, no cache website = "http://apod.nasa.gov/apod/" html = download(website+"astropix.html", cached=False, unicode=True) soup = BeautifulSoup(html, "html.parser") def get_explanation(soup): explanation = soup.find("b", string=" Explanation: ") explanation = str(explanation.parent) explanation2 = BeautifulSoup(explanation, "html.parser") explanation2 = explanation2.p.center explanation2 = str(explanation2) text = explanation.replace(explanation2, "") text = plaintext(text, linebreaks=2, indentation=True) text = str(text).replace("\n", "")
def get_dom(self): return web.DOM(web.download(self.url, cached=True))