from pattern import web from pattern.web import URL, Element url = URL('https://docs.google.com/spreadsheets/d/1J2I40hglES63YZHROcOL3oAjDPqiiKLRPE_ikAWsR-Q/pubhtml?gid=1267634591').read() dom = Element(url) dom = dom.by_tag('tbody')[0] #date Get the date from the header date = dom.by_class('s0')[1].content #places Read the place from available class='s4' inside <td> places = [] for ix in dom.by_class('s4'): places.append(ix.content) try: reading_row = [4, 10, 16, 22] pol_reading = [] for row in reading_row: reading = dom.by_tag('tr')[row] reading = reading.by_tag('td') for i in reading: if len(i) >= 1: pol_reading.append(i.content) pol_updated_row = [5, 11, 17, 23] pol_updated = []
import os, sys sys.path.insert(0, os.path.join("..", "..", "..")) from pattern.web import URL, Document, plaintext from pattern.web import NODE, TEXT, COMMENT, ELEMENT, DOCUMENT # The web module has a number of convenient search engines, # but often you will need to handle the HTML in web pages of your interest manually. # The Document object can be used for this, similar to the Javascript DOM. # For example: url = URL("http://www.reddit.com/top/") dom = Document(url.download(cached=True)) for e in dom.get_elements_by_tagname("div.entry")[:5]: # Top 5 reddit entries. for a in e.get_elements_by_tagname( "a.title")[:1]: # First <a class="title"> in entry. print plaintext(a.content) print a.attributes["href"] print # The Document object is a tree of Element and Text objects. # All objects inherit from Node, Document also inherits from Element. # Node.type => NODE, TEXT, COMMENT, ELEMENT, DOCUMENT # Node.parent => Parent Node object. # Node.children => List of child Node objects. # Node.next => Next Node in Node.parent.children. # Node.previous => Previous Node in Node.parent.children. # Document.head => Element with tag name "head". # Document.body => Element with tag name "body".
finder.apply_word_filter(filter_ngrams) finder.nbest(bigram_measures.pmi, 10) print scored ## readability import pprint printp =pprint.PrettyPrinter() printp.pprint(text) ## Pattern from pattern.web import URL, plaintext url = URL(url, method = GET) text = url.download() simple_text = plaintext(text, indetation = True) #### Scoring: scored = finder.score_ngrams( bigram_measures.likelihood_ratio ) # Group bigrams by first word in bigram. prefix_keys = collections.defaultdict(list)
def extract_tvseries(dom): # declare the observed URL url = URL(TARGET_URL) # download the URL to read the code dom = DOM(url.download(cached=True)) # initialize list for the data we need data = [] # iterate over parts of the list of tv series for e in dom('div[class="lister-item-content"]'): # look for title and accept all symbols title = e('a')[0].content.encode("utf-8") # look for rating rating = e('div[class="inline-block \ ratings-imdb-rating"] strong')[0].content # initialize list for genres genre = [] # calculate amount of genres alengte = len(e('span[class="genre"]')) # iterate over all genres for a in range(alengte): # look for genres and accept all symbols genres = e('span[class="genre"]')[a].content.encode("utf-8") # add new genre to list of genres genre.append(genres) # remove extra whitespace and join list with commas in between stripped_genres = (",".join([s.strip() for s in genre])) # initialize list for actors actors = [] # calculate amount of actors blengte = len(e('a')) # start looking at position of first actor for b in range(12, blengte): # look for actor and make sure all symbols are accepted actor = e('a')[b].content.encode("utf-8") # add new actor to list actors.append(actor) # join actors with commas in between stripped_actors = (", ".join(actors)) # look for runtime runtime = e('span[class="runtime"]')[0].content # keep the digits, throw away 'min' runtime_digits = ''.join([i for i in runtime if i.isdigit()]) # add the new data to data list data.append( (title, rating, stripped_genres, stripped_actors, runtime_digits)) # return the collected data return data
col_len=2) xls_set_class.open_excel_and_process_block_data() xls_set_class.data_label_list if choice ==7: """ use the price extract here to pull """ if choice == 8: """ Get the short sell volume. Need to get the shortsell vs the volumne traded that day Url need change with the current date """ target_url = 'http://sgx.com/wps/wcm/connect/sgx_en/home/market_info/short_sale/short_sale_daily/DailyShortSell20150417.txt' url = URL(target_url) url_data = url.download(timeout=50) shortsell_list = pandas.io.html.read_html(url_data) shortsell_df = shortsell_list[1] #ned to remove the first ropws shortsell_df.rename(columns={ 0: 'Security', 1: 'Short Sale Volume', 2: 'Currency', 3: 'Short Sale Value', }, inplace=True) shortsell_df = shortsell_df[1:-3] if choice == 9: """ combine the shortsell with the current price."""
except: csv = Datasheet() seen = set() for (level, bias, label, name), url in sources.items(): try: f = Newsfeed() f = f.search(url, cached=False) except: continue for r in f: # 1) Download source & parse the HTML tree: try: src = URL(r.url).download(cached=True) dom = DOM(src) except Exception as e: continue # 2) Find article text w/ CSS selectors: for selector in ( "article[class*='node-article']", # The Hill "span[itemprop='articleBody']", "div[itemprop='articleBody']", "div[id='rcs-articleContent'] .column1", # Reuters "div[class='story-body']", "div[class='article-body']", "div[class='article-content']", "div[class^='tg-article-page']", "div[class^='newsArticle']",
r_url = redis.StrictRedis(host='localhost', port=6379, db=0) r_text = redis.StrictRedis(host='localhost', port=6379, db=1) # c=0 # debug counter # output = open('test_output_redis.txt','w') # deprecated, was for debug while True: # take url, add to redis URL store WITH expire time set for EXPIRE_IN seconds. # if result of redis INCR command is > 1, it means the URL was already there (but we still updated its TTL) # so if result > 1, we should also resolve the url semantically (this should be another beanstalk tube, another job?) # resolving the url means, fetch it in pattern and check the mimetype to ensure we only parse text -containing stuff # then use pattern to get chunks and noun phrases and shove them in another redis store # (where key is the phrase, and value is just INCR?) job = beanstalk.reserve( ) # this is blocking, waits till there's something on the stalk url = URL(job.body) pipe = r_url.pipeline(transaction=True) redis_response = pipe.incr(url).expire(url, EXPIRE_IN).execute( ) # should I be updating the TTL? Experience-design question more than anything # print redis_response if (redis_response[0] < 2): print 'new url, we think', url try: s = url.download(cached=True) print url.mimetype if (url.mimetype in MIMETYPE_WEBPAGE) or (url.mimetype in MIMETYPE_PLAINTEXT): s = plaintext(s) ''' parsetree(string,
#print books(1) #print books(2) # We can use Chrome's Developer Tools to inspect the HTML of the overview page. # It turns out each link to each book is contained in a <div class="prod"> element. # In Pattern, the DOM (Document Object Model) is a tree of nested HTML elements, # along with useful methods to traverse and search the tree. # http://www.clips.ua.ac.be/pages/pattern-web#DOM # It is easy to fetch each <div class="prod">: corpus = Datasheet() for i in range(45): # How many pages? url = books(i + 1) url = URL(url) html = url.download(cached=True) # Cache the HTML source locally. for product in DOM(html).by_class("prod"): #print product.source # The link to each book page looks something like: # http://www.amazon.fr/dieux-voyagent-toujours-incognito/dp/2266219154/ a = product.by_tag("a")[0] a = a.attributes["href"] #print a # After some searching with Chrome, # I found that there is a page with 10 reviews about this book: # http://www.amazon.fr/product-reviews/2266219154/ # So we want to parse the book id from the first link and mine its reviews page: id = a.split("/")[-2]
import csv from pattern.web import URL, DOM, plaintext from collections import defaultdict filterlist = ["1","2","3","4","5","6","7","8","9","0","/","-"] playerlist = list() players = ["3975/stephen-curry", "9/ray-allen", "552/reggie-miller", "841/jason-terry", "662/paul-pierce", "429/jason-kidd", "136/vince-carter", "165/jamal-crawford", "63/chauncey-billups", "2011/kyle-korver", "469/rashard-lewis", "813/peja-stojakovic", "1007/joe-johnson", "110/kobe-bryant"] htmllink = "http://espn.go.com/nba/player/stats/_/id/" output_file = open('total3pointers.json', 'w') for player in players: TARGET_URL = URL(htmllink + player) dom = DOM(TARGET_URL.download(cached=True)) dataofyear = dict() print player total3pointers = 0; for e in dom.by_tag("div.mod-container mod-table mod-player-stats"): for a in e.by_tag("div.mod-content")[1:2]: for tablehead in a.by_class("tablehead"): year = -1 for oddrow in tablehead.by_class("oddrow"): madeshots = oddrow[4].content[:3] madeshots = int(madeshots.replace("-", "")) total3pointers += int(madeshots)
url = link r = requests.get(url) soup = BeautifulSoup(r.content) gdata = soup.find_all('div',{'class':'row'}) for g in gdata: baselink = 'http://www.txlottery.org' try: if re.findall('[0-9]{2} [A-Za-z]{3} [0-9]{2}',str(g)): date = re.findall('[0-9]{2} [A-Za-z]{3} [0-9]{2}',str(g)) except: pass for link in g('a'): try: if re.search('pdf$',str(link.get('href')),flags=re.IGNORECASE): link = baselink+link.get('href') pdfurl = URL(link) pdfext = '/Users/macuser/Desktop/smithpdf'+link[link.rfind('/'):] pdfext = re.sub('[!@#\$%\^&*]','',str(pdfext)) f = open(pdfext, 'wb') f.write(pdfurl.download(cached=False)) f.close() links.append(link) callThis = 'pdftotext '+pdfext+' '+'/Users/macuser/Desktop/smithtxt'+pdfext[pdfext.rfind('/'):-4]+'.txt' subprocess.call(callThis,shell=True) except: logging.exception('') pass patht = '/Users/macuser/Desktop/smithtxt' pathp = '/Users/macuser/Desktop/smithpdf'
self.load_stock_symbol_fr_file() if __name__ == '__main__': print "start processing" choice = 2 if choice == 1: """try the download format of YQL""" url_address = 'https://query.yahooapis.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.finance.keystats%20WHERE%20symbol%3D%27BN4.SI%27&format=json&diagnostics=true&env=store%3A%2F%2Fdatatables.org%2Falltableswithkeys&callback=' savefile = r'c:\data\temptryyql.json' cache.clear() url = URL(url_address) f = open(savefile, 'wb') # save as test.gif f.write(url.download()) f.close() if choice == 2: """ Handling Json file how to include the multiple keys per --> use w['query']['results']['stats'].keys() """ savefile = r'c:\data\temptryyql.json' w = json.load(open(r'c:\data\temptryyql.json', 'r')) com_data_stock_list = list() for indivdual_set in w['query']['results']['stats']: temp_dict_data = {}
#--- DOWNLOAD & UPDATE ------------------------------------------------------------------------------- for p in ("-", "-web", "-db", "-en", "-es", "-de", "-nl", "-search", "-vector", "-graph", "-canvas", "-metrics", "-shell", "stop-words", "mbsp-tags", "-dev"): # We include some useful pages (Penn Treebank tags, stop words) referenced in the documentation. if p.startswith("-"): p = "pattern" + p.rstrip("-") title = p.replace("-", ".") if p == "stop-words": title = "Stop words" if p == "mbsp-tags": title = "Penn Treebank II tag set" # Download the online documentation pages. print "Retrieving", url + p html = URL(url + p).download(cached=False) # Parse the actual documentation, we don't need the website header, footer, navigation, search. html = Document(html) html = html.by_id("content-area") html = html.by_class("node-type-page")[0] html = html.source html = strip_javascript(html) html = strip_between('<div id="navbar">', '/#navbar -->', html) html = strip_between('<div id="sidebar-right">', '/#sidebar-right -->', html) html = strip_between('<div id="footer">', '/#footer -->', html) html = strip_between('<a href="http://twitter.com/share"', '</a>', html) # Link to local pages and images. # Link to online media. html = html.replace('href="/pages/MBSP"', 'href="%sMBSP"' % url) # MBSP docs (online)
from pattern.web import URL, DOM, extension, MIMETYPE_IMAGE from pattern.web import Element, download import urllib import datetime #libraries to check urllib (legacy vs not), pattern, requests url = URL("http://www.dot.ca.gov/dist1/d1tmc/allcams.php") dom = DOM(url.download(cached = True)) i = 0 try : for e in dom.by_tag('img'): if (extension(e.attr['src']) == '.jpg'): print(e.attr['src']) urllib.request.urlretrieve(e.attr['src'], "data/test/urllib{0}.jpg".format(i)) #image = download(e.attr['src'], unicode= False, timeout= 5) #f = open("data/test/pattern{0}.jpg".format(i), 'wb') #f.write(image) i += 1 except: print ("error") """ image = "http://www1.dot.ca.gov/cwwp2/data/d1/cctv/image/us101northofcushingcreeklookingsouth/us101northofcushingcreeklookingsouth.jpg" url = URL(image) print (url.mimetype in MIMETYPE_IMAGE) urllib.request.urlretrieve(image, 'data/test2.jpg') """
# The DOM object can be used for this, similar to the Javascript DOM. # The DOM (Document Object Model) parses a string of HTML # and returns a tree of nested Element objects. # The DOM elements can then be searched by tag name, CSS id, CSS class, ... # For example, top news entries on Reddit are coded as: # <div class="_1poyrkZ7g36PawDueRza-J s1r3zmnv-7 bmeGah"> # ... # <span class="y8HYJ-y_lTUHkQIc1mdCq yj3st6-1 kYJFRo"> # ... # <a class="SQnoC3ObvgnGjWt90zD9Z " href="http://i.imgur.com/yDyPu8P.jpg">Bagel the bengal, destroyer of boxes</a> # ... # </div> # # ... which - naturally - is a picture of a cat. url = URL("http://www.reddit.com/top/") dom = DOM(url.download(cached=True)) #print(dom.body.content) for e in dom.by_tag("div._1poyrkZ7g36PawDueRza-J s1r3zmnv-7 bmeGah" )[:5]: # Top 5 reddit entries. for a in e.by_tag("a.SQnoC3ObvgnGjWt90zD9Z")[:1]: print(plaintext(a.content)) print(a.attrs["href"]) print("") # The links in the HTML source code may be relative, # e.g., "../img.jpg" instead of "www.domain.com/img.jpg". # We can get the absolute URL by prepending the base URL. # However, this can get messy with anchors, trailing slashes and redirected URL's. # A good way to get absolute URL's is to use the module's abs() function: from pattern.web import abs
# The pattern.web module has a number of convenient search engines, as demonstrated. # But often you will need to handle the HTML in web pages of your interest manually. # The DOM object can be used for this, similar to the Javascript DOM. # The DOM (Document Object Model) parses a string of HTML # and returns a tree of nested Element objects. # The DOM elements can then be searched by tag name, CSS id, CSS class, ... # For example, top news entries on Reddit are coded as: # <div class="entry"> # <p class="title"> # <a class="title " href="http://i.imgur.com/yDyPu8P.jpg">Bagel the bengal, destroyer of boxes</a> # ... # </div> # # ... which - naturally - is a picture of a cat. url = URL("http://www.reddit.com/top/") dom = DOM(url.download(cached=True)) #print dom.body.content for e in dom.by_tag("div.entry")[:5]: # Top 5 reddit entries. for a in e.by_tag("a.title")[:1]: # First <a class="title"> in entry. print plaintext(a.content) print a.attrs["href"] # The links in the HTML source code may be relative, # e.g., "../img.jpg" instead of "www.domain.com/img.jpg". # We can get the absolute URL by prepending the base URL. # However, this can get messy with anchors, trailing slashes and redirected URL's. # A good way to get absolute URL's is to use the module's abs() function: from pattern.web import abs url = URL("http://nodebox.net") for link in DOM(url.download()).by_tag("a"):
package_categories = {} with open(sys.argv[1], 'rb') as csvfile: reader = csv.reader(csvfile, delimiter=',') for row in reader: package_categories[row[0]] = row[1] package_titles = [x.replace(".apk","").replace("\n","") for x in open(sys.argv[2]).readlines()] counter = 0 for package_title in package_titles: if package_title in package_categories: continue try: url = URL("https://play.google.com/store/apps/details?id=%s" % package_title) dom = DOM(url.download(cached=True)) for e in dom('a.category'): category = e.href.split("/")[-1] package_categories[package_title] = category except: print "Request failed on %s" % package_title pass counter += 1 if counter % 5 == 0: with open(sys.argv[1], 'wb') as f: writer = csv.writer(f) writer.writerows([(key, value) for key, value in package_categories.items()]) print counter
from pattern.web import Google, URL from pattern.web import Document, plaintext # An interesting experiment on how to use the Google API # and http://amplicate.com for opinion mining. # (let's hope we get a real Amplicate API soon!) query = "smurf" # An example result, containing all the information we need: # URL: http://amplicate.com/love/george-w-bush # Title: <b>George</b> W <b>Bush</b> Hate - 64% People Agree (803 opinions) for r in Google().search(query+" site:amplicate.com"): print r.title u = URL(r.url) if "love" in u.path \ or "hate" in u.path: b = True p = u.page.lower().replace("-", "") for i, w in enumerate(query.lower().replace("-", " ").split()): if i == 0 and not p.startswith(w): b=False; break if w not in p: b=False; break if b: love = "love" in u.path f = int(re.search("- ([0-9]{1,3})%", r.title).group(1)) * 0.01 n = int(re.search("\(([0-9]+) opinions", r.title).group(1)) print r.title print r.url
use_audio = 0 if not use_audio: selected_video_obj = video.getbest('mp4') if selected_video_obj == None: selected_video_obj = video.getbest('flv') if selected_video_obj: download_fullpath = os.path.join( r'c:\data\temp\youtube_videos', selected_video_obj.filename) if not os.path.isfile(download_fullpath): selected_video_obj.download(download_fullpath, quiet=True) else: bestaudio = video.getbestaudio() download_fullpath = os.path.join(r'c:\data\temp\youtube_videos', bestaudio.filename) if not os.path.isfile(download_fullpath): bestaudio.download(download_fullpath, quiet=True) if choice == 5: """ Retrieve indvidvial file """ url_target = 'https://www.youtube.com/results?search_query=ogt+cat' url = URL(url_target) dom_object = DOM(url.download(cached=True)) w = dom_object( 'div[class="yt-lockup-content"] h3[class="yt-lockup-title"] a')
## ss.quick_set_symbol_and_param_type('S58.SI', 'analyst_opinion') ss.form_full_url() ## print ## print ss.get_list_of_param_selector_avaliable() ## print ss.full_url_str ## #ss.parse_company_desc() ## d = ss.parse_analyst_opinion() ## print ss.header_list, ss.value_list ss.parse_all_parameters() print ss.individual_stock_df ss.individual_stock_df.to_csv(r'c:\data\check.csv') if choice == 3: url_str = 'https://sg.finance.yahoo.com/q/ks?s=S24.SI' url = URL(url_str) dom_object = DOM(url.download(cached=True)) #get the yeear w = dom_object('td[class="yfnc_tabledata1"]') w = dom_object('td[class="yfnc_tablehead1]') for n in range(len(w)): print n print w[n].content if choice == 2: yf = YFinanceDirectScrape() yf.create_dom_object() # b tag inside the td # based on css selectors for n in yf.tag_element_results(yf.dom_object, 'td[class="yfnc_modtitle1"] b'):
import pandas as pd pd.core.common.is_list_like = pd.api.types.is_list_like from pandas_datareader import data as pdr import fix_yahoo_finance from pattern.web import URL tickers = pd.read_csv('static/wilshire5000.csv', delimiter=',') tickers.head() #pdr.get_data_yahoo(tickers['Ticker'][0], '2015-01-01', '2015-01-08') for stock in tickers['Ticker']: webpage = "http://financials.morningstar.com/ajax/exportKR2CSV.html?t=%s&culture=en-CA®ion=USA&order=asc&r=314562" % stock url = URL(webpage) f = open('%s_keyratios.csv' % stock, 'wb') f.write(url.download()) f.close()
from s3pipeline import S3Connector import json from pattern.web import URL if __name__ == "__main__": okrMain = S3Connector() okrMain.accessBucket('sumit_okr') okrPdfBc = S3Connector() okrPdfBc.accessBucket('sumit_okr_pdf') for key in okrMain.bucket: if okrPdfBc.bucket.get_key(key): print 'key exists... skipping %s' % key continue else: print 'saving .pdf of %s' % key record = json.loads(okrMain.getStringContent(key)) pdfUrl = record['pdfUrl'] try: url = URL(pdfUrl) okrPdfBc.storeStringContent(key, url.download(cached=False)) except: print '%s is not a valid URL' % pdfUrl okrPdfBc.storeStringContent(key, '<No pdf for this article>')
def isPDF(self, param): url = URL(param) if "pdf" in extension(url.page): return 1 else: return 0
print e print e.src print e.src.read() return {}, {} return bot_replies, user_replies if __name__ == '__main__': import json from pattern.web import URL, Twitter # Tweet to post: tweet = "test tweet" url = URL("https://api.twitter.com/1.1/statuses/update.json", method="post", query={"status": tweet}) twitter = Twitter(license=ccpattern) url = twitter._authenticate(url) try: # Send the post request. a = json.loads(url.open().read()) reply_id = a["id"] print reply_id except Exception as e: print e print e.src print e.src.read()
def save_csv(f, tvseries): ''' Output a CSV file containing highest rated TV-series. ''' writer = csv.writer(f) writer.writerow(['Title', 'Runtime', 'Genre', 'Rating', 'Actors']) # WRITE THE TV-SERIES TO DISK movies = 10 for i in range(movies): writer.writerow(tvseries[i][:]) if __name__ == '__main__': # Download the HTML file url = URL(TARGET_URL) html = url.download() # Save a copy to disk in the current directory, this serves as an backup # of the original HTML, will be used in grading. with open(BACKUP_HTML, 'wb') as f: f.write(html) # Parse the HTML file into a DOM representation dom = DOM(html) # Extract the tv series (using the function you implemented) tvseries = extract_tvseries(dom) # Write the CSV file to disk (including a header) with open(OUTPUT_CSV, 'wb') as output_file:
#!/usr/bin/env python # Name:Zelda Zeegers # Student number:11397705 ''' This script scrapes IMDB and outputs a CSV file with highest rated tv series. ''' import csv import re from pattern.web import URL, DOM import unicodedata TARGET_URL = URL( "http://www.imdb.com/search/title?num_votes=5000,&sort=user_rating,desc&start=1&title_type=tv_series" ) dom = DOM(TARGET_URL.download(cached=True)) BACKUP_HTML = 'tvseries.html' OUTPUT_CSV = 'tvseries.csv' def extract_tvseries(dom): ''' Extract a list of highest rated TV series from DOM (of IMDB page). Each TV series entry should contain the following fields: - TV Title - Rating - Genres (comma separated if more than one) - Actors/actresses (comma separated if more than one) - Runtime (only a number!) '''