def format_tweets(data): p(data.keys()) p(data['search_metadata']) p(data['statuses'][0].keys()) tweet_fields = ['created_at', 'from_user', 'id', 'text'] tweets = DataFrame(data['statuses'], columns=tweet_fields) p(tweets) p(tweets.ix[7])
def main(): ''' HTML and WebAPI ''' twitter_secret_path = study.ROOT_DIR + '/twitter_secret.json' with open(twitter_secret_path) as f: tw_secret = json.load(f) p(tw_secret) format_tweets(search_tweets(tw_secret)) format_tweets(search_and_post_status(tw_secret))
def search_and_post_status(tw_secret): """ oauth by requests module """ CONSUMER_KEY = tw_secret['api_key'] CONSUMER_SECRET = tw_secret['api_secret_key'] # request token request_token_url = 'https://api.twitter.com/oauth/request_token' auth = OAuth1(CONSUMER_KEY, CONSUMER_SECRET, callback_uri=u'oob') res = requests.post(request_token_url, auth=auth) request_token = dict(urlparse.parse_qsl(res.text)) p(request_token) # access token authorize_url = 'https://api.twitter.com/oauth/authorize' access_token_url = 'https://api.twitter.com/oauth/access_token' # Authorize print 'Auth link:' print '{0}?oauth_token={1}'.format(authorize_url, request_token['oauth_token']) print oauth_verifier = unicode(raw_input('What is the PIN? ')) auth = OAuth1(CONSUMER_KEY, CONSUMER_SECRET, request_token['oauth_token'], request_token['oauth_token_secret'], verifier=oauth_verifier) res = requests.post(access_token_url, auth=auth) access_token = dict(urlparse.parse_qsl(res.text)) p(access_token) # search search_url = 'https://api.twitter.com/1.1/search/tweets.json' query = urllib.quote('python pandas') auth = OAuth1(CONSUMER_KEY, CONSUMER_SECRET, access_token['oauth_token'], access_token['oauth_token_secret']) res = requests.get(search_url + '?q=' + query, auth=auth) tweets = json.loads(res.text) format_tweets(tweets) # post status update_url = 'https://api.twitter.com/1.1/statuses/update.json' data = { 'status': 'This status is posted by requests module.', } res = requests.post(update_url, data=data, auth=auth) p(res.text)
def search_tweets(tw_secret): ''' Using twitter module ''' MY_TWITTER_CREDS = os.path.expanduser(study.ROOT_DIR + '/.my_app_credentials') CONSUMER_KEY = tw_secret['api_key'] CONSUMER_SECRET = tw_secret['api_secret_key'] if not os.path.exists(MY_TWITTER_CREDS): twitter.oauth_dance('My App Name', CONSUMER_KEY, CONSUMER_SECRET, MY_TWITTER_CREDS) # oauth_token is access_token format # oauth_secret is access_token_secret format oauth_token, oauth_secret = twitter.read_token_file(MY_TWITTER_CREDS) p(oauth_token) p(oauth_secret) auth = twitter.OAuth(oauth_token, oauth_secret, CONSUMER_KEY, CONSUMER_SECRET) t = twitter.Twitter(auth=auth) q = urllib.quote('python pandas') p(q) return t.search.tweets(q=q)
def main(): out_dir = os.path.dirname(__file__) ex1_path = study.DATA_DIR + '/ch06/ex1.csv' cat(ex1_path) df = pd.read_csv(ex1_path) p(df) p(pd.read_table(ex1_path, sep=',')) p('header less---------------------') ex2_path = study.DATA_DIR + '/ch06/ex2.csv' cat(ex2_path) names = ['a','b', 'c', 'd', 'message'] p(pd.read_csv(ex2_path, header=None)) p(pd.read_csv(ex2_path, names=names)) p(pd.read_csv(ex2_path, names=names, index_col='message')) p('hierarchy index---------------------') mindex_path = study.DATA_DIR + '/ch06/csv_mindex.csv' cat(mindex_path) p(pd.read_csv(mindex_path, index_col=['key1', 'key2'])) p('separate by regex-------------') ex3_path = study.DATA_DIR + '/ch06/ex3.csv' cat(ex3_path) p(pd.read_csv(ex3_path, sep='\s+')) p('skip rows-----------') ex4_path = study.DATA_DIR + '/ch06/ex4.csv' cat(ex4_path) p(pd.read_csv(ex4_path, skiprows=[0,2,3])) p('N/A------------------') ex5_path = study.DATA_DIR + '/ch06/ex5.csv' cat(ex5_path) result = pd.read_csv(ex5_path) p(result) p(pd.isnull(result)) result = pd.read_csv(ex5_path, na_values=['NULL', '12']) # 12 is NA p(result) p('N/A dict------------------') sentinels = {'message': ['foo', 'NA'], 'something': ['two']} p(sentinels) p(pd.read_csv(ex5_path, na_values=sentinels)) p('6.1.1 read data chunk size---------------------') ex6_path = study.DATA_DIR + '/ch06/ex6.csv' p(pd.read_csv(ex6_path).count()) p(pd.read_csv(ex6_path, nrows=5)) chunker = pd.read_csv(ex6_path, chunksize=1000) p(chunker) tot = Series([]) for piece in chunker: tot = tot.add(piece['key'].value_counts(), fill_value=0) tot.order(ascending=False) p(tot[:10]) p('6.1.2 write---------------------') data = pd.read_csv(ex5_path) p(data) ex5_out_path = out_dir + '/ex5_out.csv' data.to_csv(ex5_out_path) cat(ex5_path) data.to_csv(sys.stdout, index=False, header=False) print '' data.to_csv(sys.stdout, index=False, cols=list('abc')) print '' p('Series--------------') tseries_out_path = out_dir + '/tseries_out.csv' dates = pd.date_range('1/1/2000', periods=7) ts = Series(np.arange(7), index=dates) ts.to_csv(tseries_out_path) cat(tseries_out_path) p(Series.from_csv(tseries_out_path, parse_dates=True)) p('6.1.3 csv-------------------------') ex7_path = study.DATA_DIR + '/ch06/ex7.csv' cat(ex7_path) f = open(ex7_path) reader = csv.reader(f) for line in reader: print line lines = list(csv.reader(open(ex7_path))) header, values = lines[0], lines[1:] data_dict = {h: v for h,v in zip(header, zip(*values))} p(data_dict) my_data_out_path = out_dir + '/mydata.csv' with open(my_data_out_path, 'w') as fp: writer = csv.writer(fp, dialect=my_dialect) writer.writerow(('one', 'two', 'three')) writer.writerow(('1', '2', '3')) writer.writerow(('4', '5', '6')) writer.writerow(('7', '8', '9')) cat(my_data_out_path) p('6.1.4 JSON-------------------------') obj = """ {"name": "Wes", "places_lived": ["United States", "Spain", "Germany"], "pet": null, "siblings": [{"name": "Scott", "age": 25, "pet": "Zuko"}, {"name": "Katie", "age": 33, "pet": "Cisco"}] } """ result = json.loads(obj) p(result) asjson = json.dumps(result) p(asjson) siblings = DataFrame(result['siblings'], columns=['name', 'age']) p(siblings) p('6.1.4 XML/HTML Web Scraping-------------------------') url = '' #'http://finance.yahoo.com/q/op?s=AAPL+Options' if not url is '': parsed = parse(urlopen('http://finance.yahoo.com/q/op?s=AAPL+Options')) doc = parsed.getroot() p([lnk.get('href') for lnk in doc.findall('.//a')][-10:]) tables = doc.findall('.//table') p(parse_options_data(tables[9])[:5]) p(parse_options_data(tables[13])[:5]) p('6.1.5 Read XML-------------------------') xml_path = out_dir + '/Performance_MNR.xml' xml_content =""" <INDICATOR> <INDICATOR_SEQ>373889</INDICATOR_SEQ> <PARENT_SEQ></PARENT_SEQ> <AGENCY_NAME>MEtro-North Railroad</AGENCY_NAME> <INDICATOR_NAME>Escalator Availability</INDICATOR_NAME> <DESCRIPTION>Percent of the time that escalators are operational systemwide. The availability rate is based on physical observations performed the morning of regular business days only. This is a new indicator the agency began reporting in 2009.</DESCRIPTION> <PERIOD_YEAR>2011</PERIOD_YEAR> <PERIOD_MONTH>12</PERIOD_MONTH> <CATEGORY>Service Indicators</CATEGORY> <FREQUENCY>M</FREQUENCY> <DESIRED_CHANGE>U</DESIRED_CHANGE> <INDICATOR_UNIT>%</INDICATOR_UNIT> <DECIMAL_PLACES>1</DECIMAL_PLACES> <YTD_TARGET>97.00</YTD_TARGET> <YTD_ACTUAL></YTD_ACTUAL> <MONTHLY_TARGET>97.00</MONTHLY_TARGET> <MONTHLY_ACTUAL></MONTHLY_ACTUAL> </INDICATOR> """ if not os.path.exists(xml_path): with open(xml_path, 'w') as f: f.write(xml_content) parsed = objectify.parse(open(xml_path)) root = parsed.getroot() data = [] skip_fields = ['PARENT_SEQ', 'INDICATOR_SEQ', 'DESIRED_SEQ', 'DECIMAL_PLACES'] p(dir(root)) for elt in root: # .INDICATOR: el_data = {} for child in elt.getchildren(): if child.tag in skip_fields: continue el_data[child.tag] = child.pyval data.append(el_data) perf = DataFrame(data) p(perf) tag = '<a href="http://google.com">Google</a>' root = objectify.parse(StringIO.StringIO(tag)).getroot() p(root) p(root.get('href')) p(root.text)
def main(): """ Binary data format """ out_dir = os.path.dirname(__file__) ex1_path = study.DATA_DIR + '/ch06/ex1.csv' cat(ex1_path) frame = pd.read_csv(ex1_path) p(frame) out_pickle = out_dir + '/frame_pickle' # deprecated # frame.save(out_pickle) # pd.load(out_pickle) frame.to_pickle(out_pickle) p(pd.read_pickle(out_pickle)) p('6.2.1 Hierarchical Data Format(HDF)----------------') h5_path = out_dir + '/mydata.h5' store = pd.HDFStore(h5_path) store['obj1'] = frame store['obj_col1'] = frame['a'] p(store) p(store.obj1) p('6.2.2 Excel-------------------') xls_file = pd.ExcelFile(out_dir + '/data.xlsx') table = xls_file.parse('Sheet1') p(table)
def main(): """ Binary data format """ out_dir = os.path.dirname(__file__) query = """ CREATE TABLE test ( a VARCHAR(20) , b VARCHAR(20) , c REAL , INTEGER );""" con = sqlite3.connect(out_dir + '/ch06-sqlite.db') try: con.execute(query) data = [('Atlanta', 'Georgia', 1.25, 6), ('Tallahassee', 'Florida', 2.6, 3), ('Sacramento', 'California', 1.7, 5)] stmt = 'INSERT INTO test VALUES(?, ?, ?, ?)' con.executemany(stmt, data) con.commit() except sqlite3.OperationalError as e: print e.message print traceback.format_exc() finally: p('finally') cursor = con.execute('select * from test') rows = cursor.fetchall() p(rows) p(cursor.description) p(DataFrame(rows, columns=zip(*cursor.description)[0])) p(sql.read_sql('select a, b, c from test', con)) # deprecated # p(sql.read_frame('select a, b, c from test', con)) con.close() p('6.4.1 MongoDB------------------') con = pymongo.Connection('localhost', port=27017) tweets = con.db.twees columns = ['created_at', 'from_user', 'id', 'text'] date_combine = datetime.combine(date(2005, 7, 14), time(12, 30)) data = [ [date.today().isoformat(), 'a', 1, 'aa'], [str(date.today()), 'b', 2, 'bb'], [date.today().strftime('%Y-%m-%d %H:%M:%S'), 'c', 3, 'cc'], [datetime.now().strftime('%Y-%m-%d %H:%M:%S'), 'd', 4, 'dd'], [date_combine.strftime('%Y-%m-%d %H:%M:%S'), 'd', 4, 'dd'], ] for d in data: r = dict(zip(columns, d)) print r tweets.save(r) cursor = tweets.find({'from_user': '******'}) p(DataFrame(list(cursor), columns=columns))