def authenticate_tweepy(): auth = tweepy.OAuthHandler(credentials.get_consumer_key(), credentials.get_consumer_secret()) auth.set_access_token(credentials.get_access_token(), credentials.get_access_secret()) global twitter twitter = tweepy.API(auth)
findspark.init() import util import credentials import pyspark sc = pyspark.SparkContext('local[*]') from pyspark.sql import SparkSession import configparser config = configparser.ConfigParser() access_id = credentials.get_access_key() access_key = credentials.get_access_secret() hadoop_conf = sc._jsc.hadoopConfiguration() hadoop_conf.set("fs.s3n.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem") hadoop_conf.set("fs.s3n.awsAccessKeyId", access_id) hadoop_conf.set("fs.s3n.awsSecretAccessKey", access_key) hadoop_conf.set("fs.s3n.endpoint", "s3.eu-central-1.amazonaws.com") # hadoop_conf.set("fs.s3a.awsAccessKeyId", access_id) # hadoop_conf.set("fs.s3a.awsSecretAccessKey", access_key) # hadoop_conf.set("fs.s3a.endpoint", "s3.eu-central-1.amazonaws.com") # hadoop_conf.set("com.amazonaws.services.s3a.enableV4", "true") # hadoop_conf.set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") # sc._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
import os #os.environ['PYSPARK_SUBMIT_ARGS'] = "--packages=org.apache.hadoop:hadoop-aws:2.7.3 pyspark-shell" os.environ[ 'PYSPARK_SUBMIT_ARGS'] = "--packages=com.amazonaws:aws-java-sdk:1.7.4,org.apache.hadoop:hadoop-aws:2.7.3 pyspark-shell" import findspark findspark.init() import util import credentials import pyspark AWS_ACCESS_KEY = credentials.get_access_key() AWS_SECRET_KEY = credentials.get_access_secret() conf = (pyspark.SparkConf().setAppName('test').setMaster('local[*]').set( "fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem" ).set('fs.s3a.access.key', AWS_ACCESS_KEY).set('fs.s3a.secret.key', AWS_SECRET_KEY).set( "fs.s3a.awsAccessKeyId", AWS_ACCESS_KEY).set("fs.s3a.awsSecretAccessKey", AWS_SECRET_KEY).set( 'fs.s3a.endpoint', "s3.eu-central-1.amazonaws.com").set( 'com.amazonaws.services.s3a.enableV4', "true").set( "fs.s3a.aws.credentials.provider", "org.apache.hadoop.fs.s3a.BasicAWSCredentialsProvider")) sc = pyspark.SparkContext(conf=conf) s3File = sc.textFile("s3a://welcome12345/student.json")
} tweet_list.append(entry) print("...tweets fetched") utils.write_to_json(json_file_name, tweet_list) except tweepy.TweepError as e: raise HistoricTweetException(str(e)) if __name__ == "__main__": arg_keyword = "".join(sys.argv[1]) arg_num_of_tweets = int(sys.argv[2]) arg_json_file_name = sys.argv[3] # Get credentials try: consumer_key = credentials.get_consumer_key() consumer_secret = credentials.get_consumer_secret() access_token = credentials.get_access_token() access_token_secret = credentials.get_access_secret() except credentials.VaultException as error: raise HistoricTweetException("Vault Exception: " + str(error)) # Set Up Auth try: auth = OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = API(auth) except TweepError as err: raise TweepError("Authentication Failed: " + str(err)) _get_historic_tweets(api, arg_keyword, arg_json_file_name, arg_num_of_tweets)