Ejemplo n.º 1
0
    def __init__(self):
        config = ConfigReader("config.json")

        auth = OAuthHandler(config.get_key("CONSUMER_KEY"),
                            config.get_key("CONSUMER_SECRET"))
        auth.set_access_token(config.get_key("ACCESS_TOKEN_KEY"),
                              config.get_key("ACCESS_TOKEN_SECRET"))
        self.api = tweepy.API(auth_handler=auth,
                              wait_on_rate_limit=True,
                              wait_on_rate_limit_notify=True)
        self.rate_limits = self.tweep_rate_limits_to_dictionary(
            self.api.rate_limit_status())
        self.scrapers = []
        kafka_url = "{:s}:{:s}".format(config.get_key("KAFKA_HOST"),
                                       config.get_key("KAFKA_PORT"))
        try:
            kafka = SimpleClient(kafka_url, timeout=60)
        except KafkaUnavailableError as e:
            logging.error("Could not connect to Kafka2")
            raise e

        self.producer = SimpleProducer(kafka)
Ejemplo n.º 2
0
try:
    with open("data/AFINN-96.txt") as file:
        sentiment_data = dict(csv.reader(file, delimiter='\t'))
    for key in sentiment_data:
        sentiment_data[key] = int(sentiment_data[key])

except IOError as e:
    logging.error("Could not open sentiment data file " + str(e.args))
    exit(1)
except ValueError:
    logging.error("Sentiment data file not valid")
    exit(1)

config = ConfigReader("config.json")

zookeeper_url = "{:s}:{:s}".format(config.get_key("ZOOKEEPER_HOST"), config.get_key("ZOOKEEPER_PORT"))
kafka_topic = config.get_key("KAFKA_TOPIC")
output_topic = config.get_key("KAFKA_OUTPUT_TOPIC")

sc = SparkContext(appName="PythonTweetCleaner")
sc.setLogLevel("ERROR")
ssc = StreamingContext(sc, 10)

kafka_params = {"startingOffsets": "earliest"}

kafkaStream = KafkaUtils.createStream(ssc, zookeeper_url, 'spark-streaming',
                                      {kafka_topic: 1}, kafka_params)


# output stream
try: