def __init__(self, config, consumer_mode, to_producer=True):
     ''' Init a consumer based on mode activated in input '''
     self.config = config
     self.config_section = consumer_mode
     self.to_producer = to_producer
     config_params = self.get_config_items()
     try:
         self.kafka_hosts = config_params['kafka_hosts']
         self.in_topic = config_params['in_topic']
         self.out_topic = config_params['out_topic']
         self.group = config_params['in_group']
         self.zk_hosts = config_params['zookeeper_hosts']
     except KeyError:
         raise
     uf.print_out("Trying to make connection {}".format(self.in_topic))
     self.client = KafkaClient(hosts=self.kafka_hosts) # Create a client
     self.topic = self.client.topics[self.in_topic] # create topic if not exists
     self.consumer = self.topic.get_balanced_consumer( # Zookeeper dynamically assigns partitions
         consumer_group=self.group,
         auto_commit_enable=True,
         zookeeper_connect=self.zk_hosts)
     uf.print_out("Made connection")
     if self.to_producer: # write into producer
         try:
             self.out_group = config_params['out_group']
             self.out_topic = self.client.topics[config_params['out_topic']]
         except KeyError:
             raise
     else:
         self.output = uf.mkdir_if_not_exist() # write to /tmp/exstreamly_cheap
     uf.print_out("Created output file or producer stage")
     self.partitions = set()
     self.msg_cnt = 0 # Num consumed by instance.
     self.init_time = datetime.now()
     self.start_time = self.init_time
     self.url_queue = Queue(maxsize=0) # infinitely sized
     self.semaphore = BoundedSemaphore()
        os.system('hdfs dfs -put {} {}'.format(self.temp_file_path, cached_fullpath))
        
        #uf.print_out('Removing temporary file - {}'.format(os.path.basename(self.temp_file_path)))
        # os.remove(self.temp_file_path)

        timestamp = time.strftime('%Y%m%d%H%M%S')
        self.temp_file_path = '{}/kafka_{}_{}_{}.dat'.format(output_dir, self.out_topic, self.group, timestamp)
        self.temp_file = open(self.temp_file_path, "w")
    
    def get_config_items(self):
        ''' Retrieve relevant config settings for section
            applicable to this type of instance for 
            group, in_topic, out_topic if available
        '''
        try:
            return dict(self.config.items(self.config_section))
        except configparser.NoSectionError:
            raise configparser.NoSectionError('No section: {} exists in the config file'
                                 .format(self.config_section))


if __name__ == '__main__':
    tmp_out_dir = '/home/ubuntu/exstreamly_cheap_all_deals/ingestion/kafka_messages'
    tmp_out_dir = uf.mkdir_if_not_exist(tmp_out_dir)
    uf.print_out('Output format: {}'.format(tmp_out_dir))
    config = configparser.SafeConfigParser()
    config.read('../../config/general.conf')
    print '\nConsuming messages...'
    cons = ConsumerToHDFS(config, settings.CONSUMER_MODE_DATA)
    cons.consume_topic(tmp_out_dir)
def fetch_and_clean_up(index_name):
    """ Fetch Elastic data and clean it up """
    # Logstash and HDFS general info
    output_dir = uf.mkdir_if_not_exist("/tmp/exstreamly_cheap_files/elasticsearch_cleanup")
    #    logstash_file = os.path.join(output_dir, 'clean_deals.json')

    # HDFS Related data
    group = "deals_data_hdfs"
    topic_id = "elastic_deals_data"
    timestamp = time.strftime("%Y%m%d%H%M%S")
    hadoop_file = os.path.join(output_dir, "hdfs_{}.dat".format(timestamp))
    hadoop_path = "/exstreamly_cheap_main_files/all_deals/history"
    cached_path = "/exstreamly_cheap_main_files/all_deals/cached"
    hadoop_fullpath = "{}/{}_{}_{}.dat".format(hadoop_path, group, topic_id, timestamp)
    cached_fullpath = "{}/{}_{}_{}.dat".format(cached_path, group, topic_id, timestamp)

    uf.print_out("Writing the logs to {} which will be pushed to hdfs and S3".format(hadoop_file))

    block_cnt = 0
    client = make_client()
    cc = Search(using=client, index=index_name)
    gen = cc.scan()

    config = configparser.SafeConfigParser()
    config.read("../../config/general.conf")
    config_params = uf.get_config_items(config, settings.PRODUCER_CLEAN_ES_DATA)
    try:
        kafka_hosts = config_params["kafka_hosts"]
        topic = config_params["topic"]
        group = config_params["group"]
        zk_hosts = config_params["zookeeper_hosts"]
    except KeyError:
        raise

    kafka_client = KafkaClient(hosts=kafka_hosts)
    kafka_topic = kafka_client.topics[topic]  # Create if not exist
    uf.print_out("Producing messages to topic {}. Press Ctrl-C to terminate".format(kafka_topic.name))

    # Produce to kafka for distributed consumption
    hdp_output = open(hadoop_file, "w")
    with kafka_topic.get_producer() as producer:
        for event in gen:
            new_string = dict(eval(event.message.encode("utf-8")))
            msg = clean_data(new_string)

            # We can decide to have logstash read from file instead
            #        with open(logstash_file, 'a') as log_output:
            #            log_output.write(json.dumps(msg) + '\n')
            # Write to producer.
            producer.produce(json.dumps(msg))

            # Back up to file for HDFS and S3
            hdp_output.write(json.dumps(msg) + "\n")
            if hdp_output.tell() > 100000000:
                hdp_output.close()

                uf.print_out("Block {}: Flushing 100MB file to HDFS => {}".format(str(block_cnt), hadoop_fullpath))

                # place blocked messages into history and cached folders on hdfs
                os.system("hdfs dfs -put {} {}".format(hadoop_file, hadoop_fullpath))
                os.system("hdfs dfs -put {} {}".format(hadoop_file, cached_fullpath))

                # Back up in S3
                uf.print_out("Syncing {} to S3 for back up".format(output_dir))
                os.system("aws s3 sync {} s3://emmanuel-awa/clean_data_from_elastic".format(output_dir))

                # Recreate file handler
                hadoop_file = os.path.join(output_dir, "hdfs_{}.dat".format(time.strftime("%Y%m%d%H%M%S")))
                hdp_output = open(hadoop_file, "w")

            uf.print_out("Cleaned {} blocks. File size: {}KB".format(block_cnt, hdp_output.tell() / 1000))
            block_cnt += 1
def fetch_sqoot_data(base_url):
    ''' Fetch Sqoot Data and save relevant information to file '''
    files_location = uf.mkdir_if_not_exist() # Folder in /tmp/exstreamly_cheap_files
    merchants_file = os.path.join(files_location, 'merchants.json')
    products_file = os.path.join(files_location, 'products.json')
    events_file = os.path.join(files_location, 'activities_events.json')
    food_nitelife_file = os.path.join(files_location, 'dining_nitelife.json')
    categories_map = map_categories(base_url)
    
    mvp_categories = [u'product', u'dining-nightlife', u'activities-events']
    focus_grp = reduce_categories_scope(categories_map, 
                                        mvp_categories)
    start_time = datetime.datetime.now()
    end_time = start_time + datetime.timedelta(hours=7)
    all_deals = []
    queue = Queue.Queue()
    while start_time < end_time:
        try:
            # Due to api inconsistencies, to always get the newest ones and page 5
            # Duplicates will be batchly processed in SPARK
              # Combine both  
            # Flatten JSON, keep online merchant ID in deals file
            # Save Merchant in Merchant Table 
#            first_100_deals = get_request(base_url, 'deals', 'per_page=100;radius=10000')
#            all_deals = all_deals + first_100_deals.json()['deals']  
            
            uf.print_out('Crawling first 100 pages')
            for num in xrange(1, 101):
                uf.print_out('.' * num)
                thread_ = threading.Thread(target=get_request, name='Thread{}'.format(num), args=[base_url, 'deals', 'page={};per_page=100;radius=10000'.format(num), queue])
                thread_.start()
                thread_.join()
                     
            while not queue.empty():
                all_deals = all_deals + queue.get()
                
            for idx, deal in enumerate(all_deals):
                uf.print_out('Processing deal: {}'.format(idx))
                # If deal category belongs to mvp, save
                category = category_in_mvp(focus_grp, deal['deal']['category_slug'])
                if category:
                    output = OrderedDict()
                    output['id'] = deal['deal']['id']
                    output['category'] = category
                    output['sub_category'] = deal['deal']['category_slug']
                    output['title'] = deal['deal']['short_title']
                    output['description'] = deal['deal']['description']
                    output['fine_print'] = deal['deal']['fine_print']
                    output['number_sold'] = deal['deal']['number_sold']
                    output['url'] = deal['deal']['untracked_url']
                    output['price'] = deal['deal']['price']
                    output['discount_percentage'] = deal['deal']['discount_percentage']
                    output['provider_name'] = deal['deal']['provider_name']
                    output['online'] = deal['deal']['online']
                    output['expires_at'] = deal['deal']['expires_at']
                    output['created_at'] = deal['deal']['created_at']
                    output['updated_at'] = deal['deal']['updated_at']
                    output['merchant_id'] = deal['deal']['merchant']['id']
                    
                    # Write deal to file
                    with open(os.path.join(files_location, str(category) + '.json'), 'a') as f:
                        f.write(json.dumps(output))
                        f.write('\n')
                    
                    # Write merchant info file
                    merchant_info = deal['deal']['merchant']
                    if not all(merchant_info.values()):
                        merchant_info = clean_merchant_info(merchant_info)        
                    with open(os.path.join(files_location, 'merchants.json'), 'a') as f:
                        f.write(json.dumps(merchant_info))
                        f.write('\n')
            start_time = datetime.datetime.now()
            uf.print_out("Time left: {} minute(s)".format((end_time - start_time).seconds / 60))
            uf.print_out("Waiting 30mins to crawl again")
            uf.spinning_cursor(1800)
        except rq.exceptions.ConnectionError:
            uf.print_out("[ConnectionError] ==> Issue with API server.")
        except rq.exceptions.ConnectTimeout:
            uf.print_out("[ConnectionTimeout] ==> Server connection timing out.")