Example #1
0
 def waitForDBChange(self, since=0):
     getLogger(self).debug(
         "Watching for changes")
     while True:
         last_seq = max(self.getSeqNumber(), since)
         self.stream = ChangesStream(
             self.db,
             feed="continuous",
             since=last_seq,
             heartbeat=True)
         try:
             for change in self.stream:
                 if not self.changes_callback:
                     return
                 if not change.get('last_seq', None):
                     if change['seq'] > self.getSeqNumber():
                         self.setSeqNumber(change['seq'])
                         if not change['id'].startswith('_design'):
                             getLogger(self).debug(
                                 "Changes from another instance")
                             deleted = bool(change.get('deleted', False))
                             revision = change.get("changes")[-1].get('rev')
                             obj_id = change.get('id')
                             if not deleted:
                                 # update cache
                                 doc = self.db.get(obj_id)
                                 self.addDoc(doc)
                             self.changes_callback(obj_id, revision, deleted)
         except Exception as e:
             getLogger(self).info("Some exception happened while waiting for changes")
             getLogger(self).info("  The exception was: %s" % e)
Example #2
0
def main(argv):
    """
    Main method.

    This method performs the following tasks:
    1. Parse command line arguments
    2. Retrieve credentials and connect to Cloudant and WebHDFS
    3. Connect to the Cloudant `_changes` feed for checkpointed document
       consumption
    4. Process each change individually.
    5. Upon exception throwing, store the latest checkpoint to local file and
       exit.
    """

    #add options into the parser
    parser = configureOptions()
    (options, args) = parser.parse_args()
    checkRequiredArguments(options, parser)
    print options

    # configurations
    last_seq = options.last_seq

    #get credential
    perm_file = '%s/.clou' % os.environ['HOME']
    creds = get_creds(perm_file)

    #connect to source database
    s = Server('https://%s:%s@%s' %
               (creds['cloudant_user'], creds['cloudant_pwd'], options.uri))
    db = s[options.dbname]
    #print db.info()

    #connect to target hdfs cluster
    hdfs = PyWebHdfsClient(host=options.hdfs_host,
                           port=options.hdfs_port,
                           user_name=creds['hdfs_user'])
    hdfs.make_dir(options.hdfs_path)

    #and here we consume the cloudant `_changes` feed
    counter = 0
    changestream = ChangesStream(db,
                                 include_docs=True,
                                 heartbeat=True,
                                 since=last_seq)
    for c in changestream:
        #print c
        try:
            if counter % 100 == 0:
                checkpoint(last_seq)
            seq = processChange(hdfs, c, options.hdfs_path)
            if seq:  # protect against the last line being blank
                last_seq = seq
                counter += 1
        except Exception:
            traceback.print_exc()
            checkpoint(last_seq)
            os._exit(1)

    checkpoint(last_seq)
Example #3
0
 def waitForDBChange(self, db_name, since=0, timeout=15000):
     """ Be warned this will return after the database has a change, if
     there was one before call it will return immediatly with the changes
     done"""
     changes = []
     last_seq = max(self.getLastChangeSeq(db_name), since)
     db = self._getDb(db_name)
     with ChangesStream(db,
                        feed="longpoll",
                        since=last_seq,
                        timeout=timeout) as stream:
         for change in stream:
             if change['seq'] > self.getLastChangeSeq(db_name):
                 self.setLastChangeSeq(db_name, change['seq'])
                 if not change['id'].startswith('_design'):
                     #fake doc type for deleted objects
                     doc = {
                         'type': 'unknown',
                         '_deleted': 'False',
                         '_rev': [0]
                     }
                     if not change.get('deleted'):
                         doc = self.getDocument(db_name, change['id'])
                     changes.append(change_factory.create(doc))
     if len(changes):
         getLogger(self).debug("Changes from another instance")
     return changes
Example #4
0
 def poll_once():
     changes_stream = ChangesStream(db=self._couch_db,
                                    since=self._last_processed_seq,
                                    include_docs=True,
                                    **extra_args)
     for couch_change in changes_stream:
         change = change_from_couch_row(
             couch_change, document_store=self._document_store)
         populate_change_metadata(change, SOURCE_COUCH,
                                  self._couch_db.dbname)
         yield change
         self._last_processed_seq = couch_change.get('seq', None)
Example #5
0
 def run_burst(self):
     """
     Use this for testing pillows. Will run through the changes stream once.
     """
     changes_stream = ChangesStream(db=self.couch_db,
                                    since=self.since,
                                    filter=self.couch_filter,
                                    include_docs=self.include_docs,
                                    **self.extra_args)
     for change in changes_stream:
         if change:
             self.processor(change)
Example #6
0
 def iter_changes(self, since, forever):
     extra_args = {'feed': 'continuous'} if forever else {}
     extra_args.update(self._extra_couch_view_params)
     changes_stream = ChangesStream(
         db=self._couch_db,
         heartbeat=True,
         since=since,
         filter=self._couch_filter,
         include_docs=self._include_docs,
         **extra_args
     )
     for couch_change in changes_stream:
         yield change_from_couch_row(couch_change, document_store=self._document_store)
Example #7
0
    def waitForDBChange(self, since=0):
        """Listen to the stream of changes provided by CouchDbKit. Process
        these changes accordingly. If there's an exception while listening
        to the changes, return inmediatly."""

        # XXX: the while True found here shouldn't be necessary because
        # changesStream already keeps listening 'for ever'. In a few tests
        # I ran, this hypothesis was confirmed, but with our current setup
        # i'm afraid I may be missing something. In any case, it works
        # as it is, but this definitevely needs revision.

        getLogger(self).debug(
            "Watching for changes")
        while True:
            last_seq = max(self.getSeqNumber(), since)
            self.stream = ChangesStream(
                self.db,
                feed="continuous",
                since=last_seq,
                heartbeat=True)
            try:
                for change in self.stream:
                    if not self.changes_callback:
                        return
                    if not change.get('last_seq', None):
                        if change['seq'] > self.getSeqNumber():
                            self.setSeqNumber(change['seq'])
                            if not change['id'].startswith('_design'):
                                getLogger(self).debug(
                                    "Changes from another instance")
                                deleted = bool(change.get('deleted', False))
                                revision = change.get("changes")[-1].get('rev')
                                obj_id = change.get('id')
                                if not deleted:
                                    # update cache
                                    doc = self.db.get(obj_id)
                                    self.addDoc(doc)
                                self.changes_callback(obj_id, revision, deleted)

            except ResourceNotFound as e:
                getLogger(self).info("The database couldn't be found")
                self.no_workspace_callback()
                return False

            except Exception as e:
                getLogger(self).info("Some exception happened while waiting for changes")
                getLogger(self).info("  The exception was: %s" % e)
                return False  # kill thread, it's failed... in reconnection
Example #8
0
 def waitForDBChange(self, db_name, since=0, timeout=15000):
     """ Be warned this will return after the database has a change, if
     there was one before call it will return immediatly with the changes
     done"""
     changes = []
     last_seq = max(self.getLastChangeSeq(db_name), since)
     db = self.__getDb(db_name)
     with ChangesStream(db,
                        feed="longpoll",
                        since=last_seq,
                        timeout=timeout) as stream:
         for change in stream:
             if change['seq'] > self.getLastChangeSeq(db_name):
                 changes.append(change)
         last_seq = reduce(lambda x, y: max(y['seq'], x), changes,
                           self.getLastChangeSeq(db_name))
         self.setLastChangeSeq(db_name, last_seq)
     return changes
Example #9
0
 def iter_changes(self, since, forever):
     from corehq.apps.change_feed.data_sources import SOURCE_COUCH
     extra_args = {'feed': 'continuous'} if forever else {}
     extra_args.update(self._extra_couch_view_params)
     self._last_processed_seq = since
     changes_stream = ChangesStream(db=self._couch_db,
                                    heartbeat=True,
                                    since=since,
                                    filter=self._couch_filter,
                                    include_docs=True,
                                    **extra_args)
     for couch_change in changes_stream:
         change = change_from_couch_row(couch_change,
                                        document_store=self._document_store)
         populate_change_metadata(change, SOURCE_COUCH,
                                  self._couch_db.dbname)
         yield change
         self._last_processed_seq = couch_change.get('seq', None)
Example #10
0
def stream_changes(db, since, limit):
    for change in ChangesStream(db=db, since=since, limit=limit):
        yield CouchChange(
            id=change['id'], rev=change['changes'][0]['rev'], deleted=change.get('deleted', False),
            seq=change.get('seq'))