Beispiel #1
0
    def _prepare_word_index_wvspace(self, dim, initialize=False):
        v_dtype, o_dtype = self._make_dtype(dim)

        def J(x):
            return os.path.join(self.dirpath, x)

        def S(f):
            return os.stat(f).st_size

        v_path = J('vectors')
        o_path = J('occurrences')

        nvecs = noccurs = 0
        if not initialize:
            nvecs = S(v_path) / np.dtype(v_dtype).itemsize
            noccurs = S(o_path) / np.dtype(o_dtype).itemsize

        v_array = DiskArray(v_path,
                            shape=(int(nvecs), ),
                            dtype=v_dtype,
                            growby=self._growby,
                            log=self.log)
        o_array = DiskArray(o_path,
                            shape=(int(noccurs), ),
                            dtype=o_dtype,
                            growby=self._growby,
                            log=self.log)

        w_index = DiskDict(J('wordtoindex'))
        i_word = DiskDict(J('indextoword'))

        return v_array, o_array, w_index, i_word
Beispiel #2
0
    def _prepare_word_index_wvspace(self, dim, initialize=False, mode='r+'):
        def J(x): return os.path.join(self.dirpath, x)

        v_path = J('vectors')
        m_path = J('magnitudes')
        o_path = J('occurrences')

        # FIXME: Support taking memmap array from diskarray
        m_array = DiskArray(m_path, dtype='float32', mode=mode,
                            growby=self._growby, log=self.log)
        o_array = DiskArray(o_path, dtype='uint64', mode=mode,
                            growby=self._growby, log=self.log)

        if not initialize:
            v_array = DiskArray(v_path, dtype='float32', mode=mode,
                                growby=self._growby, log=self.log)
            vec_l = int(len(v_array)/dim)
            v_array = v_array[:].reshape(vec_l, dim)
            m_array = m_array[:]
            o_array = o_array[:]
        else:
            v_array = DiskArray(v_path, shape=(0, dim), dtype='float32', mode=mode,
                            growby=self._growby, log=self.log)

        wtoi = itow = None
        if not self.sharding:
            wtoi = DiskDict(J('wordtoindex'))
            itow = DiskDict(J('indextoword'))

        return v_array, o_array, m_array, wtoi, itow
Beispiel #3
0
    def __init__(self,
                 in_dir,
                 outdir,
                 nvecs_per_shard=0,
                 shard_name="shard",
                 full_name="full"):

        self.in_dir = in_dir
        self.outdir = outdir
        self.nvecs_per_shard = nvecs_per_shard
        self.shard_name = shard_name

        self.do_sharding = bool(self.nvecs_per_shard)
        if self.do_sharding:
            self.full_fpath = self.J(self.outdir, full_name)
            os.makedirs(self.full_fpath)

            map_itow = self.J(self.full_fpath, "indextoword")
            self.itow = DiskDict(map_itow)

            map_wtoi = self.J(self.full_fpath, "wordtoindex")
            self.wtoi = DiskDict(map_wtoi)

            self.mags = DiskArray(
                self.J(self.full_fpath, "magnitudes"),
                shape=(0, ),
                dtype=np.float32,
                growby=1000000,
            )
            self.occurs = DiskArray(
                self.J(self.full_fpath, "occurrences"),
                shape=(0, ),
                dtype=np.uint64,
                growby=1000000,
            )
Beispiel #4
0
    def __init__(
        self,
        auth_token=None,
        repos=None,
        status_path="/tmp/",
        targets=None,
        log=DUMMY_LOG,
    ):

        self.git = Github(auth_token)
        self.log = log
        self.repos = repos
        self.targets = targets
        self.store = None
        self.dd = DiskDict(status_path + "disk.dict")
        self._pool = ThreadPool()
    def __init__(self):
        super(MineTriplet, self).__init__()

        self.inp_cluster_f = DD(self.args.manual_cluster_f)
        self.vspace = WordVecSpaceMem(self.args.wvspace_f)
        self.out_train_d = DA(self.args.hard_triplet_batch,
                              shape=(0, ),
                              dtype=self._get_dtype())
    def __init__(self,
                 host,
                 port,
                 master,
                 data_dir,
                 logaggfs_dir,
                 log=utils.DUMMY):

        self.host = host
        self.port = port
        self.master = master

        # For storing state
        data_path = os.path.abspath(os.path.join(data_dir, "logagg-data"))
        self.data_path = utils.ensure_dir(data_path)

        self.master = master

        self.log = log

        # For remembering the state of files
        self.state = DiskDict(self.data_path)

        # Initialize logaggfs paths
        self.logaggfs = self._init_logaggfs_paths(logaggfs_dir)

        # Log fpath to thread mapping
        self.log_reader_threads = {}
        # Handle name to formatter fn obj map
        self.formatters = {}
        self.queue = queue.Queue(maxsize=self.QUEUE_MAX_SIZE)

        # Add initial files i.e. serverstats to state
        if not self.state["fpaths"]:
            self.log.info("init_fpaths")
            self._init_fpaths()

        # register self to master
        self.register_to_master()

        # Create nsq_sender
        self.nsq_sender_logs, self.nsq_sender_heartbeat = self._init_nsq_sender(
        )

        self._ensure_trackfiles_sync()
Beispiel #7
0
    def _init_disk(self):
        def J(x): return os.path.join(self.dirpath, x)

        if not os.path.exists(self.dirpath):
            os.makedirs(self.dirpath)

        meta = DiskDict(J('meta'))
        meta['dim'] = self.dim

        return meta, self._prepare_word_index_wvspace(self.dim, initialize=True)
Beispiel #8
0
    def __init__(
        self,
        cred_path=None,
        topic_name=None,
        query="",
        file_path=None,
        status_path="/tmp/",
        targets=None,
        log=DUMMY_LOG,
    ):

        self.log = log
        self.cred_path = cred_path
        self.query = query
        self.gmail = None
        self.topic = topic_name
        self.file_path = file_path
        self.dd = DiskDict(status_path + "disk.dict")
        self.targets = targets
        self._pool = ThreadPool()
Beispiel #9
0
    def __init__(
        self,
        auth_token=None,
        connect_time=time.time(),
        dict_path="/tmp",
        file_path=None,
        targets=None,
        log=DUMMY_LOG,
    ):
        self.auth_token = auth_token
        self.slack = SlackClient(self.auth_token)
        self.connect_time = connect_time

        self.targets = targets
        self.file_path = file_path
        self.dd = DiskDict(dict_path + "/disk.dict")

        self.log = log
        self.username_cache = ExpiringDict(self.CACHE_LEN, self.CACHE_LIFE_TIME)
        self._pool = ThreadPool()
Beispiel #10
0
    def __init__(self, data_dir, logaggfs_dir, master, log):

        # For storing state
        data_path = os.path.abspath(os.path.join(data_dir, 'logagg-data'))
        self.data_path = util.ensure_dir(data_path)

        # For log file that have been read
        archive_path = os.path.abspath(os.path.join(data_dir, 'logagg-archive'))
        self.archive_dir = util.ensure_dir(archive_path)

        self.master = master

        self.log = log

        # For remembering the state of files
        self.state = DiskDict(self.data_path)

        # Initialize logaggfs paths
        self.logaggfs = self._init_logaggfs_paths(logaggfs_dir)

        # Log fpath to thread mapping
        self.log_reader_threads = {}
        # Handle name to formatter fn obj map
        self.formatters = {}
        self.queue = queue.Queue(maxsize=self.QUEUE_MAX_SIZE)

        # Add initial files i.e. serverstats to state
        if not self.state['fpaths']:
            self.log.info('init_fpaths')
            self._init_fpaths()

        # Create nsq_sender
        self.log.info('init_nsq_sender')
        self._init_nsq_sender()
        #self.nsq_sender = util.DUMMY

        self._ensure_trackfiles_sync()
Beispiel #11
0
def shelve2xlsx(opt):
    completed_tasks = DiskDict(opt.results_path)
    logger.info("found {} results".format(len(completed_tasks)))

    flattened_results = {}

    for named_id, result in completed_tasks.items():
        if result["rc"] == 0:
            test_stats = rename_flatten(result["details"]["test_stats"], "test_stats")
            dev_stats = rename_flatten(result["details"]["dev_stats"], "dev_stats")

            flattened_result = {
                **without_keys(result, ["details"]),
                **dev_stats,
                **test_stats,
            }
        else:
            flattened_result = {**without_keys(result, ["details"])}

        scalared_flattened_result = non_scalar_to_str(flattened_result)
        flattened_results[named_id] = scalared_flattened_result

    df = pd.DataFrame(data=flattened_results.values())
    df.to_excel(opt.results_path + ".xlsx")
class MineTriplet(BS):
    def __init__(self):
        super(MineTriplet, self).__init__()

        self.inp_cluster_f = DD(self.args.manual_cluster_f)
        self.vspace = WordVecSpaceMem(self.args.wvspace_f)
        self.out_train_d = DA(self.args.hard_triplet_batch,
                              shape=(0, ),
                              dtype=self._get_dtype())

    def _get_dtype(self):
        return [
            ('vec1', np.float32, 300),
            ('vec2', np.float32, 300),
            ('label', np.int, 1),
        ]

    def run(self):
        batched_clusters = self.get_batched_clusters(self.args.batch_size)
        clusters_union = self.get_cluster_union(batched_clusters)
        distance_matrix = DistanceMatrix(clusters_union, self.vspace)

    def get_batched_clusters(self, batch_size):
        cluster_iter = 0
        positives = []

        for values in self.inp_cluster_f.values():
            if cluster_iter < batch_size:
                positives.append(values['positive'])
                cluster_iter += 1

        return positives

    def get_cluster_union(self, batched_clusters):
        clusters_union = set().union(*batched_clusters)

        return clusters_union

    def define_args(self, parser):
        parser.add_argument('manual_cluster_f', help='manual cluster file')
        parser.add_argument('wvspace_f', help='vector space file')
        parser.add_argument('--batch_size',
                            default=5,
                            type=int,
                            help='size to produce triplets')
        parser.add_argument('hard_triplet_batch',
                            help='batch of training triplets')
def test():
    d = DiskArray(inp_f, dtype=[('vec', np.float32, 128)])
    mapping = DiskDict(dict_file)

    print('The given word is', mapping[str(index)])
    vectors = d['vec']
    vec = vectors[index].reshape(1, len(vectors[0]))
    vectors_t = vectors.T

    dists = np.dot(vec, vectors_t)
    k_near = np.argsort(dists)[0]

    words = []
    for i in k_near:
        words.append(mapping[str(i)])

    return words
Beispiel #14
0
 def load(self):
     self.cache = DiskDict(f'./generate/elmo.{self.dim}.cache')
Beispiel #15
0
class SlackHistory(object):
    """
    FIXME: explain the responsbilities of this abstraction
    This module gives all the history between given timestamps
    """

    CACHE_LEN = 100
    CACHE_LIFE_TIME = 500
    CHUNK_SIZE = 1024

    def __init__(
        self,
        auth_token=None,
        connect_time=time.time(),
        dict_path="/tmp",
        file_path=None,
        targets=None,
        log=DUMMY_LOG,
    ):
        self.auth_token = auth_token
        self.slack = SlackClient(self.auth_token)
        self.connect_time = connect_time

        self.targets = targets
        self.file_path = file_path
        self.dd = DiskDict(dict_path + "/disk.dict")

        self.log = log
        self.username_cache = ExpiringDict(self.CACHE_LEN, self.CACHE_LIFE_TIME)
        self._pool = ThreadPool()

    def get_username(self, _id):
        """
        >>> from mock import Mock
        >>> ob = SlackHistory()
        >>> ob.slack.api_call = Mock(ob.slack.api_call, return_value={"user":{"name":"asdf"}})
        >>> ob.get_username(677)
        'asdf'
        """
        name = self.username_cache

        if _id in name:
            return name["user_id"]

        user_name = self.slack.api_call("users.info", user=_id)
        name["user_id"] = user_name["user"]["name"]

        return name["user_id"]

    def get_permalink(self, channel_id, ts):
        """
        >>> from mock import Mock
        >>> ob = SlackHistory()
        >>> ob.slack.api_call = Mock(ob.slack.api_call,
        ...     return_value={"permalink":'https://justadummy.com'})
        >>> ob.get_permalink(677, 123545)
        'https://justadummy.com'
        >>> ob.slack.api_call = Mock(ob.slack.api_call,
        ...     return_value={"error":"No link found"})
        >>> ob.get_permalink(677, 123545)
        False
        """
        link = self.slack.api_call(
            "chat.getPermalink", channel=channel_id, message_ts=ts
        )
        if "permalink" in link:
            return link["permalink"]

        return False

    def get_file(self, url, filename):
        headers = {"Authorization": "Bearer " + self.auth_token}
        r = requests.get(url, headers=headers)
        with open(os.path.join(self.file_path, filename), "wb") as f:
            for chunk in r.iter_content(self.CHUNK_SIZE):
                if chunk:
                    f.write(chunk)

    def replace_text(self, text):
        """
        >>> from mock import Mock
        >>> ob = SlackHistory()
        >>> ob.get_username = Mock(ob.get_username, return_value="asdf")
        >>> ob.replace_text('<@677> hii')
        '@asdf hii'
        """
        self.log.debug("replacing text")
        text_list = text.split(" ")
        for counter in range(0, len(text_list)):
            if "<@" in text_list[counter]:
                user_id = text_list[counter].split("<@")[1].split(">")[0]
                if "|" in user_id:
                    user_id = user_id.split("|")[0]
                username = self.get_username(user_id)
                text_list[counter] = "@" + username

        return " ".join(text_list)

    def get_key(self, channel, timestamp):
        """
        >>> ob = SlackHistory()
        >>> ob.get_key("U8S2NSX6J", "1518582605.000239")
        '18a920561dec17877db2a4628c13734e2f63df1d'
        """
        return hashlib.sha1((channel + timestamp).encode("utf8")).hexdigest()

    def parse_dict(self, msg):
        """
        >>> from mock import Mock
        >>> ob = SlackHistory()
        >>> ob.get_permalink = Mock(ob.get_permalink, return_value="http://justadummy.com")
        >>> ob.replace_text = Mock(ob.replace_text, return_value="uploaded a file @asdf")
        >>> ob.get_username = Mock(ob.get_username, return_value="justaname")
        >>> ob.get_file = Mock()
        >>> ob.parse_dict(AttrDict({'channel': 'abcd', 'user': '******', 'text': 'uploaded a file <@123>', 'ts': '1518582605.000239'}))
        AttrDict({'permalink': 'http://justadummy.com', 'text': 'uploaded a file @asdf', 'ts': '1518582605.000239', 'user': '******', 'user_name': 'justaname', 'channel': 'abcd'})
        """
        p = self.get_permalink(msg.channel, msg.ts)
        if p:
            msg.permalink = p

        if "user" in msg:
            user_name = self.get_username(msg.user)
            msg.user_name = user_name

        if "text" in msg and "<@" in msg.text:
            msg.text = self.replace_text(msg.text)

        if "uploaded a file" in msg.text and self.file_path is not None:
            filename = str(msg.ts) + "_" + msg.file.name.replace(" ", "_")
            self.get_file(msg.file.url_private_download, filename)

        return msg

    def change_status(self, _id, ts):

        if _id + "_oldest" in self.dd.keys() and ts > self.dd[_id + "_oldest"]:
            self.dd[_id + "_latest"] = ts
        else:
            self.dd[_id + "_oldest"] = ts

    def _send_msgs_to_target(self, target, msg):
        while 1:
            try:
                msg["key"] = self.get_key(msg["channel"], msg["ts"])
                target.insert_msg(msg)
                self.change_status(msg["channel"], msg["ts"])
                break
            except (SystemExit, KeyboardInterrupt):
                raise
            except:
                self.log.exception("_send_msgs_to_target_failed", target=target)
                time.sleep(self.WAIT_TIME_TARGET_FAILURE)

    def _write_messages(self, msg):
        if self.targets:
            fn = self._send_msgs_to_target

            jobs = []
            for t in self.targets:
                jobs.append(self._pool.apply_async(fn, (t, deepcopy(msg))))

            for j in jobs:
                j.wait()

    def get_history(self, slack, _id, _name, end_ts, start_ts=0):
        """
        >>> from mock import Mock
        >>> ob = SlackHistory()
        >>> ob.slack.api_call = Mock(ob.slack.api_call)
        >>> ob.slack.api_call.side_effect= [{'messages' :[{'message': 'Dummy <@123>', 'ts': '123.234'}], 'has_more': True}, {'messages' :[{'message': 'Dummy <@123>', 'ts': '122.234'}], 'has_more': False}]
        >>> ob.parse_dict = Mock(ob.parse_dict, return_value={'message': 'Dummy @asdf', 'ts': '123.234'})
        >>> ob._write_messages = Mock()
        >>> ob.get_history('users.info', '1234', 'general', 12345)
        2
        """
        messages = []
        ts = end_ts
        num = 0
        while True:
            response = self.slack.api_call(
                slack, channel=_id, latest=ts, oldest=start_ts, count=1000
            )
            if "messages" not in response:
                return num

            messages.extend(response["messages"])
            messages = sorted(messages, key=itemgetter("ts"))
            for message in messages:
                msg = AttrDict(message)
                msg.channel = _id
                msg.channel_name = _name

                msg = self.parse_dict(msg)

                self._write_messages(dict(msg))

            num += len(messages)

            if response["has_more"]:
                ts = messages[-1]["ts"]
                messages = []
            else:
                return num

    def get_channel_status(self, _type, _id, name):

        if not _id + "_oldest" in self.dd.keys():
            last_ts = self.connect_time
        else:
            last_ts = self.dd[_id + "_oldest"]

        num = self.get_history(_type, _id, name, last_ts)

        if not _id + "_latest" in self.dd.keys():
            latest_ts = 0
        else:
            latest_ts = self.dd[_id + "_latest"]

        if latest_ts != self.connect_time:
            num = num + self.get_history(_type, _id, name, self.connect_time, latest_ts)

        self.log.info(
            "finished_inserting_channel_data",
            channel=_id,
            channel_name=name,
            num_msg=num,
            type="metric",
        )

        time.sleep(1)

    def get_public_channels_list(self):
        _type = "channels.history"
        public_channels = self.slack.api_call("channels.list")["channels"]
        for channel in public_channels:
            self.log.info("fetching_public_channel_message", channel=channel["name"])
            self.get_channel_status(_type, channel["id"], channel["name"])

    def get_private_channels_list(self):
        _type = "groups.history"
        private_channels = self.slack.api_call("groups.list")["groups"]
        for channel in private_channels:
            self.log.info("fetching_private_channel_message", channel=channel["name"])

            self.get_channel_status(_type, channel["id"], channel["name"])

    def get_direct_channels_list(self):
        _type = "im.history"
        direct_channels = self.slack.api_call("im.list")["ims"]
        for channel in direct_channels:
            self.log.info("fetching_direct_channel_message", channel=channel["user"])
            self.get_channel_status(_type, channel["id"], channel["user"])

    def start(self):
        self.get_public_channels_list()
        self.get_private_channels_list()
        self.get_direct_channels_list()
Beispiel #16
0
class GithubHistory(object):

    """
    This is the main class you instantiate to access the Github API v3 and store all msgs in the db.

    """

    def __init__(
        self,
        auth_token=None,
        repos=None,
        status_path="/tmp/",
        targets=None,
        log=DUMMY_LOG,
    ):

        self.git = Github(auth_token)
        self.log = log
        self.repos = repos
        self.targets = targets
        self.store = None
        self.dd = DiskDict(status_path + "disk.dict")
        self._pool = ThreadPool()

    def get_repo_obj(self, repo_fullname):
        """
        :params repo_fullname : string
        :rtype: :class:`github.Repository.Repository`

        >>> obj = GithubHistory()
        >>> obj.get_repo_obj('org/reponame')
        Repository(full_name=None)

        """
        self.log.debug("fun : get repo obj")

        return self.git.get_repo(str(repo_fullname))

    def get_repos_list(self):
        """
        :rtype: :class:`github.PaginatedList.PaginatedList` of :class:`github.Repository.Repository`
        or
        :rtype: [class:`github.Repository.Repository`,..]

        >>> from mock import Mock
        >>> obj = GithubHistory()
        >>> m=Mock(obj.repos)
        >>> obj.repos=m.return_value='org/repo1,org/repo2'
        >>> obj.get_repos_list()
        [Repository(full_name=None), Repository(full_name=None)]

        """
        self.log.debug("fun : get repos list")

        if self.repos:
            return [self.get_repo_obj(repo) for repo in self.repos.split(",")]

        return self.git.get_user().get_repos()

    def get_raw_data(self, obj):
        """
        Gets raw json from the obj

        :param obj: class github
        :rtype: dict

        >>> obj = GithubHistory()
        >>> class test(object):
        ...    __dict__ = {'_rawData':{'id':'123456'}}
        ...
        >>> obj.get_raw_data(test())
        {'id': '123456'}

        """
        self.log.debug("fun : get raw data")

        for key, value in obj.__dict__.items():
            if key is "_rawData":
                return value

    def merge_dict(self, *args):
        """
        :params args: dict
        :rtype: dict

        >>> obj = GithubHistory()
        >>> obj.merge_dict({'a':1,'b':2},{'c':3,'d':4})
        {'a': 1, 'c': 3, 'b': 2, 'd': 4}

        """
        self.log.debug("fun : chain dict or merge dict")

        return dict(ChainMap(*args))

    def get_key(self, record):
        """
        :params record: dict
        :rtype: dict

        >>> obj = GithubHistory()
        >>> obj.get_key({'repository':{'updated_at':'21-04-14'},'issue':{'updated_at':'21-04-14'},'comment':{'updated_at':'21-04-14'}})
        {'id': '1c4882d4c922bcfdc070de97de03706c9276f8eb'}
        >>> obj.get_key({'repository':{'updated_at':'21-04-14'},'issue':{},'comment':{}})
        {'id': '8acfc9c43a5c9f64ee2070007591811f4048c907'}

        """
        self.log.debug("fun : get hash key")

        key = "%s%s%s" % (
            record.get("repository", {}).get("updated_at", 0),
            record.get("issue", {}).get("updated_at", 0),
            record.get("comment", {}).get("updated_at", 0),
        )

        # TODO: Need to add repo id
        return {"id": hashlib.sha1(key).hexdigest()}

    def send_msgs_to_target(self, target, msg):
        """
        :param target: db obj
        :param msg: dict

        """
        self.log.debug("send msgs to tatgets")

        target.insert_msg(msg)

    def write_message(self, msg):
        """
        :param msg: dict

        """
        self.log.debug("write msgs in db")

        if self.targets:
            fn = self.send_msgs_to_target

            jobs = []
            for t in self.targets:
                jobs.append(self._pool.apply_async(fn, (t, deepcopy(msg))))

            for j in jobs:
                j.wait()

    def store_record(self, repo, issue=None, comment=None):
        """
        :param repo:    class 'github.Repository.Repository'
        :param issue:   class 'github.Issue.Issue'
        :param comment: class 'github.IssueComment.IssueComment'

        >>> obj = GithubHistory()
        >>> class repo(object):
        ...      __dict__ = { '_rawData' : { 'id' : 1234 }}
        ...      class owner(object):
        ...          type = 'user'
        ...
        >>> class issue(object):
        ...      __dict__ = {'_rawData':{'id':5678}}
        ...
        >>> class comment(object):
        ...      __dict__ = {'_rawData':{'id':91011}}
        ...
        >>> obj.store_record(repo(), issue(), comment())
        {'comment': {'id': 91011}, 'issue': {'id': 5678}, 'id': '8aefb06c426e07a0a671a1e2488b4858d694a730', 'repository': {'id': 1234}}

        """
        self.log.debug("fun : store record")

        iss = cmnt = {}
        rp = self.get_repo_dict(repo)

        if issue:
            iss = self.get_issue_dict(issue)

        if issue and comment:
            cmnt = self.get_comment_dict(comment)

        record = self.merge_dict(rp, iss, cmnt)
        record.update(self.get_key(record))

        self.write_message(record)
        return record

    def get_repo_dict(self, repo):
        """
        :param repo: class 'github.Repository.Repository'
        :rtype: dict

        >>> from mock import Mock
        >>> class repo(object):
        ...     __dict__ = {'_rawData':{'id':'12345', 'name':'abcd'}}
        ...     class owner(object):
        ...         type = 'Organization'
        ...         login = '******'
        ...
        >>> class org(object):
        ...     __dict__ = {'_rawData':{'id':'12345'}}
        ...
        >>> obj=GithubHistory()
        >>> obj.git.get_organization = Mock(obj.git.get_organization,return_value=org())
        >>> obj.get_repo_dict(repo())
        {'organization': {'id': '12345'}, 'repository': {'id': '12345', 'name': 'abcd'}}

        """
        self.log.debug("fun : get repo dict")

        org_dict = {}

        if repo.owner.type == "Organization":
            org = self.git.get_organization(str(repo.owner.login))
            org_dict = {"organization": self.get_raw_data(org)}

        repo_dict = {"repository": self.get_raw_data(repo)}
        return self.merge_dict(org_dict, repo_dict)

    def get_issue_dict(self, issue):
        """
        :param issue: class 'github.Issue.Issue'
        :rtype: dict

        >>> from mock import Mock
        >>> obj = GithubHistory()
        >>> class issue(object):
        ...      __dict__ = {'_rawData':{'id':123456}}
        ...
        >>> obj.get_issue_dict(issue())
        {'issue': {'id': 123456}}

        """
        self.log.debug("fun : get issue dict")

        return {"issue": self.get_raw_data(issue)}

    def get_time(self, _time):
        """
        :param _time: string
        :rtype: datetime.datetime

        >>> obj = GithubHistory()
        >>> obj.get_time('2018-02-15T09:17:49Z')
        datetime.datetime(2018, 2, 15, 9, 17, 50)

        """
        self.log.debug("fun : get api time format")

        return datetime.strptime(_time, "%Y-%m-%dT%H:%M:%SZ") + timedelta(seconds=1)

    def get_comment_dict(self, cmnt):
        """
        :param cmnt:class 'github.IssueComment.IssueComment'
        :rtype: dict

        >>> from mock import Mock
        >>> obj = GithubHistory()
        >>> class comment(object):
        ...      __dict__ = {'_rawData':{'id':123456}}
        ...
        >>> obj.get_comment_dict(comment())
        {'comment': {'id': 123456}}

        """
        self.log.debug("fun : get comment dict")

        return {"comment": self.get_raw_data(cmnt)}

    def check_rate_limit(self):
        """
        Checks no of api calls remaining before going to any function
        if remaining calls of range less than 100 calls wait for some time.

        """
        self.log.debug("fun :check api rate limit")

        remaining, total = self.git.rate_limiting

        if remaining > 1 and remaining < 100:
            expiry = self.git.rate_limiting_resettime
            delay = (expiry - time.time()) + 60
            self.log.info("waiting for " + str(delay) + " sec")
            time.sleep(delay)

    def get_comments(self, repo, issue, changes=None):
        """
        Get comments related to the issue

        :param repo: class 'github.Repository.Repository'
        :param issue: class 'github.Issue.Issue'
        :param changes: string, eg: '2018-02-15T09:17:49Z'

        """
        self.log.debug("fun : get comments")

        self.check_rate_limit()

        # converting issue obj to raw dict as iss_dict
        iss_dict = self.get_raw_data(issue)

        # get issue created time in '%d/%m/%Y' as last_time
        last_time = self.get_time(iss_dict["created_at"])

        # In case there are changes in issue,replace the last_time
        if changes:
            last_time = self.get_time(changes)

        # get and store the comments since issue created or last issue comment updated time(in case of new comments)
        for comment in issue.get_comments(since=last_time):
            self.store_record(repo, issue, comment)

        self.store_record(repo, issue)

    def get_issues(self, repo):
        """
        Get issues related to the input Repository

        :param repo: class 'github.Repository.Repository'

        """
        self.log.debug("fun : get issues")

        self.check_rate_limit()
        for issue in repo.get_issues():

            # getting issue dict from issue obj as iss
            iss = self.get_issue_dict(issue)

            # passing the issue dict and checking in db for issue related records and changes
            # returns (count as 0 or 1),(changes as 0 or time in '2018-02-15T09:17:49Z' format)
            count, changes = self.store.check_issue_in_db(iss)

            # if no records and no changes found realted to issue in db,then get all the comments
            if count is 0:
                self.get_comments(repo, issue)
                continue

            # if records present in db,but the current issue updated time not matched with the last comment
            # updated time in db,so get the msgs from last comment updated time.
            if changes:
                self.get_comments(repo, issue, changes)

    def get_history(self):
        """
        Get user's account repos and from that iterate over issues and comments.
        get_history
           -> repos -> issues -> comments

        """
        self.log.debug("fun : get history")

        for repo in self.get_repos_list():

            # if repo doesn't contain any issues just store the record
            if repo.open_issues is 0:
                self.store_record(repo)
                continue

            # get the issues related to the repo
            self.get_issues(repo)

            # store the status in disk dict
            self.dd["repository"] = repo.full_name

    def start(self):
        self.log.debug("fun : start")

        # create db obj at 0th index from target
        self.store = self.targets[0]

        if "repository" not in self.dd.keys():
            self.get_history()

        # recheck for new messages
        self.get_history()

        self.log.info("Messages stored successfully")
Beispiel #17
0
class GW2VectoWordVecSpaceFile(object):
    """
    Abstraction that helps in converting word vector space data
    (vectors and vocabulary) from Google Word2Vec format to
    WordVecSpaceFile format.
    """
    def __init__(self,
                 in_dir,
                 outdir,
                 nvecs_per_shard=0,
                 shard_name="shard",
                 full_name="full"):

        self.in_dir = in_dir
        self.outdir = outdir
        self.nvecs_per_shard = nvecs_per_shard
        self.shard_name = shard_name

        self.do_sharding = bool(self.nvecs_per_shard)
        if self.do_sharding:
            self.full_fpath = self.J(self.outdir, full_name)
            os.makedirs(self.full_fpath)

            map_itow = self.J(self.full_fpath, "indextoword")
            self.itow = DiskDict(map_itow)

            map_wtoi = self.J(self.full_fpath, "wordtoindex")
            self.wtoi = DiskDict(map_wtoi)

            self.mags = DiskArray(
                self.J(self.full_fpath, "magnitudes"),
                shape=(0, ),
                dtype=np.float32,
                growby=1000000,
            )
            self.occurs = DiskArray(
                self.J(self.full_fpath, "occurrences"),
                shape=(0, ),
                dtype=np.uint64,
                growby=1000000,
            )

    def J(self, p1, p2):
        return os.path.join(p1, p2)

    def _iter_vecs(self, vfile, vocabfile):
        for word, vec in vfile.iter_vectors():
            vec = np.fromstring(vec, dtype="float32")
            mag = np.linalg.norm(vec)
            vec = vec / mag
            _line = vocabfile.readline().split(" ")

            word, occur = _line[0], int(_line[1])
            yield vec, word, mag, occur

    def _build_writer(self, vidx, dim):
        if self.do_sharding:
            shard_num = int(vidx / self.nvecs_per_shard)
            shard_name = "{}{}".format(self.shard_name, shard_num)
            fpath = self.J(self.outdir, shard_name)
            return GWVecBinWriter(fpath, dim, sharding=True)
        else:
            return GWVecBinWriter(self.outdir, dim)

    def _create_manifest(
        self,
        out_fpath,
        nvecs,
        dim,
        N,
        t_occur,
        in_fpath,
        m_info={},
        full=False,
        num_vecs=None,
        nvps=None,
    ):
        if full:
            mfc = dict(
                num_shards=N,
                num_vectors=nvecs,
                dimension=dim,
                num_words=t_occur,
                dt_creation=datetime.utcnow().isoformat(),
                input_path=in_fpath,
                manifest_info=m_info,
                num_vecs_per_shard=self.nvecs_per_shard,
            )
        else:
            mfc = dict(
                num_shards=N,
                num_vecs_in_shard=nvecs,
                num_vecs=num_vecs,
                num_vecs_per_shard=nvps,
                dimension=dim,
                num_words=t_occur,
                dt_creation=datetime.utcnow().isoformat(),
                input_path=in_fpath,
                manifest_info=m_info,
            )

        fp = open(self.J(out_fpath, "manifest.json"), "w")
        fp.write(json.dumps(mfc))
        fp.close()

    def _find_manifest_info(self, fpath):
        m_file = self.J(fpath, "manifest.json")
        c = {}
        if os.path.isfile(m_file):
            fp = open(m_file, "r")
            c = json.loads(fp.read())
        return c

    def start(self):
        inp_vec_f = open(self.J(self.in_dir, "vectors.bin"), "rb")
        inp_vecs = GWVecBinReader(inp_vec_f)
        dim = inp_vecs.dim
        nvecs = inp_vecs.nvecs

        vocab_file = open(self.J(self.in_dir, "vocab.txt"),
                          "r",
                          encoding="utf-8",
                          errors="ignore")
        m_info = self._find_manifest_info(self.in_dir)

        w = None
        vecs = self._iter_vecs(inp_vecs, vocab_file)
        N = self.nvecs_per_shard
        if N:
            num_shards = math.ceil(nvecs / N)
        else:
            num_shards = 1

        t_occur = 0
        count = -1
        for index, (vec, word, mag, occur) in enumerate(vecs):
            if self.do_sharding and index % N == 0:
                if w:
                    count += 1
                    t_occur += s_occur
                    self._create_manifest(
                        w.outdir,
                        (index - (count * N)),
                        dim,
                        num_shards,
                        s_occur,
                        self.in_dir,
                        m_info,
                        num_vecs=nvecs,
                        nvps=N,
                    )
                    w.close()
                    w = None

            if not w:
                s_occur = 0
                w = self._build_writer(index, dim)

            if self.do_sharding:
                self.wtoi[word] = index
                self.itow[index] = word

                self.mags.append(mag)
                self.occurs.append(occur)

                w.write(vec=vec, mag=mag, occur=occur)

            else:
                w.write(vec=vec, mag=mag, word=word, index=index, occur=occur)

            s_occur += occur

        if w:
            w.close()
            count += 1
            t_occur += s_occur
            self._create_manifest(
                w.outdir,
                (index - (count * N)),
                dim,
                num_shards,
                s_occur,
                self.in_dir,
                m_info,
                num_vecs=nvecs,
                nvps=N,
            )

        if self.do_sharding:
            self.wtoi.close()
            self.itow.close()

            self.mags.flush()
            self.mags.close()

            self.occurs.flush()
            self.occurs.close()

            self._create_manifest(
                self.full_fpath,
                nvecs,
                dim,
                num_shards,
                t_occur,
                self.in_dir,
                m_info,
                full=True,
            )
Beispiel #18
0
class GmailHistory(object):
    """
    This is the main class you instantiate to access the Gmail API to get the messages from user's mailbox.
    :ref : https://developers.google.com/gmail/api/v1/reference/

    """

    MAX_RESULTS = 500  # gmail api max results
    LABELIDS = ["INBOX"]  # labels to which, pub/sub updates are to be pushed
    GMAIL_CREATED_TS = "2004/01/01"  # year in which gmail has introduced
    GMAIL_WATCH_DELAY = 86400  # time in sec to make gmail api watch() request
    SCOPES = ("https://www.googleapis.com/auth/gmail.readonly"
              )  # type of permission to access gmail api

    def __init__(
        self,
        cred_path=None,
        topic_name=None,
        query="",
        file_path=None,
        status_path="/tmp/",
        targets=None,
        log=DUMMY_LOG,
    ):

        self.log = log
        self.cred_path = cred_path
        self.query = query
        self.gmail = None
        self.topic = topic_name
        self.file_path = file_path
        self.dd = DiskDict(status_path + "disk.dict")
        self.targets = targets
        self._pool = ThreadPool()

    def authorize(self):
        """

        Gets valid user credentials from the user specified cred path
        if nothing has been stored, or if the stored credentials are invalid,
        the OAuth2 flow is incomplete and throws an authentication failed error or file not found error

        client_secret.json : This is the name of the secret file you download from
                             https://console.developers.google.com/iam-admin/projects

        credentials.json : This is the file that will be created when user has authenticated and
                           will mean you don't have to re-authenticate each time you connect to the API
        """
        self.log.debug("authorize")

        store = file.Storage("{}credentials.json".format(self.cred_path))
        creds = store.get()

        if not creds or creds.invalid:
            flow = client.flow_from_clientsecrets(
                "{}client_secret.json".format(self.cred_path), self.SCOPES)
            creds = tools.run_flow(flow, store)

        # build return gmail service object on authentication
        self.gmail = build("gmail",
                           "v1",
                           http=creds.authorize(Http()),
                           cache_discovery=False)

        return self.gmail

    def save_files(self, message):
        """
        This fun helps to store gmail attachments from the given message.

        :calls : GET https://www.googleapis.com/gmail/v1/users/userId/messages/messageId/attachments/id
        :param message : dict

        """
        self.log.debug("save_file")

        for part in message["payload"].get("parts", ""):

            if not part["filename"]:
                continue

            file_id = part["body"]["attachmentId"]
            file_dic = (self.gmail.users().messages().attachments().get(
                userId="me", messageId=message["id"], id=file_id).execute())

            file_data = base64.urlsafe_b64decode(
                file_dic["data"].encode("UTF-8"))
            path = "".join([self.file_path, part["filename"]])

            with open(path, "w") as file_obj:
                file_obj.write(file_data)

            self.log.info("attachment saved to", path=path)

    def set_tmp_ts_to_last_msg(self):
        """

        This fun help to reset last_msg_ts to tmp_ts

        """
        self.log.debug("set_tmp_ts_to_last_msg")

        self.dd["last_msg_ts"] = self.dd["tmp_ts"]
        self.dd.close()

    def renew_mailbox_watch(self):
        """Renewing mailbox watch

        You must re-call watch() at least every 7 days or else you will stop receiving pub/sub updates.
        We recommend calling watch() once per day. The watch() response also has an
        expiration field with the timestamp for the watch expiration.

        :ref : https://developers.google.com/gmail/api/guides/push

        """
        while True:
            self.watch_gmail()
            time.sleep(self.GMAIL_WATCH_DELAY)

    def get_new_msg(self):
        """
        This fun help us to see any changes to the user's mailbox and gives new msgs if they are available.
        Note : startHistoryId - returns Histories(drafts, mail deletions, new mails) after start_history_id.

        :calls : GET https://www.googleapis.com/gmail/v1/users/userId/history

        >>> from mock import Mock, MagicMock
        >>> obj = GmailHistory()
        >>> obj.store_msgs_in_db = Mock()
        >>> obj.set_tmp_ts_to_last_msg = Mock()
        >>> obj.gmail = Mock()
        >>> obj.dd = MagicMock()
        >>> sample_doc = {'history': [{'messagesAdded': [{'message': {'labelIds': ['UNREAD'], 'id': '163861dac0f17c61'}}]}]}
        >>> obj.gmail.users().history().list().execute = Mock(obj.gmail.users().history().list().execute,return_value=sample_doc)
        >>> obj.get_new_msg()
        [{'labelIds': ['UNREAD'], 'id': '163861dac0f17c61'}]

        """
        self.log.debug("get_new_msg")

        msg_list = []
        new_msg = (self.gmail.users().history().list(
            userId="me", startHistoryId=self.dd["historyId"]).execute())

        if "history" not in new_msg:
            return

        for record in new_msg.get("history"):

            if "messagesAdded" not in record:
                continue

            msg = record.get("messagesAdded")[0]["message"]

            if msg.get("labelIds")[0] == "DRAFT":
                continue

            msg_list.append(msg)

        self.store_msgs_in_db(msg_list)
        self.set_tmp_ts_to_last_msg()

        return msg_list

    def watch_gmail(self):
        """To recive Push Notifications

        In order to receive notifications from Cloud Pub/Sub topic,
        simply we can call watch() from google api client on the Gmail user mail box.
        :ref : https://developers.google.com/gmail/api/guides/push

        :calls : POST https://www.googleapis.com/gmail/v1/users/userId/watch

        >>> from mock import Mock
        >>> obj = GmailHistory()
        >>> obj.gmail = Mock()
        >>> api_doc = {'historyId':1234,'expiration':1526901631234}
        >>> obj.gmail.users().watch().execute = Mock(obj.gmail.users().watch().execute, return_value=api_doc)
        >>> obj.watch_gmail()
        {'expiration': 1526901631234, 'historyId': 1234}

        """
        self.log.debug("watch_gmail")

        request = {
            "labelIds": self.LABELIDS,
            "topicName": "{}".format(self.topic)
        }

        hstry_id = self.gmail.users().watch(userId="me",
                                            body=request).execute()

        self.log.info("Gmail_watch_id :", hstryid=hstry_id)

        return hstry_id

    def send_msgs_to_target(self, target, msg):
        """
        This fun helps to send msg to target database and store it.

        :param target : db_obj
        :param msg : dict

        """
        self.log.debug("send msgs to tatgets")

        target.insert_msg(msg)

    def write_message(self, msg):
        """
        This function helps to push msgs to databases in asynchronous manner, if more than one db is specified.

        :param msg: dict

        """
        self.log.debug("write msgs in db")

        if self.targets:
            fn = self.send_msgs_to_target

            jobs = []
            for t in self.targets:
                jobs.append(self._pool.apply_async(fn, (t, deepcopy(msg))))

            for j in jobs:
                j.wait()

    def change_diskdict_state(self, message):
        """
        This fun helps us to change the state of diskdict

        :param message : dict

        """

        # for every msg the last_msg_ts will be replace with new msg internalDate
        self.dd["last_msg_ts"] = message["internalDate"]

        if "frst_msg_ts" not in self.dd.keys() or (self.dd["frst_msg_ts"] <=
                                                   message["internalDate"]):
            self.dd["frst_msg_ts"] = message["internalDate"]
            self.dd["historyId"] = message["historyId"]

    def store_msgs_in_db(self, msgs_list):
        """
        Get msg ids from list of messages amd makes an api call with the msg id
        and store in db.

        :params msgs_list : list
        :calls : GET https://www.googleapis.com/gmail/v1/users/userId/messages/id

        """
        self.log.debug("store_msgs_in_db")

        for msg in msgs_list:

            message = (self.gmail.users().messages().get(
                userId="me", id=msg["id"]).execute())

            self.write_message(message)
            self.change_diskdict_state(message)

            if self.file_path:
                self.save_files(message)

    def get_default_ts(self):
        """
        This fun helps to return next day date from today in Y/m/d format

        :rtype: str

        """
        self.log.debug("get_default_ts")

        return (datetime.now() + timedelta(days=1)).strftime("%Y/%m/%d")

    def get_history(self, before, after=GMAIL_CREATED_TS):
        """
        Get all the msgs from the user's mailbox with in given dates and store in the db
        Note : Gmail api will consider 'before' : excluded date, 'after' : included date
        Eg: before : 2017/01/01, after : 2017/01/31 then gmail api gives msgs from 2017/01/02 - 2017/01/31

        :ref : https://developers.google.com/gmail/api/guides/filtering
        :calls : GET https://www.googleapis.com/gmail/v1/users/userId/messages

        :param before : string
        :param after : string
        :rtype : list

        >>> from mock import Mock
        >>> obj = GmailHistory()
        >>> obj.gmail = Mock()
        >>> api_doc = {'messages':[{'id':'163861dac0f17c61'},{'id':'1632163b6a84ab94'}]}
        >>> obj.gmail.users().messages().list().execute = Mock(obj.gmail.users().messages().list().execute, return_value=api_doc)
        >>> obj.store_msgs_in_db = Mock()
        >>> obj.get_history('2017/05/10')
        [{'id': '163861dac0f17c61'}, {'id': '1632163b6a84ab94'}]

        """
        self.log.debug("fun get history")

        query = "{} before:{} after:{}".format(self.query, before, after)
        response = (self.gmail.users().messages().list(
            userId="me", maxResults=self.MAX_RESULTS, q=query).execute())
        msgs = []
        response = AttrDict(response)

        if "messages" in response:
            msgs.extend(response.messages)
            self.store_msgs_in_db(response.messages)

        while "nextPageToken" in response:
            page_token = response.nextPageToken
            response = (self.gmail.users().messages().list(
                userId="me",
                maxResults=self.MAX_RESULTS,
                q=query,
                pageToken=page_token,
            ).execute())
            response = AttrDict(response)

            if response.resultSizeEstimate is not 0:
                msgs.extend(response.messages)
                self.store_msgs_in_db(response.messages)

        return msgs

    def get_oldest_date(self, ts):
        """
        This fun helps to get next day date from given timestamp.

        :param ts: str (Unix time stamp)
        :rtype: str

        >>> obj=GmailHistory()
        >>> obj.get_oldest_date('1526901630000')
        '2018/05/22'

        """
        self.log.debug("get_oldest_date")

        return (datetime.fromtimestamp(int(ts[:10])) +
                timedelta(days=1)).strftime("%Y/%m/%d")

    def get_latest_date(self, ts):
        """
        This function helps us to get date from given timestamp

        :param ts: str (Unix time stamp)
        :rtype: str

        >>> obj=GmailHistory()
        >>> obj.get_latest_date('1526901630000')
        '2018/05/21'

        """
        self.log.debug("get_latest_date")

        return (datetime.fromtimestamp(int(ts[:10]))).strftime("%Y/%m/%d")

    def start(self):
        self.log.debug("start")

        # Gets next day date from current date as before_ts in 'yr/m/d' format
        # and check for last_msg_ts key in diskdict file
        before_ts = self.get_default_ts()
        last_msg_ts = self.dd.get("last_msg_ts", 0)

        # If any messages present in diskdict, get the last_msg_ts value  and replace before_ts var with the
        # last_msg_ts with 'yr/m/d' format
        if last_msg_ts:
            before_ts = self.get_oldest_date(last_msg_ts)

        # Get and store the messages from before_ts date to the time gmail has created
        self.get_history(before_ts)
        self.dd["tmp_ts"] = self.dd["last_msg_ts"]

        # Recheck for any new messages from the time, execution has happened
        after = self.get_latest_date(self.dd["frst_msg_ts"])
        self.get_history(self.get_default_ts(), after)

        # reset last_msg_ts to temp_ts
        self.set_tmp_ts_to_last_msg()
Beispiel #19
0
    def _read_from_disk(self):
        m = DiskDict(os.path.join(self.dirpath, 'meta'))

        return m, self._prepare_word_index_wvspace(m['dim'])
Beispiel #20
0
    def run(self):
        global completed_tasks

        if not self.opt.results_path:
            results_path = "results/default_results_{}".format(self.datasetname)
        else:
            results_path = self.opt.results_path

        if not self.opt.continue_run:
            self.logger.info("not continuing")
            os.remove(results_path)
        else:
            self.logger.info("continuing previous run(s)")

        completed_tasks = DiskDict(results_path)
        self.logger.info("found {} previous results".format(len(completed_tasks)))

        self.logger.info("preparing experiment setups...")
        experiment_descs = defaultdict(list)
        previous_tasks = Counter()
        with tqdm(total=self.combination_count) as pbar:
            for i, named_combination in enumerate(self.named_combinations):
                _experiment_named_id = self._experiment_named_id_from_named_combination(
                    named_combination
                )

                pool_id = i % self.opt.num_workers

                if _experiment_named_id in completed_tasks:
                    task_desc = completed_tasks[_experiment_named_id]

                    if task_desc["rc"] != 0:
                        if self.opt.rerun_non_rc0:
                            self.logger.debug(
                                "task {} was already executed, but with rc={}. rerunning.".format(
                                    _experiment_named_id, task_desc["rc"]
                                )
                            )
                            (
                                cmd,
                                human_cmd,
                                experiment_path,
                                cuda_device_id,
                            ) = self.prepare_single_setup(named_combination, i)
                            # if cuda_device_id != -1:
                            #    pool_id = cuda_device_id

                            experiment_descs[pool_id].append(
                                (
                                    i,
                                    _experiment_named_id,
                                    named_combination,
                                    cmd,
                                    human_cmd,
                                    experiment_path,
                                )
                            )

                            previous_tasks["new"] += 1
                            del completed_tasks[_experiment_named_id]
                        else:
                            self.logger.debug(
                                "task {} was already executed, but with rc={}. not rerunning.".format(
                                    _experiment_named_id, task_desc["rc"]
                                )
                            )
                            previous_tasks["rcnon0"] += 1
                    else:
                        # rerun tasks where the rc != 0 (always rerun tasks that have not been executed at all, yet)
                        self.logger.debug(
                            "skipping experiment: {}".format(_experiment_named_id)
                        )
                        self.logger.debug(
                            "previous result: {}".format(
                                completed_tasks[_experiment_named_id]
                            )
                        )
                        previous_tasks["rc0"] += 1
                else:
                    (
                        cmd,
                        human_cmd,
                        experiment_path,
                        cuda_device_id,
                    ) = self.prepare_single_setup(named_combination, i)
                    # if cuda_device_id != -1:
                    #    pool_id = cuda_device_id

                    experiment_descs[pool_id].append(
                        (
                            i,
                            _experiment_named_id,
                            named_combination,
                            cmd,
                            human_cmd,
                            experiment_path,
                        )
                    )
                    previous_tasks["new"] += 1
                pbar.update(1)

        self.logger.info(
            "summary (new is also increased for tasks that were executed previously but yielded rc!=0 - if rerun_non_rc0==True"
        )
        self.logger.info("{}".format(previous_tasks))

        self.logger.info("starting {} experiments".format(self.combination_count))
        self.logger.info(
            "creating {} process pools with each 1 worker".format(self.opt.num_workers)
        )

        if self.cuda_devices and len(self.cuda_devices) != self.opt.num_workers:
            self.logger.warning(
                "number of cuda devices does not match number of workers: {} vs. {}".format(
                    len(self.cuda_devices), self.opt.num_workers
                )
            )

        manager = multiprocessing.Manager()
        running_processes = manager.dict()
        for pool_index in range(self.opt.num_workers):
            pool = multiprocessing.Pool(processes=1, maxtasksperchild=1)
            for desc in experiment_descs[pool_index]:
                proc_args = desc + (running_processes,)  # must be a tuple
                pool.apply_async(
                    start_worker,
                    args=proc_args,
                    callback=on_task_done,
                    error_callback=on_task_error,
                )
            self.logger.info(
                f"created pool with {len(experiment_descs[pool_index])} tasks, each processed by 1 worker"
            )

        self.logger.info("waiting for workers to complete all jobs...")
        prev_count_done = 0
        with tqdm(total=previous_tasks["new"], initial=prev_count_done) as pbar:
            while completed_tasks_in_this_run_count < previous_tasks["new"]:
                time.sleep(10)
                update_inc = completed_tasks_in_this_run_count - prev_count_done

                if update_inc > 0:
                    pbar.update(update_inc)
                    prev_count_done = completed_tasks_in_this_run_count
                    completed_tasks.sync_to_disk()

                best_dev_snem = self._get_best_dev_snem()
                sorted_running_procs = sorted(
                    self._get_running_processes(running_processes),
                    key=lambda running_device: running_device[1],
                )
                pbar.set_postfix_str(
                    f"dev-snem: {best_dev_snem:.4f}; prcs/devs: {sorted_running_procs}"
                )

        completed_tasks.sync_to_disk()
        self.logger.info(
            f"finished all tasks ({completed_tasks_in_this_run_count} of {previous_tasks['new']})"
        )

        # copy and close shelve to preserve its original state
        processed_results = dict(completed_tasks)
        completed_tasks.sync_to_disk()

        experiments_rc_overview = Counter()
        non_okay_experiment_ids = []
        for experiment_named_id, experiment_result in processed_results.items():
            rc = experiment_result["rc"]
            experiments_rc_overview[rc] += 1

            if rc != 0:
                non_okay_experiment_ids.append(experiment_result["experiment_id"])

        if non_okay_experiment_ids:
            self.logger.warning(
                f"{len(non_okay_experiment_ids)} experiments did not return 0: {sorted(non_okay_experiment_ids)}"
            )

        # snem-based performance sort
        sorted_results = list(dict(processed_results).values())
        for result in sorted_results:
            if result["details"]:
                result["dev_snem"] = result["details"]["dev_stats"][self.snem]
                del result["details"]
            else:
                result["dev_snem"] = -1.0
        sorted_results.sort(key=lambda x: x["dev_snem"], reverse=True)
        headers = list(sorted_results[0].keys())
        rows = [x.values() for x in sorted_results]

        self.logger.info("all experiments finished. statistics:")
        self.logger.debug("snem-based performances:")
        self.logger.debug("\n" + tabulate(rows, headers))
        self.logger.info("return codes: {}".format(experiments_rc_overview))
Beispiel #21
0
class LogCollector():
    DESC = 'Collects the log information and sends to NSQTopic'

    QUEUE_MAX_SIZE = 2000 # Maximum number of messages in in-mem queue
    MAX_NBYTES_TO_SEND = 4.5 * (1024**2) # Number of bytes from in-mem queue minimally required to push
    MIN_NBYTES_TO_SEND = 512 * 1024 # Minimum number of bytes to send to nsq in mpub
    MAX_SECONDS_TO_PUSH = 1 # Wait till this much time elapses before pushing
    LOG_FILE_POLL_INTERVAL = 0.25 # Wait time to pull log file for new lines added
    QUEUE_READ_TIMEOUT = 1 # Wait time when doing blocking read on the in-mem q
    PYGTAIL_ACK_WAIT_TIME = 0.05 # TODO: Document this
    SCAN_FPATTERNS_INTERVAL = 30 # How often to scan filesystem for files matching fpatterns
    HOST = socket.gethostname()
    HEARTBEAT_RESTART_INTERVAL = 30 # Wait time if heartbeat sending stops
    LOGAGGFS_FPATH_PATTERN = re.compile("[a-fA-F\d]{32}") # MD5 hash pattern
    SERVERSTATS_FPATH = '/var/log/serverstats/serverstats.log' # Path to serverstats log file
    DOCKER_FORMATTER = 'logagg.formatters.docker_file_log_driver' # Formatter name for docker_file_log_driver
    BASESCRIPT_FORMATTER = 'logagg.formatters.basescript' #FIXME: remove later

    LOG_STRUCTURE = {
        'id': basestring,
        'timestamp': basestring,
        'file' : basestring,
        'host': basestring,
        'formatter' : basestring,
        'raw' : basestring,
        'type' : basestring,
        'level' : basestring,
        'event' : basestring,
        'data' : dict,
        'error' : bool,
        'error_tb' : basestring,
    }


    def __init__(self, data_dir, logaggfs_dir, master, log):

        # For storing state
        data_path = os.path.abspath(os.path.join(data_dir, 'logagg-data'))
        self.data_path = util.ensure_dir(data_path)

        # For log file that have been read
        archive_path = os.path.abspath(os.path.join(data_dir, 'logagg-archive'))
        self.archive_dir = util.ensure_dir(archive_path)

        self.master = master

        self.log = log

        # For remembering the state of files
        self.state = DiskDict(self.data_path)

        # Initialize logaggfs paths
        self.logaggfs = self._init_logaggfs_paths(logaggfs_dir)

        # Log fpath to thread mapping
        self.log_reader_threads = {}
        # Handle name to formatter fn obj map
        self.formatters = {}
        self.queue = queue.Queue(maxsize=self.QUEUE_MAX_SIZE)

        # Add initial files i.e. serverstats to state
        if not self.state['fpaths']:
            self.log.info('init_fpaths')
            self._init_fpaths()

        # Create nsq_sender
        self.log.info('init_nsq_sender')
        self._init_nsq_sender()
        #self.nsq_sender = util.DUMMY

        self._ensure_trackfiles_sync()


    def _init_fpaths(self):
        '''
        Files to be collected by default
        '''
        self.state['fpaths'] = [{'fpath':self.SERVERSTATS_FPATH,
            'formatter':self.BASESCRIPT_FORMATTER}]
        self.state.flush()
        return self.state['fpaths']


    def _init_nsq_sender(self):
        '''
        Initialize nsq_sender on startup
        '''
        #FIXME: Take this from master
        self.nsq_sender = NSQSender('localhost:4151', 'logagg', self.log)
        return self.nsq_sender


    def _init_logaggfs_paths(self, logaggfs_dir):
        '''
        Logaggfs directories and file initialization
        '''
        logaggfs = AttrDict()
        logaggfs.logcache = logaggfs_dir
        logaggfs.logs_dir = os.path.abspath(os.path.join(logaggfs.logcache, 'logs'))
        logaggfs.trackfiles = os.path.abspath(os.path.join(logaggfs.logcache, 'trackfiles.txt'))
        return logaggfs


    def _ensure_trackfiles_sync(self):
        '''
        Make sure fpaths in logagg state-file are present in
        logaggfs trackfiles on start up
        '''
        # If all the files are present in trackfiles
        for f in self.state['fpaths']:
            if not self._fpath_in_trackfiles(f['fpath']):
                self.add_to_logaggfs_trackfile(f['fpath'])

    def _remove_redundancy(self, log):
        '''
        Removes duplicate data from 'data' inside log dict and brings it out

        >>> lc = LogCollector('file=/path/to/log_file.log:formatter=logagg.formatters.basescript', 30)

        >>> log = {'id' : 46846876, 'type' : 'log',
        ...         'data' : {'a' : 1, 'b' : 2, 'type' : 'metric'}}
        >>> lc._remove_redundancy(log)
        {'data': {'a': 1, 'b': 2}, 'type': 'metric', 'id': 46846876}
        '''
        for key in log:
            if key in log and key in log['data']:
                log[key] = log['data'].pop(key)
        return log


    def _validate_log_format(self, log):
        '''
        Assert if the formatted log is of the same structure as specified

        >>> lc = LogCollector('file=/path/to/file.log:formatter=logagg.formatters.basescript', 30)

        >>> incomplete_log = {'data' : {'x' : 1, 'y' : 2},
        ...                     'raw' : 'Not all keys present'}
        >>> lc._validate_log_format(incomplete_log)
        'failed'

        >>> redundant_log = {'one_invalid_key' : 'Extra information',
        ...  'data': {'x' : 1, 'y' : 2},
        ...  'error': False,
        ...  'error_tb': '',
        ...  'event': 'event',
        ...  'file': '/path/to/file.log',
        ...  'formatter': 'logagg.formatters.mongodb',
        ...  'host': 'deepcompute-ThinkPad-E470',
        ...  'id': '0112358',
        ...  'level': 'debug',
        ...  'raw': 'some log line here',
        ...  'timestamp': '2018-04-07T14:06:17.404818',
        ...  'type': 'log'}
        >>> lc._validate_log_format(redundant_log)
        'failed'

        >>> correct_log = {'data': {'x' : 1, 'y' : 2},
        ...  'error': False,
        ...  'error_tb': '',
        ...  'event': 'event',
        ...  'file': '/path/to/file.log',
        ...  'formatter': 'logagg.formatters.mongodb',
        ...  'host': 'deepcompute-ThinkPad-E470',
        ...  'id': '0112358',
        ...  'level': 'debug',
        ...  'raw': 'some log line here',
        ...  'timestamp': '2018-04-07T14:06:17.404818',
        ...  'type': 'log'}
        >>> lc._validate_log_format(correct_log)
        'passed'
        '''

        keys_in_log = set(log)
        keys_in_log_structure = set(self.LOG_STRUCTURE)

        # Check keys
        try:
            assert (keys_in_log == keys_in_log_structure)
        except AssertionError as e:
            self.log.warning('formatted_log_structure_rejected' ,
                                key_not_found = list(keys_in_log_structure-keys_in_log),
                                extra_keys_found = list(keys_in_log-keys_in_log_structure),
                                num_logs=1,
                                type='metric')
            return 'failed'

        # Check datatype of values
        for key in log:
            try:
                assert isinstance(log[key], self.LOG_STRUCTURE[key])
            except AssertionError as e:
                self.log.warning('formatted_log_structure_rejected' ,
                                    key_datatype_not_matched = key,
                                    datatype_expected = type(self.LOG_STRUCTURE[key]),
                                    datatype_got = type(log[key]),
                                    num_logs=1,
                                    type='metric')
                return 'failed'

        return 'passed'


    def _full_from_frags(self, frags):
        '''
        Join partial lines to full lines
        '''
        full_line = '\n'.join([l for l, _ in frags])
        line_info = frags[-1][-1]
        return full_line, line_info


    def _iter_logs(self, freader, fmtfn):
        '''
        Iterate on log lines and identify full lines from full ones
        '''
        # FIXME: does not handle partial lines at the start of a file properly

        frags = []

        for line_info in freader:
            # Remove new line char at the end
            line = line_info['line'][:-1]
            if not fmtfn.ispartial(line) and frags:
                yield self._full_from_frags(frags)
                frags = []

            frags.append((line, line_info))

        if frags:
            yield self._full_from_frags(frags)


    def _assign_default_log_values(self, fpath, line, formatter):
        '''
        >>> lc = LogCollector('file=/path/to/log_file.log:formatter=logagg.formatters.basescript', 30)
        >>> from pprint import pprint

        >>> formatter = 'logagg.formatters.mongodb'
        >>> fpath = '/var/log/mongodb/mongodb.log'
        >>> line = 'some log line here'

        >>> default_log = lc._assign_default_log_values(fpath, line, formatter)
        >>> pprint(default_log) #doctest: +ELLIPSIS
        {'data': {},
         'error': False,
         'error_tb': '',
         'event': 'event',
         'file': '/var/log/mongodb/mongodb.log',
         'formatter': 'logagg.formatters.mongodb',
         'host': '...',
         'id': None,
         'level': 'debug',
         'raw': 'some log line here',
         'timestamp': '...',
         'type': 'log'}
        '''
        return dict(
            id=None,
            file=fpath,
            host=self.HOST,
            formatter=formatter,
            event='event',
            data={},
            raw=line,
            timestamp=datetime.datetime.utcnow().isoformat(),
            type='log',
            level='debug',
            error= False,
            error_tb='',
          )


    def _archive_file(self, fpath):
        '''
        Move log file from logaggfs 'logs' directory
        '''
        shutil.move(fpath, self.archive_dir+'/'+fpath.split('/')[-1])
        os.remove(fpath+'.offset')


    @keeprunning(LOG_FILE_POLL_INTERVAL, on_error=util.log_exception)
    def _collect_log_files(self, log_files):
        '''
        Collect from log files in logaggfs 'logs' one by one
        '''

        L = log_files
        # Sorted list of all the files for one pattern
        fpaths = glob.glob(join(self.logaggfs.logs_dir, L['fpattern']))
        fpaths = sorted(fpaths)

        for f in fpaths:
            log_files.update({'fpath': f})
            # If last file in the list keep polling until next file arrives
            self._collect_log_lines(log_files)
            if not f == fpaths[-1]:
                self.log.debug('archiving_file', f=f)
                self._archive_file(f)
        time.sleep(1)


    def _collect_log_lines(self, log_file):
        '''
        Collects logs from logfiles, formats and puts in queue
        '''
        L = log_file
        fpath = L['fpath']
        fmtfn = L['formatter_fn']
        formatter = L['formatter']

        freader = Pygtail(fpath)
        for line, line_info in self._iter_logs(freader, fmtfn):
            log = self._assign_default_log_values(fpath, line, formatter)

            try:
                _log = fmtfn(line)
                # Identify logs inside a log
                # Like process logs inside docker logs
                if isinstance(_log, RawLog):
                    formatter, raw_log = _log['formatter'], _log['raw']
                    log.update(_log)
                    # Give them to actual formatters
                    _log = load_formatter_fn(formatter)(raw_log)

                log.update(_log)
            except (SystemExit, KeyboardInterrupt) as e: raise
            except:
                log['error'] = True
                log['error_tb'] = traceback.format_exc()
                self.log.exception('error_during_handling_log_line', log=log['raw'])

            if log['id'] == None:
                log['id'] = uuid.uuid1().hex

            log = self._remove_redundancy(log)
            if self._validate_log_format(log) == 'failed': continue

            self.queue.put(dict(log=json.dumps(log),
                                freader=freader, line_info=line_info))
            self.log.debug('tally:put_into_self.queue', size=self.queue.qsize())

        while not freader.is_fully_acknowledged():
            t = self.PYGTAIL_ACK_WAIT_TIME
            self.log.debug('waiting_for_pygtail_to_fully_ack', wait_time=t)
            time.sleep(t)


    def _get_msgs_from_queue(self, msgs, timeout):
        msgs_pending = []
        read_from_q = False
        ts = time.time()

        msgs_nbytes = sum(len(m['log']) for m in msgs)

        while 1:
            try:
                msg = self.queue.get(block=True, timeout=self.QUEUE_READ_TIMEOUT)
                read_from_q = True
                self.log.debug("tally:get_from_self.queue")

                _msgs_nbytes = msgs_nbytes + len(msg['log'])
                _msgs_nbytes += 1 # for newline char

                if _msgs_nbytes > self.MAX_NBYTES_TO_SEND:
                    msgs_pending.append(msg)
                    self.log.debug('msg_bytes_read_mem_queue_exceeded')
                    break

                msgs.append(msg)
                msgs_nbytes = _msgs_nbytes

                #FIXME condition never met
                if time.time() - ts >= timeout and msgs:
                    self.log.debug('msg_reading_timeout_from_mem_queue_got_exceeded')
                    break
                    # TODO: What if a single log message itself is bigger than max bytes limit?

            except queue.Empty:
                self.log.debug('queue_empty')
                time.sleep(self.QUEUE_READ_TIMEOUT)
                if not msgs:
                    continue
                else:
                    return msgs_pending, msgs_nbytes, read_from_q

        self.log.debug('got_msgs_from_mem_queue')
        return msgs_pending, msgs_nbytes, read_from_q


    @keeprunning(0, on_error=util.log_exception) # FIXME: what wait time var here?
    def _send_to_nsq(self, state):
        msgs = []
        should_push = False

        while not should_push:
            cur_ts = time.time()
            self.log.debug('should_push', should_push=should_push)
            time_since_last_push = cur_ts - state.last_push_ts

            msgs_pending, msgs_nbytes, read_from_q = self._get_msgs_from_queue(msgs,
                                                                        self.MAX_SECONDS_TO_PUSH)

            have_enough_msgs = msgs_nbytes >= self.MIN_NBYTES_TO_SEND
            is_max_time_elapsed = time_since_last_push >= self.MAX_SECONDS_TO_PUSH

            should_push = len(msgs) > 0 and (is_max_time_elapsed or have_enough_msgs)
            self.log.debug('deciding_to_push', should_push=should_push,
                            time_since_last_push=time_since_last_push,
                            msgs_nbytes=msgs_nbytes)

        try:
            if isinstance(self.nsq_sender, type(util.DUMMY)):
                for m in msgs:
                    self.log.info('final_log_format', log=m['log'])
            else:
                self.log.debug('trying_to_push_to_nsq', msgs_length=len(msgs))
                self.nsq_sender.handle_logs(msgs)
                self.log.debug('pushed_to_nsq', msgs_length=len(msgs))
            self._confirm_success(msgs)
            msgs = msgs_pending
            state.last_push_ts = time.time()
        except (SystemExit, KeyboardInterrupt): raise
        finally:
            if read_from_q: self.queue.task_done()


    def _confirm_success(self, msgs):
        ack_fnames = set()

        for msg in reversed(msgs):
            freader = msg['freader']
            fname = freader.filename

            if fname in ack_fnames:
                continue

            ack_fnames.add(fname)
            freader.update_offset_file(msg['line_info'])


    def _compute_md5_fpatterns(self, f):
        '''
        For a filepath in logaggfs logs directory compute 'md5*.log' pattern
        '''
        fpath = f.encode("utf-8")
        d = self.logaggfs.logs_dir
        dir_contents = [f for f in os.listdir(d) if bool(self.LOGAGGFS_FPATH_PATTERN.match(f)) and isfile(join(d, f))]
        dir_contents = set(dir_contents)
        for c in dir_contents:
            if md5(fpath).hexdigest() == c.split('.')[0]:
                return md5(fpath).hexdigest() + '*' + '.log'


    @keeprunning(SCAN_FPATTERNS_INTERVAL, on_error=util.log_exception)
    def _scan_fpatterns(self, state):
        '''
        For a list of given fpatterns or a logaggfs directory,
        this starts a thread collecting log lines from file

        >>> os.path.isfile = lambda path: path == '/path/to/log_file.log'
        >>> lc = LogCollector('file=/path/to/log_file.log:formatter=logagg.formatters.basescript', 30)

        >>> print(lc.fpaths)
        file=/path/to/log_file.log:formatter=logagg.formatters.basescript

        >>> print('formatters loaded:', lc.formatters)
        {}
        >>> print('log file reader threads started:', lc.log_reader_threads)
        {}
        >>> state = AttrDict(files_tracked=list())
        >>> print('files bieng tracked:', state.files_tracked)
        []


        >>> if not state.files_tracked:
        >>>     lc._scan_fpatterns(state)
        >>>     print('formatters loaded:', lc.formatters)
        >>>     print('log file reader threads started:', lc.log_reader_threads)
        >>>     print('files bieng tracked:', state.files_tracked)
        '''
        for f in self.state['fpaths']:

            # For supporting file patterns rather than file paths
            for fpath in glob.glob(f['fpath']):

                # Compute 'md5(filename)*.log' fpattern for fpath
                fpattern, formatter = self._compute_md5_fpatterns(fpath), f['formatter']
                # When no md5 pattern filenames are found for the fpath in logaggfs logs directory
                if fpattern == None: continue
                self.log.debug('_scan_fpatterns', fpattern=fpattern, formatter=formatter)
                try:
                    formatter_fn = self.formatters.get(formatter,
                                  load_formatter_fn(formatter))
                    self.log.debug('found_formatter_fn', fn=formatter)
                    self.formatters[formatter] = formatter_fn
                except (SystemExit, KeyboardInterrupt): raise
                except (ImportError, AttributeError):
                    self.log.exception('formatter_fn_not_found', fn=formatter)
                    sys.exit(-1)
                # Start a thread for every filepattern
                log_f = dict(fpattern=fpattern,
                                formatter=formatter, formatter_fn=formatter_fn)
                log_key = (f['fpath'], fpattern, formatter)
                if log_key not in self.log_reader_threads:

                    self.log.info('starting_collect_log_files_thread', log_key=log_key)
                    # There is no existing thread tracking this log file, start one.
                    log_reader_thread = util.start_daemon_thread(self._collect_log_files, (log_f,))
                    self.log_reader_threads[log_key] = log_reader_thread

        time.sleep(self.SCAN_FPATTERNS_INTERVAL)


    @keeprunning(HEARTBEAT_RESTART_INTERVAL, on_error=util.log_exception)
    def _send_heartbeat(self, state):

        # Sends continuous heartbeats to a seperate topic in nsq
        if self.log_reader_threads:
            for f in self.log_reader_threads:
                files_tracked = self.log_reader_threads.keys()
        else:
            files_tracked = ''

        heartbeat_payload = {'host': self.HOST,
                            'heartbeat_number': state.heartbeat_number,
                            'timestamp': time.time(),
                            'nsq_topic': self.nsq_sender.topic_name,
                            'files_tracked': files_tracked
                            }
        self.nsq_sender.handle_heartbeat(heartbeat_payload)
        state.heartbeat_number += 1
        time.sleep(self.HEARTBEAT_RESTART_INTERVAL)


    def collect(self):

        # start tracking files and put formatted log lines into queue
        state = AttrDict(files_tracked=list())
        util.start_daemon_thread(self._scan_fpatterns, (state,))

        # start extracting formatted logs from queue and send to nsq
        state = AttrDict(last_push_ts=time.time())
        util.start_daemon_thread(self._send_to_nsq, (state,))

        # start sending heartbeat to "Hearbeat" topic
        state = AttrDict(heartbeat_number=0)
        self.log.info('init_heartbeat')
        th_heartbeat = util.start_daemon_thread(self._send_heartbeat, (state,))


    def _fpath_in_trackfiles(self, fpath):
        '''
        Check the presence of fpath is in logaggfs trackfiles.txt
        '''

        # List of files in trackfiles.txt
        tf = open(self.logaggfs.trackfiles, 'r').readlines()

        for path in tf:
            if path[:-1] == fpath: return True
        return False

    def add_to_logaggfs_trackfile(self, fpath):
        '''
        Given a fpath add it to logaggfs trackfiles.txt via moving
        '''
        fd, tmpfile = tempfile.mkstemp()

        with open(self.logaggfs.trackfiles, 'r') as f:
            old = f.read()
            new = fpath
            # Write previous files and add the new file
            if not self._fpath_in_trackfiles(new):
                with open(tmpfile, 'w') as t: t.write((old+new+'\n'))

        shutil.move(tmpfile, self.logaggfs.trackfiles)


    def remove_from_logaggfs_trackfile(self, fpath):

        # Given a fpath remove it from logaggfs trackfiles.txt via moving
        fd, tmpfile = tempfile.mkstemp()

        with open(self.logaggfs.trackfiles, 'r') as f:
            paths = [line[:-1] for line in f.readlines()]

        for p in paths:
            if p == fpath:
                pass
            else:
                with open(tmpfile, 'w+')as t:
                    t.write((p+'\n'))

        shutil.move(tmpfile, self.logaggfs.trackfiles)
Beispiel #22
0
    def _read_from_disk(self):
        m = DiskDict(os.path.join(self.dirpath, "meta"))

        return m, self._prepare_word_index_wvspace(m["dim"], mode="r")
class LogCollector:
    DESC = "Collects the log information and sends to NSQTopic"
    NAMESPACE = "collector"
    REGISTER_URL = "http://{master_address}/logagg/v1/register_component?namespace={namespace}&topic_name={topic_name}&host={host}&port={port}"
    GET_TOPIC_INFO_URL = (
        "http://{host}:{port}/logagg/v1/get_topic_info?topic_name={topic_name}"
    )

    QUEUE_MAX_SIZE = 2000  # Maximum number of messages in in-mem queue
    MAX_NBYTES_TO_SEND = 4.5 * (
        1024**2
    )  # Number of bytes from in-mem queue minimally required to push
    MIN_NBYTES_TO_SEND = 512 * 1024  # Minimum number of bytes to send to nsq in mpub
    MAX_SECONDS_TO_PUSH = 1  # Wait till this much time elapses before pushing
    LOG_FILE_POLL_INTERVAL = 0.25  # Wait time to pull log file for new lines added
    QUEUE_READ_TIMEOUT = 1  # Wait time when doing blocking read on the in-mem q
    PYGTAIL_ACK_WAIT_TIME = 0.05  # TODO: Document this
    SCAN_FPATTERNS_INTERVAL = (
        30)  # How often to scan filesystem for files matching fpatterns
    HEARTBEAT_RESTART_INTERVAL = 30  # Wait time if heartbeat sending stops
    LOGAGGFS_FPATH_PATTERN = re.compile("[a-fA-F\d]{32}")  # MD5 hash pattern
    SERVERSTATS_FPATH = ("/var/log/serverstats/serverstats.log"
                         )  # Path to serverstats log file
    DOCKER_FORMATTER = ("logagg_collector.formatters.docker_file_log_driver"
                        )  # Formatter name for docker_file_log_driver

    LOG_STRUCTURE = {
        "id": basestring,
        "timestamp": basestring,
        "file": basestring,
        "host": basestring,
        "formatter": basestring,
        "raw": basestring,
        "type": basestring,
        "level": basestring,
        "event": basestring,
        "data": dict,
        "error": bool,
        "error_tb": basestring,
    }

    def __init__(self,
                 host,
                 port,
                 master,
                 data_dir,
                 logaggfs_dir,
                 log=utils.DUMMY):

        self.host = host
        self.port = port
        self.master = master

        # For storing state
        data_path = os.path.abspath(os.path.join(data_dir, "logagg-data"))
        self.data_path = utils.ensure_dir(data_path)

        self.master = master

        self.log = log

        # For remembering the state of files
        self.state = DiskDict(self.data_path)

        # Initialize logaggfs paths
        self.logaggfs = self._init_logaggfs_paths(logaggfs_dir)

        # Log fpath to thread mapping
        self.log_reader_threads = {}
        # Handle name to formatter fn obj map
        self.formatters = {}
        self.queue = queue.Queue(maxsize=self.QUEUE_MAX_SIZE)

        # Add initial files i.e. serverstats to state
        if not self.state["fpaths"]:
            self.log.info("init_fpaths")
            self._init_fpaths()

        # register self to master
        self.register_to_master()

        # Create nsq_sender
        self.nsq_sender_logs, self.nsq_sender_heartbeat = self._init_nsq_sender(
        )

        self._ensure_trackfiles_sync()

    def register_to_master(self):
        """
        Request authentication with master details
        
        Sample url: http://localhost:1088/logagg/v1/register_component?namespace=master
                    &topic_name=logagg&host=localhost&port=1088
        """
        # TODO: test case
        master = self.master
        url = self.REGISTER_URL.format(
            master_address=master.host + ":" + master.port,
            namespace=self.NAMESPACE,
            topic_name=master.topic_name,
            host=self.host,
            port=self.port,
        )

        try:
            register = requests.get(url)
            register_result = json.loads(register.content.decode("utf-8"))
            if not register_result.get("success"):
                err_msg = register_result.get("message")
                raise Exception(err_msg)

            if register_result["result"].get("success"):
                return register_result
            else:
                err_msg = register_result.get["result"]
                raise Exception(err_msg)

        except requests.exceptions.ConnectionError:
            err_msg = "Could not reach master, url: {}".format(url)
            raise Exception(err_msg)

    def _init_fpaths(self):
        """
        Files to be collected by default

        >>> import tempfile
        >>> temp_dir = tempfile.TemporaryDirectory()
        >>> temp_dir_path = temp_dir.name
        >>> trackfile = open(temp_dir_path+'/trackfiles.txt', 'w+'); trackfile.close()
        >>> lc = collector = LogCollector('localhost', '1088', None, temp_dir_path, temp_dir_path)

        >>> lc._init_fpaths()
        [{'fpath': '/var/log/serverstats/serverstats.log', 'formatter': 'logagg_collector.formatters.docker_file_log_driver'}]

        >>> temp_dir.cleanup()
        """
        self.state["fpaths"] = [{
            "fpath": self.SERVERSTATS_FPATH,
            "formatter": self.DOCKER_FORMATTER
        }]
        self.state.flush()
        return self.state["fpaths"]

    def _init_nsq_sender(self):
        """
        Initialize nsq_sender on startup
        >>> import tempfile
        >>> temp_dir = tempfile.TemporaryDirectory()
        >>> temp_dir_path = temp_dir.name
        >>> trackfile = open(temp_dir_path+'/trackfiles.txt', 'w+'); trackfile.close()
        >>> lc = collector = LogCollector('localhost', '1088', None, temp_dir_path, temp_dir_path)

        >>> lc._init_nsq_sender() # doctest: +ELLIPSIS
        (<deeputil.misc.Dummy object at ...>, <deeputil.misc.Dummy object at ...>)

        >>> temp_dir.cleanup()
        """
        # TODO: test cases for master mode

        # Check if running on master mode or not
        if self.master is None:
            self.log.warn("nsq_not_set",
                          msg="will send formatted logs to stdout")
            return utils.DUMMY, utils.DUMMY

        # Prepare to request NSQ details from master
        url = self.GET_TOPIC_INFO_URL.format(
            host=self.master.host,
            port=self.master.port,
            topic_name=self.master.topic_name,
        )
        try:
            get_topic_info = requests.get(url)
            get_topic_info_result = json.loads(
                get_topic_info.content.decode("utf-8"))

        except requests.exceptions.ConnectionError:
            err_msg = "Could not reach master, url: {}".format(url)
            raise Exception(err_msg)

        if get_topic_info_result["result"].get("success"):
            nsqd_http_address = get_topic_info_result["result"]["topic_info"][
                "nsqd_http_address"]
            heartbeat_topic = get_topic_info_result["result"]["topic_info"][
                "heartbeat_topic"]
            logs_topic = get_topic_info_result["result"]["topic_info"][
                "logs_topic"]
            nsq_depth_limit = get_topic_info_result["result"]["topic_info"][
                "nsq_depth_limit"]

            # Create NSQSender object for sending logs and heartbeats
            nsq_sender_heartbeat = NSQSender(nsqd_http_address,
                                             heartbeat_topic, self.log)
            nsq_sender_logs = NSQSender(nsqd_http_address, logs_topic,
                                        self.log)
            return nsq_sender_logs, nsq_sender_heartbeat

        else:
            err_msg = get_topic_info_result["result"].get("details")
            raise Exception(err_msg)

    def _init_logaggfs_paths(self, logaggfs_dir):
        """
        Logaggfs directories and file initialization

        >>> test_dir = utils.ensure_dir('/tmp/xyz')
        >>> trackfile = open(test_dir+'/trackfiles.txt', 'w+'); trackfile.close()
        >>> lc = collector = LogCollector('localhost', '1088', None, test_dir, test_dir)

        >>> lc._init_logaggfs_paths(test_dir)
        AttrDict({'logcache': '/tmp/xyz', 'logs_dir': '/tmp/xyz/logs', 'trackfiles': '/tmp/xyz/trackfiles.txt'})

        >>> import shutil; shutil.rmtree(test_dir)
        """
        logaggfs = AttrDict()
        logaggfs.logcache = logaggfs_dir
        logaggfs.logs_dir = os.path.abspath(
            os.path.join(logaggfs.logcache, "logs"))
        logaggfs.trackfiles = os.path.abspath(
            os.path.join(logaggfs.logcache, "trackfiles.txt"))
        return logaggfs

    def _ensure_trackfiles_sync(self):
        """
        Make sure fpaths in logagg state-file are present in logaggfs trackfiles on start up

        >>> import tempfile
        >>> temp_dir = tempfile.TemporaryDirectory()
        >>> temp_dir_path = temp_dir.name
        >>> trackfile = open(temp_dir_path+'/trackfiles.txt', 'w+'); trackfile.close()
        >>> lc = collector = LogCollector('localhost', '1088', None, temp_dir_path, temp_dir_path)

        >>> with open(lc.logaggfs.trackfiles) as f: f.read()
        '/var/log/serverstats/serverstats.log\\n'
        >>> lc.state['fpaths'] = [{'fpath': '/var/log/some.log'}]; lc.state.flush()
        >>> lc._ensure_trackfiles_sync()
        >>> with open(lc.logaggfs.trackfiles) as f: f.read()
        '/var/log/serverstats/serverstats.log\\n/var/log/some.log\\n'

        >>> temp_dir.cleanup()
        """
        # If all the files are present in trackfiles
        for f in self.state["fpaths"]:
            if not self._fpath_in_trackfiles(f["fpath"]):
                self.add_to_logaggfs_trackfile(f["fpath"])

    def _remove_redundancy(self, log):
        """
        Removes duplicate data from 'data' inside log dictionary and brings it out

        >>> import tempfile
        >>> temp_dir = tempfile.TemporaryDirectory()
        >>> temp_dir_path = temp_dir.name
        >>> trackfile = open(temp_dir_path+'/trackfiles.txt', 'w+'); trackfile.close()
        >>> lc = collector = LogCollector('localhost', '1088', None, temp_dir_path, temp_dir_path)

        >>> log = {'id' : 46846876, 'type' : 'log',
        ...         'data' : {'a' : 1, 'b' : 2, 'type' : 'metric'}}
        >>> from pprint import pprint
        >>> pprint(lc._remove_redundancy(log))
        {'data': {'a': 1, 'b': 2}, 'id': 46846876, 'type': 'metric'}

        >>> temp_dir.cleanup()
        """
        for key in log:
            if key in log["data"]:
                log[key] = log["data"].pop(key)
        return log

    def _validate_log_format(self, log):
        """
        Assert if the formatted log is of the same structure as specified

        >>> import tempfile
        >>> temp_dir = tempfile.TemporaryDirectory()
        >>> temp_dir_path = temp_dir.name
        >>> trackfile = open(temp_dir_path+'/trackfiles.txt', 'w+'); trackfile.close()
        >>> lc = collector = LogCollector('localhost', '1088', None, temp_dir_path, temp_dir_path)

        >>> incomplete_log = {'data' : {'x' : 1, 'y' : 2},
        ...                     'raw' : 'Not all keys present'}
        >>> lc._validate_log_format(incomplete_log)
        'failed'

        >>> redundant_log = {'one_invalid_key' : 'Extra information',
        ...  'data': {'x' : 1, 'y' : 2},
        ...  'error': False,
        ...  'error_tb': '',
        ...  'event': 'event',
        ...  'file': '/path/to/file.log',
        ...  'formatter': 'logagg.formatters.mongodb',
        ...  'host': 'deepcompute-ThinkPad-E470',
        ...  'id': '0112358',
        ...  'level': 'debug',
        ...  'raw': 'some log line here',
        ...  'timestamp': '2018-04-07T14:06:17.404818',
        ...  'type': 'log'}
        >>> lc._validate_log_format(redundant_log)
        'failed'

        >>> correct_log = {'data': {'x' : 1, 'y' : 2},
        ...  'error': False,
        ...  'error_tb': '',
        ...  'event': 'event',
        ...  'file': '/path/to/file.log',
        ...  'formatter': 'logagg.formatters.mongodb',
        ...  'host': 'deepcompute-ThinkPad-E470',
        ...  'id': '0112358',
        ...  'level': 'debug',
        ...  'raw': 'some log line here',
        ...  'timestamp': '2018-04-07T14:06:17.404818',
        ...  'type': 'log'}
        >>> lc._validate_log_format(correct_log)
        'passed'

        >>> temp_dir.cleanup()
        """

        keys_in_log = set(log)
        keys_in_log_structure = set(self.LOG_STRUCTURE)

        # Check keys
        try:
            assert keys_in_log == keys_in_log_structure
        except AssertionError as e:
            self.log.warning(
                "formatted_log_structure_rejected",
                key_not_found=list(keys_in_log_structure - keys_in_log),
                extra_keys_found=list(keys_in_log - keys_in_log_structure),
                num_logs=1,
                type="metric",
            )
            return "failed"

        # Check datatype of values
        for key in log:
            try:
                assert isinstance(log[key], self.LOG_STRUCTURE[key])
            except AssertionError as e:
                self.log.warning(
                    "formatted_log_structure_rejected",
                    key_datatype_not_matched=key,
                    datatype_expected=type(self.LOG_STRUCTURE[key]),
                    datatype_got=type(log[key]),
                    num_logs=1,
                    type="metric",
                )
                return "failed"

        return "passed"

    def _full_from_frags(self, frags):
        """
        Join partial lines to full lines
        """
        full_line = "\n".join([l for l, _ in frags])
        line_info = frags[-1][-1]
        return full_line, line_info

    def _iter_logs(self, freader, fmtfn):
        """
        Iterate on log lines and identify full lines from full ones
        >>> import tempfile
        >>> temp_dir = tempfile.TemporaryDirectory()
        >>> temp_dir_path = temp_dir.name
        >>> trackfile = open(temp_dir_path+'/trackfiles.txt', 'w+'); trackfile.close()
        >>> lc = collector = LogCollector('localhost', '1088', None, temp_dir_path, temp_dir_path)

        >>> def fmtfn(line):
        ...     return line
        >>> def ispartial(line):
        ...     if line.startswith('--->'): return True
        ...     else: return False
        >>> fmtfn.ispartial = ispartial
        >>> log_dir = tempfile.TemporaryDirectory()

        >>> log_dir_path = log_dir.name
        >>> log_file_path = log_dir_path + '/log_file.log'
        >>> loglines = 'Traceback (most recent call last):\\n--->File "<stdin>", line 1, in <module>\\n--->NameError: name "spam" is not defined'
        >>> with open(log_file_path, 'w+') as logfile: w = logfile.write(loglines)
        >>> sample_freader = Pygtail(log_file_path)

        >>> for log in lc._iter_logs(sample_freader, fmtfn): print(log[0])
        Traceback (most recent call last):
        --->File "<stdin>", line 1, in <module>
        --->NameError: name "spam" is not define

        >>> temp_dir.cleanup()
        >>> log_dir.cleanup()
        """
        # FIXME: does not handle partial lines at the start of a file properly

        frags = []

        for line_info in freader:
            # Remove new line char at the end
            line = line_info["line"][:-1]
            if not fmtfn.ispartial(line) and frags:
                yield self._full_from_frags(frags)
                frags = []

            frags.append((line, line_info))

        if frags:
            yield self._full_from_frags(frags)

    def _assign_default_log_values(self, fpath, line, formatter):
        """
        Fills up default data into one log record
        >>> import tempfile
        >>> temp_dir = tempfile.TemporaryDirectory()
        >>> temp_dir_path = temp_dir.name
        >>> trackfile = open(temp_dir_path+'/trackfiles.txt', 'w+'); trackfile.close()
        >>> lc = collector = LogCollector('localhost', '1088', None, temp_dir_path, temp_dir_path)
        >>> from pprint import pprint

        >>> formatter = 'logagg.formatters.mongodb'
        >>> fpath = '/var/log/mongodb/mongodb.log'
        >>> line = 'some log line here'

        >>> default_log = lc._assign_default_log_values(fpath, line, formatter)
        >>> pprint(default_log) #doctest: +ELLIPSIS
        {'data': {},
         'error': False,
         'error_tb': '',
         'event': 'event',
         'file': '/var/log/mongodb/mongodb.log',
         'formatter': 'logagg.formatters.mongodb',
         'host': '...',
         'id': None,
         'level': 'debug',
         'raw': 'some log line here',
         'timestamp': '...',
         'type': 'log'}

        >>> temp_dir.cleanup()
        """
        return dict(
            id=None,
            file=fpath,
            host=self.host,
            formatter=formatter,
            event="event",
            data={},
            raw=line,
            timestamp=datetime.datetime.utcnow().isoformat(),
            type="log",
            level="debug",
            error=False,
            error_tb="",
        )

    def _delete_file(self, fpath):
        """
        Move log file from logaggfs 'logs' directory
        >>> import tempfile
        >>> temp_dir = tempfile.TemporaryDirectory()
        >>> temp_dir_path = temp_dir.name
        >>> trackfile = open(temp_dir_path+'/trackfiles.txt', 'w+'); trackfile.close()
        >>> lc = collector = LogCollector('localhost', '1088', None, temp_dir_path, temp_dir_path)

        >>> log_file_dir = tempfile.TemporaryDirectory()
        >>> log_file_path = log_file_dir.name + '/log_file.log'
        >>> offset_file_path = log_file_path + '.offset'
        >>> log_file = open(log_file_path, 'w+'); trackfile.close()
        >>> offset_file = open(offset_file_path, 'w+'); offset_file.close() 
        >>> import os
        >>> os.path.isfile(log_file_path)
        True
        >>> os.path.isfile(offset_file_path)
        True
        >>> lc._delete_file(log_file_path)
        >>> os.path.isfile(log_file_path)
        False
        >>> os.path.isfile(offset_file_path)
        False

        >>> temp_dir.cleanup()
        >>> log_file_dir.cleanup()
        """
        os.remove(fpath)
        os.remove(fpath + ".offset")

    @keeprunning(LOG_FILE_POLL_INTERVAL, on_error=utils.log_exception)
    def _collect_log_files(self, log_files):
        """
        Collect from log files in logaggfs 'logs' one by one
        """

        L = log_files
        # Sorted list of all the files for one pattern
        fpaths = glob.glob(join(self.logaggfs.logs_dir, L["fpattern"]))
        fpaths = sorted(fpaths)

        for fpath in fpaths:
            log_files.update({"fpath": fpath})
            # If last file in the list keep polling until next file arrives
            self._collect_log_lines(log_files)
            if not fpath == fpaths[-1]:
                self.log.debug("deleting_file", fpath=fpath)
                self._delete_file(fpath)
        time.sleep(1)

    def _collect_log_lines(self, log_file):
        """
        Collects logs from logfiles, formats and puts in queue
        """
        L = log_file
        fpath = L["fpath"]
        fmtfn = L["formatter_fn"]
        formatter = L["formatter"]

        freader = Pygtail(fpath)
        for line, line_info in self._iter_logs(freader, fmtfn):
            log = self._assign_default_log_values(fpath, line, formatter)

            try:
                _log = fmtfn(line)
                # Identify logs inside a log
                # Like process logs inside docker logs
                if isinstance(_log, RawLog):
                    formatter, raw_log = _log["formatter"], _log["raw"]
                    log.update(_log)
                    # Give them to actual formatters
                    _log = load_formatter_fn(formatter)(raw_log)

                log.update(_log)
            except (SystemExit, KeyboardInterrupt) as e:
                raise
            except:
                log["error"] = True
                log["error_tb"] = traceback.format_exc()
                self.log.exception("error_during_handling_log_line",
                                   log=log["raw"])

            if log["id"] == None:
                log["id"] = uuid.uuid1().hex

            log = self._remove_redundancy(log)
            if self._validate_log_format(log) == "failed":
                continue

            self.queue.put(
                dict(log=json.dumps(log), freader=freader,
                     line_info=line_info))
            self.log.debug("tally:put_into_self.queue",
                           size=self.queue.qsize())

        while not freader.is_fully_acknowledged():
            t = self.PYGTAIL_ACK_WAIT_TIME
            self.log.debug("waiting_for_pygtail_to_fully_ack", wait_time=t)
            time.sleep(t)

    def _get_msgs_from_queue(self, msgs, timeout):
        msgs_pending = []
        read_from_q = False
        ts = time.time()

        msgs_nbytes = sum(len(m["log"]) for m in msgs)

        while 1:
            try:
                msg = self.queue.get(block=True,
                                     timeout=self.QUEUE_READ_TIMEOUT)
                read_from_q = True
                self.log.debug("tally:get_from_self.queue")

                _msgs_nbytes = msgs_nbytes + len(msg["log"])
                _msgs_nbytes += 1  # for newline char

                if _msgs_nbytes > self.MAX_NBYTES_TO_SEND:
                    msgs_pending.append(msg)
                    self.log.debug("msg_bytes_read_mem_queue_exceeded")
                    break

                msgs.append(msg)
                msgs_nbytes = _msgs_nbytes

                # FIXME condition never met
                if time.time() - ts >= timeout and msgs:
                    self.log.debug(
                        "msg_reading_timeout_from_mem_queue_got_exceeded")
                    break
                    # TODO: What if a single log message itself is bigger than max bytes limit?

            except queue.Empty:
                self.log.debug("queue_empty")
                time.sleep(self.QUEUE_READ_TIMEOUT)
                if not msgs:
                    continue
                else:
                    return msgs_pending, msgs_nbytes, read_from_q

        self.log.debug("got_msgs_from_mem_queue")
        return msgs_pending, msgs_nbytes, read_from_q

    @keeprunning(0, on_error=utils.log_exception
                 )  # FIXME: what wait time var here?
    def _send_to_nsq(self, state):
        msgs = []
        should_push = False

        while not should_push:
            cur_ts = time.time()
            self.log.debug("should_push", should_push=should_push)
            time_since_last_push = cur_ts - state.last_push_ts

            msgs_pending, msgs_nbytes, read_from_q = self._get_msgs_from_queue(
                msgs, self.MAX_SECONDS_TO_PUSH)

            have_enough_msgs = msgs_nbytes >= self.MIN_NBYTES_TO_SEND
            is_max_time_elapsed = time_since_last_push >= self.MAX_SECONDS_TO_PUSH

            should_push = len(msgs) > 0 and (is_max_time_elapsed
                                             or have_enough_msgs)
            self.log.debug(
                "deciding_to_push",
                should_push=should_push,
                time_since_last_push=time_since_last_push,
                msgs_nbytes=msgs_nbytes,
            )

        try:
            if isinstance(self.nsq_sender_logs, type(utils.DUMMY)):
                for m in msgs:
                    self.log.info("final_log_format", log=m["log"])
            else:
                self.log.debug("trying_to_push_to_nsq", msgs_length=len(msgs))
                self.nsq_sender_logs.handle_logs(msgs)
                self.log.debug("pushed_to_nsq", msgs_length=len(msgs))
            self._confirm_success(msgs)
            msgs = msgs_pending
            state.last_push_ts = time.time()
        except (SystemExit, KeyboardInterrupt):
            raise
        finally:
            if read_from_q:
                self.queue.task_done()

    def _confirm_success(self, msgs):
        ack_fnames = set()

        for msg in reversed(msgs):
            freader = msg["freader"]
            fname = freader.filename

            if fname in ack_fnames:
                continue

            ack_fnames.add(fname)
            freader.update_offset_file(msg["line_info"])

    def _compute_md5_fpatterns(self, fpath):
        """
        For a filepath in logaggfs logs directory compute 'md5*.log' pattern
        """
        fpath = fpath.encode("utf-8")
        d = self.logaggfs.logs_dir
        dir_contents = [
            f for f in os.listdir(d) if
            bool(self.LOGAGGFS_FPATH_PATTERN.match(f)) and isfile(join(d, f))
        ]
        dir_contents = set(dir_contents)
        for c in dir_contents:
            if md5(fpath).hexdigest() == c.split(".")[0]:
                return md5(fpath).hexdigest() + "*" + ".log"

    @keeprunning(SCAN_FPATTERNS_INTERVAL, on_error=utils.log_exception)
    def _scan_fpatterns(self, state):
        """
        For a list of given fpatterns or a logaggfs directory,
        this starts a thread collecting log lines from file

        >>> os.path.isfile = lambda path: path == '/path/to/log_file.log'
        >>> lc = LogCollector('file=/path/to/log_file.log:formatter=logagg.formatters.basescript', 30)

        >>> print(lc.fpaths)
        file=/path/to/log_file.log:formatter=logagg.formatters.basescript

        >>> print('formatters loaded:', lc.formatters)
        {}
        >>> print('log file reader threads started:', lc.log_reader_threads)
        {}
        >>> state = AttrDict(files_tracked=list())
        >>> print('files bieng tracked:', state.files_tracked)
        []


        >>> if not state.files_tracked:
        >>>     lc._scan_fpatterns(state)
        >>>     print('formatters loaded:', lc.formatters)
        >>>     print('log file reader threads started:', lc.log_reader_threads)
        >>>     print('files bieng tracked:', state.files_tracked)
        """
        for f in self.state["fpaths"]:

            # For supporting file patterns rather than file paths
            for fpath in glob.glob(f["fpath"]):

                # Compute 'md5(filename)*.log' fpattern for fpath
                fpattern, formatter = self._compute_md5_fpatterns(
                    fpath), f["formatter"]
                # When no md5 pattern filenames are found for the fpath in logaggfs logs directory
                if fpattern == None:
                    continue
                self.log.debug("_scan_fpatterns",
                               fpattern=fpattern,
                               formatter=formatter)
                try:
                    formatter_fn = self.formatters.get(
                        formatter, load_formatter_fn(formatter))
                    self.log.debug("found_formatter_fn", fn=formatter)
                    self.formatters[formatter] = formatter_fn
                except (SystemExit, KeyboardInterrupt):
                    raise
                except (ImportError, AttributeError):
                    self.log.exception("formatter_fn_not_found", fn=formatter)
                    sys.exit(-1)
                # Start a thread for every filepattern
                log_f = dict(fpattern=fpattern,
                             formatter=formatter,
                             formatter_fn=formatter_fn)
                log_key = (f["fpath"], fpattern, formatter)
                if log_key not in self.log_reader_threads:
                    self.log.info("starting_collect_log_files_thread",
                                  log_key=log_key)
                    # There is no existing thread tracking this log file, start one.
                    log_reader_thread = utils.start_daemon_thread(
                        self._collect_log_files, (log_f, ))
                    self.log_reader_threads[log_key] = log_reader_thread

        time.sleep(self.SCAN_FPATTERNS_INTERVAL)

    @keeprunning(HEARTBEAT_RESTART_INTERVAL, on_error=utils.log_exception)
    def _send_heartbeat(self, state):

        # Sends continuous heartbeats to a seperate topic in nsq
        if self.log_reader_threads:
            files_tracked = [k for k in self.log_reader_threads.keys()]
        else:
            files_tracked = ""

        heartbeat_payload = {
            "namespace": self.NAMESPACE,
            "host": self.host,
            "port": self.port,
            "topic_name": self.master.topic_name,
            "files_tracked": files_tracked,
            "heartbeat_number": state.heartbeat_number,
            "timestamp": time.time(),
        }
        self.nsq_sender_heartbeat.handle_heartbeat(heartbeat_payload)
        state.heartbeat_number += 1
        time.sleep(self.HEARTBEAT_RESTART_INTERVAL)

    def collect(self):

        # start tracking files and put formatted log lines into queue
        state = AttrDict(files_tracked=list())
        utils.start_daemon_thread(self._scan_fpatterns, (state, ))

        # start extracting formatted logs from queue and send to nsq
        state = AttrDict(last_push_ts=time.time())
        utils.start_daemon_thread(self._send_to_nsq, (state, ))

        # start sending heartbeat to "Hearbeat" topic
        state = AttrDict(heartbeat_number=0)
        self.log.info("init_heartbeat")
        th_heartbeat = utils.start_daemon_thread(self._send_heartbeat,
                                                 (state, ))

    def _fpath_in_trackfiles(self, fpath):
        """
        Check the presence of fpath is in logaggfs trackfiles.txt
        """

        # List of files in trackfiles.txt
        with open(self.logaggfs.trackfiles, "r") as f:
            tf = f.readlines()

        for path in tf:
            if path[:-1] == fpath:
                return True
        return False

    def add_to_logaggfs_trackfile(self, fpath):
        """
        Given a fpath add it to logaggfs trackfiles.txt via moving
        """
        fd, tmpfile = tempfile.mkstemp()

        with open(self.logaggfs.trackfiles, "r") as f:
            old = f.read()
            new = fpath
            # Write previous files and add the new file
            if not self._fpath_in_trackfiles(new):
                with open(tmpfile, "w") as t:
                    t.write((old + new + "\n"))
                shutil.move(tmpfile, self.logaggfs.trackfiles)

    def remove_from_logaggfs_trackfile(self, fpath):
        """
        Given a fpath remove it from logaggfs trackfiles.txt via moving
        """
        fd, tmpfile = tempfile.mkstemp()

        with open(self.logaggfs.trackfiles, "r") as f:
            paths = [line[:-1] for line in f.readlines()]

        for p in paths:
            if p == fpath:
                pass
            else:
                with open(tmpfile, "w+") as t:
                    t.write((p + "\n"))

        shutil.move(tmpfile, self.logaggfs.trackfiles)
Beispiel #24
0
 def __init__(self):
     self.data_path = ensure_dir(expanduser('~/.logagg'))
     self.state = DiskDict(self.data_path)
     self._init_state()
Beispiel #25
0
class LogaggCli():
    '''
    Command line interface for logagg
    '''

    MASTER_PING_URL = 'http://{host}:{port}/logagg/v1/ping?key={key}&secret={secret}'
    MASTER_ADD_NSQ_URL = 'http://{host}:{port}/logagg/v1/add_nsq?nsqd_tcp_address={nsqd_tcp_address}&nsqd_http_address={nsqd_http_address}&key={key}&secret={secret}'
    MASTER_GET_NSQ_URL = 'http://{host}:{port}/logagg/v1/get_nsq?key={key}&secret={secret}'
    GET_TOPIC_URL = 'http://{host}:{port}/logagg/v1/get_topics'
    GET_TOPIC_INFO_URL = 'http://{host}:{port}/logagg/v1/get_topic_info?topic_name={topic_name}'
    GET_COMPONENT_URL = 'http://{host}:{port}/logagg/v1/get_components?topic_name={topic_name}'
    TAIL_LOGS_URL = 'http://{host}:{port}/logagg/v1/tail_logs?topic_name={topic_name}'
    COLLECTOR_ADD_FILE_URL = 'http://{host}:{port}/logagg/v1/collector_add_file?topic_name={topic_name}&collector_host={collector_host}&collector_port={collector_port}&fpath="{fpath}"&formatter="{formatter}"'
    COLLECTOR_REMOVE_FILE_URL = 'http://{host}:{port}/logagg/v1/collector_remove_file?topic_name={topic_name}&collector_host={collector_host}&collector_port={collector_port}&fpath="{fpath}"'

    def __init__(self):
        self.data_path = ensure_dir(expanduser('~/.logagg'))
        self.state = DiskDict(self.data_path)
        self._init_state()

    def _init_state(self):
        '''
        Initialize default values for stored state
        '''
        if not self.state['master']:
            self.state['master'] = dict()
            self.state.flush()
        if not self.state['default_topic']:
            self.state['default_topic'] = dict()
            self.state.flush()

    def ensure_master(self):
        '''
        Check if Master details are present
        '''
        if not self.state['master']:
            err_msg = 'No master details stored locally'
            prRed(err_msg)
            sys.exit(1)
        else:
            return AttrDict(self.state['master'])

    def request_master_url(self, url):
        '''
        Request mater urls and return response
        '''
        try:
            response = requests.get(url)
            response = json.loads(response.content.decode('utf-8'))
            return response

        except requests.exceptions.ConnectionError:
            err_msg = 'Could not reach master, url: {}'.format(url)
            prRed(err_msg)
            sys.exit(1)

    def clear(self):
        '''
        Delete all saved data
        '''
        self.state['master'] = dict()
        self.state['default_topic'] = dict()
        self.state.flush()

    def store_master(self, host, port, auth):
        '''
        Add master details to state file 
        '''
        ping_url = self.MASTER_PING_URL.format(host=host,
                                               port=port,
                                               key=auth.key,
                                               secret=auth.secret)
        ping_result = self.request_master_url(ping_url)

        if ping_result.get('result', {}).get('success', {}):
            if ping_result.get('result',
                               {}).get('details',
                                       {}) == 'Authentication passed':
                master_details = {
                    'host': host,
                    'port': port,
                    'key': auth.key,
                    'secret': auth.secret,
                    'admin': True
                }
                self.state['master'] = master_details
                self.state.flush()
                prGreen('Added master with admin permission')
            elif ping_result.get('result', {}).get(
                    'details', {}
            ) == 'Authentication failed' and not auth.key and not auth.secret:
                master_details = {
                    'host': host,
                    'port': port,
                    'key': auth.key,
                    'secret': auth.secret,
                    'admin': False
                }
                self.state['master'] = master_details
                self.state.flush()
                prYellow('Added master with non-admin permission')
            else:
                err_msg = ping_result.get('result', {}).get('details', {})
                prRed(err_msg)
                sys.exit(1)

    def list_master(self):
        '''
        Show Master details
        '''
        master = self.ensure_master()
        headers = ['HOST', 'PORT', 'ADMIN']

        data = [[master.host, master.port, str(master.admin)]]
        print(tabulate(data, headers=headers))

    def add_nsq(self, nsqd_tcp_address, nsqd_http_address):
        '''
        Add nsq details to master
        '''
        master = self.ensure_master()

        if not master.admin:
            err_msg = 'Requires admin permissions to master'
            prRed(err_msg)
            sys.exit(1)

        add_nsq_url = self.MASTER_ADD_NSQ_URL.format(
            host=master.host,
            port=master.port,
            nsqd_tcp_address=nsqd_tcp_address,
            nsqd_http_address=nsqd_http_address,
            key=master.key,
            secret=master.secret)

        add_nsq_result = self.request_master_url(add_nsq_url)

        if add_nsq_result.get('result', {}).get('success', {}):
            prGreen(add_nsq_result.get('result', {}).get('details', {}))
        else:
            err_msg = add_nsq_result.get('result', {}).get('details', {})
            prRed(err_msg)
            sys.exit(1)

    def list_nsq(self):
        '''
        List nsq details of master
        '''
        master = self.ensure_master()

        if not master.admin:
            err_msg = 'Requires admin permissions to master'
            prRed(err_msg)
            sys.exit(1)

        get_nsq_url = self.MASTER_GET_NSQ_URL.format(host=master.host,
                                                     port=master.port,
                                                     key=master.key,
                                                     secret=master.secret)

        get_nsq_result = self.request_master_url(get_nsq_url)

        if get_nsq_result.get('result', {}).get('success', {}):
            nsq_details = get_nsq_result.get('result', {}).get('nsq_list', {})
            headers = [
                'Nsqd TCP address', 'Nsqd HTTP address', 'Nsq depth limit',
                'Nsq API address'
            ]
            data = list()
            for nsq in nsq_details:
                data.append(list(nsq.values()))
            print(tabulate(data, headers=headers))
        else:
            err_msg = get_nsq_result.get('result', {}).get('details', {})
            prRed(err_msg)
            sys.exit(1)

    def list_topic(self):
        '''
        List all the topics in master
        '''
        master = self.ensure_master()

        # Get list of all topics from master
        list_topic_url = self.GET_TOPIC_URL.format(host=master.host,
                                                   port=master.port)
        list_topic_result = self.request_master_url(list_topic_url)
        topic_list = list_topic_result.get('result', {})

        master = self.state['master']
        master_admin = master.get('admin')

        for topic in topic_list:
            if topic['topic_name'] == self.state['default_topic'].get(
                    'topic_name'):
                topic['default_topic'] = True
            else:
                topic['default_topic'] = False
            if not master_admin:
                topic.pop('nsqd_tcp_address')
                topic.pop('nsqd_http_address')
                topic.pop('nsq_depth_limit')
                topic.pop('nsq_api_address')
                topic.pop('heartbeat_topic')
                topic.pop('logs_topic')

        headers = list()

        if not master_admin:
            headers = ['Topic-name', 'Default topic']
        else:
            headers = [
                'Topic-name', 'Nsqd TCP address', 'Nsqd HTTP address',
                'NSQ max depth', 'Nsq API address', 'Heartbeat topic',
                'Logs topic', 'Default topic'
            ]

        data = list()
        for c in topic_list:
            data.append(list(c.values()))
        print(tabulate(data, headers=headers))

    def ensure_topic_info(self, topic_name):
        '''
        Ensure topic info is saved locally
        '''
        master = self.ensure_master()

        # Get list of all topics from master
        list_topic_url = self.GET_TOPIC_URL.format(host=master.host,
                                                   port=master.port)
        list_topic_result = self.request_master_url(list_topic_url)
        topic_list = list_topic_result.get('result', [])

        for topic in topic_list:
            if topic['topic_name'] == topic_name:
                return topic
        err_msg = 'No topic found, topic-name: {topic_name}'.format(
            topic_name=topic_name)
        prRed(err_msg)
        sys.exit(1)

    def use_topic(self, topic_name):
        '''
        Make a topic usable by default
        '''
        topic = self.ensure_topic_info(topic_name)

        self.state['default_topic'] = topic
        self.state.flush()
        prGreen('Switched to default: {}'.format(topic_name))

    def list_collectors(self):
        '''
        List collectors in an existing topic
        '''
        master = self.ensure_master()

        if not self.state['default_topic']:
            err_msg = 'No default topic'
            prRed(err_msg)
            sys.exit(1)
        else:
            topic_name = self.state['default_topic']['topic_name']

            get_components_url = self.GET_COMPONENT_URL.format(
                host=master.host, port=master.port, topic_name=topic_name)

            get_components_result = self.request_master_url(get_components_url)

            if get_components_result.get('result', {}).get('success', {}):
                components_info = get_components_result.get(
                    'result', {}).get('components_info')

                headers = [
                    'Namespace',
                    'Host',
                    'Port',
                    'Topic name',
                    'files tracked',
                    'Heartbeat number',
                    'timestamp',
                ]

                data = list()
                for c in components_info:
                    if c.get('namespace') == 'collector':
                        data.append([
                            c.get('namespace'),
                            c.get('host'),
                            c.get('port'),
                            c.get('topic_name'),
                            c.get('files_tracked'),
                            c.get('heartbeat_number'),
                            c.get('timestamp')
                        ])
                print(tabulate(data, headers=headers))

            else:
                # Print result
                msg = get_components_result.get('result',
                                                {}).get('details', {})
                prRed(msg)
                sys.exit(1)

    def tail(self, pretty):
        '''
        Tail the logs of a topic
        '''
        master = self.ensure_master()
        if not self.state['default_topic']:
            err_msg = 'No default topic'
            prRed(err_msg)
            sys.exit(1)
        else:
            topic_name = self.state['default_topic']['topic_name']

            tail_logs_url = self.TAIL_LOGS_URL.format(host=master.host,
                                                      port=master.port,
                                                      topic_name=topic_name)

            try:
                session = requests.session()
                resp = session.get(tail_logs_url, stream=True)
                c = ConsoleRenderer()
                for line in resp.iter_lines():
                    log = dict()
                    try:
                        result = json.loads(line.decode('utf-8'))
                        result = result.get('result')
                        if result: log = json.loads(result)
                        else: continue
                    except ValueError:
                        print(Exception('ValueError log:{}'.format(result)))
                        continue
                    if pretty:
                        print(c(None, None, log))
                    else:
                        print(log)
            except requests.exceptions.ConnectionError:
                err_msg = 'Cannot request master'
                prRed(err_msg)
                sys.exit(1)
            except Exception as e:
                if resp: resp.close()
                raise e
                sys.exit(1)

    def collector_add_file(self, collector_host, collector_port, fpath,
                           formatter):
        '''
        Add file to collector
        '''
        master = self.ensure_master()

        if not self.state['default_topic']:
            err_msg = 'No default topic'
            prRed(err_msg)
            sys.exit(1)
        else:
            topic_name = self.state['default_topic']['topic_name']

            add_file_url = self.COLLECTOR_ADD_FILE_URL.format(
                host=master.host,
                port=master.port,
                topic_name=topic_name,
                collector_host=collector_host,
                collector_port=collector_port,
                fpath=fpath,
                formatter=formatter)

            add_file_result = self.request_master_url(add_file_url)

            if add_file_result.get('result', {}).get('success', {}):
                new_fpaths_list = list()
                for fpath in add_file_result.get('result', {})['fpaths']:
                    new_fpaths_list.append([fpath['fpath']])
                headers = ['File paths']
                data = list()
                #print result
                print(tabulate(new_fpaths_list, headers=headers))

            else:
                # Print result
                msg = get_components_result.get('result',
                                                {}).get('details', {})
                prRed(msg)
                sys.exit(1)

    def collector_remove_file(self, collector_host, collector_port, fpath):
        '''
        Remove file-path from collector
        '''
        master = self.ensure_master()

        if not self.state['default_topic']:
            err_msg = 'No default topic'
            prRed(err_msg)
            sys.exit(1)
        else:
            topic_name = self.state['default_topic']['topic_name']

            remove_file_url = self.COLLECTOR_REMOVE_FILE_URL.format(
                host=master.host,
                port=master.port,
                topic_name=topic_name,
                collector_host=collector_host,
                collector_port=collector_port,
                fpath=fpath)

            remove_file_result = self.request_master_url(remove_file_url)

            if remove_file_result.get('result', {}).get('success', {}):
                new_fpaths_list = list()
                for fpath in remove_file_result.get('result', {})['fpaths']:
                    new_fpaths_list.append([fpath['fpath']])
                headers = ['File paths']
                data = list()
                #print result
                print(tabulate(new_fpaths_list, headers=headers))

            else:
                # Print result
                msg = remove_file_result
                prRed(msg)
                sys.exit(1)