Ejemplo n.º 1
0
    def __call__(self, weights, update, gradients, staleness, epoch):
        """
        :param weights: Copy of the model weights
        :param update: Curent update
        :param gradients: List of gradients
        :param staleness: Staleness of each gradient
        :param epoch: Current epoch
        """

        lrr = -self.learning_rate_func(epoch, update) * len(gradients)
        print(lrr)
        logger.debug("the learning rate is:", lrr)

        start_time = time.time()
        if not self.staleness_aware:
            gradient = gradients
        else:
            gradient = [np.divide(g, s) for g, s in zip(gradients, staleness)]
        grad = np.mean(gradient, axis=0)
        for i, g in enumerate(grad):
            if self.momentum != 0:
                buf = self.buf[i]
                np.add(np.multiply(self.momentum, buf, out=buf), g, out=buf)
                if self.nesterov:
                    np.add(np.multiply(self.momentum, buf), g, out=g)
                else:
                    g = buf
                np.add(self.weight[i], np.multiply(lrr, g), out=self.weight[i])

        end_time = time.time()
        if self.flags.time_program:
            self.times.append(end_time - start_time)
        return self.weight
Ejemplo n.º 2
0
def create_weights(flags, agent_function, test_data, test_labels, batch_size,pipe):
    """
    Initializes the weights of the model
    :param agent_function: Implementation of the abstract AgentFunction class
    :param test_data: data of the test set
    :param test_labels: Labels ot the test set
    :param batch_size: batch size to be used
    :param pipe: pipe to send back the data
    """
    if flags.load_save:
        logger.state("Loading saved weights at:", flags.saved_weights)
        f = f = open(flags.saved_weights, "rb")
        ret = pickle.load(f)
        weights = ret[0][0]
    else:
        logger.debug("Before calling the agent_function")
        af = agent_function(flags, test_data, test_labels, batch_size, 0,1, True, None, 1)
        logger.debug("After calling the agent_function")
        weights = af.get_weight()
        af.close()
    if flags.eamsgd:
        weights.append(np.array([-0.05],np.float32)) #inital learning rate for eamsgd

    pipe.send(weights)
    pipe.close()
Ejemplo n.º 3
0
    def __call__(self, weights, update, gradients, staleness, epoch):
        """
        :param weights: Copy of the model weights
        :param update: Curent update
        :param gradients: List of gradients
        :param staleness: Staleness of each gradient
        :param epoch: Current epoch
        """
        lrr = self.learning_rate_func(epoch, update) * len(gradients)
        #print(lrr, flush=True)

        start_time = time.time()
        if not self.staleness_aware:
            gradient = gradients
        else:
            logger.debug("optimizer staleness list", staleness)
            for i, k in enumerate(staleness):
                if k < self.agents:
                    staleness[i] = self.agents
            gradient = [np.divide(g, s) for g, s in zip(gradients, staleness)]
        grad = np.mean(gradient, axis=0)

        for l, w in enumerate(weights):
            np.subtract(w, lrr * grad[l], out=w)

    # print("release lock opt", flush=True)
        end_time = time.time()
        if self.flags.time_program:
            self.times.append(end_time - start_time)
        return weights
Ejemplo n.º 4
0
 def learn(self, result, delta, learning_rate):
     super(BiasedNeuralLayer, self).learn(result, delta, learning_rate)
     Logger.debug("delta: " + str(delta))
     Logger.debug("biases: " + str(self.biases))
     tmp = -(learning_rate * delta)
     self.biases = tmp + np.atleast_2d(self.biases)
     self.biases[self.biases < 0] = 0
Ejemplo n.º 5
0
    def __call__(self, epoch, update):
        count = 0
        ii = 1
        gradients_list = []
        metrics_list = []
        from_list = []
        step_list = []
        global_update_list = []


        if self.mode == 1:
            agnt_nr = self.holdback.pop()
            logger.state("poped", agnt_nr, flush=True)
            if len(self.holdback) == 0:
                self.mode = 0
            return  None, [agnt_nr], None, None, None, 0, 1

        while True:
            i, p = next(self.gen)
            if p.poll():
                grads = []
                for i, fs in enumerate(self.float_sizes):
                    w = p.recv_bytes(fs * 4)
                    grads.append(np.ndarray(self.shapes[i], np.float32, w))

                last_update, step, agnt_nr, metrics = p.recv()

                sending = 2
                if self.mode == 0:
                    if self.updates_per_learner[agnt_nr-1] >= self.min + self.max_difference:
                        self.holdback.append(agnt_nr)
                        sending = 0
                        logger.debug("holding back", agnt_nr, flush=True)
                    else:
                        self.updates_per_learner[agnt_nr - 1] += 1
                        new_min = np.min(self.updates_per_learner)
                        if new_min != self.min:
                            self.min = new_min
                            if len(self.holdback) != 0:
                                self.mode = 1
                        logger.debug("reg send", agnt_nr, flush=True)
                    count += 1
                    gradients_list.append(grads)
                    metrics_list.append(metrics)
                    from_list.append(agnt_nr)
                    global_update_list.append(last_update)
                    step_list.append(1)
                else:
                    ii += 1
                if ii % self.learners == 0:
                    time.sleep(0.0001)

                if count == self.num:
                    if self.timing:
                        self.counter += 1
                    return gradients_list, from_list, global_update_list,step_list, metrics_list, 0, sending
Ejemplo n.º 6
0
 def get_delta(self, out_data, last_delta, last_weights):
     """calculate delta for layer before"""
     #Logger.DEBUG = True
     Logger.debug("Get delta: ")
     Logger.debug("out: " + str(np.shape(out_data)))
     Logger.debug("last_delta: " + str(np.shape(last_delta)))
     Logger.debug("last_weights: " + str(np.shape(last_weights)))
     dot = np.dot(last_weights.T, last_delta)
     Logger.debug("dot shape: " + str(np.shape(dot)))
     delta = dot * self.activation_deriv(out_data)
     #Logger.debug("delta shape: " + str(np.shape(delta)))
     #Logger.DEBUG = False
     return delta
Ejemplo n.º 7
0
    def get_weight(self):
        """
        :return: the current weight of the agent
        """
        list = []
        counter = 0
        for p in self.resnet.parameters():
            #print(p)
            #p.cpu()
            list.append(p.cpu().detach().numpy())
            counter +=1

        logger.debug("counter:", counter)
        return list
Ejemplo n.º 8
0
 def __init__(self, in_size, out_size, activation_fn=None, activation_fn_deriv=None):
     Logger.debug("create Neural Layer: \n" + \
                       "  in_size: " + str(in_size) + "\n" + \
                       "  out_size: " + str(out_size))
     # initialize weight matrix
     self.weights = np.random.uniform(-0.001, 0.001, (out_size, in_size)).astype(np.float64)
     # set biases if not already set (by child class for example)
     if not hasattr(self, 'biases'):
         self.biases = np.zeros(out_size).astype(np.float64)
     # set activation function and derivate
     self.activation = activation_fn or NeuralLayer.activation_linear
     self.activation_deriv = activation_fn_deriv or NeuralLayer.activation_linear_deriv
     # set size and input size
     self.size = out_size
     self.in_size = in_size
Ejemplo n.º 9
0
 def __del__(self):
     logger.debug("del PS")
     if not self.orderly:
         for p in self.shards_list:
             if p.is_alive():
                 try:
                     p.kill()
                 except Exception:
                     raise Exception
         for p in self.threads:
             if p.is_alive():
                 try:
                     p.kill()
                 except Exception:
                     raise Exception
Ejemplo n.º 10
0
    def test_evaluation(self):
        """
            Does the evaluation of the test set
        """
        logger.debug("starting evaluation")

        weights = self.test_weights_unified
        other_vars = self.test_averages_unified
        list1 = []
        for i in range(0, self.evaluation_steps):
            res = self.eval_obj.evaluate(weights, other_vars)
            list1.append(res)
        tmp1 = np.mean(list1, axis=0)

        return tmp1
Ejemplo n.º 11
0
    def elastic(self, bins_updates):
        # based on https://arxiv.org/pdf/1412.6651.pdf
        logger.debug("elastic", bins_updates, flush=True)
        weights = self.flags.bin_weights
        bins_updates = [u * v for (u, v) in zip(bins_updates, weights)]
        sum = np.sum(bins_updates)
        bins_updates = [i / sum for i in bins_updates]
        logger.debug("elastic", bins_updates, flush=True)

        new_weight = [
            np.multiply(self.weights[i], m) for i, m in enumerate(bins_updates)
        ]
        new_weight = np.sum(new_weight, axis=0)
        for i, w in enumerate(new_weight):
            for wb in self.weights:
                wb[i][:] = w[:]
Ejemplo n.º 12
0
    def __call__(self, epoch, update):
        """
            Implementation with the learners waiting n/learners others to arrive
        """
        ii=1
        count = 0
        list = []
        gradients_list = []
        metrics_list = []
        from_list = []
        step_list = []
        global_update_list = []
        while True:
            i, p = next(self.gen)
            if p.poll():
                grads = []
                for i, fs in enumerate(self.float_sizes):
                    w = p.recv_bytes(fs * 4)
                    grads.append(np.ndarray(self.shapes[i], np.float32, w))

                last_update, step, agnt_nr, metrics = p.recv()

                count += 1

                gradients_list.append(grads)
                metrics_list.append(metrics)
                from_list.append(agnt_nr)
                global_update_list.append(last_update)
                step_list.append(1)
                staleness = update - last_update
            else:
                ii += 1
            if ii % self.learners == 0:
                time.sleep(0.0001)
            if count == self.num:
                binning = 0
                for i in self.bins:
                    if staleness >= i:
                        binning += 1
                    else:
                        break
                self.bin_counts[binning] += 1
                logger.debug("staleness", staleness, "put in bin", binning, flush=True)
                return gradients_list, from_list, global_update_list, step_list, metrics_list, binning, 2
Ejemplo n.º 13
0
class FatDetector(object):
    def __init__(self):
        self.model = resnet50(num_classes=2, pretrained=False)
        self.model.cuda().eval()
        self.model.load_state_dict(torch.load('ckpt/fat/fat.pth'))
        self.image_size = [(672, 224), (448, 224), (224, 224)]
        self.logger = Logger()
        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                         std=[0.229, 0.224, 0.225])
        self.transform = transforms.Compose([
            transforms.ScalePad(self.image_size),
            transforms.ToTensor(), normalize
        ])

    def preprocessing(self, image):
        return Variable(self.transform(image).unsqueeze(0),
                        volatile=True).cuda()

        image = image.resize(self.image_size)
        image = np.asarray(image, dtype=np.uint8)
        image = torch.ByteTensor(image)
        image = image.float().div(255)
        image[:, :, 0] = (image[:, :, 0] - 0.485) / 0.229
        image[:, :, 1] = (image[:, :, 1] - 0.456) / 0.224
        image[:, :, 2] = (image[:, :, 2] - 0.406) / 0.225
        image = image.unsqueeze(0).permute(0, 3, 1, 2).contiguous()
        return Variable(image, volatile=True).cuda()

    def detect(self, image):
        start = time.time()
        inputs = self.preprocessing(image)
        self.logger.debug('preprocessing: %s', time.time() - start)
        start = time.time()
        #bs, ncrops, c, h, w = inputs.size()
        #logits = self.model(inputs.view(-1, c, h, w))
        #logits = logits.view(bs, ncrops, -1).mean(dim=1)
        logits = self.model(inputs)
        self.logger.debug('forward: %s', time.time() - start)
        preds = F.softmax(logits)
        return preds.data.cpu().numpy()[0][1]
Ejemplo n.º 14
0
def _get_browse_tag(url):

    # 다음 세부분류 category로의 링크들
    browse_url_list = []
    # 특정 Category로 채용 공고를 찾는 링크들
    search_url_list = []

    soup = BeautifulSoup(_get_html(url), 'html.parser')
    main_content = soup.find("table", {"id": "main_content"})
    if main_content == None:
        main_content = soup.find("table", {"id": "browsejobs_main_content"})

    for a_tag in main_content.find_all("a"):
        # print(a_tag)
        if "browsejobs" in a_tag["href"]:
            browse_url_list.append(a_tag["href"])
            Logger.debug("browsejobs = " + a_tag.text)
        else:
            search_url_list.append(a_tag["href"])
            Logger.debug("search = " + a_tag.text)

    return browse_url_list, search_url_list
Ejemplo n.º 15
0
def get_emp_key(url, index):
    # 해당 페이지에 "다음" 글씨가 있는지 여부
    # next_page_exist=True

    req_url = url + "&start=" + str(index * 50)
    Logger.debug("get_emp_key request URL = " + url)
    html = _get_html(url)

    soup = BeautifulSoup(html, 'html.parser')

    job_key_list = []

    # a_tag 중 채용공고에 대한 Tag를 찾는 방식
    # -> 일반 채용공고 뿐만 아니라 유료 광고도 불러옴 (일반 채용 공고 10개 + 유료 광고 6개)
    for a_tag in soup.find_all("a", {"data-tn-element": "jobTitle"}):
        split_id = a_tag.parent["id"].split("_")

        id_type = split_id[0]
        emp_id = split_id[1]

        job_key_list.append(emp_id)

    return job_key_list
Ejemplo n.º 16
0
def write_search_url_list(flush_num, max_sleep):
    browse_url_list, _ = _get_browse_tag(INDEED_CATEGORY_URL)

    file_index = 0
    search_url_list = []

    Logger.info("browse url list length = " + str(len(browse_url_list)) +
                " search url list length = " + str(len(search_url_list)))

    # browse_url 없을 때 까지 search url 계속 추가하면서 찾음
    while len(browse_url_list) != 0:

        Logger.debug("checking browse url = " + browse_url_list[0])
        new_br_list, new_search_list = _get_browse_tag(INDEED_URL +
                                                       browse_url_list[0])
        browse_url_list = browse_url_list + new_br_list
        search_url_list = search_url_list + new_search_list
        del browse_url_list[0]
        # return search_url_list
        Logger.info("browse url list length = " + str(len(browse_url_list)) +
                    " search url list length = " + str(len(search_url_list)))
        time.sleep(uniform(0.1, max_sleep))

        if len(search_url_list) >= flush_num:
            file_path = config.CATEGORY_URL_FILE_PATH + "search_url_list_" + str(
                file_index) + ".txt"
            # with open(pkl_name,"wb") as f:
            #         pkl.dump(search_url_list,f)

            with open(file_path, "w") as f:
                f.write("\n".join(search_url_list))

            Logger.info("searurl " + str(len(search_url_list)) +
                        " is written to " + file_path + "\n")

            search_url_list = []
            file_index += 1
Ejemplo n.º 17
0
 def learn(self, result, delta, learning_rate):
     #raw_input("press Enter")
     # apply learning rule
     #Logger.DEBUG = True
     Logger.debug("NeuralLayer:learn")
     Logger.debug("result: " + str(np.shape(result)))
     Logger.debug("delta: " + str(np.shape(delta)))# + "\nresult shape:" + str(np.shape(result)))
     delta_weights = learning_rate * np.outer(delta, result)
     #Logger.debug("delta weights shape:" + str(np.shape(delta_weights)))
     #Logger.log(str(delta_weights))
     self.weights += delta_weights
Ejemplo n.º 18
0
 def feed(self, input_data):
     # calculate activation of layer for given inputs
     #Logger.DEBUG = True
     Logger.debug("NeuralLayer:feed")
     Logger.debug("input: " + str(np.shape(input_data)))
     dot = np.dot(self.weights, input_data)
     result = self.activation(dot + np.atleast_2d(self.biases).T)
     Logger.debug("output: " + str(np.shape(result)))
     #Logger.debug("weights: " + str(np.shape(self.weights)))
     #Logger.debug("dot shape: " + str(np.shape(dot)))
     #Logger.DEBUG = False
     return result
Ejemplo n.º 19
0
 def close(self):
     """
     need to close the data loaders otherwise they keep running
     """
     logger.debug("close resnet torch", flush=True)
     if self.timess != []:
         logger.state("time is: ",np.mean(self.timess, axis = 0), flush=True )
     a = getattr(self, "test_set_load", None)
     b = getattr(self,"train_set_load", None)
     if a != None:
         logger.debug("shut them down",flush=True)
         # del(a)
         a._shutdown_workers()
     if b != None:
         logger.debug("shut them down")
         b._shutdown_workers()
Ejemplo n.º 20
0
    def work(self):
        """
            Trains for one batch using the self.agent_function and then sends the gradient back to
            the parameter shards and gets an up-to-date weight back
            It logs the time when it finished each update.
            in a file stored at self.flags.time_stamps_file_name
        """
        # store the global update of each weight
        if self.flags.load_save:
            self.batch_size = self.flags.batch_size // self.num_agents
            # number of iteration for the total batch size (Batch size if there were one agent)
            self.iterations = self.flags.total_iterations
            # how many iterations there are per epoch
            if self.flags.drop_remainder:
                self.iterations_in_epoch = math.floor(
                    self.flags.train_set_size /
                    (self.num_agents * self.batch_size))
            else:
                self.iterations_in_epoch = math.ceil(
                    self.flags.train_set_size /
                    (self.num_agents * self.batch_size))
            self.iterations_in_epoch //= 1
            global_update = [
                self.flags.starting_epoch * self.iterations_in_epoch *
                self.num_agents
            ] * self.num_shards
        else:
            global_update = [0] * self.num_shards

        #list that sotres the size of each weight in bytes, needed for sending them
        float_sizes = []
        total_size = []
        shapes = []
        weights = []
        for w in self.weights:
            weights += w
            shap = []
            fs = []
            ts = 0
            for i in w:
                l = 1
                shap.append(i.shape)
                for r in i.shape:
                    l *= r
                fs.append(l)
                ts += l
            shapes.append(shap)
            float_sizes.append(fs)
            total_size.append(ts)
        print(total_size[0], flush=True)
        print(float_sizes[0], flush=True)
        print(shapes[0], flush=True)

        logger.debug("Agent nr",
                     self.agent_nr,
                     "pid is: ",
                     getpid(),
                     flush=True)
        if self.data is not None:
            logger.debug("Agent nr",
                         self.agent_nr,
                         self.data.shape,
                         self.labels.shape,
                         self.num_labels,
                         self.batch_size,
                         flush=True)
        agent = self.agent_function(self.flags, self.data, self.labels,
                                    self.batch_size, self.gpu_nr,
                                    self.fraction, False, weights,
                                    self.agent_nr)
        del (self.data)
        del (self.labels)

        itr = True
        step = 0
        while itr:
            start_time = time.time()
            weights = []
            for w in self.weights:
                weights += w

            grads_, metrics = agent.train(weights)

            agent_part_finished = time.time()
            if self.slow_down != None:
                if type(self.slow_down) is tuple:
                    if self.flags.slow_down_type == "gauss":
                        r = self.random.gauss(self.slow_down[0],
                                              self.slow_down[1])
                        if r <= 0:
                            time.sleep(0)
                        else:
                            time.sleep(r)
                else:
                    if self.flags.slow_down_type == "ber":
                        r = self.random.uniform(0, 1)
                        if self.p >= r:
                            print(r)
                            time.sleep(self.slow_down)
                    elif self.flags.slow_down_type == "time":
                        time.sleep(self.slow_down)

            offset = 0
            for c, q in enumerate(self.pipes):
                new_offset = offset + len(self.weights[c])
                for bits in grads_[offset:new_offset]:
                    q.send_bytes(bits)
                q.send((global_update[c], step, self.agent_nr, metrics))
                offset = new_offset
                logger.debug("agent", self.agent_nr, "sent to", c)

            itr_inner = 0
            while itr_inner < self.num_shards:
                for c, p in enumerate(self.pipes):
                    if p.poll():
                        while True:
                            logger.debug("Agent", self.agent_nr, "is recv")
                            tmp = p.recv()
                            logger.debug("Agent", self.agent_nr, "has recv")
                            if tmp[0] == "update weights":
                                global_update[c] = p.recv()
                                weight = []
                                for fsi, fs in enumerate(float_sizes[c]):
                                    w = p.recv_bytes(fs * 4)
                                    weight.append(
                                        np.ndarray(shapes[c][fsi], np.float32,
                                                   w))
                                self.weights[c] = weight
                                itr_inner += 1
                                break
                            elif tmp[0] == "stop":
                                logger.debug("Agent nr",
                                             self.agent_nr,
                                             "got stoped",
                                             flush=True)
                                self.pipes[c].close()
                                del (self.pipes[c])
                                self.shut_down()
                                itr = False
                                itr_inner = self.num_shards
                                break
                            elif tmp[0] == "globals":
                                logger.debug("agent is sending globals")
                                glob = agent.get_globals()
                                p.send(glob)
                else:
                    time.sleep(0.0001)

            self.log[step, 0] = time.time()
            self.log[step, 1] = time.clock()
            self.saver.save_1D(self.log[step, :])
            step += 1
            end_time = time.time()
            if self.timing:
                self.times.append(
                    [agent_part_finished - start_time, end_time - start_time])
        logger.debug("Agent nr",
                     self.agent_nr,
                     "loop actually finished",
                     flush=True)

        agent.close()
        for p in self.pipes:
            p.close()
        if self.timing:
            t = np.average(self.times, axis=0)
            logger.state("Agent nr",
                         self.agent_nr,
                         "running agent function:",
                         t[0],
                         "total time",
                         t[1],
                         flush=True)
Ejemplo n.º 21
0
    def __call__(self, weights, update, gradients, staleness, epoch):
        """
        :param weights: Copy of the model weights
        :param update: Curent update
        :param gradients: List of gradients
        :param staleness: Staleness of each gradient
        :param epoch: Current epoch
        """

        self.learning_rate_func(epoch, update)
        lr = torch.tensor(-self.learning_rate,
                          dtype=torch.float,
                          device=self.device)

        start_time = time.time()
        if not self.use_exp:
            gradient = gradients
        else:
            lr_list = []

            def exp(x):
                return self.equalizer * np.exp(-(x - self.mean)**2 /
                                               (2 * self.std**2))

            for i, s in enumerate(staleness):
                # print(s)
                pos = self.staleness_counter % self.ring_size
                self.staleness_ring[pos] = s
                lrexp = exp(s)
                self.lr_ring[pos] = lrexp
                lr_list.append(lrexp)
                self.staleness_counter += 1
                if pos == self.ring_size - 1:  ##is this at the end of the ring
                    std = np.std(self.staleness_ring)
                    self.std = self.decay * self.std + self.learn * std
                    if self.std == 0:  # not numerically correct
                        self.std = 0.2

                    mean = np.mean(self.staleness_ring)
                    self.mean = self.decay * self.mean + self.learn * mean
                    sum = np.sum(
                        self.lr_ring
                    )  #want sum to be about (self.ring_size/self.agents)
                    equalizer = self.equalizer * (
                        self.ring_size / self.agents
                    ) / sum  # problme that we change the equalizer and avgs while iterating through the stalenesses???????
                    self.equalizer = self.decay * self.equalizer + self.learn * equalizer
                    logger.debug("mean",
                                 self.mean,
                                 "std",
                                 self.std,
                                 "stalleness ring",
                                 self.staleness_ring,
                                 "lr ring",
                                 self.lr_ring,
                                 "sum",
                                 sum,
                                 "equalizer",
                                 self.equalizer,
                                 flush=True)
            if self.staleness_counter <= self.ring_size * 5:
                gradient = gradients
            else:
                gradient = [
                    np.multiply(g, s) for g, s in zip(gradients, lr_list)
                ]
        grad = np.mean(gradient, axis=0)
        grads = []
        for g in grad:
            grads.append(torch.tensor(g, device=self.device))

        # followig code snipped is based on https://github.com/pytorch/pytorch/blob/master/torch/optim/sgd.py
        for i, p in enumerate(self.weights):
            d_p = grads[i].data
            if self.momentum != 0:
                buf = self.buf[i]
                lr_grad = d_p.mul(lr)
                buf.mul_(self.momentum).add_(
                    lr_grad)  #corrected the version of lr
                #buf.mul_(self.momentum).add_(d_p)
                if self.nesterov:
                    d_p = buf.mul(self.momentum).add_(d_p.mul(lr))
                    #d_p = d_p.add(self.momentum, buf)
                else:
                    d_p = buf
            #p.data.add_(lr,d_p)
            p.data.add_(d_p)

        end_time = time.time()
        if self.flags.time_program:
            self.times.append(end_time - start_time)

        w = []
        for wg in self.weights:
            w.append(wg.data.cpu().numpy())
        return w
Ejemplo n.º 22
0
    def __init__(self, flags, data, labels, batch_size, gpu_nr, fraction, testing, weights=None, agent_nr=1):
        """
        :param flags: flags set by the user
        :param data: sampels if None have to be loaded in the by the actual implementation
        :param labels: lables  if None have to be loaded in the by the actual implementation
        :param batch_size: batch size to be used
        :param gpu_nr: Number of the GPU, the agent should run on
        :param fraction: fraction of the GPU memory the agent should maximally use
        :param testing: True if we run a test set, False for training
        """
        super().__init__(flags, data, labels, batch_size, gpu_nr, fraction, testing, weights, agent_nr)
        seed = (int(time.time()*100)*agent_nr) % (2**32 - 1)
        self.timess = [] 
        logger.debug("seed", seed)
        np.random.seed(seed)
        #random.seed(123)
        torch.manual_seed(seed)  #set randomness for everything that torch uses the pythono random for. Here padding, image order
        self.flags = flags
        self.resnet = ResNet_cifar(depth=44)#num_classes = 100
        #self.resnet = ResNet_imagenet(block=BasicBlock,
         #                      layers=[2, 2, 2, 2],
          #                     expansion=1)
        self.loss = CrossEntropyLoss()
        self.batch_size = batch_size
        self.gpu_nr = gpu_nr
        logger.debug("number of gpus", torch.cuda.device_count())

        if 'cuda' in self.flags.device:
            self.device = torch.device(self.gpu_nr)
            logger.debug("using device ", torch.cuda.get_device_name(self.device),"am Agent ",agent_nr)
        else:
            self.device = torch.device('cpu')

        self.cpu = torch.device('cpu')

        self.resnet.to(self.device, dtype = torch.float)
        self.loss.to(self.device)
        path = os.environ['HOME']
        path = path + '/cifar10_data/'
        logger.debug("gpu_nr", gpu_nr)

        # copied and modified from
        # convNet.pytorch/utils/regularization   Regularizer class
        # and convNet.pytorch/resnet: weight_decay_config function
        self._named_parameters = list(
            FilterParameters(self.resnet, **{'parameter_name': lambda n: not n.endswith('bias'),
                                             'module': lambda m: not isinstance(m, nn.BatchNorm2d)}).named_parameters())

        # copid and modified from:
        # convNet.pytorch/main.py
        if not testing:
            self.train_set =  DataRegime(getattr(self.resnet, 'data_regime', None),
                       defaults={'datasets_path':path , 'name': 'cifar10', 'split': 'train',
                                 'augment': True,
                                 'input_size': None, 'batch_size': batch_size, 'shuffle': True,
                                 'num_workers': 1, 'pin_memory': True, 'drop_last': True,
                                 'distributed': False, 'duplicates': 1, #batch augmentation
                                 'cutout': {'holes': 1, 'length': 16} if True else None})
            self.train_epoch = 0
            self.train_iterator = self._train_generator()
        else:
            self.test_set = DataRegime(getattr(self.resnet, 'data_eval_regime', None),
                                  defaults={'datasets_path': path, 'name': 'cifar10', 'split': 'val',
                                            'augment': False,
                                            'input_size': None, 'batch_size': batch_size,
                                            'shuffle': False,
                                            'num_workers': 1, 'pin_memory': True, 'drop_last': False})
            self.test_epoch = 0
            self.test_iterator = self._test_generator()
        #end


        if flags.load_save:
            import pickle
            f = f = open(flags.saved_weights, "rb")
            ret= pickle.load(f)
            glob = ret[1][0]
            for i, p in enumerate(self.resnet.buffers()):
                if np.isscalar(glob[i]):
                    p.data = torch.tensor(glob[i])
                else:
                    p.data[:] = torch.tensor(glob[i], device=self.device)[:]




        if self.flags.eamsgd and not testing:
            weights = weights[0:-1]
        if (self.flags.correction or self.flags.eamsgd) and not testing:
            if weights != None:
                self.velocity = []
                self.velocity_grad = []
                self.momentum = torch.tensor(0.9, dtype=torch.float, device = self.device)
                for w in weights:
                    self.velocity.append(torch.zeros(w.shape,dtype=torch.float, device = self.device))
Ejemplo n.º 23
0
class MultiTaskDetector(object):

    def __init__(self):
        self.model = MultiTask(pretrained=False, backend='resnext101_32d')
        self.model.cuda().eval()
        self.model.load_state_dict(torch.load('ckpt/multi_task/multi_task.pth'))
        self.category_mapping = yaml.load(open('config/category_mapping.yml'))
        self.category_trans = yaml.load(open('config/category_trans.yml'))
        self.attr_mapping = yaml.load(open('config/attr_mapping.yml'))
        self.attr_trans = yaml.load(open('config/attr_trans.yml'))
        self.image_size = (224, 224)
        self.logger = Logger()
        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        self.transform = transforms.Compose([
            #    transforms.Scale(self.image_size),
            #    transforms.ThreeCrop(self.image_size),
            #    transforms.Lambda(lambda crops: torch.stack([normalize(transforms.ToTensor()(crop)) for crop in crops]))
            transforms.Resize(self.image_size),
            transforms.ToTensor(),
            normalize,
        ])

    def preprocessing(self, image):
        return Variable(self.transform(image).unsqueeze(0), volatile=True).cuda()

        image = image.resize(self.image_size)
        image = np.asarray(image, dtype=np.uint8)
        image = torch.ByteTensor(image)
        image = image.float().div(255)
        image[:, :, 0] = (image[:, :, 0] - 0.485) / 0.229
        image[:, :, 1] = (image[:, :, 1] - 0.456) / 0.224
        image[:, :, 2] = (image[:, :, 2] - 0.406) / 0.225
        image = image.unsqueeze(0).permute(0, 3, 1, 2).contiguous()
        return Variable(image, volatile=True).cuda()

    def detect(self, image, cls_threshold=0.3, attr_threshold=0.3):
        start = time.time()
        inputs = self.preprocessing(image)
        self.logger.debug('preprocessing: %s', time.time() - start)
        start = time.time()
        #bs, ncrops, c, h, w = inputs.size()
        #logits = self.model(inputs.view(-1, c, h, w))
        #logits = logits.view(bs, ncrops, -1).mean(dim=1)
        cls_out, attr_out = self.model(inputs)
        self.logger.debug('forward: %s', time.time() - start)
        cls_preds = F.softmax(cls_out)
        cls_probs, cls_labels = cls_preds.topk(1, 1)
        cls_label = cls_labels.data.cpu().tolist()[0][0]
        cls_prob = cls_probs.data.cpu().tolist()[0][0]

        attr_preds = (F.sigmoid(attr_out)).view(1, -1).data.cpu().tolist()[0]
        attrs = [(self.attr_mapping.get(x[0], x[0]), x[1]) for x in enumerate(attr_preds)]
        attrs = [(self.attr_trans.get(x[0], x[0]), x[1]) for x in attrs]
        attrs = sorted(filter(lambda x: x[1] >= attr_threshold, attrs), key=lambda x: x[1], reverse=True)

        if cls_prob > cls_threshold:
            cls_name = self.category_mapping.get(cls_label)
            category = (self.category_trans.get(cls_name, cls_name), cls_prob)
            return category, attrs
        else:
            return None, attrs
Ejemplo n.º 24
0
class MultAccount:
    def __init__(self, name, id_list):
        """
        :param id_list: string id list
        """
        self._name = name
        self._date_list = []
        self._account_list = []
        self._len = len(id_list)
        for id in id_list:
            self._account_list.append(Account(id))
        self._id_list = id_list

        self._nav_tail = -1
        self._pos_maxval = 0
        self._pos_dd = 0

        self._sum_investment = 0
        self._mult_account_weight_list = [1.0 / self._len] \
                                         * self._len
        self._market_value = 0
        self._pos_list = []
        self._cum_ret_list = []

        self._final_cum_ret = None

        self._invest_start = False
        self._logger = Logger()

    @property
    def name(self):
        return self._name

    @property
    def account_list(self):
        return self._account_list

    @property
    def account_num(self):
        return self._len

    def __iter__(self):
        for accout in self._account_list:
            yield accout

    def find_maxdown(cls, pos_val, pos_maxval, mdd):
        down_val = pos_maxval - pos_val
        dd = down_val / (pos_maxval + 1e-10)
        if down_val < 0:
            pos_maxval = pos_val
        elif dd > mdd:
            mdd = dd
        return pos_maxval, mdd

    def update_datenav(self, date, nav_list):
        market_value_sum = 0
        # obtain all accounts' remaining weight
        market_value_list = []
        for account, nav in list(zip(self._account_list, nav_list)):
            account.update_nav(nav)
            market_value_sum += account.market_value
            market_value_list.append(account.market_value)

        if market_value_sum > 0:
            self._mult_account_weight_list = [
                v / market_value_sum for v in market_value_list
            ]
        self._market_value = market_value_sum

        self._logger.debug(
            module='account',
            file='multacct.py',
            content=" " * 4 +
            "multi accounts, market value now is %.5f." % market_value_sum)

        self._date_list.append(date)
        self._pos_list.append(market_value_sum)

        self._pos_maxval, self._pos_dd = \
            self.find_maxdown(market_value_sum,
                              self._pos_maxval,
                              self._pos_dd)

        self._cum_ret_list.append(self.cum_ret)

        self._logger.debug(module='account',
                           file='multacct.py',
                           content=" " * 4 +
                           "return rate %s." % str(self.cum_ret))

    def update_allocation(self, weight_list):
        if 0.995 < sum(weight_list) < 1.005:
            pass
        else:
            self._logger.debug(module='account',
                               file='multacct.py',
                               content="" * 4 +
                               "weights sum not equal to 1 is not allowed.")
            return
        practical_invest = 0
        market_value_sum = 0
        if self._invest_start:
            pos_tail = self._pos_list[-1]
            for account, weight in list(zip(self._account_list, weight_list)):
                assert (isinstance(account, Account))
                single_invest = weight * (pos_tail if pos_tail > 0 else 1)
                adjust_invest = single_invest - account.market_value

                if abs(adjust_invest) / (account.market_value + 1e-10) > 0.01:
                    if adjust_invest > 0:
                        practical_invest += adjust_invest
                        account.buy(adjust_invest)
                    elif adjust_invest < 0:
                        account.sell(abs(adjust_invest))
                        practical_invest -= account.surplus_value
                        account.extract_surplus()
                market_value_sum += account.market_value
        else:
            for account, weight in list(zip(self._account_list, weight_list)):
                practical_invest += weight
                account.buy(weight)
                market_value_sum += account.market_value
            self._invest_start = True

        self._sum_investment += practical_invest
        self._market_value = market_value_sum

        self._logger.debug(
            module='account',
            file='multacct.py',
            content=" " * 4 +
            "this transaction cost extra invest %.5f." % practical_invest)
        self._logger.debug(
            module='account',
            file='multacct.py',
            content=" " * 4 +
            "cumulative investment is %.5f." % self._sum_investment)
        self._logger.debug(
            module='account',
            file='multacct.py',
            content=" " * 4 +
            "multi accounts, market value after transaction is %.5f." %
            market_value_sum)

    def stop(self):
        return_value_sum = 0
        for account in self._account_list:
            account.sell(account.market_value)
            return_value_sum += account.extract_surplus()
        self._final_cum_ret = (return_value_sum - self._sum_investment) / \
                              self._sum_investment

    def logging(self):
        self._logger.info(module='account',
                          file='multacct.py',
                          content={
                              'backtest_time': str(self._date_list[-1]),
                              'account_name': self._name,
                              'content': 'return rate is %.4f' % self.cum_ret
                          })

    @property
    def cum_ret(self):
        if self._final_cum_ret:
            return self._final_cum_ret
        else:
            return (self._market_value - self._sum_investment) / \
                   (self._sum_investment + 1e-10)

    @property
    def cum_ret_avg(self):
        return sum(self._cum_ret_list) / len(self._cum_ret_list)

    @property
    def cum_ret_list(self):
        return self._cum_ret_list

    @property
    def pos_list(self):
        return self._pos_list

    @property
    def mult_account_weight_list(self):
        return self._mult_account_weight_list

    @property
    def investment_sum(self):
        return self._sum_investment

    @property
    def return_value(self):
        return self.investment_sum * self.cum_ret

    @property
    def pos_dd(self):
        return self._pos_dd
Ejemplo n.º 25
0
class Account:
    def __init__(self, id, sub_discount=0.0015, red_discount=0.005):
        self._id = id
        self._nav = 1
        self._volume = 0
        self._market_value = 0
        self._surplus_value = 0
        self._sub_discount = sub_discount
        self._red_discount = red_discount
        self._logger = Logger()

    @property
    def volume(self):
        return self._volume

    @property
    def market_value(self):
        return self._market_value

    @property
    def surplus_value(self):
        return self._surplus_value

    def _update_market_value(self):
        self._market_value = self._volume * self._nav

    def update_nav(self, nav):
        self._logger.debug(module='account',
                           file='singleacct.py',
                           content="----" * 2 +
                           "account %s, org market value is %.5f" %
                           (self._id, self._market_value))
        self._nav = nav
        self._update_market_value()
        self._logger.debug(module='account',
                           file='singleacct.py',
                           content="----" * 3 +
                           "account %s, now market value is %.5f" %
                           (self._id, self._market_value))

    def buy(self, weight):
        self._logger.debug(module='account',
                           file='singleacct.py',
                           content="----" * 1 +
                           "account %s, buy weight %.5f, cut off %.5f" %
                           (self._id, weight, weight * self._sub_discount))
        weight *= 1 - self._sub_discount
        volume = weight / self._nav
        self._volume += volume
        self._update_market_value()

    def sell(self, weight):
        self._logger.debug(module='account',
                           file='singleacct.py',
                           content="----" * 1 +
                           "account %s, sell weight %.5f, cut off %.5f" %
                           (self._id, weight, weight * self._red_discount))
        volume = weight / self._nav
        self._volume -= volume
        self._update_market_value()
        self._surplus_value += \
            volume * self._nav * (1 - self._red_discount)

    def extract_surplus(self):
        surplus_value = self._surplus_value
        self._surplus_value = 0
        return surplus_value
Ejemplo n.º 26
0
def write_emp_detail(url_list, max_sleep):

    crawling_data_num = 0
    start_time = time.time()

    # job detail을 불러와 xml파일로 쓴 job key들
    key_dict = {}

    # 요청시 실패한 key list 저장
    failed_key_list = []
    url_index = 0
    # URL 당 최대 1000개의 job detail적음
    for url in url_list:
        Logger.info(
            str(url_index + 1) + "/" + str(len(url_list)) +
            " job list request URL = " + url.replace("\n", ""))
        url_index += 1
        job_key_list = []

        # 채용 공고 목록 페이지에서 job key들을 받아옴
        # 50개씩 최대 20번 불러옴
        try:
            req_url = config.INDEED_URL + url + "&limit=50"
            prev_key_list = []
            for index in range(20):
                cur_key_list = get_emp_key(req_url, index)
                # 페이지 index가 넘어간 경우 계속 똑같은 list를 반복하므로 이 경우 더 이상 job key를 받아오지 않음
                if prev_key_list == cur_key_list:
                    break
                else:
                    prev_key_list = cur_key_list
                job_key_list = job_key_list + cur_key_list
                rand_sleep(max_sleep)
        except:
            continue

        Logger.info("job key list length = " + str(len(job_key_list)))

        not_exist_cnt = 0

        # job key들의 채용 공고 detail 받아옴
        for key in job_key_list:
            # 해당 key에 대해서 아직 Crwaling 안한 경우
            # Crawling 수행하고 xml 파일 적음
            if not key in key_dict:
                key_dict[key] = True
                try:
                    xml = get_job_detail(key)
                except:
                    Logger.warn(key + " parsing failed")
                    failed_key_list.append(key)
                    rand_sleep(max_sleep)
                    continue

                file_path = config.JOB_DETAIL_FILE_PATH + key + ".xml"
                _write_xml(xml, file_path)
                Logger.debug(file_path + " is written")
                not_exist_cnt += 1
                crawling_data_num += 1
                rand_sleep(max_sleep)
            else:
                Logger.debug("key " + key + " already exist")

        # url
        Logger.info("total data num =" + str(crawling_data_num) +
                    " added data num =" + str(not_exist_cnt) +
                    " elapsed time = " + str(time.time() - start_time) + "\n")

        # 현재까지 가져온 모든 key들을 pkl로 저장
        key_dict_path = "key_dict.pkl"
        # 적은 job key의 dictionary를 pickle 파일로 저장
        with open(key_dict_path, "wb") as f:
            pkl.dump(key_dict, f)

        Logger.info(key_dict_path + "has been written")
Ejemplo n.º 27
0
    def __init__(self, flags, update_function, agent_function, data_loader, num_agents, slow_down, algorithm):
        """
        :param flags: Flags set by the user.
        :param update_function: Implementation of the UpdateFunction abstract class.
        :param agent_function: Implementation of the abstract AgentFunction class.
        :param data_loader: Function which returns ((train_data, train_labels),(test_data, test_labels),
                number of distinct labels (Categories)).
                If left None, then the data needs to be loaded by the agent_function.
        :param num_agents: The number of Agents to be simulated.
        :param slow_down: How much the agent gets slowed down, either a tuple (float a, float b),
                a single float b or None for no slowdown.
        :param algorithm:  Implementation of the AlgorithmFunction abstract class.
        """
        # flag used to figure out if the PS finished without an interruption
        self.orderly = False
        self.flags = flags
        self.batch_size = self.flags.batch_size // num_agents
        self.num_agents = num_agents
        self.slow_down = slow_down
        self.iterations = self.flags.total_iterations
        logger.state("Number of iterations per agent " + str(self.iterations))
        self.total_updates = self.iterations*self.num_agents
        self.algorithm = algorithm
        self.shards = self.flags.shards
        self.number_of_labels = self.flags.number_of_labels

        #if Dataset should get distributed
        self.distribute = self.flags.distribute

        self.update_rule_uninitiated = update_function
        self.agent_function = agent_function
        self.data_loader = data_loader

        # list that stores gradients of the agents
        self.gradients_list = []
        # pipes to communicate with the agent processes
        self.pipes = []
        # Each agent gets an process same order as self.agents
        self.threads = []

        # list of the weights
        self.weights_numpy = []

        # Calculate what fraction of the gpu memory each Agnet and Shard gets to use
        if flags.gpu_number == 1:
            self.fraction = (1 - 0.01) / (self.num_agents + self.shards)
        if flags.gpu_number > 1:
            half = math.ceil((self.num_agents + self.shards) / flags.gpu_number)
            self.fraction = (1 - 0.01) / (half)

        if self.distribute:
            ((ts, tl), (es, el), number_of_labels) = self.data_loader()

            self.data = ts
            self.labels = tl
            self.test_data = es
            self.test_labels = el
            self.number_of_labels = number_of_labels
            #Distributing the data set such that each Agent gets the same number of distinct samples
            self.data_list, self.labels_list = self._distribute_data()
        else:
            self.data_list = [None for i in range(0, self.num_agents)]
            self.labels_list = [None for i in range(0, self.num_agents)]
            self.test_data=None
            self.test_labels=None

        # getting the initial weighs
        # We use a seperate process here because loading tensorflow in this process sometimes causes problems later on.
        (ps_end, agent_end) = Pipe()
        if self.distribute:
            p = Process(target=create_weights, args= (self.flags, self.agent_function, self.test_data, self.test_labels
                                                 , self.flags.batch_size,agent_end))
        else:
            p = Process(target=create_weights, args=(self.flags, self.agent_function, None,None
                                                          , self.flags.batch_size, agent_end))
        p.daemon = True
        p.start()
        agent_end.close()
        self.weights_numpy = ps_end.recv()
        p.join()
        ps_end.close()

        # list that stores the portion of the weight each shard gets
        self.weights_numpy_list = []
        # number of trainable weights each shard gets, the remainder gets distributed to the first few shards
        size = len(self.weights_numpy) // self.shards
        remainder = len(self.weights_numpy) % self.shards
        offset = 0
        for i in range(0, self.shards):
            if i < remainder:
                new_offset = offset + size + 1
            else:
                new_offset = offset + size
            self.weights_numpy_list.append(self.weights_numpy[offset:new_offset])
            offset = new_offset

        self.pipes_list = []
        # Creating the agent processes
        for i in range(0, self.num_agents):
            pipes_for_agent = []
            pipes_for_shars = []
            for j in range(0, self.shards):
                (ps_end, agent_end) = Pipe()
                pipes_for_agent.append(agent_end)
                pipes_for_shars.append(ps_end)
            self.pipes_list.append(pipes_for_shars)
            p = Process(target=Agent, args=(self.flags,  self.num_agents, i + 1, self.data_list[i],
                                            self.labels_list[i], self.number_of_labels, self.batch_size,
                                            self.weights_numpy_list, self.total_updates, pipes_for_agent,
                                            self.agent_function,
                                            self.fraction, self.slow_down[i], os.getpid()))
            #p.daemon = True, cannot set this because Pytorch uses processes fo its own.
            p.start()
            agent_end.close()
            self.threads.append(p)

        self.shards_list = []
        self.shard_pipes = []

        # running the parameter server shards
        for i in range(0,self.shards):
            (ps_end, shard_end) = Pipe()
            self.shard_pipes.append(ps_end)
            pipes = []
            for j in range(0,self.num_agents):
                pipes.append(self.pipes_list[j][i])
            p = Process(target=Shard, args=(self.flags, pipes,self.weights_numpy_list[i] ,i, self.algorithm,
                                            self.update_rule_uninitiated, self.flags.train_set_size, shard_end,
                                            self.fraction, os.getpid()))

            p.start()
            shard_end.close()
            self.shards_list.append(p)

        for i in self.pipes_list:
           for p in i:
              p.close()

        # delete the reference to the data such that it doesn't use up memory, is not used here anymore
        self.data_list = []
        self.labels_list = []
        self.data = None

        self.csv_name = os.path.sep + self.flags.log_file_names + ".log"
        saver = Saver(self.flags.log_dir + self.csv_name)


        # Receiving the weights, other global variables and metrics from each shard

        # list that stores the weights at the end of each epoch for the test set evaluation
        self.test_weights = []
        # other globals variables needed for the test set evaluation
        self.test_globals = []
        print("getting the test weights", flush=True)
        batchsize = self.flags.evaluation_batchsize
        self.evaluation_steps = self.flags.test_set_size // batchsize
        self.eval_obj = self.agent_function(self.flags, self.test_data, self.test_labels, batchsize, 0, 1, True)
        log = []
        self.weights = []
        self.globals = []
        self.metrics = []
        itr = True
        if self.flags.load_save:
            self.counter = self.flags.starting_epoch
        else:
            self.counter = 0
        while itr:
            self.counter += 1
            self.test_globals = [[]] * self.shards
            self.test_weights = [[]] * self.shards
            c = 0
            while c<self.shards:
                for i,p in enumerate(self.shard_pipes):
                    if p.poll():
                        logger.debug("polled true",flush=True)
                        tmp = p.recv()
                        nr = i
                        c += 1
                        print("nr", nr, flush=True)
                        if tmp == 'end':
                            p.close()
                            del(self.shard_pipes[i])
                            self.shut_down_shards()
                            itr = False
                            break
                        weights_list, moving_avg_list = tmp
                        self.test_globals[nr] = moving_avg_list
                        self.test_weights[nr] = weights_list
                        break
                else:
                    #logger.debug("not polled any", flush=True)
                    time.sleep(0.05)
                if itr ==False:
                    break
            if itr:
                for p in self.shard_pipes:
                    p.send("beginning")
                self.test_weights_unified = self.test_weights[0]
                for i in range(1, self.shards):
                    self.test_weights_unified += self.test_weights[i]
                self.test_averages_unified = self.test_globals[0]  # could also take the mean of the list
                if self.flags.eval_at_end:
                    self.weights.append(self.test_weights_unified)
                    self.globals.append(self.test_averages_unified)

                else:
                    res = self.test_evaluation()

                # stroing the weights, globals, and metrics just in case we need them at a later point again
                if self.flags.save_weights:
                    if self.counter > 5:
                        os.remove(self.flags.saves_dir + os.path.sep + self.flags.dump_file_name + '_' + str(
                            self.counter - 5) + ".pkl")
                    f = open(self.flags.saves_dir + os.path.sep + self.flags.dump_file_name + '_' + str(
                        self.counter) + ".pkl", 'wb')
                    pickle.dump([self.test_weights, self.test_globals], f)

                for p in self.shard_pipes:
                    p.send("finished")
                for p in self.shard_pipes:
                    train = p.recv()
                if self.flags.eval_at_end:
                    self.metrics.append(train)
                else:
                    loglist = [self.counter]
                    for t in train:
                        loglist.append(t)
                    res = res.tolist()
                    for r in res:
                        loglist.append(r)
                    log.append(loglist)
                    saver.save_1D(loglist)
                    logger.results(*loglist)
        if self.flags.eval_at_end:
            for epoch, weight in enumerate(self.weights):
                self.test_weights_unified = weight
                self.test_averages_unified = self.globals[epoch]
                res = self.test_evaluation()
                loglist = [epoch]
                for t in self.metrics[epoch]:
                    loglist.append(t)
                res = res.tolist()
                for r in res:
                    loglist.append(r)
                log.append(loglist)
                saver.save_1D(loglist)
                logger.results(*loglist)

        for t in self.shards_list:
            t.join()
        for t in self.threads:
            t.join()

        self.eval_obj.close()
        saver.close()
        logger.debug("at the end", flush=True)
        self.orderly = True
Ejemplo n.º 28
0
def get_job_detail(key):
    job_dict = {}
    req_url = config.INDEED_JOB_SEARCH_URL + key
    html = _get_html(req_url)
    soup = BeautifulSoup(html, 'html.parser')

    # job header 내용 Parsing
    job_header = soup.find("div", {"data-tn-component": "jobHeader"})
    Logger.debug("request URL =" + req_url)
    # print(job_header)

    job_title = job_header.find("b", {
        "class": "jobtitle"
    }).contents[0].get_text()
    company = job_header.find("span", {"class": "company"}).get_text()
    location = job_header.find("span", {"class": "location"}).get_text()
    cmp_description = ""

    soup.find("span", {"class": "company"})
    # company description이 없는 페이지도 있음
    try:
        cmp_description = soup.find("div", {
            "class": "cmp_description"
        }).get_text()
    except AttributeError:
        Logger.debug("cmp_description does not exists")

    job_summary = soup.find("span", {"id": "job_summary"}).get_text()

    Logger.debug("jobtitle = " + job_title)
    Logger.debug("company = " + company)
    Logger.debug("location = " + location)
    Logger.debug("cmp_description = " + cmp_description)
    Logger.debug("job_summary = " + job_summary)

    xml = ET.Element("root")
    ET.SubElement(xml, "job_key").text = key
    ET.SubElement(xml, "jobtitle").text = job_title
    ET.SubElement(xml, "company").text = company
    ET.SubElement(xml, "location").text = location
    ET.SubElement(xml, "cmp_description").text = cmp_description
    ET.SubElement(xml, "job_summary").text = job_summary

    Logger.debug(ET.tostring(xml, encoding="utf=8"))
    _indent(xml)

    return xml
Ejemplo n.º 29
0
def rand_sleep(sec):
    sleep_len = random.uniform(0.1, sec)
    Logger.debug("sleep " + str(sleep_len) + "sec")
    time.sleep(sleep_len)
Ejemplo n.º 30
0
    def __init__(self, flags, pipes, weights, shard_nr, algorithm,
                 update_function, dataset_size, pipe, fraction, ps_pid):
        """
        Simulates a shard
        :param flags: Flags set by the user
        :param pipes: Pipes connecting agents and shards (one pipe per agent) len(pipes) == #agents
        :param weights: Initial weights for this shard
        :param shard_nr: Shard number (starting form 0)
        :param algorithm:  Implementation of the abstract AlgorithmFunction class.
        :param update_function:  Implementation of the abstract AgentFunction class
        :param dataset_size: Number of samples in the data-set
        :param pipe: Pipe to communicate with the ParameterServer
        :param fraction: Fraction of the GPU allowed to use
        :param ps_pid: PID of the ParameterServer, used to shut down the program in case of an error.
        """

        # flag used to figure out if the shard finished without an interruption
        self.orderly = False
        os.sched_setaffinity(0, list(range(0, flags.threads)))
        logger.state("affinity shard", os.sched_getaffinity(0))
        self._ps_pid = ps_pid
        self.flags = flags
        self.pipes = pipes
        self.shard_nr = shard_nr
        self.num_agents = len(self.pipes)  # constraint to be imposed
        self.update_function_uninitiated = update_function
        self.times = []
        self.dataset_size = dataset_size
        self.pipe = pipe
        self.fraction = fraction

        self.timing = self.flags.time_program  # The time calls them self not put into an if because it almost safes no
        # time if timing is false, and costs if timing True
        self.printing = self.flags.printing
        self.print_interval = self.flags.print_interval

        #period after which bins get averaged
        self.bins_period = self.flags.bins_period

        #Variables used to calculate the staleness
        self.steps_for_learner = [0] * self.num_agents
        self.max_staleness = 0

        # Logs the training metrics, used to send them back at the end to the PS
        self.training_metrics_log = []
        # Logs the weights, used to send them back at the end to the PS
        self.evaluation_weights = []

        self.bins = self.flags.bins
        # Putting the weights into shared memory such that the update_function and Communicator can use the same copy
        self.weights = []
        for i in range(0, self.bins):
            self.weights.append(self.create_copy(weights))

        self.float_sizes = []
        self.total_size = 0
        self.shapes = []
        for i in self.weights[0]:
            self.shapes.append(i.shape)
            l = 1
            for r in i.shape:
                l *= r
            self.float_sizes.append(l)
            self.total_size += l

        self.algorithm = algorithm(self.pipes, self.float_sizes, self.shapes)

        self.batch_size = self.flags.batch_size // self.num_agents
        # number of iteration for the total batch size (Batch size if there were one agent)
        self.iterations = self.flags.total_iterations
        # number of iteration we do in total (taking number of agents into account)
        self.total_updates = self.iterations * self.num_agents
        # how many iterations there are per epoch
        if self.flags.drop_remainder:
            self.iterations_in_epoch = math.floor(
                self.dataset_size / (self.num_agents * self.batch_size))
        else:
            self.iterations_in_epoch = math.ceil(
                self.dataset_size / (self.num_agents * self.batch_size))
        logger.state("Shard {0} is doing epochs:{1}".format(
            shard_nr, self.iterations // self.iterations_in_epoch))
        # here for testing
        self.iterations_in_epoch //= 1  # if changed here also has to be changed in the agent

        self.csv_name_staleness = os.path.sep + self.flags.staleness_file_name + "_shard" + str(
            self.shard_nr) + ".log"
        self.saver = Saver(self.flags.log_dir + self.csv_name_staleness)
        if not self.flags.load_save:
            self.saver.header(
                ["Agent", "staleness", "phase_staleness", "global_staleness"])
        logger.state("Shard {0} set up saving".format(shard_nr))

        logger.state("Shard {0} set up the communicator".format(shard_nr))
        logger.debug("affinity shard", os.sched_getaffinity(0))
        self._run_shard()
        self.shut_down()
        self.pipe.send("end")
        if self.bins > 1:
            logger.state("the bins were", self.algorithm.bin_counts)

        # self.queue.close()
        for p in self.pipes:
            p.close()
            logger.debug(p.closed)

        self.pipe.close()
        logger.debug(self.pipe.closed)
        self.orderly = True
Ejemplo n.º 31
0
def stop():
    Logger.debug("Cancelling all future events for Phase One.")
    scheduler.stop()
Ejemplo n.º 32
0
    def _run_shard(self):

        # staleness log, stores the staleness of each update
        staleness_log = np.empty((self.iterations * self.num_agents, 4),
                                 np.int64)
        update_rule = []
        for i in range(0, self.bins):
            update_rule.append(
                self.update_function_uninitiated(
                    self.flags, self.num_agents,
                    (self.shard_nr) % self.flags.gpu_number, self.fraction,
                    self.weights[i]))
        if self.flags.load_save:
            grad_update = self.flags.starting_epoch * self.iterations_in_epoch * self.num_agents
            epoch = self.flags.starting_epoch + 1  # Start at one, for the update function
            validation_checker = 0
        else:
            grad_update = 0
            epoch = 1  # Start at one, for the update function
            # second counter that gets reset once we finished an epoch
            validation_checker = grad_update
        # used to do stuff during the processing of the first update
        first = True
        # how many updates were done for each bin
        bins_update = [0] * self.bins

        # flag set if we need to wait for PS
        wait_ps = False

        # how many metrics have been added up
        metrics_count = 0

        # number of updates in an epoch
        updates_until_validation = self.iterations_in_epoch * self.num_agents
        print_abs = updates_until_validation // self.print_interval
        if print_abs == 0:
            print_abs = 1
        start_time = time.time()
        while True:
            logger.debug("shard", self.shard_nr, " does next iter")
            before_weights_time = time.time()
            #calling the algorithm to get the gradients
            gradients_list, from_list, tag_list, step_list, metrics_list,\
                        binning, sending = self.algorithm(epoch, grad_update)
            logger.debug("shard", self.shard_nr, " after algorithm")
            if sending != 1:
                got_weights_time = time.time()
                if first:
                    # list for all the moving average variant of the metrics
                    moving_averages = [0] * len(metrics_list[0])
                    first = False

                logger.debug("shard", self.shard_nr, "before loop")

                staleness_list = []
                for j, anr in enumerate(from_list):
                    self.steps_for_learner[anr - 1] += 1
                    staleness_list.append(grad_update - tag_list[j] + 1)
                self.max_staleness = np.max(self.steps_for_learner)
                send_back = [agnr - 1 for agnr in from_list]
                updates = len(from_list)

                for j, s in enumerate(from_list):
                    staleness_log[
                        grad_update + j,
                        3] = self.max_staleness - self.steps_for_learner[s - 1]
                    staleness_log[grad_update + j, 2] = step_list[j]
                    staleness_log[grad_update + j, 1] = staleness_list[j] - 1
                    staleness_log[grad_update + j,
                                  0] = from_list[j]  # agent nr
                    self.saver.save_1D(staleness_log[grad_update + j, :])
                logger.debug("shard", self.shard_nr, " after loop")
                # Perform the update of the weights, note the weights and update are in shared memory and get updated in-suto
                ww = update_rule[binning](self.weights[binning], grad_update,
                                          gradients_list, staleness_list,
                                          epoch)
                if not self.flags.eamsgd:
                    self.weights[binning] = ww
                logger.debug("shard", self.shard_nr, " after update rule")
                bins_update[binning] += 1

                # updates stores how many gradients get used in the same update
                grad_update += updates
                validation_checker += updates

                # calculating moving average of the metrics
                for metrics in metrics_list:
                    for mi, mv in enumerate(metrics):
                        moving_averages[mi] += mv
                metrics_count += updates
                #printing the current average train metrics
                if self.printing and self.shard_nr == 0 and validation_checker % print_abs == 0:
                    logger.results(
                        str(grad_update) + "/" + str(updates_until_validation),
                        *np.divide(moving_averages, metrics_count),
                        staleness_list[-1])

                if self.flags.bins > 1 and grad_update % self.bins_period == 0:
                    self.elastic(bins_update)
                    bins_update = [0] * self.bins

                if validation_checker >= updates_until_validation:
                    weights_copy = self.weights[binning]

                    self.evaluation_weights.append(weights_copy)
                    self.training_metrics_log.append(
                        np.divide(moving_averages, metrics_count))

                    validation_checker -= updates_until_validation
                    logger.results(epoch,
                                   *np.divide(moving_averages, metrics_count))
                    for metrics in metrics_list:
                        for mi, mv in enumerate(metrics):
                            moving_averages[mi] = 0
                    metrics_count = 0
                    logger.state("Time/updates:",
                                 (time.time() - start_time) / grad_update,
                                 flush=True)
                    epoch += 1

                    i = send_back[0]
                    p = self.pipes[i]
                    logger.debug("shard", self.shard_nr,
                                 "before sending to Agent 'globals' to", i)
                    p.send(["globals", weights_copy])
                    logger.debug("shard", self.shard_nr, "after send")
                    glob = p.recv()
                    logger.debug("shard", self.shard_nr, "after recv")
                    self.pipe.send([weights_copy, glob])
                    logger.debug("shard", self.shard_nr, "sent to ps")
                    wait_ps = True

            before_comm = time.time()
            if sending != 0:
                send_back = [agnr - 1 for agnr in from_list]
                if self.flags.eamsgd:
                    logger.state("eamsgd shard", flush=True)
                    weights_copy = ww
                else:
                    weights_copy = self.weights[binning]

                self.send_updates(send_back, "update weights", grad_update,
                                  weights_copy)

            if sending != 1:
                end_time = time.time()
                if self.timing:
                    self.times.append([
                        end_time - before_weights_time,
                        end_time - got_weights_time, end_time - before_comm
                    ])

            if wait_ps:
                while True:
                    a = self.pipe.recv()
                    if a == "beginning":
                        logger.debug("received beginning")
                    if a == "finished":
                        logger.debug("received finished")
                        self.pipe.send(self.training_metrics_log[-1])
                        logger.debug("sent training metrics")
                        wait_ps = False
                        break

            if not (grad_update < self.total_updates):
                break

        for i in range(0, self.bins):
            update_rule[i].close()
        if self.timing:
            t = np.average(self.times, axis=0)
            logger.state("Shard, total time:", t[0],
                         "without waiting for gradients:", t[1], "comm time: ",
                         t[2])
Ejemplo n.º 33
0
 def __del__(self):
     logger.debug("del Agent")
     if not self.orderly:
         os.system('pkill -TERM -P ' + str(self._ps_pid))
         pass
Ejemplo n.º 34
0
class Mail(object):
    """model for the Mail."""
    
    id_is_valid = staticmethod(lambda num: 0 < int(num) <= 1L << 31)
    
    def __init__(self, env, id=None, db=None, messageid=None, row=None):
        self.env = env
        self.db = db
        self.log = Logger(env)
        
        if id is not None:
            self.resource = Resource('mailarchive', str(id), None)
            self._fetch_mail(id)
        elif messageid is not None:
            self._fetch_mail_by_messageid(messageid)
            self.resource = Resource('mailarchive', self.id, None)
        elif row is not None:
            self._fetch_mail_by_row(row)
            self.resource = Resource('mailarchive', self.id, None)
        else:
            self.messageid = ''
            self.subject = ''
            self.utcdate = 0
            self.localdate = ''
            self.zoneoffset = 0
            self.body = ''
        
    def __eq__(self, other):
        if isinstance(other, Mail):
            return self.messageid == other.messageid
        return super.__eq__(other)
        
    def _get_db(self):
        if self.db:
            return self.db
        else:
            return self.env.get_db_cnx()

    def _get_db_for_write(self):
        if self.db:
            return (self.db, False)
        else:
            return (self.env.get_db_cnx(), True)
        
    def get_sanitized_fromaddr(self):
        return self.fromaddr.replace('@',
                                     self.env.config.get('mailarchive',
                                                         'replaceat', '@'))
        
    def get_fromtext(self):
        return get_author(self.fromname, self.fromaddr) 
        
    def get_category(self):
        yearmonth = time.strftime("%Y%m", time.gmtime(self.utcdate))
        category = self.mlid + yearmonth
        return category.encode('utf-8')
        
    def get_plain_body(self):
        return self._sanitize(self.env, self.body)
    
    def get_html_body(self, req):
        
        # for HTML Mail
        if self.body.lstrip().startswith('<'):
            return Markup(self.body)
        
        contentlines = self.body.splitlines()
        htmllines = ['',]
        
        #customize!
        #http://d.hatena.ne.jp/ohgui/20090604/1244114483
        wikimode = req.args.get('wikimode', 'on')
        for line in contentlines:
            if self.env.config.get('mailarchive', 'wikiview',' enabled') == 'enabled' and wikimode == 'on':
                htmllines.append(wiki_to_oneliner(line, self.env, self.db, False, False, req))
            else:
                htmllines.append(Markup(Markup().escape(line).replace(' ','&nbsp;')))
            
        content = Markup('<br/>').join(htmllines)
        return content
        
    def _sanitize(self, env, text):
        return text.replace('@', env.config.get('mailarchive', 'replaceat','_at_') )
    
    def _fetch_mail(self, id):
        row = None
        if self.id_is_valid(id):
            db = self._get_db()
            cursor = db.cursor()
            cursor.execute(SELECT_FROM_MAILARC + " WHERE id=%s", (id,))

            row = cursor.fetchone()
        if not row:
            raise ResourceNotFound('Mail %s does not exist.' % id,
                                   'Invalid Mail Number')

        self._fetch_mail_by_row(row)
    
    def _fetch_mail_by_messageid(self, messageid):
        row = None

        db = self._get_db()
        cursor = db.cursor()
        cursor.execute(SELECT_FROM_MAILARC + " WHERE messageid=%s",
                        (messageid,))

        row = cursor.fetchone()
        if not row:
            raise ResourceNotFound('Mail messageid %s does not exist.' % messageid,
                                   'Invalid Mail messageid Number')

        self._fetch_mail_by_row(row)
        
    def _fetch_mail_by_row(self, row):
        self.id = row[0]
        self.messageid = row[1]
        self.utcdate = row[2]
        self.zoneoffset = row[3]
        self.subject = row[4]
        self.fromname = row[5]
        self.fromaddr = row[6]
        self.header =row[7]
        self.body = row[8]
        self.thread_root = row[9]
        self.thread_parent = row[10]
        
        self.zone = self._to_zone(self.zoneoffset)
        self.localdate = self._to_localdate(self.utcdate, self.zoneoffset)
        
    def _to_localdate(self, utcdate, zoneoffset):
        return time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(utcdate + zoneoffset))

    def _to_zone(self, zoneoffset):
        #zone and date
        zone = ''
        if zoneoffset == '':
            zoneoffset = 0
        if zoneoffset > 0:
            zone = ' +' + time.strftime('%H%M', time.gmtime(zoneoffset))
        elif zoneoffset < 0:
            zone = ' -' + time.strftime('%H%M', time.gmtime(-1 * zoneoffset))
        return zone
                
    def get_href(self, req):
        return req.href.mailarchive(self.id)
    
    def get_subject(self):
        if is_empty(self.subject):
            return '(no subject)'
        else:
            return self.subject
    
    def get_senddate(self):
        return self.localdate + self.zone
    
    def get_thread_root(self):
        if self.thread_root == '':
            return self
        try:
            root_mail = Mail(self.env, messageid=self.thread_root)
        except ResourceNotFound:
            return self
        
        #self.thread_rootはオリジナル版だと親のメールになってしまっている。
        #互換性維持のため、ルートではない場合は自力で探しにいくロジックを走らす
        if root_mail.thread_root == '':
            return root_mail
        else:
            if self.thread_parent != '':
                root_id = MailFinder.find_root_id(self.env, self.messageid)
                return Mail(self.env, messageid=root_id)
    
    def get_thread_parent_id(self):
        if self.thread_parent != '':
            return self.thread_parent.split(' ')[0]
        return None
    
    def get_thread_parent(self):
        if self.thread_parent != '':
            return Mail(self.env, db=self.db, messageid=self.get_thread_parent_id())
        return self
    
    def get_children(self, desc=False, cached_mails=None):
        if cached_mails:
            self.log.debug("[%s] mail's threads is cached." % self.id)
            return [x for x in cached_mails if x.get_thread_parent_id() == self.messageid]
            
        db = self._get_db()
        cursor = db.cursor()
        sql = SELECT_FROM_MAILARC + " WHERE threadparent LIKE %s ORDER BY utcdate"
        
        if desc:
            sql += " DESC"
        
        cursor.execute(sql, ('%s %%' % self.messageid,))
        
        children = []
        
        for row in cursor:
            child_mail = Mail(self.env, row=row, db=self.db)
            children.append(child_mail)
        return children
    
    def get_thread_mails(self, desc=False):
        root = self.get_thread_root()
        
        db = self._get_db()
        cursor = db.cursor()
        sql = SELECT_FROM_MAILARC + " WHERE threadroot = %s ORDER BY utcdate"
        
        if desc:
            sql += " DESC"
        
        cursor.execute(sql, (root.messageid,))
        mails = []
        for row in cursor:
            mails.append(Mail(self.env, row=row, db=self.db))
        return mails
    
    def has_children(self, cached_mails=None):
        rtn = len(self.get_children(cached_mails=cached_mails)) > 0
        return rtn 

    def get_related_tickets(self, req):
        db = self._get_db()
        return get_related_tickets(self.env, req, db, self.id)
    
    def has_attachments(self, req):
        attachment = MailArchiveAttachment(self.env, self.id)
        return attachment.has_attachments(req)

    def populate(self, author, msg, mlid):
        """Populate the mail with 'suitable' values from a message"""
        
        if 'message-id' not in msg:
            raise 'Illegal Format Mail!'
        
        self.is_new_mail = False
        self.mlid = mlid

        self._parse_messageid(msg)
        self._parse_date(msg)
        self._parse_subject(msg)
        
        if msg.is_multipart():
            self._parse_multipart(author, msg)
        else:
            self._parse_body(msg)

        ref_messageid = self._parse_reference(msg)
        self._make_thread(ref_messageid)
        
    def update_or_save(self):
        if self.messageid is None or self.messageid == '':
            raise "Can't save mail to database."
        
        db, has_tran = self._get_db_for_write()
        cursor = db.cursor()

        yearmonth = time.strftime("%Y%m", time.gmtime(self.utcdate))
        category = self.mlid + yearmonth
        cursor.execute("SELECT category, mlid, yearmonth, count FROM mailarc_category WHERE category=%s",
                        (category.encode('utf-8'),))
        row = cursor.fetchone()
        count = 0
        if row:
            count = row[3]
            pass
        else:
            cursor.execute("INSERT INTO mailarc_category (category, mlid, yearmonth, count) VALUES(%s, %s, %s, %s)",
                            (category.encode('utf-8'),
                             self.mlid.encode('utf-8'),
                             yearmonth,
                             0))
        if self.is_new_mail:
            count = count + 1
        cursor.execute("UPDATE mailarc_category SET count=%s WHERE category=%s",
            (count, category.encode('utf-8')))

        # insert or update mailarc

        #self.log.debug(
        #    "VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" %(str(id),
        #    category.encode('utf-8'),
        #    messageid,
        #     utcdate,
        #      zoneoffset,
        #     subject.encode('utf-8'), fromname.encode('utf-8'),
        #     fromaddr.encode('utf-8'),'','',
        #     thread_root,thread_parent))
        cursor.execute("DELETE FROM mailarc where messageid=%s",
                       (self.messageid,))

        cursor.execute("INSERT INTO mailarc ("
            "id, category, messageid, utcdate, zoneoffset, subject,"
            "fromname, fromaddr, header, text, threadroot, threadparent) "
            "VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",
            (str(self.id),
            category.encode('utf-8'),
            self.messageid,
            self.utcdate,
            self.zoneoffset,
            self.subject.encode('utf-8'), self.fromname.encode('utf-8'),
            self.fromaddr.encode('utf-8'), '', self.body.encode('utf-8'),
            self.thread_root, self.thread_parent))

        if has_tran:
            db.commit()

    def _parse_messageid(self, msg):
        self.messageid = msg['message-id'].strip('<>')

        #check messageid is unique
        self.log.debug("Creating new mailarc '%s'" % 'mailarc')
        
        db = self._get_db()
        cursor = db.cursor()
        cursor.execute("SELECT id from mailarc WHERE messageid=%s", (self.messageid,))
        row = cursor.fetchone()
        id = None
        if row:
            id = row[0]
            
        if id == None or id == "":
            # why? get_last_id return 0 at first.
            #id = db.get_last_id(cursor, 'mailarc')
            self.is_new_mail = True
            cursor.execute("SELECT Max(id)+1 as id from mailarc")
            row = cursor.fetchone()
            if row and row[0] != None:
                id = row[0]
            else:
                id = 1
        self.id = int(id) # Because id might be 'n.0', int() is called.

    def _parse_date(self, msg):
        if 'date' in msg:
            datetuple_tz = email.Utils.parsedate_tz(msg['date'])
            localdate = calendar.timegm(datetuple_tz[:9]) #toDB
            zoneoffset = datetuple_tz[9] # toDB
            utcdate = localdate - zoneoffset # toDB
            #make zone ( +HHMM or -HHMM
            zone = ''
            if zoneoffset > 0:
                zone = '+' + time.strftime('%H%M', time.gmtime(zoneoffset))
            elif zoneoffset < 0:
                zone = '-' + time.strftime('%H%M', time.gmtime(-1 * zoneoffset))
            #self.log.debug( time.strftime("%y/%m/%d %H:%M:%S %z",datetuple_tz[:9]))
            
            self.log.debug(time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(utcdate)))
            self.log.debug(time.strftime("%Y/%m/%d %H:%M:%S", time.gmtime(localdate)))
            self.log.debug(zone)
        
        fromname, fromaddr = email.Utils.parseaddr(msg['from'])
        
        self.fromname = self._decode_to_unicode(fromname)
        self.fromaddr = self._decode_to_unicode(fromaddr)
        self.zone = zone
        self.utcdate = utcdate
        self.zoneoffset = zoneoffset
        self.localdate = self._to_localdate(utcdate, zoneoffset)
        
        self.log.info('  ' + self.localdate + ' ' + zone +' '+ fromaddr)
        
    def _parse_subject(self, msg):
        if 'subject' in msg:
            self.subject = self._decode_to_unicode(msg['subject'])
            
    def _parse_reference(self, msg):
        # make thread infomations
        ref_messageid = ''
        if 'in-reply-to' in msg:
            ref_messageid = ref_messageid + msg['In-Reply-To'] + ' '
            self.log.debug('In-Reply-To:%s' % ref_messageid )

        if 'references' in msg:
            ref_messageid = ref_messageid + msg['References'] + ' '

        m = re.findall(r'<(.+?)>', ref_messageid)
        ref_messageid = ''
        for text in m:
            ref_messageid = ref_messageid + "'%s'," % text
            
        ref_messageid = ref_messageid.strip(',')
        
        self.log.debug('RefMessage-ID:%s' % ref_messageid)
        
        return ref_messageid

    def _parse_multipart(self, author, msg):
        body = ''
        # delete all attachement at message-id
        Attachment.delete_all(self.env, 'mailarchive', self.id, self.db)

        for part in msg.walk():
            content_type = part.get_content_type()
            self.log.debug('Content-Type:' + content_type)
            file_counter = 1

            if content_type == 'multipart/mixed':
                pass
            
            elif content_type == 'text/html' and self._is_file(part) == False:
                if body != '':
                    body += "\n------------------------------\n\n"
                    
                body = part.get_payload(decode=True)
                charset = part.get_content_charset()
                
                self.log.debug('charset:' + str(charset))
                # Todo:need try
                if charset != None:
                    body = self._to_unicode(body, charset)
                
            elif content_type == 'text/plain' and self._is_file(part) == False:
                #body = part.get_payload(decode=True)
                if body != '':
                    body += "\n------------------------------\n\n"
                    
                current_body = part.get_payload(decode=True)
                charset = part.get_content_charset()
                
                self.log.debug('charset:' + str(charset))
                # Todo:need try
                if charset != None:
                    #body = self._to_unicode(body, charset)
                    body += self._to_unicode(current_body, charset)
                else:
                    body += current_body
                
            elif part.get_payload(decode=True) == None:
                pass
            
            # file attachment
            else:
                self.log.debug(part.get_content_type())
                # get filename
                # Applications should really sanitize the given filename so that an
                # email message can't be used to overwrite important files
                
                filename = self._get_filename(part)
                if not filename:
                    import mimetypes
                    
                    ext = mimetypes.guess_extension(part.get_content_type())
                    if not ext:
                        # Use a generic bag-of-bits extension
                        ext = '.bin'
                    filename = 'part-%03d%s' % (file_counter, ext)
                    file_counter += 1

                self.log.debug("filename:" + filename.encode(OUTPUT_ENCODING))

                # make attachment
                tmp = os.tmpfile()
                tempsize = len(part.get_payload(decode=True))
                tmp.write(part.get_payload(decode=True))

                tmp.flush()
                tmp.seek(0,0)

                attachment = Attachment(self.env, 'mailarchive', self.id)

                attachment.description = '' # req.args.get('description', '')
                attachment.author = author #req.args.get('author', '')
                attachment.ipnr = '127.0.0.1'

                try:
                    attachment.insert(filename,
                            tmp, tempsize, None, self.db)
                except Exception, e:
                    try:
                        ext = filename.split('.')[-1]
                        if ext == filename:
                            ext = '.bin'
                        else:
                            ext = '.' + ext
                        filename = 'part-%03d%s' % (file_counter, ext)
                        file_counter += 1
                        attachment.description += ', Original FileName: %s' % filename
                        attachment.insert(filename,
                                tmp, tempsize, None, self.db)
                        self.log.warn('As name is too long, the attached file is renamed : ' + filename)

                    except Exception, e:
                        self.log.error('Exception at attach file of Message-ID:' + self.messageid)
                        traceback.print_exc(e)

                tmp.close()
Ejemplo n.º 35
0
def stop():
    Logger.debug("Cancelling all future events for Phase Five.")
Ejemplo n.º 36
0
 def __del__(self):
     logger.debug("del Shard", self.shard_nr)
     if not self.orderly:
         os.system('pkill -TERM -P ' + str(self._ps_pid))