Esempio n. 1
0
def split_dataset(dataset, labelname='label', test_ratio=0.3):
    """
    将输入dataset分解为训练集、测试集
    :param dataset: (N,M) DataFrame
    :param labelname: String dataset中label列的名称
    :param test_ratio: 测试集占比
    :return: 训练集数据、训练集label、测试集数据、测试集label
    """
    logging('Dataset: ', dataset.shape)
    """ 调整标签列的位置,方便以后数据切割 """
    labels = dataset.pop(labelname)
    dataset.insert(dataset.shape[1], labelname, labels)

    pos_data = shuffle(dataset[dataset[labelname] == 1])
    neg_data = shuffle(dataset[dataset[labelname] == 0])

    cut = int(pos_data.shape[0] * test_ratio)
    """ 训练集使用与原始数据一致的正负样本比例 """
    n_neg_train = neg_data.shape[0] / pos_data.shape[0] * pos_data.iloc[cut:].shape[0]

    test_data = shuffle(pd.concat((pos_data.iloc[:cut], neg_data.iloc[:cut])))
    train_data = shuffle(pd.concat((pos_data.iloc[cut:], neg_data.iloc[-n_neg_train:])))

    # test_data.to_csv(fn_raw_test, index=False)
    # train_data.to_csv(fn_raw_train, index=False)

    logging('Train set: ', train_data.shape, 'Test set: ', test_data.shape)
    return train_data.iloc[:, :-1], train_data.iloc[:, -1], test_data.iloc[:, :-1], test_data.iloc[:, -1]
Esempio n. 2
0
def check_config(filename):
    """Check configuration file of backup2swift

    Argument:

        filename: config file path (default is ~/.bu2sw.conf)
    """
    try:
        conf = configparser.SafeConfigParser(allow_no_value=False)
    except TypeError as error:
        msg = "__init__() got an unexpected keyword argument 'allow_no_value'"
        if str(error) == msg:
            # for argparse using python 2.6
            conf = configparser.SafeConfigParser()
        else:
            utils.logging(3, error)
    conf.read(filename)
    try:
        auth_url = conf.get('swift', 'auth_url')
        username = conf.get('swift', 'username')
        password = conf.get('swift', 'password')
        rotate_limit = conf.get('backup', 'rotate_limit')
    except (configparser.NoSectionError, configparser.NoOptionError) as error:
        # syslog.ERR is 3
        utils.logging(3, error)
    try:
        if conf.get('swift', 'ignore_verify_ssl_certification') == 'True':
            verify = False
        else:
            verify = True
    except (configparser.NoSectionError, configparser.NoOptionError):
        verify = True
    return auth_url, username, password, rotate_limit, verify
Esempio n. 3
0
def experiment(FLAGS, perfmeasure, deepmethod,
               setting_no):  # 5-fold cross validation + test
    # Input
    # XD: [drugs, features] sized array (features may also be similarities with other drugs
    # XT: [targets, features] sized array (features may also be similarities with other targets
    # Y: interaction values, can be real values or binary (+1, -1), insert value float("nan") for unknown entries
    # perfmeasure: function that takes as input a list of correct and predicted outputs, and returns performance
    # higher values should be better, so if using error measures use instead e.g. the inverse -error(Y, P)
    # foldcount: number of cross-validation folds for settings 1-3, setting 4 always runs 3x3 cross-validation

    dataset = DataSet(
        fpath=FLAGS.dataset_path,  ### BUNU ARGS DA GUNCELLE
        setting_no=FLAGS.setting_no,  ##BUNU ARGS A EKLE
        seqlen=FLAGS.max_seq_len,
        smilen=FLAGS.max_smi_len,
        need_shuffle=False)
    # set character set size
    FLAGS.charseqset_size = dataset.charseqset_size
    FLAGS.charsmiset_size = dataset.charsmiset_size

    data_6_folds = dataset.read_dataset(fpath=FLAGS.dataset_path,
                                        setting_no=setting_no)

    if not os.path.exists(FLAGS.fig_dir):
        os.makedirs(FLAGS.fig_dir)

    print(FLAGS.log_dir)
    S1_avgperf, S1_avgloss, S1_teststd = nfold_1_2_3_setting_sample(
        data_6_folds, perfmeasure, deepmethod, FLAGS, dataset)

    logging("Setting Negative Sample" + str(FLAGS.setting_no), FLAGS)
    logging(
        "avg_perf = %.5f,  avg_mse = %.5f, std = %.5f" %
        (S1_avgperf, S1_avgloss, S1_teststd), FLAGS)
Esempio n. 4
0
    def _segment(self, features, labels):
        """
        对所有特征的值域进行分段
        :param features: (N,M)
        :param labels: (N,1)
        :return: 经过排序的区间信息熵字典: key--区间, value--信息熵
        """
        assert isinstance(features, pd.core.series.Series), 'Features is not the instance of Series.'
        assert isinstance(labels, pd.core.series.Series), 'Labels is not the instance of Series.'
        assert features.shape[0] == labels.shape[0], 'The dimensions of features and label are unequal.'

        self.nfeature = features.shape[0]
        self.mincount = features.shape[0] * self.examplelimit  # 最小区间样本数至少为样本总数的self.examplelimit
        self.pos_data = features[labels == 1].sort_values()
        self.neg_data = features[labels == 0].sort_values()
        sum_entropy = entropy(self.pos_data, self.neg_data)
        min_entorpy = sum_entropy * self.entropylimit
        # 区间信息熵字典: key--区间, value--信息熵;
        self.seg_ents = {(features.min(), features.max() + 1): sum_entropy}
        while len(self.seg_ents) < self.nbin:
            maxent = max(self.seg_ents.items(), key=lambda x: x[1])
            if maxent[1] < min_entorpy: break
            interval = maxent[0]  # 对熵值最大区间继续分割
            pos_parts = self.pos_data[logical_and(self.pos_data >= interval[0], self.pos_data < interval[1])]
            neg_parts = self.neg_data[logical_and(self.neg_data >= interval[0], self.neg_data < interval[1])]
            # 如果该区间内某类别的特征值少于1个,没有必要再对该区间进行分裂.
            if pos_parts.unique().shape[0] <= 1 or neg_parts.unique().shape[0] <= 1:
                self.seg_ents[interval] -= 1.0
            else:
                self._split_intervals(pos_parts, neg_parts, interval)
        logging(sorted(self.seg_ents.iteritems(), key=lambda d: d[0]))

        return sorted(self.seg_ents.keys())
Esempio n. 5
0
def test():
    def truths_length(truths):
        for i in range(50):
            if truths[i][1] == 0:
                return i
        return 50

    model.eval()
    num_classes = model.num_classes
    total = 0.0
    proposals = 0.0
    correct = 0.0
    device = torch.device("cuda" if use_cuda else "cpu")

    if model.net_name() == 'region':  # region_layer
        shape = (0, 0)
    else:
        shape = (model.width, model.height)
    for data, target, org_w, org_h in test_loader:
        print("======")
        data = data.to(device)
        output = model(data)
        all_boxes = get_all_boxes(output,
                                  shape,
                                  conf_thresh,
                                  num_classes,
                                  use_cuda=use_cuda)

        for k in range(len(all_boxes)):
            boxes = all_boxes[k]
            correct_yolo_boxes(boxes, org_w[k], org_h[k], model.width,
                               model.height)
            boxes = np.array(nms(boxes, nms_thresh))
            truths = target[k].view(-1, 5)
            num_gts = truths_length(truths)
            total = total + num_gts
            num_pred = len(boxes)
            if num_pred == 0:
                continue

            proposals += int((boxes[:, 4] > conf_thresh).sum())
            for i in range(num_gts):
                gt_boxes = torch.FloatTensor([
                    truths[i][1], truths[i][2], truths[i][3], truths[i][4],
                    1.0, 1.0, truths[i][0]
                ])
                gt_boxes = gt_boxes.repeat(num_pred, 1).t()
                pred_boxes = torch.FloatTensor(boxes).t()
                best_iou, best_j = torch.max(
                    multi_bbox_ious(gt_boxes, pred_boxes, x1y1x2y2=False), 0)
                # pred_boxes and gt_boxes are transposed for torch.max
                if best_iou > iou_thresh and pred_boxes[6][best_j] == gt_boxes[
                        6][0]:
                    correct += 1

    precision = 1.0 * correct / (proposals + eps)
    recall = 1.0 * correct / (total + eps)
    fscore = 2.0 * precision * recall / (precision + recall + eps)
    logging("correct: %d, precision: %f, recall: %f, fscore: %f" %
            (correct, precision, recall, fscore))
Esempio n. 6
0
def main():
    try:
        args = parse_options()
        args.func(args)
    except RuntimeError as error:
        # syslog.ERR is 3
        utils.logging(3, error)
Esempio n. 7
0
def main():
    try:
        args = parse_options()
        args.func(args)
    except RuntimeError as error:
        # syslog.ERR is 3
        utils.logging(3, error)
Esempio n. 8
0
def main():
    # Validation parameters
    conf_thresh = FLAGS.conf_threshold
    nms_thresh = FLAGS.nms_threshold
    iou_thresh = FLAGS.iou_threshold

    # output file
    out_path = FLAGS.out_path

    # Training settings
    datacfg = FLAGS.data
    cfgfile = FLAGS.config

    data_options = read_data_cfg(datacfg)
    file_list = data_options['valid']
    gpus = data_options['gpus']  # e.g. 0,1,2,3
    ngpus = len(gpus.split(','))

    num_workers = int(data_options['num_workers'])
    # for testing, batch_size is set to 1 (one)
    batch_size = FLAGS.batch_size

    global use_cuda
    use_cuda = torch.cuda.is_available() and use_cuda

    ###############
    torch.manual_seed(seed)
    if use_cuda:
        os.environ['CUDA_VISIBLE_DEVICES'] = gpus
        torch.cuda.manual_seed(seed)

    global model
    model = Darknet(cfgfile)
    # model.print_network()

    init_width = model.width
    init_height = model.height

    kwargs = {'num_workers': num_workers, 'pin_memory': True} if use_cuda else {}

    val_loader = torch.utils.data.DataLoader(
        dataset.listDataset(file_list, shape=(init_width, init_height),
                            shuffle=False, jitter=False,
                            transform=transforms.Compose([
                                transforms.ToTensor(),
                            ]), validate=True),
        batch_size=batch_size, shuffle=False, **kwargs)

    if use_cuda:
        if ngpus > 1:
            model = torch.nn.DataParallel(model)
            model = model.module
    model = model.to(torch.device("cuda" if use_cuda else "cpu"))
    for w in FLAGS.weights:
        # model.load_weights(w)
        checkpoint = torch.load(w)
        model.load_state_dict(checkpoint['model_state_dict'])
        logging('evaluating ... %s' % (w))
        test(val_loader, conf_thresh, nms_thresh, iou_thresh, out_path, batch_size)
Esempio n. 9
0
 def askParameter(request, answer, paramName):
     '''ask parameter, check correctness and eventually log message'''
     ch=False
     while not ch:
         par = raw_input(request)
         ch=checkInput[paramName](par)
     logging(answer+" "+par)
     return par
Esempio n. 10
0
def call_bowtie(input_ref, out_pref, log):
    query = ["bowtie2-build", "-f", input_ref, out_pref]
    utils.logging("[INFO] bowtie2-build is called as: %s" % (" ".join(query)),
                  args)
    # make index with transformed genome
    f = open(log, 'w')
    p = subprocess.Popen(query, stdout=f)
    p.communicate()
Esempio n. 11
0
def main():
    # Training settings
    datacfg = FLAGS.data
    cfgfile = FLAGS.config

    data_options = read_data_cfg(datacfg)
    testlist = data_options['valid']
    gpus = data_options['gpus']  # e.g. 0,1,2,3
    ngpus = len(gpus.split(','))

    num_workers = int(data_options['num_workers'])
    # for testing, batch_size is setted to 1 (one)
    batch_size = 1  # int(net_options['batch'])

    global use_cuda
    use_cuda = torch.cuda.is_available() and (True if use_cuda is None else
                                              use_cuda)

    ###############
    torch.manual_seed(seed)
    if use_cuda:
        os.environ['CUDA_VISIBLE_DEVICES'] = gpus
        torch.cuda.manual_seed(seed)

    global model
    model = Darknet(cfgfile)
    #model.print_network()

    init_width = model.width
    init_height = model.height

    kwargs = {
        'num_workers': num_workers,
        'pin_memory': True
    } if use_cuda else {}

    global test_loader
    test_loader = torch.utils.data.DataLoader(dataset.listDataset(
        testlist,
        shape=(init_width, init_height),
        shuffle=False,
        transform=transforms.Compose([
            transforms.ToTensor(),
        ]),
        train=False),
                                              batch_size=batch_size,
                                              shuffle=False,
                                              **kwargs)

    if use_cuda:
        if ngpus > 1:
            model = torch.nn.DataParallel(model)
            model = model.module
    model = model.to(torch.device("cuda" if use_cuda else "cpu"))
    for w in FLAGS.weights:
        model.load_weights(w)
        logging('evaluating ... %s' % (w))
        test()
Esempio n. 12
0
def readData(config):
    file_path = config['raw_data']
    encoding = config['raw_encoding']
    with open(file_path, 'r', encoding=encoding) as f:
        raw_data = csv.reader(f, delimiter='\t')
        raw_data = list(raw_data)

    logging('原始数据示例', raw_data[0:3])
    return raw_data
Esempio n. 13
0
 def _post(self, url, data={}, headers=None, referer=u''):
     logging(u'<POST> "{}"'.format(url))
     if headers:
         self.session.headers.update(headers)
     if referer:
         self.session.headers.update({ u'Referer': referer })
     res = self.session.post(url, data=data)
     logging(u'<POST> "{}" - {}'.format(url, res.status_code))
     return res
Esempio n. 14
0
 def _get(self, url, params={}, headers=None, referer=u''):
     logging(u'<GET> "{}"'.format(url))
     if headers:
         self.session.headers.update(headers)
     if referer:
         self.session.headers.update({ u'Referer': referer })
     res = self.session.get(url, params=params)
     logging(u'<GET> "{}" - {}'.format(url, res.status_code))
     return res
Esempio n. 15
0
 def __init__(self):
     logging(u'Starting new manta session . . .')
     self.session = requests.Session()
     self.session.proxies = cfg.proxies
     self.session.headers = cfg.base_headers
     self._getMyIp()
     self._testSessionProxy()
     logging(u'Initializing request session . . .')
     self._buildCookiedSession(cfg.manta_base_url)
Esempio n. 16
0
def cleanData(raw_data):
    sms_text = []
    sms_label = []
    for line in raw_data:
        sms_text.append(" ".join(preprocess(line[1])))
        sms_label.append(line[0])
    logging('预处理后的文本示例', sms_text[0:3])
    logging('预处理后的标签示例', sms_label[0:3])
    return sms_text, sms_label
Esempio n. 17
0
def align(sc, args):
    import utils as g_utils
    import align_utils as a_utils

    ## broadcast raw reference
    ref_file = os.path.join(args.tempbase, "ref.fa")
    g_utils.read_hdfs(os.path.join(args.ref, "raw.fa"), ref_file)
    ref_dict = {}
    for chrid, seq in g_utils.read_fasta(ref_file):
        ref_dict[chrid] = (seq, len(seq))

    g_utils.logging("[DEBUG] loading reference done", args)
    bc_refdict = sc.broadcast(ref_dict)

    ## read from hadoop
    readRDD = sc.textFile( args.input ) \
                .map( lambda x: g_utils.line2kv( x))

    if args.testmode == "balancing":
        readRDD = readRDD.partitionBy(args.nodes)

    readRDD = readRDD.cache()

    ## transform and get result of bowtie
    c2tTransRDD = readRDD.mapValues(lambda x: (x[0].translate(
        g_utils.make_trans_with("W", "C", "T")), x[1]))
    c2tMapRDD = c2tTransRDD.mapPartitionsWithIndex(
        lambda i, ptn: a_utils.mapping(i, "C2T", ["W_C2T", "C_C2T"], ptn, args
                                       ))

    g2aTransRDD = readRDD.mapValues(lambda x: (x[0].translate(
        g_utils.make_trans_with("W", "G", "A")), x[1]))
    g2aMapRDD = g2aTransRDD.mapPartitionsWithIndex(
        lambda i, ptn: a_utils.mapping(i, "G2A", ["W_G2A", "C_G2A"], ptn, args
                                       ))

    mergedRDD = sc.union([readRDD, c2tMapRDD, g2aMapRDD])
    combRDD = mergedRDD.combineByKey( lambda v: [v],\
                                      lambda lst, v: lst + [v],\
                                      lambda l1, l2: l1 + l2 )
    filteredRDD = combRDD.mapValues( lambda x: a_utils.select_and_find_uniq_alignment( x))\
                          .filter( lambda (k, v): v is not None )
    # .filter( lambda (k, v): not (v is None))

    if args.testmode == "balancing":
        filteredRDD = filteredRDD.partitionBy(args.nodes)


    methylRDD = filteredRDD.map( lambda x: a_utils.calc_methyl(x, bc_refdict.value, args.num_mm) )\
                            .filter( lambda x: x is not None )

    result_path = os.path.join(args.output, "alignment")
    methylRDD.map(lambda x: a_utils.res_to_string(x)).saveAsTextFile(
        result_path)

    return result_path
Esempio n. 18
0
 def _validate_domains(self, url):
     parsed = urlparse(url)
     domain = u'.'.join(parsed.netloc.split(u'.')[-2:])
     if any(
             map(lambda ad: domain.lower() == ad.lower(),
                 self.allowed_domains)):
         return True
     else:
         logging(u'"{}" is not an allowed domain.'.format(domain))
         return False
Esempio n. 19
0
def test():
    def truths_length(truths):
        for i in range(50):
            if truths[i][1] == 0:
                return i
        return 50

    model.eval()
    num_classes = model.num_classes
    print("num", num_classes)
    total = 0.0
    proposals = 0.0
    correct = 0.0
    device = torch.device("cuda" if use_cuda else "cpu")

    for _, (data, target) in enumerate(test_loader):
        data = data.to(device)
        output = model(data)
        all_boxes = get_all_boxes(output, conf_thresh, num_classes)

        for k in range(data.size(0)):
            boxes = all_boxes[k]
            boxes = np.array(nms(boxes, nms_thresh))
            truths = target[k].view(-1, 5)
            num_gts = truths_length(truths)
            total = total + num_gts
            num_pred = len(boxes)

            if num_pred == 0:
                continue

            proposals += int((boxes[:, 4] > conf_thresh).sum())
            for i in range(num_gts):
                gt_boxes = torch.FloatTensor([
                    truths[i][1], truths[i][2], truths[i][3], truths[i][4],
                    1.0, 1.0, truths[i][0]
                ])
                gt_boxes = gt_boxes.repeat(num_pred, 1).t()
                pred_boxes = torch.FloatTensor(boxes).t()
                best_iou, best_j = torch.max(
                    multi_bbox_ious(gt_boxes, pred_boxes, x1y1x2y2=False), 0)
                # pred_boxes and gt_boxes are transposed for torch.max
                if best_iou > iou_thresh and pred_boxes[6][best_j] == gt_boxes[
                        6][0]:
                    correct += 1

    precision = 1.0 * correct / (proposals + eps)
    recall = 1.0 * correct / (total + eps)
    fscore = 2.0 * precision * recall / (precision + recall + eps)
    logging("precision: %f, recall: %f, fscore: %f" %
            (precision, recall, fscore))
Esempio n. 20
0
def build_index(args):
    i_file = args.input

    utils.logging("[INFO] Start downloading reference file.", args)
    tempbase = utils.gen_file()
    utils.mkdir(tempbase)
    reffile = os.path.join(tempbase, "raw.fa")
    utils.read_hdfs(i_file, reffile)

    tempfiles = [
        open(os.path.join(tempbase, "%s.fa" % m), 'w') for m in conv_way
    ]

    utils.logging("[INFO] Start transforming reference file.", args)
    # read ref
    for chrid, seq in utils.read_fasta(reffile):
        for i, method in enumerate(conv_way):
            (strand, a_from, a_to) = (method[0], method[2], method[4])

            if strand == "W":
                tempfiles[i].write(
                    ">%s\n%s\n" %
                    (chrid,
                     seq.translate(utils.make_trans_with(strand, a_from,
                                                         a_to))))
            else:
                tempfiles[i].write(
                    ">%s\n%s\n" %
                    (chrid,
                     seq.translate(utils.make_trans_with(strand, a_from,
                                                         a_to))[::-1]))

    # close all files
    for i, method in enumerate(conv_way):
        tempfiles[i].close()

    utils.logging("[INFO] Start launching bowtie2-build.", args)
    # run bowtie jobs
    procs = []

    utils.mkdir(os.path.join(tempbase, "index"))
    for i, method in enumerate(conv_way):
        out_pref = os.path.join(tempbase, "index", method)
        build_log = out_pref + ".build.log"

        proc = Process(target=call_bowtie,
                       args=(
                           tempfiles[i].name,
                           out_pref,
                           build_log,
                       ))
        procs.append(proc)
        proc.start()

    for proc in procs:
        proc.join()

    utils.logging("[INFO] Start uploading index file.", args)
    # move to hdfs
    utils.copy_to_hdfs(tempbase, args.output, remove_original=True)
Esempio n. 21
0
def train_epoch(epoch, train_loader, config, writer=None):
    global processed_batches
    t0 = time.time()
    logging('epoch %d, processed %d samples, lr %f' %
            (epoch, epoch * len(train_loader.dataset), config.learning_rate))

    model.train()
    processed_batches = 0
    correct, total = 0, 0
    f = open(config.logFile, 'a')

    for batch_idx, (X_batch, Y_batch, NX_batch) in enumerate(train_loader):
        processed_batches = processed_batches + 1
        X_batch, Y_batch, NX_batch = X_batch.cuda().squeeze(
            0), Y_batch.cuda().squeeze(0), NX_batch.cuda().squeeze(0)

        optimizer.zero_grad()

        output = model.forward(X_batch, NX_batch)
        # if len(output.shape) == 3:
        #     output = output.reshape(config.batch_size*config.num_nodes, -1)
        #     Y_batch = Y_batch.reshape(config.batch_size * config.num_nodes)
        loss = nn.CrossEntropyLoss()(output, Y_batch)

        pred = torch.argmax(output, dim=1)
        correct += torch.sum(pred.eq(Y_batch))
        total += output.shape[0]
        acc = np.array(correct.cpu()) / total

        print('epoch: %d, processed_batches: %d, loss: %f' %
              (epoch, processed_batches, loss.item()))
        print('acc:', acc)

        f.write('%0.6f' % (loss.item()) + ' ' + '%0.6f' % (acc.item()) + '\n')
        loss.backward()
        optimizer.step()

    t1 = time.time()
    logging('training with %f samples/s' % (len(train_loader.dataset) /
                                            (t1 - t0)))
    f.close()
    if (epoch + 1) % config.save_interval == 0:
        torch.save(
            {
                'epoch': epoch,
                'seen': processed_batches,
                'state_dict': model.state_dict()
            }, '%s/%06d.pkl' % ('backup', np.int(epoch / 12)))

    print("done")
Esempio n. 22
0
class Session(object):

    _ip = None

    def __init__(self):
        logging(u'Starting new manta session . . .')
        self.session = requests.Session()
        self.session.proxies = cfg.proxies
        self.session.headers = cfg.base_headers
        self._getMyIp()
        self._testSessionProxy()
        logging(u'Initializing request session . . .')
        self._buildCookiedSession(cfg.manta_base_url)

    def _getMyIp(self):
        self._ip = json.load(urlopen(cfg.ip_url))[u'origin']
        logging(u'Current Public IP: {}'.format(self._ip))

    def _testSessionProxy(self):
        logging(u'Testing proxies . . .')
        try:
            r = self.session.get(cfg.ip_url, timeout=5)
            r.raise_for_status()
            r_json = json.loads(r.content)
            if self._ip in r_json[u'origin']:
                raise ValueError(u'Proxied IP same as original IP')
        except Exception, e:
            logging(u'Proxy test failed: {}'.format(unicode(e)))
            logging(u'Exiting process . . .')
            exit()
        logging(u'Proxy test passed! Current proxied IP: {}'.format(r_json[u'origin']))
Esempio n. 23
0
def train(model, network_input, network_output, X_train, X_test, y_train, y_test, results_dir):
    callbacks_list = utils.model_callbacks(results_dir)
    utils.logging('Loaded model callbacks')

    utils.save_model_to_json(model, results_dir)
    utils.logging('Model saved to file: {}/{}'.format(results_dir, 'model.json'))

    history = model.fit(network_input, network_output,
                        validation_data=(X_test, y_test),
                        validation_split=0.33,
                        epochs=200,
                        batch_size=64,
                        callbacks=callbacks_list,
                        verbose=1,
                        )

    utils.generate_final_plots(history, results_dir)
Esempio n. 24
0
def get_eval(lr=0.01, n_episodes=50, is_train=False, savefig=False):
    # mkdir
    print('qlearning_nn evaluating...')
    base_dir = './results/qlearning_nn'
    if not os.path.exists(base_dir):
        os.makedirs(base_dir)

    log_file = os.path.join(base_dir, 'qlearning_nn.log')
    logger = logging(log_file)
    results_file = os.path.join(base_dir, 'qlearning_nn.csv')
    if os.path.exists(results_file) and not is_train and not savefig:
        results = pd.read_csv(results_file)
        results = results.sort_values(by=['noisy', 'problem_id'])
        return results
    else:
        if os.path.exists(results_file):
            os.remove(results_file)
        if os.path.exists(log_file):
            os.remove(log_file)
        pkl_file = os.path.join(
            base_dir,
            'qlearning_nn_lr={}_episodes={}.pkl'.format(lr, n_episodes))
        if os.path.exists(pkl_file):
            q_learning_nn = pickle.load(open(pkl_file, 'rb'))
        else:
            q_learning_nn = train(lr=lr, n_episodes=n_episodes)
    # eval
    results = pd.DataFrame([],
                           columns=[
                               'problem_id', 'noisy', 'action',
                               'Total_rewards', 'avg_reward_per_action'
                           ])
    for problem_id, noisy, env in get_env():
        states, rewards, actions = implement(env,
                                             q_learning_nn,
                                             1,
                                             discount_factor=0.95)
        result = {
            'problem_id': problem_id,
            'noisy': noisy,
            'Total_rewards': sum(rewards),
            'avg_reward_per_action': sum(rewards) / len(actions)
        }
        results = results.append(pd.DataFrame(result, index=[0]),
                                 ignore_index=0)
        logger(' ' + str(result))
        logger(actions)
        if savefig:
            get_fig(states, rewards)
            pic_name = os.path.join(
                base_dir,
                'problem_id={} noisy={}.jpg'.format(problem_id, noisy))
            plt.savefig(dpi=300, fname=pic_name)
            plt.close()
        env.close()
    results = results.sort_values(by=['noisy', 'problem_id'])
    results.to_csv(results_file, index=0)
    return results
Esempio n. 25
0
 def stats(self):
     delta = self._endTime - self._startTime
     logging(u'Crawling finished.')
     logging(
         unicode('This Run:\n'
                 '{space}[Start At]\t{startTime}\n'
                 '{space}[Finish At]\t{endTime}\n'
                 '{space}[Total Runtime]\t{seconds} Seconds\n'
                 '{space}[Success]\t{success}\n'
                 '{space}[Failed]\t{failed}\n'
                 '{space}[NG Domain]\t{ngad}').
         format(space=u' ' * 26,
                success=self._success,
                failed=self._failed,
                ngad=self._ngad,
                startTime=self._startTime.strftime(u'%Y-%m-%d %H:%M:%S %Z'),
                endTime=self._endTime.strftime(u'%Y-%m-%d %H:%M:%S %Z'),
                seconds=delta.total_seconds()))
def crawl(ids):
    browser = open_browser()
    for i in tqdm(ids):
        href = df.stock_href[i]
        if not isinstance(href, str):
            continue
        file_id = href.replace('?', '').replace('=', '')
        if  file_id in files:
            print(file_id, 'already downloaded \n')
            continue
        if 'quote' not in href:
            continue
        print(i, file_id)
        
        url = 'https://finance.yahoo.com' + href
        try:
            browser.get(url)
        except Exception as e:
            print(e)
            browser.quit()
            browser = open_browser()
            browser.get(url)
        while "无法访问" in browser.page_source or '未连接' in browser.page_source or '该网页无法正常运作' in browser.page_source:
            try:
                browser.quit()
                browser = open_browser()
                browser.get(url)
                e = 0
            except Exception as e:
                print(e)
                browser.quit()
                browser = open_browser()
                browser.get(url)
                
            
        #time.sleep(30+random.randint(0,10))
        html = browser.page_source.encode('utf-8')
        browser.execute_script("window.scrollTo(500,2000)")
        time.sleep(SLEEP+random.randint(0,10))
        
        filename = PATH + file_id + '.html'
        save_page(html, filename)
        logging(PATH, filename, '200')
        time.sleep(SLEEP+random.randint(0,10))
Esempio n. 27
0
    def _buildCookiedSession(self, base_url, referer=u''):
        parsed = urlparse(base_url)

        # First request
        logging(u'Build Cookies Request #1:')
        res = self._get(parsed.geturl(), headers={
            u'Accept': u'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            u'Upgrade-Insecure-Requests': u'1',
            u'Host': parsed.netloc,
        }, referer=referer)
        self.lastReferer = res.url
        # print res.content + '\n'

        # Second request to get js content
        JS = re.search(r'src\=\"\/(ser\-.*\.js)\"', res.content).group(1)
        url_2 = ParseResult(parsed.scheme, parsed.netloc, JS, u'', u'', u'').geturl()
        time.sleep(random.random())
        logging(u'Build Cookies Request #2:')
        res = self._get(url_2, referer=self.lastReferer)
        # print res.content + '\n'

        # Third request to post js to get cookies
        PID = re.search(r'FingerprintWrapper\(\{path\:\"\/.*?\?(PID\=.*?)\"\,', res.content).group(1)
        AJAX = re.search(r'FingerprintWrapper.*?ajax_header\:\"(.*?)\"\,interval', res.content).group(1)
        url_3 = ParseResult(parsed.scheme, parsed.netloc, JS, '', PID, '').geturl()
        time.sleep(random.random())
        logging(u'Build Cookies Request #3:')
        res = self._post(url_3, data={ u'p': proof(cfg.p) }, headers={ u'Accept': u'*/*', u'X-Distil-Ajax': AJAX }, referer=self.lastReferer)
        # print res.__dict__
        return res
Esempio n. 28
0
def eval(args):
    if args.is_log:
        file_name = os.path.basename(__file__)
        output_path = logging(file_name, verbose=2)
        
    user_embs, movie_embs, movie_cate_sim, Tr, Te = load_data(args)
    hidden_dim, num_movies = movie_embs.shape
    nor_embs = movie_embs.T.copy()
    for i in range(num_movies):
        nor_embs[i, :] = nor_embs[i, :] / np.linalg.norm(nor_embs[i, :])
    sim_mat = np.dot(movie_embs, movie_embs.T)
    lamda_list = [0.1]

    # write date to excel
    # file = xlwt.Workbook(encoding='ascii')
    # table = file.add_sheet('cucb')
    row0 = list(range(0, args.num_bandit_iter, 1))

    test_users = list(Te.keys())
    num_test_users = len(test_users)

    for i in range(len(lamda_list)):
        test_precision = np.zeros(args.num_bandit_iter)
        test_recall = np.zeros(args.num_bandit_iter)
        test_div = np.zeros(args.num_bandit_iter)
        test_cate_div = np.zeros(args.num_bandit_iter)
        test_reward = np.zeros(args.num_bandit_iter)
        args.lam_da = lamda_list[i]
        t1 = time.clock()

        for user in test_users:
            prec, recall, div, cate_div, reward, s_inx0 = c2ucb(movie_embs.T, None, Te[user],
                                              args, num=args.num_bandit_iter,
                                                                sim=sim_mat,
                                                                cate_sim=movie_cate_sim,
                                                                user_emb=None)
            print(user)
            print(prec)
            test_precision += prec
            test_recall += recall
            test_div += div
            test_cate_div += cate_div
            test_reward += reward

        test_precision = test_precision / num_test_users
        test_recall = test_recall / num_test_users
        test_div = test_div / num_test_users
        test_cate_div = test_cate_div / num_test_users
        test_reward = test_reward / num_test_users

        print("lambda:{0}\ntest_precision:{1}\ntest_recall:{2}\ntest_div:{3}\ntest_cate_div:{4}\ntest_reward:{5}".format(args.lam_da, test_precision, test_recall, test_div, test_cate_div, test_reward))
        print("time used:%s\n" % (time.clock() - t1))
def get_eval(is_train=False, savefig=False):
    print('deterministic evaluating...')
    # mkdir
    # evaluate
    pic_dir = './results/Deterministic'
    if not os.path.exists(pic_dir):
        os.makedirs(pic_dir)

    log_file = os.path.join(pic_dir, 'Deterministic.log')
    logger = logging(log_file)
    results_file = os.path.join(pic_dir, 'Deterministic_results.csv')
    if os.path.exists(results_file) and not is_train and not savefig:
        results = pd.read_csv(results_file)
        results = results.sort_values(by=['noisy', 'problem_id'])
        return results
    else:
        if os.path.exists(results_file):
            os.remove(results_file)
        if os.path.exists(log_file):
            os.remove(log_file)
    results = pd.DataFrame([],
                           columns=[
                               'problem_id', 'noisy', 'action',
                               'Total_rewards', 'avg_reward_per_action'
                           ])
    for problem_id, noisy, env in get_env():
        for act in range(4):
            func = MyDeterministicPolicy(act)
            states, rewards, actions = exec_policy(env, func, verbose=False)
            result = {
                'problem_id': problem_id,
                'noisy': noisy,
                'action': act,
                'Total_rewards': sum(rewards),
                'avg_reward_per_action': sum(rewards) / len(actions)
            }
            results = results.append(pd.DataFrame(result, index=[0]),
                                     ignore_index=0)
            logger(' ' + str(result))
            logger(actions)
            if savefig:
                get_fig(states, rewards)
                pic_name = os.path.join(
                    pic_dir, 'problem_id={} noisy={} action={}.jpg'.format(
                        problem_id, noisy, str(act)))
                plt.savefig(dpi=300, fname=pic_name)
                plt.close()
            env.close()
        results = results.sort_values(by=['noisy', 'problem_id'])
        results.to_csv(results_file, index=0)
    return results
def train():
    config = Config()
    agent = Agent(config)
    Dev_loader = loader("data/" + config.dev_file + ".npz", batch_size = config.batch_size, Train = False)
    num_dev_batches = Dev_loader.max_batch
    train_losses  = []
    dev_losses = []
    lowest_loss = 1e+10
    for i in range(config.n_epoch):
        train_loss = 0.0
        total_trained_batches = 0
        for j in range(num_training_volumes):
            Train_loader = loader("data/" + config.train_file + "_vol_" + str(j) + ".npz", batch_size = config.batch_size, Train = True)
            num_train_batches = Train_loader.max_batch
            total_trained_batches += num_train_batches
            for k in range(num_train_batches):
                train_dic = Train_loader.get_batch()
                train_loss+=agent.run_train_step(train_dic)
        avg_train_loss = train_loss/total_trained_batches
        dev_loss = 0.0
        dev_preds = []
        dev_ys = []
        for k in range(num_dev_batches):
            dev_dic = Dev_loader.get_batch()
            dev_batch_out, dev_batch_loss = agent.test(dev_dic)      
            dev_pred = out_to_predict(dev_batch_out, version=2)
            dev_y = dev_dic["labels"]
            dev_ys += list(dev_y)
            dev_preds += list(dev_pred)
            dev_loss += dev_batch_loss
        precision , recall, accuracy , f1  = evaluate(np.array(dev_preds) , np.array(dev_ys))
        avg_dev_loss = dev_loss/num_dev_batches
        logging(i+1, avg_train_loss , avg_dev_loss , precision , recall, accuracy , f1)
        if avg_dev_loss < lowest_loss:
            agent.save(i+1)
            lowest_loss = avg_dev_loss
        Train_loader.reset_loader()
        Dev_loader.reset_loader()
Esempio n. 31
0
 def start(self):
     self._success, self._failed, self._ngad = 0, 0, 0
     self._startTime = now()
     if len(self.crawl_urls):
         logging(u'Total {} urls waiting to be crawled.'.format(
             len(self.crawl_urls)))
         try:
             for url in self.crawl_urls:
                 if self._validate_domains(url):
                     time.sleep(self.delay)
                     response, html = self.request_proccess(url)
                     if response is not None:
                         self._success += 1
                         self.response_processor(response, html)
                     else:
                         self._failed += 1
                 else:
                     self._ngad += 1
         except Exception, e:
             self._failed += 1
             logging(u'Crawling has been interrupted by exception.')
         self._endTime = now()
         self.stats()
Esempio n. 32
0
def send_text():
    """Sends the txt message from data passed through POST."""
                                
    if request.headers['Content-Type'] == 'application/json':
        #converts json to python dict
        data = request.json
        #get list of carriers from carriers.json                      
        config = load_config()
        carriers = load_carriers() 
        
        #authenticate request
        if data['api_key'] == config['api_key']:
            
            if data['carrier'] in carriers:
                #prepare the message
                carrier_choice = data['carrier']
                carrier = carriers[carrier_choice]
                number = data['number']            
                msg = data['msg']
                to =  "{0}{1}".format(number, carrier)
                sender = config['from']
                #sends the actual message
                mail = smtplib.SMTP(config['smtp_address'])
                mail.starttls()
                mail.login(config['username'], config['password'])
                mail.sendmail(sender, to, msg)
                mail.quit()
                #prepare the json response.
                log = "Message: '{0}' was sent succesfuly sent to '{1}'.".format(msg, to)
                logging(log)
                resp = {"response" : log}
                response = Response(json.dumps(resp), status=200, mimetype='application/json')
                return response
        
        #if the carrier is not supported or found in the carriers list.
        else: 
            log = "Carrier not supported."
            #log to web2txt.log file
            logging(log)
            resp = {"response" : log}
            response = Response(json.dumps(resp), status=404, mimetype='application/json')
            return response
    
    #if the content type is not json
    else:
        log = "Wrong request content-type. API only support JSON"
        #log to web2txt.log file
        logging(log)
        resp = {"response" : log}
        response = Response(json.dumps(resp), status=415, mimetype='application/json')
        return response 
Esempio n. 33
0
 def printPossibleOptions(self):
     '''log possible options'''
     logging("Options")
     logging("1-Set target host/IP (Current: " + str(self.victim) + ")")
     logging("2-Set web app port (Current: " + str(self.webPort) + ")" )
     logging("3-Set URI Path (Current: " + str(self.uri) + ")")
     logging("4-Set HTTP Request Method (1-GET/2-POST, current: "+str(self.httpMethod)+")")
     logging("5-Set my local Mongo/Shell IP (Current: " + str(self.myIP) + ")")
     logging("6-Set shell listener port (Current: " + str(self.myPort) + ")")
Esempio n. 34
0
 def close(self):
     self._connector.cursor().close()
     self._connector.close()
     logging("error", "DBM close!")