コード例 #1
0
ファイル: spider_main.py プロジェクト: KIMI-Z/python-spider
class SpiderMain:

    def __init__(self):
        """
        初始化方法,主要是将其他组件实例化
        """
        self.url_manager = UrlManager()
        self.html_downloader = HtmlDownloader()
        self.html_parser = HtmlParser()
        self.data_storage = DataStorage()

    def start(self):
        """
        爬虫的主启动方法
        :return:
        """
        """ 页码 """
        title = set()
        for a in range(2, 10):
            html = self.html_downloader.download(
                'http://ggzy.foshan.gov.cn/jyxx/fss/zfcg_1108551/zbxx/index_'+str(a)+'.html?1')
            _title = self.html_parser.titleParer(html)
            for i in _title:
                title.add(i)
        for i in title:
            print(i)
            html = self.html_downloader.download(i)
            _product = self.html_parser.contextParer(html)
            self.data_storage.storage(_product)
コード例 #2
0
ファイル: main.py プロジェクト: tomcat2088/tools
def main(argv):
    address = '127.0.0.1'
    port = 9999
    queryDB = ""
    queryFields = None
    usage = str.format('{0} -i <ip address> -p <port> -f <filter> -e <complex filter> -b <select db> -q <query fields> -l <list all dbs>',sys.argv[0])
    try:
        opts, args = getopt.getopt(argv,"hi:p:f:e:b:q:l",["help","filter=","address","port","complexFilter","listdb","queryFields","listAllDB"])
    except getopt.GetoptError:
        print(usage)
        sys.exit(2)
    for opt, arg in opts:
        if opt == '-h':
            print(usage)
            sys.exit()
        elif opt == "-i":
            address = arg
        elif opt == "-p":
            port = int(arg)
        elif opt in ("-f", "--filter"):
            packet_log_config.setFilter(arg)
        elif opt in ("-e"):
            packet_log_config.setFilter(parseComplexFilter(arg))
        elif opt in ("-b"):
            queryDB = arg
        elif opt in ("-q"):
            queryFields = parseQueryFields(arg)
        elif opt in ("-l"):
            DataStorage.listDBs()
            sys.exit()
    if type(queryFields) is not dict:
        proxy_instance = http_proxy.HttpProxy(address,port)
        proxy_instance.start()
    else:
        HttpPacket.query(queryFields,queryDB)
コード例 #3
0
ファイル: spider_main.py プロジェクト: KIMI-Z/python-spider
 def __init__(self):
     """
     初始化方法,主要是将其他组件实例化
     """
     self.url_manager = UrlManager()
     self.html_downloader = HtmlDownloader()
     self.html_parser = HtmlParser()
     self.data_storage = DataStorage()
コード例 #4
0
    def test_basic_types(self):
        ds = DataStorage.open('test_basic_types.h5', 'w')

        test_int = 124
        test_float = 0.1
        test_1d_array = np.arange(1000000)
        test_2d_array = np.random.rand(100, 100)
        test_list = [1, 2, 3, 4]
        test_dict = dict(int=123, float=111.1, list=test_list)

        ds['int'] = test_int
        ds['float'] = test_float
        ds['1d_array'] = test_1d_array
        ds['2d_array'] = test_2d_array
        ds['list'] = test_list
        ds['dict'] = test_dict

        # Test return types
        self.assertTrue(isinstance(ds['int'], int))
        self.assertTrue(isinstance(ds['float'], float))
        self.assertTrue(isinstance(ds['1d_array'], np.ndarray))
        self.assertTrue(isinstance(ds['2d_array'], np.ndarray))
        self.assertTrue(isinstance(ds['list'], collections.MutableSequence))
        self.assertTrue(isinstance(ds['dict'], collections.MutableMapping))

        # Test values of basic types
        self.assertEqual(ds['int'], test_int)
        self.assertEqual(ds['float'], test_float)
        self.assertTrue(np.all(ds['1d_array'] == test_1d_array))
        self.assertTrue(np.all(ds['2d_array'] == test_2d_array))
        self.assertEqual(ds['list'], test_list)
        self.assertEqual(ds['dict'], test_dict)

        ds.flush()

        # Test delete
        del (ds['int'])
        self.assertRaises(KeyError, self.getItem, ds, 'int')
        del (ds['float'])
        self.assertRaises(KeyError, self.getItem, ds, 'float')
        del (ds['1d_array'])
        self.assertRaises(KeyError, self.getItem, ds, '1d_array')
        del (ds['2d_array'])
        self.assertRaises(KeyError, self.getItem, ds, '2d_array')
        del (ds['list'])
        self.assertRaises(KeyError, self.getItem, ds, 'list')
        del (ds['dict'])
        self.assertRaises(KeyError, self.getItem, ds, 'dict')

        ds.close()

        # ds should be empty now
        ds = DataStorage.open('test_basic_types.h5', 'r')
        self.assertEqual(len(ds), 0)

        ds.close()
コード例 #5
0
    def test_empty_arr(self):
        arr = np.array([])

        ds = DataStorage.open('test_empty_arr.h5', 'w')
        ds['empty'] = arr
        ds.close()

        ds = DataStorage.open('test_empty_arr.h5', 'r')
        self.assertTrue(np.all(arr == ds['empty']))
        self.assertEqual(len(ds['empty']), 0)
コード例 #6
0
    def readCorpus(self, db_path, dev_path):
        ds = DataStorage()
        ds.load_db_from_file(db_path)
        ds.load_valid_datasets(dev_path)

        self.vec_prop_valid = ds.get_actual_input_features(
            merged_col_name=True)
        get_bert_tokens(self.vec_prop_valid, self._bert_tokenizer)
コード例 #7
0
def getBumpCurrentSlope(simLabel, threshold=0):
    fileName = 'bump_slope_data/bump_slope_detailed_noise_{0}.h5'.format(
        simLabel)
    msg = 'Loading bump vs. current slope from \'{0}\''.format(fileName)
    log_info('getBumpCurrentlope (detailed)', msg)
    ds = DataStorage.open(fileName, 'r')
    slopes = ds['lineFitSlope'].flatten()
    ds.close()
    slopes[slopes < threshold] = np.nan
    return slopes
コード例 #8
0
    def test_lists(self):
        def appendListAndTest(ds, key, test_l, item):
            ds[key].append(item)
            test_l.append(item)
            self.assertEqual(test_l, ds[key])

        ds = DataStorage.open('test_lists.h5', 'w')

        d1 = {"hola": [10, 20, 30], "str": "This is a test string"}

        test_list = [1, 2, d1]
        ds['list'] = test_list
        ds.close()

        ds = DataStorage.open('test_lists.h5', 'r+')
        appendListAndTest(ds, 'list', test_list, 10)
        appendListAndTest(ds, 'list', test_list, 23.5)
        appendListAndTest(ds, 'list', test_list, [1, 2, 3])
        appendListAndTest(ds, 'list', test_list, dict(a=10, b=[1, 2, 3]))
        ds.close()
コード例 #9
0
class SpiderMain:
    def __init__(self):
        self.url_manager = UrlManager()
        self.html_downloader = HtmlDownloader()
        self.html_parser = HtmlParser()
        self.data_storage = DataStorage()

    def start(self):
        """
        爬虫的主启动方法
        :return:
        """
        self.url_manager.add_new_url(
            "http://127.0.0.1:8848/xiaomi-master/index.html")
        # 从url管理器获取url
        url = self.url_manager.get_new_url()
        # 将获取到的url使用下载器进行下载
        html = self.html_downloader.download(url)
        # 将html进行解析
        res = self.html_parser.parser(html)
        # 数据存储
        self.data_storage.storage(res)
コード例 #10
0
    def test_iterator(self):
        ds = DataStorage.open('test_iterator.h5', 'w')

        test_list = list(np.arange(100))
        ds['list'] = test_list

        it1 = 0
        for val1 in ds['list']:
            self.assertEqual(val1, test_list[it1])
            it1 += 1

            it2 = 0
            for val2 in ds['list']:
                self.assertEqual(val2, test_list[it2])
                it2 += 1

        ds.close()
コード例 #11
0
    def test_chained_setter(self):
        def testChain(ds, keyList, testValue):
            val = ds
            for key in keyList[0:-1]:
                val = val[key]
                self.assertTrue(isinstance(val, collections.MutableMapping))
            self.assertEqual(val[keyList[-1]], testValue)

        keyList = ['a', 'b', 'c', 'd']
        testValue = [1, 2, 3, 4]
        ds = DataStorage.open('test_chained_setter.h5', 'w')

        # Initial test
        ds.setItemChained(keyList, testValue)
        testChain(ds, keyList, testValue)

        # Over-write test
        newTestValue = 'Over-write test string'
        ds.setItemChained(keyList, newTestValue)
        testChain(ds, keyList, newTestValue)

        # Do not overwrite if overwriteLast is False
        noOverWriteTestValue = 'This should not be written'
        ds.setItemChained(keyList, noOverWriteTestValue, overwriteLast=False)
        testChain(ds, keyList, newTestValue)

        # ['a', 'b', 'another', 'xxx']
        otherKeyList = ['a', 'b', 'another', 'xxx']
        otherTestValue = [1, 2, 3, 'ahoy']
        ds.setItemChained(otherKeyList, otherTestValue)
        testChain(ds, otherKeyList, otherTestValue)
        testChain(ds, keyList, newTestValue)  # Should not be overwritten

        # Single item in keyList
        singleList = ['single']
        ds.setItemChained(singleList, otherTestValue)
        testChain(ds, singleList, otherTestValue)

        # Test using getItemChained()
        self.assertEqual(ds.getItemChained(keyList), newTestValue)
        self.assertEqual(ds.getItemChained(otherKeyList), otherTestValue)
        self.assertEqual(ds.getItemChained(singleList), otherTestValue)
コード例 #12
0
    def test_chained_getter(self):
        test_dict = dict(
            int=123,
            float=111.1,
            list=[1, 2, 3,
                  dict(a='blabla', b=10, c=np.random.rand(10))])

        ds = DataStorage.open('test_chained_getter.h5', 'w')
        ds['nested'] = test_dict

        ds_nested = ds['nested']
        self.assertEqual(ds_nested.getItemChained(('int', )), test_dict['int'])
        self.assertEqual(ds_nested.getItemChained(('list', 0)),
                         test_dict['list'][0])
        self.assertEqual(ds_nested.getItemChained(('list', 3, 'a')),
                         test_dict['list'][3]['a'])

        # The same test but with a list as an index
        self.assertEqual(ds_nested.getItemChained(['list', 3, 'a']),
                         test_dict['list'][3]['a'])

        ds.close()
コード例 #13
0
 def __init__(self, bert_tokenizer=None, bert_config=None):
     self.ds = DataStorage()
     self._bert_tokenizer = bert_tokenizer
     self._bert_config = bert_config
コード例 #14
0
class SQLTrainer:
    def __init__(self, bert_tokenizer=None, bert_config=None):
        self.ds = DataStorage()
        self._bert_tokenizer = bert_tokenizer
        self._bert_config = bert_config

    def readCorpus(self, db_path, vec_train_path, dev_path):
        self.ds.load_db_from_file(db_path)
        self.ds.load_train_datasets(vec_train_path, dev_path)

        # 1. Generate property-specific train / test data.
        self.vec_prop_train, self.vec_prop_valid = self.ds.get_prop_cla_features(
            merged_col_name=True)
        self.vec_real_valid = self.ds.get_actual_input_features(
            merged_col_name=True)

        get_bert_tokens(self.vec_prop_train + self.vec_prop_valid,
                        self._bert_tokenizer)
        train_bert_removed = 0
        valid_bert_removed = 0
        final_train = []
        final_valid = []
        for v_f in self.vec_prop_train:
            bert_len = 0
            bert_len += 2 + len(v_f[Q_BERT_TOK]) + 1 + len(v_f[PF_PATH])
            for vec_tbl_bert in v_f[C_BERT_TOK]:
                for vec_col_bert in vec_tbl_bert:
                    bert_len += 1 + len(vec_col_bert)
            bert_len += 2  # *
            if bert_len >= 512:
                train_bert_removed += 1
            else:
                final_train.append(v_f)
        for v_f in self.vec_prop_valid:
            bert_len = 0
            bert_len += 2 + len(v_f[Q_BERT_TOK]) + 1 + len(v_f[PF_PATH])
            for vec_tbl_bert in v_f[C_BERT_TOK]:
                for vec_col_bert in vec_tbl_bert:
                    bert_len += 1 + len(vec_col_bert)
            bert_len += 2  # *
            if bert_len >= 512:
                valid_bert_removed += 1
            else:
                final_valid.append(v_f)
        self.vec_prop_train = final_train
        self.vec_prop_valid = final_valid
        print("TOO LARGE FOR BERT (TRAIN): ", train_bert_removed)
        print("TOO LARGE FOR BERT (VALID): ", valid_bert_removed)

        get_bert_tokens(self.vec_real_valid, self._bert_tokenizer)

        self.sg = SQLGen(bert_config=self._bert_config)

        # Extract Path IDX.
        for v_f in self.vec_prop_train + self.vec_prop_valid:
            v_f[PF_PATHIDX] = [IDX_PATH[str_path] for str_path in v_f[PF_PATH]]

        # Extract Actually used tables.
        # Remove those required to JOIN, reducing the garbage inputs.
        train_reduce_num = 0
        valid_reduce_num = 0
        vec_filter_codes    = [ [ OF_VU_COL1, OF_VU_OPERATOR, OF_VU_COL2 ], [ SF_VU_COL1, SF_VU_OPERATOR, SF_VU_COL2 ], \
                            [ WF_CU_VU_COL1, WF_CU_VU_OPERATOR, WF_CU_VU_COL2 ], [ HV_CU_VU_COL1, HV_CU_VU_OPERATOR, WF_CU_VU_COL2 ] ]

        for v_f in self.vec_prop_train + self.vec_prop_valid:
            if 0 not in v_f[SF_VU_COL1]:  # Try to reduce. ( NO STAR )
                set_used_col_idx = set()
                for c1_code, op_code, c2_code in vec_filter_codes:
                    for col1, op, col2 in zip(v_f[c1_code], v_f[op_code],
                                              v_f[c2_code]):
                        set_used_col_idx.add(col1)
                        if op > 0:
                            set_used_col_idx.add(col2)

                db = v_f[META_DB]
                set_used_tbl_idx = set()
                for cidx in set_used_col_idx:
                    if db.vec_cols[cidx].table_belong != None:
                        set_used_tbl_idx.add(
                            db.vec_cols[cidx].table_belong.tbl_idx)

                if set(v_f[TV_TABLES_USED_IDX]) != set_used_tbl_idx:
                    recover = self._generate_actual_tbls(
                        db, list(set_used_tbl_idx))
                    if set(recover) == set(v_f[TV_TABLES_USED_IDX]):
                        if v_f in self.vec_prop_train:
                            train_reduce_num += 1
                        else:
                            valid_reduce_num += 1

                        v_f[TV_TABLES_USED_IDX] = list(set_used_tbl_idx)
                        v_f[TV_TABLES_NUM] = len(v_f[TV_TABLES_USED_IDX])

        print("TRAIN REDUCE NUM: [%d]" % train_reduce_num)
        print("VALID REDUCE NUM: [%d]" % valid_reduce_num)

        print("PROPERTY TRAIN INSTANCE NUM: [%d]" % len(self.vec_prop_train))
        print("PROPERTY VALID INSTANCE NUM: [%d]" % len(self.vec_prop_valid))

    # Test on the features.
    def _do_test(self, sess, target_data, batch_size):
        batch_num = int(len(target_data) / batch_size)
        if len(target_data) % batch_size != 0:
            batch_num += 1

        ti = TestInfo()
        add_prop_test_info(ti, self.sg)
        add_tbl_test_info(ti, self.sg)

        for batch_idx in range(batch_num):
            vec_data = target_data[batch_idx * batch_size:min(
                (batch_idx + 1) * batch_size, len(target_data))]

            # 1. Classify Tbl.
            fdv_tbl = prepare_tbl_dict(vec_data,
                                       bert_tokenizer=self._bert_tokenizer)
            fdv_tbl[self.sg.is_train] = False
            fdv_tbl[self.sg.drop_rate] = 0.0
            tbl_results = ti.fetch_tbl_tensor_info(sess, fdv_tbl)

            # 2. Get the extracted tbl info.
            vec_tbl_idx_score = tbl_results[TV_TABLES_USED_IDX]
            vec_tbl_num = tbl_results[TV_TABLES_NUM]

            vec_mod_tbl_num = []
            vec_tbl_extracted = []
            for tbl_idx_score, tbl_num in zip(vec_tbl_idx_score, vec_tbl_num):
                vec_tbl_idx = np.array(tbl_idx_score).argsort()[-tbl_num:]
                vec_tbl_idx = sorted(vec_tbl_idx.tolist())
                vec_tbl_extracted.append(vec_tbl_idx)

            # 3. Classify Props.
            fdv_prop = prepare_prop_dict(vec_data,
                                         vec_tbl_extracted,
                                         bert_tokenizer=self._bert_tokenizer)
            fdv_prop[self.sg.is_train] = False
            fdv_prop[self.sg.drop_rate] = 0.0
            prop_results = ti.fetch_prop_tensor_info(sess, fdv_prop)

            # 4. Update Pointer results to the actual columns.
            vec_score_answers   = [ [ GF_NUMCOL, [ GF_COLLIST ] ], \
                                    [ OF_NUMVU, [ OF_VU_COL1, OF_VU_COL2, OF_VU_AGG1, OF_VU_AGG2, OF_VU_DIST1, OF_VU_DIST2, OF_VU_OPERATOR ] ], \
                                    [ SF_NUM_VU, [ SF_VU_AGGALL, SF_VU_COL1, SF_VU_COL2, SF_VU_AGG1, SF_VU_AGG2, SF_VU_DIST1, SF_VU_DIST2, SF_VU_OPERATOR ] ], \
                                    [ WF_NUM_CONDUNIT, [ WF_CU_AGGREGATOR, WF_CU_IS_NOT, WF_CU_COND_OP, \
                                                        WF_CU_VAL1_TYPE, WF_CU_VAL1_SP, WF_CU_VAL1_EP, WF_CU_VAL1_LIKELY, WF_CU_VAL1_BOOLVAL, \
                                                        WF_CU_VAL2_TYPE, WF_CU_VAL2_SP, WF_CU_VAL2_EP, WF_CU_VAL2_LIKELY, WF_CU_VAL2_BOOLVAL, \
                                                        WF_CU_VU_OPERATOR, WF_CU_VU_AGG1, WF_CU_VU_COL1, WF_CU_VU_DIST1, WF_CU_VU_AGG2, WF_CU_VU_COL2, WF_CU_VU_DIST2 ] ], \
                                    [ HV_NUM_CONDUNIT, [ HV_CU_AGGREGATOR, HV_CU_IS_NOT, HV_CU_COND_OP, \
                                                         HV_CU_VAL1_TYPE, HV_CU_VAL1_SP, HV_CU_VAL1_EP, HV_CU_VAL1_LIKELY, HV_CU_VAL1_BOOLVAL, \
                                                         HV_CU_VAL2_TYPE, HV_CU_VAL2_SP, HV_CU_VAL2_EP, HV_CU_VAL2_LIKELY, HV_CU_VAL2_BOOLVAL, \
                                                         HV_CU_VU_OPERATOR, HV_CU_VU_AGG1, HV_CU_VU_COL1, HV_CU_VU_DIST1, HV_CU_VU_AGG2, HV_CU_VU_COL2, HV_CU_VU_DIST2 ] ] ]

            for num_col_name, vec_col_name in vec_score_answers:
                for col_name in vec_col_name:
                    prop_results[col_name] = [
                        np.argmax(v, -1).tolist()[:l] for v, l in zip(
                            prop_results[col_name], prop_results[num_col_name])
                    ]

            # 5. Update the extracted column indexes
            vec_col_pointers = [
                GF_COLLIST, OF_VU_COL1, OF_VU_COL2, SF_VU_COL1, SF_VU_COL2,
                WF_CU_VU_COL1, WF_CU_VU_COL2, HV_CU_VU_COL1, HV_CU_VU_COL2
            ]
            for pidx, prop_data in enumerate(vec_data):
                db = prop_data[META_DB]

                # 1. Get the valid column span. ( Inclusive )
                vec_col_spans = [[0, 0]]  # Special colum: *
                for t in vec_tbl_extracted[pidx]:
                    if t < len(db.vec_tbls):
                        tbl = db.vec_tbls[t]
                        vec_col_spans.append([
                            tbl.vec_cols[0].col_idx, tbl.vec_cols[-1].col_idx
                        ])
                    else:
                        print("ERROR: %d VS. %d" % (t, len(db.vec_tbls)))
                vec_col_spans = sorted(vec_col_spans, key=lambda x: x[0])

                # 2. Generate New Col Idx - Old Col Idx Map.
                new_idx = 0
                map_col_idx = dict()
                for sidx, eidx in vec_col_spans:
                    for col_idx in range(sidx, eidx + 1):
                        map_col_idx[new_idx] = col_idx
                        new_idx += 1

                # 3. Update the mappings.
                for col_name in vec_col_pointers:
                    prop_results[col_name][pidx] = [
                        map_col_idx[col_idx]
                        for col_idx in prop_results[col_name][pidx]
                        if col_idx in map_col_idx
                    ]

            # 6. Get merged results & evaluate.
            total_results = prop_results
            for k, v in tbl_results.items():
                total_results[k] = v

            ti.integrate_eval_result(total_results, vec_data)

        return ti

    def train(self, BERT_DIR, lf, save_path, batch_size):
        batch_num = int(len(self.vec_prop_train) / batch_size)
        if len(self.vec_prop_train) % batch_size != 0:
            batch_num += 1

        self.sg.constructGraph(batch_num,
                               lf,
                               init_bert=os.path.join(BERT_DIR,
                                                      "bert_model.ckpt"))

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True
        with tf.Session(graph=self.sg.graph, config=config) as sess:
            ckpt = tf.train.get_checkpoint_state(save_path)
            v2_path = ckpt.model_checkpoint_path + ".index" if ckpt else ""
            self.sg.init.run()

            tvars = tf.trainable_variables()
            initialized_variable_names = {}
            (assignment_map, initialized_variable_names
             ) = modeling.get_assignment_map_from_checkpoint(
                 tvars, os.path.join(BERT_DIR, "bert_model.ckpt"))

            tf.logging.info("**** Trainable Variables ****")
            for var in tvars:
                init_string = ""
                if var.name in initialized_variable_names:
                    init_string = ", *INIT_FROM_CKPT*"
                print("  name = %s, shape = %s%s" %
                      (var.name, var.shape, init_string))

            print('Num params: %d' % sum(v.get_shape().num_elements()
                                         for v in tf.trainable_variables()))

            step = 1
            valid_accept_cnt = 0
            global_step = 0
            prev_valid_acc = 0.0
            while True:
                self.vec_prop_train = corpus_util.shuffleData(
                    self.vec_prop_train)  # JUST FOR DEBUGGING
                print("STEP: %d" % step)
                average_train_loss = 0.0
                average_valid_loss = 0.0

                tm = TimeMeasurer()
                tm.start("train")

                for batch_idx in range(batch_num):
                    global_step += 1
                    vec_prop_data = self.vec_prop_train[
                        batch_idx *
                        batch_size:min((batch_idx + 1) *
                                       batch_size, len(self.vec_prop_train))]

                    # 1. Train the Table Selector Network.
                    feed_dict_train = prepare_tbl_dict(
                        vec_prop_data, bert_tokenizer=self._bert_tokenizer)
                    feed_dict_train[self.sg.global_step] = global_step
                    feed_dict_train[self.sg.is_train] = True
                    feed_dict_train[self.sg.drop_rate] = 0.1

                    _, loss_val = sess.run(
                        [self.sg.train_tbl_op, self.sg.tbl_cross_entropy],
                        feed_dict=feed_dict_train)
                    average_train_loss += loss_val * len(vec_prop_data)

                    if math.isnan(loss_val) or math.isinf(loss_val):
                        print("TBL NAN/INF ERROR")
                        sys.exit(0)

                    # 2. Train for Other property selection network.
                    #    Feed the used tables as its inputs.
                    vec_valid_tbl_idx = [
                        v_f[TV_TABLES_USED_IDX] for v_f in vec_prop_data
                    ]
                    feed_dict_prop_train = prepare_prop_dict(
                        vec_prop_data,
                        vec_valid_tbl_idx,
                        bert_tokenizer=self._bert_tokenizer)
                    feed_dict_prop_train[self.sg.global_step] = global_step
                    feed_dict_prop_train[self.sg.is_train] = True
                    feed_dict_prop_train[self.sg.drop_rate] = 0.1

                    _, loss_val = sess.run(
                        [self.sg.train_op, self.sg.cross_entropy],
                        feed_dict=feed_dict_prop_train)
                    average_train_loss += loss_val * len(vec_prop_data)
                    if math.isnan(loss_val) or math.isinf(loss_val):
                        print("PROP NAN/INF ERROR")
                        sys.exit(0)

                    if batch_idx % 100 == 0:
                        print("%d, Train Loss: %f" % (batch_idx, loss_val))
                print("%d, Train Loss: %f" % (batch_idx, loss_val))

                average_train_loss /= len(self.vec_prop_train)
                tm.end("train")

                print("Average Training Loss: " + str(average_train_loss))
                print("Elapsed Per Step: " + str(tm.getElapsed("train")))

                ti = self._do_test(sess, self.vec_prop_valid, batch_size)
                ti.print_eval_results()
                em_f1 = ti.get_overall_result()

                valid_acc = em_f1
                if valid_acc < prev_valid_acc:
                    if valid_accept_cnt >= 20:
                        print("Training Finished.")
                        return
                    else:
                        valid_accept_cnt += 1
                        print("TOLERATING CNT: %d" % valid_accept_cnt)
                else:
                    self._save_model(sess, save_path)
                    print("MODEL SAVED.")
                    prev_valid_acc = valid_acc
                    valid_accept_cnt = 0

                step += 1

    def test(self, load_path, batch_size):
        self.sg.constructGraph()

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True
        with tf.Session(graph=self.sg.graph, config=config) as sess:
            self.sg.init.run()

            self._load_model(sess, load_path)

            tm = TimeMeasurer()
            tm.start("test")
            ti = self._do_test(sess, self.vec_prop_valid, batch_size)
            tm.end("test")
            elapsed = tm.getElapsed("test")
            print("elased_inmsec = " + str(elapsed) + "( per sen: " +
                  str(float(elapsed) / len(self.vec_prop_valid)) + " )")
            ti.print_eval_results()

    def _save_model(self, sess, save_path):
        self.sg.saver.save(sess, "%s/best_chk" % save_path)

    def _load_model(self, sess, save_path):
        ckpt = tf.train.get_checkpoint_state(save_path)
        v2_path = ckpt.model_checkpoint_path + ".index" if ckpt else ""

        self.sg.saver.restore(sess, ckpt.model_checkpoint_path)

    def _generate_actual_tbls(self, db, vec_tbl_idx):
        if len(vec_tbl_idx) == 1:
            return vec_tbl_idx

        # 1. Find the "Join Path".
        joinable_tbls = dict()
        for f1, f2 in db.foreign_keys:
            t1 = f1.table_belong.tbl_idx
            t2 = f2.table_belong.tbl_idx
            if t1 not in joinable_tbls:
                joinable_tbls[t1] = set()
            if t2 not in joinable_tbls:
                joinable_tbls[t2] = set()
            joinable_tbls[t1].add(t2)
            joinable_tbls[t2].add(t1)

        cur_path_cand = [set([v]) for v in vec_tbl_idx]
        final_path = None
        while len(cur_path_cand) > 0:
            new_path_cand = []
            for path_cand in cur_path_cand:
                for t in path_cand:
                    if t not in joinable_tbls:
                        continue
                    for new_exp in joinable_tbls[t]:
                        if new_exp not in path_cand:
                            new_path = path_cand.copy()
                            new_path.add(new_exp)

                            if set(vec_tbl_idx) <= new_path:
                                final_path = new_path
                                break
                            if new_path not in new_path_cand:
                                new_path_cand.append(set(new_path))

                    if final_path != None:
                        break
                if final_path != None:
                    break
            if final_path != None:
                break

            cur_path_cand = new_path_cand

        if final_path == None:
            return vec_tbl_idx

        return final_path
コード例 #15
0
ファイル: main.py プロジェクト: jzilic91/edge
def main():
    # data storage will parse all failures from LANL data file which contains failure trace logs
    data_storage = DataStorage(
        "../data/LA-UR-05-7318-failure-data-1996-2005.csv",
        DatasetType.LANL_DATASET)

    number_of_samplings = 10000
    number_of_app_executions = 40

    ec_node_candidate_list = [(19, 1), (19, 11), (19, 4), (19, 8), (20, 41)]
    ed_node_candidate_list = [(1, 0), (5, 158), (5, 165), (5, 243), (5, 48),
                              (7, 1), (7, 154), (7, 242), (7, 32)]
    er_node_candidate_list = [(3, 0), (16, 80), (4, 55), (4, 1), (4, 3)]
    cd_node_candidate_list = [(22, 0)]
    # ec_node_candidate_list = [(19, 8)]
    # ed_node_candidate_list = [(5, 243)]
    # er_node_candidate_list = [(4, 1)]
    # cd_node_candidate_list = [(22, 0)]

    application_names = [
        'ANTIVIRUS', 'GPS_NAVIGATOR', 'CHESS', 'FACEBOOK', 'FACERECOGNIZER'
    ]
    mixed_mode = True
    sensitivity_analysis = False
    # NodeCategory.EC_DATA: [(19, 1)],
    # NodeCategory.ED_DATA: [(1, 0)],
    # NodeCategory.ER_DATA: [(3, 0)],
    # NodeCategory.CD_DATA: [(22, 0)]
    #}

    for i in range(5):
        # declare mobile device with offloading sites included in the simulation and data provider
        ec_node_candidate = ec_node_candidate_list[i]
        ed_node_candidate = ed_node_candidate_list[i]
        er_node_candidate = er_node_candidate_list[i]
        cd_node_candidate = cd_node_candidate_list[0]
        application = application_names[i]

        print('EC node candidate: ' + str(ec_node_candidate))
        print('ED node candidate: ' + str(ed_node_candidate))
        print('ER node candidate: ' + str(er_node_candidate))
        print('CD node candidate: ' + str(cd_node_candidate))

        edge_data_site = OffloadingSite(
            5000, 8, 300, OffloadingSiteCode.EDGE_DATABASE_SERVER,
            data_storage.get_ed_data_stats(), 'A', ed_node_candidate)
        edge_comp_site = OffloadingSite(
            8000, 8, 150,
            OffloadingSiteCode.EDGE_COMPUTATIONAL_INTENSIVE_SERVER,
            data_storage.get_ec_data_stats(), 'A', ec_node_candidate)
        edge_reg_site = OffloadingSite(5000, 8, 150,
                                       OffloadingSiteCode.EDGE_REGULAR_SERVER,
                                       data_storage.get_er_data_stats(), 'A',
                                       er_node_candidate)
        cloud_dc_site = OffloadingSite(12000, 128, 1000,
                                       OffloadingSiteCode.CLOUD_DATA_CENTER,
                                       data_storage.get_cd_data_stats(), 1,
                                       cd_node_candidate)

        mobile_device = MobileDevice(
            (edge_data_site, edge_comp_site, edge_reg_site), cloud_dc_site)

        mobile_device.deploy_network_model()

        if not mixed_mode:
            if application == 'ANTIVIRUS':
                mobile_device.deploy_antivirus_application()

            elif application == 'GPS_NAVIGATOR':
                mobile_device.deploy_gps_navigator_application()

            elif application == 'CHESS':
                mobile_device.deploy_chess_application()

            elif application == 'FACEBOOK':
                mobile_device.deploy_facebook_application()

            elif application == 'FACERECOGNIZER':
                mobile_device.deploy_facerecognizer_application()

        mobile_device.deploy_enhanced_efpo_ode()
        mobile_device.run(number_of_samplings, number_of_app_executions,
                          mixed_mode, sensitivity_analysis)

        mobile_device.deploy_efpo_ode()
        mobile_device.run(number_of_samplings, number_of_app_executions,
                          mixed_mode, sensitivity_analysis)

        mobile_device.deploy_energy_efficient_ode()
        mobile_device.run(number_of_samplings, number_of_app_executions,
                          mixed_mode, sensitivity_analysis)
コード例 #16
0
 def __init__(self,):
     super(DataFetcher, self).__init__()
     self.file_dir = "/files/"
     self.ds = DataStorage("sqlite:///"+os.path.dirname(os.path.abspath(__file__)) + self.file_dir+"cache.db")
     print "Current Cache size: " + str(self.cache_folder_size(
         os.path.dirname(os.path.abspath(__file__)) + self.file_dir)) + " MB"
コード例 #17
0
class DataFetcher(object):
    def __init__(self,):
        super(DataFetcher, self).__init__()
        self.file_dir = "/files/"
        self.ds = DataStorage("sqlite:///"+os.path.dirname(os.path.abspath(__file__)) + self.file_dir+"cache.db")
        print "Current Cache size: " + str(self.cache_folder_size(
            os.path.dirname(os.path.abspath(__file__)) + self.file_dir)) + " MB"

    def fetch_data(self, url_list):
        list = {}
        for url in url_list:
            list[url] = self.get_file(url)
        return list

    def cache_folder_size(self, folder):
        folder_size = 0
        for (path, dirs, files) in os.walk(folder):
            for file in files:
                filename = os.path.join(path, file)
                folder_size += os.path.getsize(filename)
        return folder_size / 1000000

    def get_file(self, url):
        #check if in chache
        data_dict = self.ds.retrieve_dict(url)
        headers = self.check_if_valid_file_last_update(url)
        if data_dict is not None and headers is not None:
            # get the f*****g date from the header
            if datetime_utils.parse_date(headers['Last-Modified']) > datetime_utils.parse_date(data_dict['time']):
                self.download(url)
                for path in data_dict['path']:
                    os.remove(path)
                self.ds.remove_dict(url)
            else:
                return data_dict['path']
        else:
            return self.download(url)

    def generate_file_path(self):
        return os.path.dirname(os.path.abspath(__file__)) + self.file_dir + str(uuid1())

    def unzip(self, path, dirname, url):
        if not os.path.exists(dirname):
            os.makedirs(dirname)
        zfile = zipfile.ZipFile(path)
        for name in zfile.namelist():
            zfile.extract(name, dirname)
        zip_files = os.listdir(dirname)
        results = []
        for zip_file in zip_files:
            path = self.generate_file_path()
            os.rename(dirname +'/'+ zip_file, path)
            results.append(path)
        return  results


    def get_unzip_list(self, filename, type, url, data_dict):
        if type.find("zip") != -1:
            list_zips = self.unzip(filename, os.path.dirname(os.path.abspath(__file__)) + self.file_dir + 'zips', url)
            os.remove(data_dict['path'])
            self.ds.remove_dict(url)
            save_dict = dict(path=list_zips, time=data_dict['time'])
            self.ds.store_dict(url, save_dict)
            return list_zips
        else:
            return None


    def check_if_valid_file_last_update(self, url):
        try:
            tag_filter = ["image", "text/html"]
            response = urllib2.urlopen(HeadRequest(url)).info()
            found = False
            for tag in tag_filter:
                if response['Content-Type'].find(tag) != -1:
                    found = True
            if not found:
                return response
            else:
                return None
        except:
            return None


    def download(self, url):
        headers = self.check_if_valid_file_last_update(url)
        if headers is not None:
            path = self.generate_file_path()
            save_dict = dict(path=path, time=headers['Last-Modified'])
            urllib.urlretrieve(url, path)
            unzip_result = self.get_unzip_list(path, headers['Content-Type'], url, save_dict)
            if unzip_result is None:
                self.ds.store_dict(url, save_dict)
                return [path]
            else:
                return unzip_result
        else:
            return None


    def test(self):
        print self.fetch_data(
            ["http://www.pagepersonnel.it/index.html", "http://upload.wikimedia.org/wikipedia/it/3/30/Ls_xterm.png",
             "http://gis.csi.it/repertorio/sitad_wgs84/DBPR10_ELEMIDRI/elemidri.zip", "http://www.google.it"])
        print "second_dictionary", self.ds.retrieve_dict("hhh")
コード例 #18
0
 def __init__(self):
     self.url_manager = UrlManager()
     self.html_downloader = HtmlDownloader()
     self.html_parser = HtmlParser()
     self.data_storage = DataStorage()
コード例 #19
0
 def __init__(self, host, port, level):
     DataStorage.__init__(self, level)
     self._storage = StrictRedis(host=host, port=port)
コード例 #20
0
 def __init__(self, level):
     self._storage = {}
     DataStorage.__init__(self, level)