Ejemplo n.º 1
0
    def test_model(self, words):
        """
        测试词向量模型,查找每个词语最相近的 10个词
        :param words: 待测试的词语列表
        :return:
        """
        if not os.path.exists(self.model_path):
            LOG.error('Model file not exists, please train the model first')
            exit(1)

        if not isinstance(words, list):
            words = [words]

        LOG.info('Testing model with words %s' % str(words))

        # 加载词向量模型
        mdl = word2vec.Word2Vec.load(self.model_path)

        for word in words:
            try:
                # 查找与 word 最相近的 10个词
                tops = mdl.wv.most_similar(word, topn=10)
                print('\nWords most related to "%s":' % word)
                cpprint(tops)
            # 捕获异常并跳过,比如 word 不在词汇表中会抛出 KeyError
            except Exception as e:
                print('\nError:', repr(e))
 def test_parser(self):
     lexer = Lexer('1 + 2 * (3 + 4)')
     parser = Parser(lexer, 10)
     ast = parser.expr()
     prettyprinter.cpprint(dataclasses.asdict(ast))
     # ast.walk()
     Visitor().visit(ast)
Ejemplo n.º 3
0
 def handle(self, *args, **options):
     # Headless Chrome
     options = Options()
     options.add_argument('--headless')
     browser = webdriver.Chrome(chrome_options=options)
     try:
         browser = login(browser)
         if browser == False:
             print('Error: failed to login to RMS')
             browser.quit()
             sys.exit(1)
         data = get_rmstop(browser)
         get_rms_detail(browser, data)
         printdata(data)
         cpprint(data)
         if adddata(data) == False:
             print('data {0} is already exists.'.format(data['date']))
             sys.exit(1)
         browser.quit()
     except:
         browser.quit()
         import traceback
         print('error.')
         traceback.print_exc()
         sys.exit(1)
Ejemplo n.º 4
0
def test_pretty_json():
    with open('tests/sample_json.json') as f:
        data = json.load(f)

    print('native pprint')
    nativepprint(data)
    print('prettyprinter')
    cpprint(data)
Ejemplo n.º 5
0
def main():
    try:
        user = get_user()
        cpprint(user)

    except APIUnreachableException as my_e:
        print("le serveur est injoignable", str(my_e), my_e.custom_message)
    except HttpNotFound:
        print("L'information n'existe pas")
Ejemplo n.º 6
0
def main():
    try:
        user = get_user()
        cpprint(user)
        # print(f"{user.titre} {user.nom_complet}")
    except APIUnreachableException:
        print("L'API est injoignable.")
    except HttpNotFound:
        print("l'URL n'exite pas")
Ejemplo n.º 7
0
    def __init__(self):
        super(MF, self).__init__()
        self.config = ConfigX()
        cpprint(self.config.__dict__)  #print the configuration

        # self.rg = RatingGetter()  # loading raing data
        # self.init_model()
        self.iter_rmse = []
        self.iter_mae = []
        pass
Ejemplo n.º 8
0
 def colorize(self,s) -> str:
   assert isinstance(s,str), s
   _stream = io.StringIO()
   try:
     litval = literal_eval(s)
     cpprint(litval,stream=_stream)
   except:
     cpprint(s,stream=_stream)
   rv = _stream.getvalue()
   _stream.close()
   return rv
Ejemplo n.º 9
0
def main():
    """The main function."""
    ap = argparse.ArgumentParser(description=__doc__)
    ap.add_argument('--config',
                    default=str(Path.home() / '.philipshue.ini'),
                    help='the config file location')
    args = ap.parse_args()

    while True:
        try:
            cp = configparser.ConfigParser()
            cp.read(args.config)
            cf = cp['DEFAULT']
            bridge_location = cf['bridge_location']
            bridge_username = cf['bridge_username']
        except KeyError:
            setup(args.config)
            continue
        break

    print(f'Connecting to {bridge_location}...')
    try:
        bridge = qhue.Bridge(bridge_location, bridge_username)
        num_lights = len(bridge.lights())
        print(f'Connected to {bridge_location}. {num_lights} lights found.')
    except requests.ConnectionError as err:
        print(f'{sgr(1, 31)}{err.__class__.__name__}{sgr(0)}: {err}')
        sys.exit(1)

    session = PromptSession(
        '> ',
        lexer=PygmentsLexer(Python3Lexer),
        style=style_from_pygments_cls(PYGMENTS_STYLE),
        auto_suggest=AutoSuggestFromHistory(),
        input_processors=[HighlightMatchingBracketProcessor('()[]{}')],
        history=FileHistory(Path.home() / '.philipshue.hist'))
    while True:
        try:
            cmd = session.prompt()
            start = time.perf_counter()
            out = exec_cmd(cmd, bridge=bridge)
            time_taken = time.perf_counter() - start
            prettyprinter.cpprint(out)
            print(f'Time taken: {sgr(1, 34)}{time_taken*1000:.3f} ms{sgr(0)}')
        except KeyboardInterrupt:
            pass
        except EOFError:
            break
        except requests.ConnectionError as err:
            print(f'{sgr(1, 31)}{err.__class__.__name__}{sgr(0)}: {err}')
            sys.exit(1)
        except Exception as err:
            print(f'{sgr(1, 31)}{err.__class__.__name__}{sgr(0)}: {err}')
Ejemplo n.º 10
0
    def create_from_api(cls, response):
        first_resp = response.get('results')[0]
        cpprint(response)

        gender = first_resp.get('gender')
        title = first_resp.get('name').get('title')
        firstname = first_resp.get('name').get('first')
        lastname = first_resp.get('name').get('last')
        email = first_resp.get('email')
        username = first_resp.get('login').get('username')

        return User(gender, title, firstname, lastname, email, username)
Ejemplo n.º 11
0
def main():
    signal(SIGINT, signal_handle)
    options_dict = args(usage)
    cpprint(options_dict)
    tc_handle(**options_dict)()
    gol._init()
    t = threading.Thread(target=exec, args=())
    t.setDaemon(True)
    t.start()
    time.sleep(2)
    global win
    win = init_scr()
    win()
def main():
    parser = configparser.ConfigParser()
    parser.read(['config.ini'])
    urls = set(parser.sections())
    urls.remove('Planet')

    result = asyncio.run(amain(urls=list(urls)))
    prettyprinter.cpprint(result)
    for section in result:
        parser.remove_section(section)

    with open('config.modified.ini', 'w') as fp:
        parser.write(fp)
Ejemplo n.º 13
0
def amv_show_all_styles(data=None):
    from prettyprinter import cpprint
    from pygments import styles

    if data is None:
        data = _get_testdata()

    for style in styles.get_all_styles():
        print(f"{style}:")
        try:
            cpprint(data, style=styles.get_style_by_name(style))
        except Exception as exc:
            print(repr(exc))
        print()
Ejemplo n.º 14
0
def setup(config):
    resp = requests.get('https://discovery.meethue.com')
    print('Detected Philips Hue Bridges:')
    prettyprinter.cpprint(resp.json())
    session = PromptSession()
    location = session.prompt('Enter the Bridge IP address: ')
    username = qhue.create_new_username(location)
    cp = configparser.ConfigParser()
    cp.read(config)
    cf = cp['DEFAULT']
    cf['bridge_location'] = location
    cf['bridge_username'] = username
    with open(config, 'w') as configfile:
        cp.write(configfile)
Ejemplo n.º 15
0
def test_large_data_performance():
    data = [{'text': 'lorem ipsum dolor sit amet ' * 500}] * 200
    stream = StringIO()

    start = datetime.datetime.now()
    cpprint(data, stream=stream)
    stream.getvalue()

    end = datetime.datetime.now()
    took = end - start
    print('took', took)
    # The bottleneck is in string to doc conversion,
    # specifically escaping strings many times.
    # There's probably more we can do here
    assert took < datetime.timedelta(seconds=13)
Ejemplo n.º 16
0
    def __init__(self, fixseed = True):
        super(MF, self).__init__()
        self.config = ConfigX()
        self.configc = ConfigCUC()
        cpprint(self.config.__dict__)  #print the configuration
        # 打印数据统计
        print_data_file_stats(self.config.rating_path)
        print_data_file_stats(self.config.trust_path)

        if fixseed:
            np.random.seed(seed=self.config.random_state) # 固定随机种子

        # self.rg = RatingGetter()  # loading raing data
        # self.init_model()
        self.iter_rmse = []
        self.iter_mae = []
        pass
Ejemplo n.º 17
0
def dump(*args):
    """Dump variables using prettyprinter"""

    # Detect if running in pytest
    if "pytest" in sys.modules: level = None

    for arg in args:
        if type(arg) == str:
            # I don't want string printed with dump because it adds quotes to the string
            # which seems confusing at times.
            #prettyprinter.cpprint(arg, width=10000, ribbon_width=10000)
            print(arg)
        else:
            width = 120
            if uvicore.config:
                if uvicore.config.app.dump.width:
                    width = uvicore.config.app.dump.width
            prettyprinter.cpprint(arg, width=width, ribbon_width=width)
Ejemplo n.º 18
0
def eth2json(eth):
    if eth:
        print(eth)
        eth = eth.encode().decode('unicode_escape').encode(
            'raw_unicode_escape').decode()
        #a = b"\xe8\xaf\xad\xe6\x96\x87"
        #print(str(a, "utf-8"))
        try:
            eth = json.loads(eth, encoding="utf-8")
            return cpprint(eth)
        except Exception as e:
            print(e)
    return ''
Ejemplo n.º 19
0
def gen_list_of_sections_and_html_files(source_folder_path):
    toc = get_toc(source_folder_path)
    html_files_list = []  # list of dicts
    html_folder_path = os.path.join(source_folder_path, "_build", "html")

    # Add the root for the jupyterbook > 0.12 toc config
    if "root" in toc:
        html_file_path = os.path.join(html_folder_path,
                                      str(toc["root"]) + ".html")
        html_files_list.append({
            "section_name": "Introduction",
            "html_file_path": html_file_path
        })

    # TODO There are several jupyterbook _toc configurations that are possible
    if "parts" in toc:
        parts = toc["parts"]
    else:
        logger.warn(
            "Key 'parts' not present in _toc. Please convert your _toc to the new jupyterbook format with format jb-book."
        )
        exit(1)

    for item in parts:
        if ("chapters" in item.keys()
            ):  # will exclude intro file from the transfer to zendesk
            # section = item["part"]
            section = item["caption"]
            files = item["chapters"]
            for f in files:
                filename = f["file"]
                html_file_path = os.path.join(html_folder_path,
                                              str(filename) + ".html")
                html_files_list.append({
                    "section_name": section,
                    "html_file_path": html_file_path
                })
    # logger.info(f"Final List of html files to be sent to Zendesk: \n {html_files_list}")
    logger.debug(cpprint(html_files_list))
    return html_files_list
Ejemplo n.º 20
0
    def prettyprinter_displayhook(value):
        if value is None:
            return

        builtins._ = None
        stream = StringIO()
        output = cpprint(value,
                         width=get_terminal_width(default=79),
                         stream=stream,
                         end='')
        output = stream.getvalue()

        try:
            sys.stdout.write(output)
        except UnicodeEncodeError:
            encoded = output.encode(sys.stdout.encoding, 'backslashreplace')
            if hasattr(sys.stdout, 'buffer'):
                sys.stdout.buffer.write(encoded)
            else:
                text = encoded.decode(sys.stdout.encoding, 'strict')
                sys.stdout.write(text)

        sys.stdout.write('\n')
        builtins._ = value
Ejemplo n.º 21
0
def test_all_python_values(value):
    cpprint(value)
Ejemplo n.º 22
0
    def __init__(self):
        super(GEMF, self).__init__()
        self.rg = RatingGetter()
        ex_file = 'yp_trust'
        self.explict_trust_path = '../data/net/' + ex_file + '.txt'

        weight = 0.5
        # file = '%s_weight_%s' % (self.config.dataset_name, weight)
        file = 'yp_CUnet_weight'
        self.implict_trust_path = '../data/net/' + file + '.txt'
        # file = '%s_CUnet_weight_nnn' % self.config.dataset_name
        # file = '%s_less_CUnet_weight' % self.config.dataset_name
        # self.implict_trust_path = '../data/' + file + '.txt'
        # self.implict_trust_path = '../data/yp_30_39_rating_im_net_new.txt'  # ft_3 & db_13 & ca_16 & yp_30_39 # & ca_23 & db_18

        ############## 1 ################
        # ex_file = '%s_filter_trust_new' % self.config.dataset_name
        # file = '%s_CUnet_weight_new' % self.config.dataset_name
        # self.implict_trust_path = '../data/' + file + '.txt'
        # self.explict_trust_path = '../data/' + ex_file + '.txt'
        ############## 2 ################
        # file = 'ft_3_rating_im_net'
        # file = 'ft_3_rating_im_net_new' # ft_3 & db_18 & ca_23 & yp_30_39 for new
        # self.implict_trust_path = '../data/' + file + '.txt'
        ############## 3 ################
        # weight = 0.3
        # file = '%s_two_net_with_weight_%s_rewrited' % (self.config.dataset_name, weight)
        # file = '%s_two_net_with_weight_%s_new_rewrited' % (self.config.dataset_name, weight)
        # self.implict_trust_path = '../data/%s_two_net/' % self.config.dataset_name + file + '.txt'
        ############## 4 ################
        # file = '%s_two_net_with_tanh_rewrited' % (self.config.dataset_name)
        # file = '%s_two_net_with_tanh_new_rewrited' % (self.config.dataset_name)
        # self.implict_trust_path = '../data/%s_two_net/' % self.config.dataset_name + file + '.txt'
        ############## 5 ################
        # file = '%s_inter_net' % self.config.dataset_name
        # file = '%s_union_net' % self.config.dataset_name
        # file = '%s_union_net_expanded' % self.config.dataset_name
        # file = '%s_inter_net_new' % self.config.dataset_name
        # file = '%s_union_net_new' % self.config.dataset_name
        # file = '%s_union_net_new_expanded' % self.config.dataset_name
        # self.implict_trust_path = '../data/%s_two_net/' % self.config.dataset_name + file + '.txt'

        # parameters for matrix factorization
        self.config.lr = 0.01
        self.config.lambdaP = 0.03  #0.03
        self.config.lambdaQ = 0.01  #0.01
        self.config.lambdaB = 0.01  #0.01
        self.config.temp1 = 0.01
        self.config.temp2 = 0.01
        self.config.alpha = self.config.temp1
        self.config.beta = self.config.temp2
        self.config.factor = 10
        self.config.isEarlyStopping = True
        self.config.k_fold_num = 5

        # parameters for netwalker
        self.config.random_state = 0
        self.config.number_walks = 30  # the times of random walk 5
        self.config.path_length = 20  # the length of random walk 10
        self.config.restart_pro = 0.1  # the probability of restarts.
        self.config.undirected = True
        self.config.ex_walk_result_path = '../data/ge/' + ex_file + '_social_corpus_filter.txt'
        self.config.im_walk_result_path = '../data/ge/' + file + '_social_corpus_implict.txt'
        # parameters for graph embedding
        self.config.lambdaW = 1
        self.config.ex_table_path = '../data/ge/' + ex_file + '_table_filter.pkl'
        self.config.ex_model_out_path = '../data/ge/' + ex_file + '_result_filter.txt'
        self.config.im_table_path = '../data/ge/' + file + '_table_implict.pkl'
        self.config.im_model_out_path = '../data/ge/' + file + '_result_implict.txt'
        self.config.cbow = 0
        self.config.neg = 5
        self.config.w2v_lr = 0.01  # 0.01-0.81
        self.config.win_size = 10
        self.config.min_count = 3
        self.config.binary = 0

        self.dataSet_u = defaultdict(dict)
        self.dataSet_i = defaultdict(dict)
        self.filteredRatings = defaultdict(list)
        self.CUNet = defaultdict(list)
        self.walks = []
        self.ex_walks = []
        self.im_walks = []
        # self.visited = defaultdict(dict)

        self.ex_pos_loss_total = 0
        self.ex_neg_loss_total = 0
        self.im_pos_loss_total = 0
        self.im_neg_loss_total = 0

        # cpprint('k is %s' % self.config.near_num)
        cpprint('implict_trust_path is %s' % self.implict_trust_path)
        cpprint('explict_trust_path is %s' % self.explict_trust_path)
        cpprint('lr is %s' % self.config.lr)
        cpprint('neg is %s' % self.config.neg)
        cpprint('w2v_lr is %s' % self.config.w2v_lr)
        cpprint('win_size is %s' % self.config.win_size)
        cpprint('alpha is %s' % self.config.alpha)
        cpprint('beta is %s' % self.config.beta)
        cpprint('lamdbaP is %s' % self.config.lambdaP)
        cpprint('lambdaQ is %s' % self.config.lambdaQ)
        cpprint('number_walks is %s' % self.config.number_walks)
        cpprint('path_length is %s' % self.config.path_length)
        # cpprint('factor is %s' % self.config.factor)

        self.init_model()
Ejemplo n.º 23
0
            pass
    """
    def only_rate_limit(self):
        self.ip.tc('add', 'tbf', self.nic, 0x100000, parent=0x10010, rate=self.rate+'kbit', burst=1024 * 2, latency='200ms')

    def only_no_rate_limit(self):
        self.ip.tc('add', 'netem', self.nic, 0x100000, parent=0x10010, loss=30)
    """
    def __call__(self):
        self.flush_instance()
        if not self.flush:
            self.ip.tc('add', 'htb', self.nic, 0x10000, default=0x200000)
            self.ip.tc('add-class', 'htb', self.nic, 0x10001, parent=0x10000, rate='1000mbit', prio=4)
            #print(self.rate)
            self.ip.tc('add-class', 'htb', self.nic, 0x10010, parent=0x10001, rate=self.rate+'kbit',prio=3)
            self.ip.tc('add-class', 'htb', self.nic, 0x10020, parent=0x10001, rate='700mbit', prio=2)
            if self.loss or self.delay:
                #print(self.delay)
                self.ip.tc('add', 'netem', self.nic, 0x100000, parent=0x10010, loss=self.loss, delay=self.delay)
            else:
                self.ip.tc('add', 'tbf', self.nic, 0x100000, parent=0x10010, rate=self.rate+'kbit', burst=1024 * 2, latency='200ms')
            self.ip.tc('add', 'sfq', self.nic, 0x200000, parent=0x10020, perturb=10)
            #pyroute2 有bug,对socket家族的协议解析有不正确的地方,比如AF_INET应该解析成IPV4,但是解析成了ax25,AF_AX25解析成了all,所以将错就错用这个好了,protocols也一样的结果
            self.ip.tc('add-filter', 'u32', self.nic, parent=0x10000, prio=1, protocol=socket.AF_AX25, target=0x10010, keys=self.keys)
    

if __name__ == "__main__":
    options_dict = args(usage)
    cpprint(options_dict)
    tc_handle(**options_dict)()
Ejemplo n.º 24
0
def train(cfg):
    # hyperparameters
    global optimizer, criterion, scheduler
    SEED = cfg.values.seed
    seed_everything(SEED)
    MODEL_NAME = cfg.values.model_name
    USE_KFOLD = cfg.values.val_args.use_kfold
    TSVFILE = cfg.values.tsvfile
    log_interval = cfg.values.train_args.log_interval
    weight_decay = cfg.values.train_args.weight_decay
    tr_batch_size = cfg.values.train_args.train_batch_size
    val_batch_size = cfg.values.train_args.eval_batch_size
    epochs = cfg.values.train_args.num_epochs
    loss_type = cfg.values.train_args.loss_fn
    lr_decay_step = 1 #stepLR parameter
    steplr_gamma = cfg.values.train_args.steplr_gamma
    opti = cfg.values.train_args.optimizer
    scheduler_type = cfg.values.train_args.scheduler_type
    label_smoothing_factor = cfg.values.train_args.label_smoothing_factor

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print(f'device is "{device}"')
    print(MODEL_NAME)



    if 'koelectra' in MODEL_NAME:
        model_config = ElectraConfig.from_pretrained(MODEL_NAME)
    # elif 'roberta' in MODEL_NAME:
    #     model_config = RobertaConfig.from_pretrained(MODEL_NAME)
        # model_config.is_decoder = True
    else:
        model_config = AutoConfig.from_pretrained(MODEL_NAME)
    model_config.num_labels = 42

    train_dataset = load_data("../input/data/train/" + TSVFILE)
    train_label = train_dataset['label'].values

    if MODEL_NAME=='KoBertTokenizer':
        tokenizer = KoBertTokenizer.from_pretrained(MODEL_NAME)
    elif 'koelectra' in MODEL_NAME:
        tokenizer = ElectraTokenizer.from_pretrained(MODEL_NAME)
    # elif 'roberta' in MODEL_NAME:
    #     tokenizer = RobertaTokenizerFast.from_pretrained(MODEL_NAME)
    else:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)



    # compute the class weights
    class_weights = compute_class_weight('balanced', np.unique(train_label), train_label)
    weights = torch.tensor(class_weights, dtype=torch.float)
    weights = weights.to(device)


    if loss_type == 'custom':  # F1 + Cross_entropy
        criterion = CustomLoss()
    elif loss_type == 'labelsmooth':
        criterion = LabelSmoothingLoss(smoothing=label_smoothing_factor)
    elif loss_type == 'CEloss':
        criterion = nn.CrossEntropyLoss(weight=weights)
    elif loss_type == 'focal':
        criterion = FocalLoss()


    if USE_KFOLD:
        kfold = StratifiedKFold(n_splits=cfg.values.val_args.num_k)
        k=1
        save_dir = increment_output_dir(cfg.values.train_args.output_dir)
        for idx, splits in enumerate(kfold.split(train_dataset, train_label)):#(trind, valind)
            if idx !=4:
                continue
            trind=splits[0]
            valind=splits[1]
            print('\n')
            cpprint('=' * 15 + f'{k}-Fold Cross Validation' + '=' * 15)
            tr_label = train_dataset['label'].iloc[trind].values
            val_label = train_dataset['label'].iloc[valind].values

            tr_dataset = train_dataset.iloc[trind]
            val_dataset = train_dataset.iloc[valind]
            # tokenizing dataset
            tokenized_train = tokenized_dataset(tr_dataset, tokenizer)
            tokenized_dev = tokenized_dataset(val_dataset, tokenizer)

            RE_train_dataset = RE_Dataset(tokenized_train, tr_label)
            RE_dev_dataset = RE_Dataset(tokenized_dev, val_label)

            train_loader = DataLoader(RE_train_dataset, batch_size=tr_batch_size, shuffle=True)
            val_loader = DataLoader(RE_dev_dataset, batch_size=val_batch_size, shuffle=True)

            if 'koelectra' in MODEL_NAME:
                model = ElectraForSequenceClassification.from_pretrained(MODEL_NAME,
                                                                           config=model_config)
            else:
                model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME,
                                                                           config=model_config)
            model.to(device)
            model_dir = save_dir + f'/{k}fold'

            # OPTIMIZER

            if scheduler_type == 'CCAWR':
                opt_module = getattr(import_module("torch.optim"), opti)  # default: Adam
                optimizer = opt_module(
                    filter(lambda p: p.requires_grad,
                           model.parameters()),
                    lr=5e-7, #cfg.values.train_args.lr * 0.0001,  # 5e-6,
                    weight_decay=weight_decay
                )
                scheduler = CustomCosineAnnealingWarmUpRestarts(optimizer,
                                                          T_0=2,
                                                          T_mult=2,
                                                          eta_max=cfg.values.train_args.lr,
                                                          T_up=1,
                                                          gamma=0.8,
                                                          last_epoch=-1)
            elif scheduler_type == 'stepLr':
                opt_module = getattr(import_module("torch.optim"), opti)  # default: Adam
                optimizer = opt_module(
                    filter(lambda p: p.requires_grad,
                           model.parameters()),
                    lr=cfg.values.train_args.lr,  # 5e-6,
                    weight_decay=weight_decay
                )
                scheduler = StepLR(optimizer, lr_decay_step, gamma=steplr_gamma)#794) #gamma : 20epoch => lr x 0.01

            elif scheduler_type == 'cycleLR':
                opt_module = getattr(import_module("torch.optim"), opti)  # default: Adam
                optimizer = opt_module(
                    filter(lambda p: p.requires_grad,
                           model.parameters()),
                    lr=cfg.values.train_args.lr,  # 5e-6,
                    weight_decay=weight_decay
                )
                scheduler = CyclicLR(optimizer,
                                     base_lr=0.000000001,
                                     max_lr=cfg.values.train_args.lr,
                                     step_size_up=1,
                                     step_size_down=4,
                                     mode='triangular',
                                     cycle_momentum=False)#triangular2
            logger = SummaryWriter(log_dir=model_dir)

            best_val_acc = 0
            best_val_loss = np.inf

            for epoch in range(epochs):
                model.train()
                loss_value = 0
                matches = 0
                for idx, batch in enumerate(train_loader):
                    optimizer.zero_grad()

                    input_ids = batch['input_ids'].to(device)
                    # token_type_ids = batch['token_type_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)
                    labels = batch['labels'].to(device)
                    outputs = model(input_ids,
                                    # token_type_ids=token_type_ids,
                                    attention_mask=attention_mask,
                                    labels=labels)
                    loss = criterion(outputs[1], labels)
                    loss_value += loss.item()
                    preds = torch.argmax(F.log_softmax(outputs[1], dim=1), dim=-1)
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                    matches += (preds == labels).sum().item()
                    if (idx + 1) % log_interval == 0:
                        train_loss = loss_value / log_interval
                        train_acc = matches / tr_batch_size / log_interval
                        current_lr = get_lr(optimizer)
                        print(
                            f"Epoch[{epoch}/{epochs}]({idx + 1}/{len(train_loader)}) || "
                            f"training loss {train_loss:4.4} || training accuracy {train_acc:4.2%} || lr {current_lr}"
                        )
                        logger.add_scalar("Train/loss", train_loss, epoch * len(train_loader) + idx)
                        logger.add_scalar("Train/accuracy", train_acc, epoch * len(train_loader) + idx)
                        logger.add_scalar("Train/lr", current_lr, epoch * len(train_loader) + idx)
                        loss_value = 0
                        matches = 0
                optimizer.step()
                if scheduler is not None:
                    scheduler.step()

                model.eval()
                with torch.no_grad():
                    print("Calculating validation results...")

                    val_loss_items = []
                    val_acc_items = []
                    for idx, val_batch in enumerate(val_loader):
                        input_ids = val_batch['input_ids'].to(device)
                        # token_type_ids = val_batch['token_type_ids'].to(device)
                        attention_mask = val_batch['attention_mask'].to(device)
                        labels = val_batch['labels'].to(device)
                        outputs = model(input_ids,
                                        # token_type_ids=token_type_ids,
                                        attention_mask=attention_mask,
                                        labels=labels)
                        preds = torch.argmax(F.log_softmax(outputs[1], dim=1), dim=-1)
                        loss_item = outputs[0].item()
                        correct = preds.eq(labels)
                        acc_item = correct.sum().item()

                        val_loss_items.append(loss_item)
                        val_acc_items.append(acc_item)
                    val_loss = np.sum(val_loss_items) / len(val_loader)
                    val_acc = np.sum(val_acc_items) / len(val_label)
                    best_val_loss = min(best_val_loss, val_loss)

                    if val_acc > best_val_acc:
                        print(f"New best model for val accuracy : {val_acc:4.2%}! saving the best model..")
                        torch.save(model.state_dict(), f"./{model_dir}/best.pt")
                        best_val_acc = val_acc
                    torch.save(model.state_dict(), f"./{model_dir}/last.pt")
                    print(
                        f"[Val] acc : {val_acc:4.2%}, loss: {val_loss:4.2} || "
                        f"best acc : {best_val_acc:4.2%}, best loss: {best_val_loss:4.2}"
                    )
                    logger.add_scalar("Val/loss", val_loss, epoch)
                    logger.add_scalar("Val/accuracy", val_acc, epoch)
                    print()
            with open(f"./{model_dir}/config.yaml", 'w') as file:
                documents = yaml.dump(cfg.values, file)

            k += 1
            if cfg.values.val_args.fold_break:
                break
Ejemplo n.º 25
0
client_do = dospacesboto3.digital_ocean_client_init()
print('DO client initialized')

# print("Here is a list of the files contained in the spaces")
list_of_files = dospacesboto3.get_list_of_files(client_do,
                                                s_spaces_name,
                                                folder_prefix='Pictures')
# TODO: print the file list better
# print(list_of_files)
# cpprint(list_of_files)

print('Get list of files using pagination')
list_of_files_pagination = dospacesboto3.get_list_of_files_using_pagination(
    client_do, s_spaces_name, folder_prefix='new-folder')
# print(list_of_files)
cpprint(list_of_files_pagination)

print("Uploading file")

dospacesboto3.upload_file(client_do, s_local_file_path, s_spaces_name,
                          'new-folder/polarbear_1920x1080.jpeg')

print("File uploaded to DigitalOcean")

print('Downloading file')

dospacesboto3.download_file(client_do, s_local_file_name, s_spaces_name,
                            'new-folder/polarbear_1920x1080.jpeg')

print('File downloaded')
Ejemplo n.º 26
0
import yaml
import io
from prettyprinter import cpprint

with open("specs/0.1/api.yaml", 'r') as stream:
    model = yaml.safe_load(stream)

cpprint(model)

version = model.get('version')

kresources = list(model.get('resources').keys())
kmodels = list(model.get('models').keys())
kapis = list(model.get('apis').keys())
kendpoints = list(model.get('endpoints').keys())

cpprint(version)
cpprint('apis: ' + str(kapis))
cpprint('endpoints: ' + str(kendpoints))
cpprint('models: ' + str(kmodels))
cpprint('resources: ' + str(kresources))
Ejemplo n.º 27
0
def train(cfg):
    SEED = cfg.values.seed
    MODEL_NAME = cfg.values.model_name
    USE_KFOLD = cfg.values.val_args.use_kfold
    TRAIN_ONLY = cfg.values.train_only

    seed_everything(SEED)
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    # model_config_module = getattr(import_module('transformers'), cfg.values.model_arc + 'Config')
    model_config = AutoConfig.from_pretrained(MODEL_NAME)
    model_config.num_labels = 42

    whole_df = load_data("/opt/ml/input/data/train/train.tsv")
    additional_df = load_data("/opt/ml/input/data/train/additional_train.tsv")

    whole_label = whole_df['label'].values
    # additional_label = additional_df['label'].values

    if cfg.values.tokenizer_arc:
        tokenizer_module = getattr(import_module('transformers'),
                                   cfg.values.tokenizer_arc)
        tokenizer = tokenizer_module.from_pretrained(MODEL_NAME)
    else:
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    early_stopping = EarlyStoppingCallback(early_stopping_patience=9999999,
                                           early_stopping_threshold=0.001)

    training_args = TrainingArguments(
        output_dir=cfg.values.train_args.output_dir,  # output directory
        save_total_limit=cfg.values.train_args.
        save_total_limit,  # number of total save model.
        save_steps=cfg.values.train_args.save_steps,  # model saving step.
        num_train_epochs=cfg.values.train_args.
        num_epochs,  # total number of training epochs
        learning_rate=cfg.values.train_args.lr,  # learning_rate
        per_device_train_batch_size=cfg.values.train_args.
        train_batch_size,  # batch size per device during training
        per_device_eval_batch_size=cfg.values.train_args.
        eval_batch_size,  # batch size for evaluation         
        warmup_steps=cfg.values.train_args.
        warmup_steps,  # number of warmup steps for learning rate scheduler
        weight_decay=cfg.values.train_args.
        weight_decay,  # strength of weight decay            
        max_grad_norm=cfg.values.train_args.max_grad_norm,
        logging_dir=cfg.values.train_args.
        logging_dir,  # directory for storing logs
        logging_steps=cfg.values.train_args.logging_steps,  # log saving step.
        evaluation_strategy=cfg.values.train_args.
        evaluation_strategy,  # evaluation strategy to adopt during training
        # `no`: No evaluation during training.
        # `steps`: Evaluate every `eval_steps`.
        # `epoch`: Evaluate every end of epoch.
        eval_steps=cfg.values.train_args.eval_steps,  # evaluation step.
        dataloader_num_workers=4,
        seed=SEED,
        label_smoothing_factor=cfg.values.train_args.label_smoothing_factor,
        load_best_model_at_end=True,
        # metric_for_best_model='accuracy'
    )

    if USE_KFOLD:
        kfold = StratifiedKFold(n_splits=cfg.values.val_args.num_k)

        k = 1
        for train_idx, val_idx in kfold.split(whole_df, whole_label):
            print('\n')
            cpprint('=' * 15 + f'{k}-Fold Cross Validation' + '=' * 15)
            train_df = whole_df.iloc[train_idx]
            # train_df = pd.concat((train_df, additional_df))
            val_df = whole_df.iloc[val_idx]

            if cfg.values.model_arc == 'Roberta':
                tokenized_train = roberta_tokenized_dataset(
                    train_df, tokenizer)
                tokenized_val = roberta_tokenized_dataset(val_df, tokenizer)
            else:
                tokenized_train = tokenized_dataset(train_df, tokenizer)
                tokenized_val = tokenized_dataset(val_df, tokenizer)

            RE_train_dataset = RE_Dataset(tokenized_train,
                                          train_df['label'].values)
            RE_val_dataset = RE_Dataset(tokenized_val, val_df['label'].values)

            try:
                if cfg.values.model_name == 'Bert':
                    model = BertForSequenceClassification.from_pretrained(
                        MODEL_NAME, config=model_config)
                else:
                    model = AutoModelForSequenceClassification.from_pretrained(
                        MODEL_NAME, config=model_config)
            except:
                # model_module = getattr(import_module('transformers'), cfg.values.model_arc)
                model_module = getattr(
                    import_module('transformers'),
                    cfg.values.model_arc + 'ForSequenceClassification')
                model = model_module.from_pretrained(MODEL_NAME,
                                                     config=model_config)

            model.parameters
            model.to(device)

            training_args.output_dir = cfg.values.train_args.output_dir + f'/{k}fold'
            training_args.logging_dir = cfg.values.train_args.output_dir + f'/{k}fold'

            optimizer = MADGRAD(model.parameters(),
                                lr=training_args.learning_rate)
            total_step = len(
                RE_train_dataset
            ) / training_args.per_device_train_batch_size * training_args.num_train_epochs
            scheduler = transformers.get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=training_args.warmup_steps,
                num_training_steps=total_step)
            optimizers = optimizer, scheduler

            trainer = Trainer(
                model=
                model,  # the instantiated 🤗 Transformers model to be trained
                args=training_args,  # training arguments, defined above
                train_dataset=RE_train_dataset,  # training dataset
                eval_dataset=RE_val_dataset,  # evaluation dataset
                compute_metrics=compute_metrics,  # define metrics function
                optimizers=optimizers,
                # callbacks=[early_stopping]
            )
            k += 1
            # train model
            trainer.train()

    else:
        cpprint('=' * 20 + f'START TRAINING' + '=' * 20)
        if not TRAIN_ONLY:
            train_df, val_df = train_test_split(
                whole_df,
                test_size=cfg.values.val_args.test_size,
                random_state=SEED)
            # train_df = pd.concat((train_df, additional_df))

            if cfg.values.model_arc == 'Roberta':
                tokenized_train = roberta_tokenized_dataset(
                    train_df, tokenizer)
                tokenized_val = roberta_tokenized_dataset(val_df, tokenizer)
            else:
                tokenized_train = tokenized_dataset(train_df, tokenizer)
                tokenized_val = tokenized_dataset(val_df, tokenizer)

            RE_train_dataset = RE_Dataset(tokenized_train,
                                          train_df['label'].values)
            RE_val_dataset = RE_Dataset(tokenized_val, val_df['label'].values)

            try:
                if cfg.values.model_name == 'Bert':
                    model = BertForSequenceClassification.from_pretrained(
                        MODEL_NAME, config=model_config)
                else:
                    model = AutoModelForSequenceClassification.from_pretrained(
                        MODEL_NAME, config=model_config)
            except:
                # model_module = getattr(import_module('transformers'), cfg.values.model_arc)
                model_module = getattr(
                    import_module('transformers'),
                    cfg.values.model_arc + 'ForSequenceClassification')
                model = model_module.from_pretrained(MODEL_NAME,
                                                     config=model_config)

            model.parameters
            model.to(device)

            optimizer = transformers.AdamW(model.parameters(),
                                           lr=training_args.learning_rate)
            total_step = len(
                RE_train_dataset
            ) / training_args.per_device_train_batch_size * training_args.num_train_epochs
            # scheduler = transformers.get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=training_args.warmup_steps, num_training_steps=total_step)
            scheduler = transformers.get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=training_args.warmup_steps,
                num_training_steps=total_step)
            optimizers = optimizer, scheduler

            trainer = Trainer(
                model=
                model,  # the instantiated 🤗 Transformers model to be trained
                args=training_args,  # training arguments, defined above
                train_dataset=RE_train_dataset,  # training dataset
                eval_dataset=RE_val_dataset,  # evaluation dataset
                compute_metrics=compute_metrics,  # define metrics function
                optimizers=optimizers,
                callbacks=[early_stopping])

            # train model
            trainer.train()

        else:
            training_args.evaluation_strategy = 'no'

            if cfg.values.model_arc == 'Roberta':
                print('Roberta')
                tokenized_train = roberta_tokenized_dataset(
                    whole_df, tokenizer)
            else:
                tokenized_train = tokenized_dataset(whole_df, tokenizer)

            RE_train_dataset = RE_Dataset(tokenized_train,
                                          whole_df['label'].values)

            try:
                model = AutoModelForSequenceClassification.from_pretrained(
                    MODEL_NAME, config=model_config)
            except:
                # model_module = getattr(import_module('transformers'), cfg.values.model_arc)
                model_module = getattr(
                    import_module('transformers'),
                    cfg.values.model_arc + 'ForSequenceClassification')
                model = model_module.from_pretrained(MODEL_NAME,
                                                     config=model_config)

            model.parameters
            model.to(device)

            training_args.output_dir = cfg.values.train_args.output_dir + '/only_train'
            training_args.logging_dir = cfg.values.train_args.output_dir + '/only_train'

            optimizer = AdamP(model.parameters(),
                              lr=training_args.learning_rate)
            total_step = len(
                RE_train_dataset
            ) / training_args.per_device_train_batch_size * training_args.num_train_epochs
            scheduler = transformers.get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=training_args.warmup_steps,
                num_training_steps=total_step)
            optimizers = optimizer, scheduler

            trainer = Trainer(
                model=
                model,  # the instantiated 🤗 Transformers model to be trained
                args=training_args,  # training arguments, defined above
                train_dataset=RE_train_dataset,  # training dataset
                optimizers=optimizers,
                # callbacks=[early_stopping]
            )

            # train model
            trainer.train()
Ejemplo n.º 28
0
                num_warmup_steps=training_args.warmup_steps,
                num_training_steps=total_step)
            optimizers = optimizer, scheduler

            trainer = Trainer(
                model=
                model,  # the instantiated 🤗 Transformers model to be trained
                args=training_args,  # training arguments, defined above
                train_dataset=RE_train_dataset,  # training dataset
                optimizers=optimizers,
                # callbacks=[early_stopping]
            )

            # train model
            trainer.train()


def main(cfg):
    train(cfg)


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--config_file_path', type=str, default='./config.yml')
    parser.add_argument('--config', type=str, default='base')

    args = parser.parse_args()
    cfg = YamlConfigManager(args.config_file_path, args.config)
    cpprint(cfg.values, sort_dict_keys=False)
    print('\n')
    main(cfg)
Ejemplo n.º 29
0
import requests
# pretty
from prettyprinter import cpprint
# 新华字典库
url = 'https://www.pwxcoo.com/dictionary'
#歇后语 riddle 语面
key = input('输入关键字:')
params = {'type': 'xiehouyu', 'riddle': key}
# 成语
# params={'type':'idiom','riddle':'兴高采烈'}
#  拼音缩写
# params={'type':'idiom','riddle':'xgcl'}
# 汉字
# params={'type':'word','riddle':'王'}
r = requests.get(url=url, params=params)
data = r.json()
cpprint(data)
Ejemplo n.º 30
0
def log(msg) -> None:
    if __debug__:
        cpprint(msg)