コード例 #1
0
ファイル: preprocess.py プロジェクト: shiqing1234/XNLG
def preprocess(voc_path, txt_path):

    assert os.path.isfile(voc_path)
    assert os.path.isfile(txt_path)

    logger = create_logger(None, 0)

    bin_path = txt_path + ".pth"

    dico = Dictionary.read_vocab(voc_path)
    logger.info("")

    data = Dictionary.index_data(txt_path, bin_path, dico)
    logger.info("%i words (%i unique) in %i sentences." %
                (len(data['sentences']) - len(data['positions']),
                 len(data['dico']), len(data['positions'])))
    if len(data['unk_words']) > 0:
        logger.info(
            "%i unknown words (%i unique), covering %.2f%% of the data." %
            (sum(data['unk_words'].values()), len(
                data['unk_words']), sum(data['unk_words'].values()) * 100. /
             (len(data['sentences']) - len(data['positions']))))
        if len(data['unk_words']) < 30:
            for w, c in sorted(data['unk_words'].items(),
                               key=lambda x: x[1])[::-1]:
                logger.info("%s: %i" % (w, c))
コード例 #2
0
def main(args):
    input_file = None
    output_file = None

    logger = create_logger(LOG_FILE, LOG_LEVEL)
    logger.info("Geofencing validator logger was created.")

    try:
        opts, args = getopt.getopt(sys.argv[1:], "i:o:", ["input=", "output="])
    except getopt.GetoptError as exc:
        logger.error(
            "Got an error: \"{}\", while trying to get options".format(exc))
        sys.exit(2)

    for opt, arg in opts:
        if opt in ("-i", "--input"):
            input_file = arg
        elif opt in ("-o", "--output"):
            output_file = arg
        else:
            logger.error("Incorrect parameter was set")
            sys.exit(usage)

    if input_file and output_file:
        validator = Validator(input_file, output_file)
        validator.process()
    else:
        logger.error("No input csv file or output file path in parameter")
        sys.exit(usage)
コード例 #3
0
ファイル: utils.py プロジェクト: facebookresearch/selavi
def initialize_exp(params, *args, dump_params=True):
    """
    Initialize the experience:
    - dump parameters
    - create checkpoint repo
    - create a logger
    - create a panda object to keep track of the training statistics
    """

    # dump parameters
    if dump_params:
        pickle.dump(params, open(os.path.join(params.dump_path, "params.pkl"), "wb"))

    # create repo to store checkpoints
    params.dump_checkpoints = os.path.join(params.dump_path, "checkpoints")
    if not params.rank and not os.path.isdir(params.dump_checkpoints):
        os.mkdir(params.dump_checkpoints)

    # create a panda object to log loss and acc
    training_stats = PD_Stats(
        os.path.join(params.dump_path, "stats" + str(params.rank) + ".pkl"), args
    )

    # create a logger
    logger = create_logger(
        os.path.join(params.dump_path, "train.log"), rank=params.rank
    )
    logger.info("============ Initialized logger ============")
    logger.info(
        "\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(params)).items()))
    )
    logger.info("The experiment will be stored in %s\n" % params.dump_path)
    logger.info("")
    return logger, training_stats
コード例 #4
0
def create_app():
    app = Flask(__name__)
    app.config.from_object(config)
    create_logger()
    admin.init_app(app)
    admin.add_view(ModelView(Teams, session))

    @app.route("/" + config.BOT_API_TOKEN, methods=['POST'])
    def get_message():
        bot.process_new_updates([
            telebot.types.Update.de_json(request.stream.read().decode("utf-8"))
        ])
        return "!", 200

    @app.route("/")
    def webhook():
        bot.remove_webhook()
        bot.set_webhook(url=f'{config.APP_HOST}/{config.BOT_API_TOKEN}')
        return 'Hook was set!', 200

    return app
コード例 #5
0
import argparse
import torch
from src.logger import create_logger
import os
from src.model import build_mt_model
from src.data.loader import load_data
import subprocess
import re

logger = create_logger('translate.log')
os.environ["CUDA_VISIBLE_DEVICES"] = "2"

parser = argparse.ArgumentParser(description='Settings')
parser.add_argument("--train_data",
                    type=str,
                    default='data/120w.bin',
                    help="train data dir")
parser.add_argument("--max_len",
                    type=int,
                    default=50,
                    help="max length of sentences")
parser.add_argument("--reload_model",
                    type=str,
                    default='',
                    help="reload model")
parser.add_argument("--batch_size", type=int, default=64, help="batch size")
parser.add_argument("--batch_size_tokens",
                    type=int,
                    default=-1,
                    help="batch size tokens")
parser.add_argument("--src_n_words", type=int, default=0, help="data")
コード例 #6
0
# LICENSE file in the root directory of this source tree.
#
"""
Example: python data/vocab.txt data/train.txt
vocab.txt: 1stline=word, 2ndline=count
"""

import os
import sys

from src.logger import create_logger
from src.data.dictionary import Dictionary

if __name__ == "__main__":

    logger = create_logger(None, 0)

    voc_path = sys.argv[1]
    txt_path = sys.argv[2]
    bin_path = sys.argv[2] + ".pth"
    assert os.path.isfile(voc_path)
    assert os.path.isfile(txt_path)

    dico = Dictionary.read_vocab(voc_path)
    logger.info("")

    data = Dictionary.index_data(txt_path, bin_path, dico)
    logger.info("%i words (%i unique) in %i sentences." % (
        len(data["sentences"]) - len(data["positions"]),
        len(data["dico"]),
        len(data["positions"]),
コード例 #7
0
ファイル: new_model.py プロジェクト: xiamengzhou/NLPerf
                        type=int,
                        default=2,
                        help="verbose level (2:debug, 1:info, 0:warning)")
    parser.add_argument("--log", default="log", type=str, help="the log file")
    parser.add_argument("--n",
                        type=int,
                        default=0,
                        help="How many should we add?")
    parser.add_argument("--portion",
                        type=float,
                        default=0.5,
                        help="Portion of data points for testing.")
    parser.add_argument("--test_id_options_num",
                        type=int,
                        default=10,
                        help="Number of train/test split.")
    parser.add_argument("--sample_options_num",
                        type=int,
                        default=10,
                        help="Number of samples for n data points.")

    params = parser.parse_args()

    logger = create_logger(params.log, vb=params.verbose)
    init_logging()
    task = params.task

    # n = 5
    n = params.n
    run_ex(task, n, "xgboost", params.portion)
コード例 #8
0
ファイル: train.py プロジェクト: nyu-dl/dl4chem-mgm
def setup_data_and_model(params, model):
    # Variables that may not otherwise be assigned
    writer = perturbation_loader = generator = training_smiles = None

    # setup random seeds
    if params.val_seed is None: params.val_seed = params.seed
    set_seed_if(params.seed)

    exp_path = os.path.join(params.dump_path, params.exp_name)
    # create exp path if it doesn't exist
    if not os.path.exists(exp_path):
        os.makedirs(exp_path)
    # create logger
    logger = create_logger(os.path.join(exp_path, 'train.log'), 0)
    pp = pprint.PrettyPrinter()
    logger.info("============ Initialized logger ============")
    logger.info("Random seed is {}".format(params.seed))
    if params.suppress_params is False:
        logger.info("\n".join("%s: %s" % (k, str(v))
                          for k, v in sorted(dict(vars(params)).items())))
        logger.info("Running command: %s" % 'python ' + ' '.join(sys.argv))
    logger.info("The experiment will be stored in %s\n" % exp_path)
    logger.info("")
    # load data
    train_data, val_dataset, train_loader, val_loader = load_graph_data(params)

    logger.info ('train_loader len is {}'.format(len(train_loader)))
    logger.info ('val_loader len is {}'.format(len(val_loader)))

    if params.num_binary_graph_properties > 0 and params.pretrained_property_embeddings_path:
        model.binary_graph_property_embedding_layer.weight.data = \
            torch.Tensor(np.load(params.pretrained_property_embeddings_path).T)
    if params.load_latest is True:
        load_prefix = 'latest'
    elif params.load_best is True:
        load_prefix = 'best'
    else:
        load_prefix = None

    if load_prefix is not None:
        if params.local_cpu is True:
            model.load_state_dict(torch.load(os.path.join(exp_path, '{}_model'.format(load_prefix)), map_location='cpu'))
        else:
            model.load_state_dict(torch.load(os.path.join(exp_path, '{}_model'.format(load_prefix))))
    if params.local_cpu is False:
        model = model.cuda()
    if params.gen_num_samples > 0:
        generator = GraphGenerator(train_data, model, params.gen_random_init, params.gen_num_iters, params.gen_predict_deterministically, params.local_cpu)
        with open(params.smiles_path) as f:
            smiles = f.read().split('\n')
            training_smiles = smiles[:int(params.smiles_train_split * len(smiles))]
            del smiles
    opt = get_optimizer(model.parameters(), params.optimizer)
    if load_prefix is not None:
        opt.load_state_dict(torch.load(os.path.join(exp_path, '{}_opt_sd'.format(load_prefix))))

    lr = opt.param_groups[0]['lr']
    lr_lambda = lambda iteration: lr_decay_multiplier(iteration, params.warm_up_iters, params.decay_start_iter,
                                                      params.lr_decay_amount, params.lr_decay_frac,
                                                      params.lr_decay_interval, params.min_lr, lr)
    scheduler = LambdaLR(opt, lr_lambda)
    index_method = get_index_method()

    best_loss = 9999
    if params.tensorboard:
        from tensorboardX import SummaryWriter
        writer = SummaryWriter(exp_path)

    total_iter, grad_accum_iters = params.first_iter, 0

    return params, model, opt, scheduler, train_data, train_loader, val_dataset, val_loader, perturbation_loader,\
           generator, index_method, exp_path, training_smiles, pp, logger, writer, best_loss, total_iter,\
           grad_accum_iters
コード例 #9
0
ファイル: main.py プロジェクト: rgwt123/simple-fairseq
import argparse
from src.logger import create_logger
import os
import torch
logger = create_logger('train.log')

parser = argparse.ArgumentParser(description='Settings')
parser.add_argument("--train_data", type=str, default='data/cwmt.bin',
                    help="train data dir")
parser.add_argument("--max_len", type=int, default=100,
                    help="max length of sentences")
parser.add_argument("--reload_model", type=str, default='',
                    help="reload model")
parser.add_argument("--batch_size", type=int, default=80,
                    help="batch size sentences")
parser.add_argument("--batch_size_tokens", type=int, default=4000,
                    help="batch size tokens")
parser.add_argument("--src_n_words", type=int, default=0,
                    help="data")
parser.add_argument("--tgt_n_words", type=int, default=0,
                    help="data")
parser.add_argument("--dropout", type=float, default=0.1,
                    help="Dropout")
parser.add_argument("--label-smoothing", type=float, default=0.1,
                    help="Label smoothing")
parser.add_argument("--attention", type=bool, default=True,
                    help="Use an attention mechanism")
parser.add_argument("--transformer", type=bool, default=True,
                    help="Use Transformer")
parser.add_argument("--emb_dim", type=int, default=512,
                    help="Embedding layer size")
コード例 #10
0
ファイル: train_ar.py プロジェクト: nyu-dl/dl4chem-mgm
def main(params):
    # setup random seeds
    set_seed(params.seed)
    params.ar = True

    exp_path = os.path.join(params.dump_path, params.exp_name)
    # create exp path if it doesn't exist
    if not os.path.exists(exp_path):
        os.makedirs(exp_path)
    # create logger
    logger = create_logger(os.path.join(exp_path, 'train.log'), 0)
    logger.info("============ Initialized logger ============")
    logger.info("Random seed is {}".format(params.seed))
    logger.info("\n".join("%s: %s" % (k, str(v))
                          for k, v in sorted(dict(vars(params)).items())))
    logger.info("The experiment will be stored in %s\n" % exp_path)
    logger.info("Running command: %s" % 'python ' + ' '.join(sys.argv))
    logger.info("")
    # load data
    data, loader = load_smiles_data(params)
    if params.data_type == 'ChEMBL':
        all_smiles_mols = open(os.path.join(params.data_path, 'guacamol_v1_all.smiles'), 'r').readlines()
    else:
        all_smiles_mols = open(os.path.join(params.data_path, 'QM9_all.smiles'), 'r').readlines()
    train_data, val_data = data['train'], data['valid']
    dico = data['dico']
    logger.info ('train_data len is {}'.format(len(train_data)))
    logger.info ('val_data len is {}'.format(len(val_data)))

    # keep cycling through train_loader forever
    # stop when max iters is reached
    def rcycle(iterable):
        saved = []                 # In-memory cache
        for element in iterable:
            yield element
            saved.append(element)
        while saved:
            random.shuffle(saved)  # Shuffle every batch
            for element in saved:
                  yield element
    train_loader = rcycle(train_data.get_iterator(shuffle=True, group_by_size=True, n_sentences=-1))

    # extra param names for transformermodel
    params.n_langs = 1
    # build Transformer model
    model = TransformerModel(params, is_encoder=False, with_output=True)

    if params.local_cpu is False:
        model = model.cuda()
    opt = get_optimizer(model.parameters(), params.optimizer)
    scores = {'ppl': np.float('inf'), 'acc': 0}

    if params.load_path:
        reloaded_iter, scores = load_model(params, model, opt, logger)

    for total_iter, train_batch in enumerate(train_loader):
        if params.load_path is not None:
            total_iter += reloaded_iter + 1

        epoch = total_iter // params.epoch_size
        if total_iter == params.max_steps:
            logger.info("============ Done training ... ============")
            break
        elif total_iter % params.epoch_size == 0:
            logger.info("============ Starting epoch %i ... ============" % epoch)
        model.train()
        opt.zero_grad()
        train_loss = calculate_loss(model, train_batch, params)
        train_loss.backward()
        if params.clip_grad_norm > 0:
            clip_grad_norm_(model.parameters(), params.clip_grad_norm)
        opt.step()
        if total_iter % params.print_after == 0:
            logger.info("Step {} ; Loss = {}".format(total_iter, train_loss))

        if total_iter > 0 and total_iter % params.epoch_size == (params.epoch_size - 1):
            # run eval step (calculate validation loss)
            model.eval()
            n_chars = 0
            xe_loss = 0
            n_valid = 0
            logger.info("============ Evaluating ... ============")
            val_loader = val_data.get_iterator(shuffle=True)
            for val_iter, val_batch in enumerate(val_loader):
                with torch.no_grad():
                    val_scores, val_loss, val_y = calculate_loss(model, val_batch, params, get_scores=True)
                # update stats
                n_chars += val_y.size(0)
                xe_loss += val_loss.item() * len(val_y)
                n_valid += (val_scores.max(1)[1] == val_y).sum().item()

            ppl = np.exp(xe_loss / n_chars)
            acc = 100. * n_valid / n_chars
            logger.info("Acc={}, PPL={}".format(acc, ppl))
            if acc > scores['acc']:
                scores['acc'] = acc
                scores['ppl'] = ppl
                save_model(params, data, model, opt, dico, logger, 'best_model', epoch, total_iter, scores)
                logger.info('Saving new best_model {}'.format(epoch))
                logger.info("Best Acc={}, PPL={}".format(scores['acc'], scores['ppl']))

            logger.info("============ Generating ... ============")
            number_samples = 100
            gen_smiles = generate_smiles(params, model, dico, number_samples)
            generator = ARMockGenerator(gen_smiles)

            try:
                benchmark = ValidityBenchmark(number_samples=number_samples)
                validity_score = benchmark.assess_model(generator).score
            except:
                validity_score = -1
            try:
                benchmark = UniquenessBenchmark(number_samples=number_samples)
                uniqueness_score = benchmark.assess_model(generator).score
            except:
                uniqueness_score = -1

            try:
                benchmark = KLDivBenchmark(number_samples=number_samples, training_set=all_smiles_mols)
                kldiv_score = benchmark.assess_model(generator).score
            except:
                kldiv_score = -1
            logger.info('Validity Score={}, Uniqueness Score={}, KlDiv Score={}'.format(validity_score, uniqueness_score, kldiv_score))
            save_model(params, data, model, opt, dico, logger, 'model', epoch, total_iter, {'ppl': ppl, 'acc': acc})
コード例 #11
0
ファイル: worker.py プロジェクト: CHuKeR/ggscore
from src.bot.messages import send_future_match, send_result_match, send_closest_match
from src.extensions import session
from src.logger import create_logger
from src.models import Series
from src.models import Teams
from src.models import Users
from src.parser.dota_series import DotaParser

logger = create_logger()


def send_future_matches_to_users():
    users = session.query(Users).all()
    for user in users:
        user_teams = user.get_user_teams()
        matches = Series.get_today_matches()
        for match in matches:
            if match.team1_name in user_teams or match.team2_name in user_teams:
                send_future_match(user.id, match)


def send_updated_matches_to_user(matches: list):
    users_for_update = get_users_to_updates(matches)
    for user in users_for_update:
        user_teams = user.get_user_teams()
        for match in matches:
            if match.team1_name in user_teams or match.team2_name in user_teams:
                send_future_match(user.id, match)


def send_closest_matches_to_user():
コード例 #12
0
ファイル: test_read_data.py プロジェクト: xiamengzhou/NLPerf
from src.read_data import read_data, K_Fold_Spliter, Random_Spliter, Specific_Spliter
from pathlib import Path
from src.logger import create_logger

tasks = [
    "wiki", "monomt", "tsfmt", "tsfparsing", "tsfpos", "tsfel", "bli", "ma",
    "ud"
]
data_folder = Path(__file__).parent / ".."

logger = create_logger("pytest.log", vb=1)


def test_load_data():
    logger.info("*" * 20)
    data = read_data(task="monomt",
                     folder=data_folder,
                     shuffle=True,
                     selected_feats=None,
                     combine_models=False)
    assert len(data["BLEU"]["feats"]) == 54
    assert len(data["BLEU"]["labels"]) == 54
    assert len(data["BLEU"]["langs"]) == 54
    assert list(data["BLEU"]["langs"].columns.values) == [
        "Source Language", "Target Language"
    ]

    # test_feature_selection
    logger.info("*" * 20)
    data = read_data(task="monomt",
                     folder=data_folder,
コード例 #13
0
from flask import Flask, request, Response
from nltk.tokenize import sent_tokenize
import json
from translate import translate_onesentence
from store import storeonesent2mysql, update_table_viamysql, update_mysql_viafile
from src.logger import create_logger
import datetime
logger = create_logger('access.log')

app = Flask(__name__)

# bilingual dictionary
# provided to update and translate function
TAGTABLE = {}
number = update_table_viamysql(TAGTABLE, 'pe', 'zh')
logger.info(f'loading {number} pairs from table')


@app.route('/api/split', methods=['POST'])
def split_sentence():
    if request.method == 'POST':
        now = datetime.datetime.now()
        year, month, day = now.year, now.month, now.day
        left_month = 13 - ((year - 2020) * 12 + (month - 3))
        if left_month < 0:
            result = {}
            result['errorCode'] = 1
            result['error'] = '试用已经过期,请联系管理员。'
            return Response(json.dumps(result),
                            mimetype='application/javascript')
        else: