Exemple #1
0
def combine(val, res, mapping):
    data = read_content(val)
    mesh_map, mesh_rev_map = mesh_mapping(mapping)
    print len(mesh_map), len(mesh_rev_map)
    resdict = collections.defaultdict(list)
    res = read_content(res)
    for i in res['documents']:
        resdict[i['pmid']] = [mesh_rev_map[k] for k in i['labels']]

    for i in data['documents']:
        i['meshMajor'] = resdict[i['pmid']]

    with open(val.split('.json')[0] + '_res.json', 'w') as outfile:
        json.dump(data, outfile)
Exemple #2
0
 def __read_impl(self, excluded, fpath):
     """
     single read operation, if excluded==True, return a empty list, otherwise return data content
     """
     if excluded:
         return list()
     return utils.read_content(fpath)
Exemple #3
0
 def __read_impl(self, excluded, fpath):
     """
     single read operation, if excluded==True, return a empty list, otherwise return data content
     """
     if excluded:
         return list()
     return utils.read_content(fpath)
Exemple #4
0
def test_utils():
    # lemmatized = "f:/Corpus/lemmatized_trec_all.dat"
    lemmatized = "f:/Corpus/new4.dat"
    from utils import read_content
    count = 0
    for text, meta in read_content(lemmatized):
        count += 1
        # if count is 10:
        #     break
        print(text, meta)
Exemple #5
0
def test_from_data_file(r6):
    import driver

    def _corrupt(fname, index, size):
        get_logger().warning("corrupting disk {}".format(index))
        error_fpath = r6.get_real_name(index, fname)
        error_content = os.urandom(size)
        utils.write_content(error_fpath, error_content)

    def _corrupt2(fname, indexes, size):
        for index in indexes:
            _corrupt(fname, index, size)

    data_fname = 'data3'
    SIZE = 32768
    driver.gen_rnd_file(data_fname, SIZE, 'text')
    fpath = os.path.join(config.root, 'data3')
    original_content = utils.read_content(fpath)
    r6.write(original_content, data_fname)
    r6.detect_corruption(data_fname)
    for error_index in [0, 3, r6.N - 2, r6.N - 1]:
        error_size = SIZE / 13
        _corrupt(data_fname, error_index, error_size)
        found_error_index = r6.detect_corruption(data_fname)
        if found_error_index is not None:
            get_logger().warning("recover disk {}".format(error_index))
            assert found_error_index == error_index
            if found_error_index < r6.N - 1:
                r6.recover_d_or_p(data_fname, found_error_index)
            else:
                r6.recover_q(data_fname)
            r6.detect_corruption(data_fname)
    #####################################################
    get_logger().warning("testing recover_d_q")
    error_indexes = [4, r6.N - 1]
    size = SIZE / (r6.N - 4)
    _corrupt2(data_fname, error_indexes, size)
    r6.recover_d_q(data_fname, error_indexes[0])
    r6.detect_corruption(data_fname)
    #####################################################
    get_logger().warning("testing recover_2d")
    error_indexes = [0, 1]
    size = SIZE / (r6.N + 2)
    _corrupt2(data_fname, error_indexes, size)
    r6.recover_2d(data_fname, error_indexes[0], error_indexes[1])
    r6.detect_corruption(data_fname)
    #####################################################
    get_logger().warning("testing recover_d_p")
    error_indexes = [0, r6.N - 2]
    size = SIZE / (r6.N - 2)
    _corrupt2(data_fname, error_indexes, size)
    r6.recover_d_p(data_fname, error_indexes[0])
    r6.detect_corruption(data_fname)
Exemple #6
0
def test_from_data_file(r6):
    import driver

    def _corrupt(fname, index, size):
        get_logger().warning("corrupting disk {}".format(index))
        error_fpath = r6.get_real_name(index, fname)
        error_content = os.urandom(size)
        utils.write_content(error_fpath, error_content)

    def _corrupt2(fname, indexes, size):
        for index in indexes:
            _corrupt(fname, index, size)

    data_fname = 'data3'
    SIZE = 32768
    driver.gen_rnd_file(data_fname, SIZE, 'text')
    fpath = os.path.join(config.root, 'data3')
    original_content = utils.read_content(fpath)
    r6.write(original_content, data_fname)
    r6.detect_corruption(data_fname)
    for error_index in [0, 3, r6.N - 2, r6.N - 1]:
        error_size = SIZE / 13
        _corrupt(data_fname, error_index, error_size)
        found_error_index = r6.detect_corruption(data_fname)
        if found_error_index is not None:
            get_logger().warning("recover disk {}".format(error_index))
            assert found_error_index == error_index
            if found_error_index < r6.N - 1:
                r6.recover_d_or_p(data_fname, found_error_index)
            else:
                r6.recover_q(data_fname)
            r6.detect_corruption(data_fname)
    #####################################################
    get_logger().warning("testing recover_d_q")
    error_indexes = [4, r6.N - 1]
    size = SIZE / (r6.N - 4)
    _corrupt2(data_fname, error_indexes, size)
    r6.recover_d_q(data_fname, error_indexes[0])
    r6.detect_corruption(data_fname)
    #####################################################
    get_logger().warning("testing recover_2d")
    error_indexes = [0, 1]
    size = SIZE / (r6.N + 2)
    _corrupt2(data_fname, error_indexes, size)
    r6.recover_2d(data_fname, error_indexes[0], error_indexes[1])
    r6.detect_corruption(data_fname)
    #####################################################
    get_logger().warning("testing recover_d_p")
    error_indexes = [0, r6.N - 2]
    size = SIZE / (r6.N - 2)
    _corrupt2(data_fname, error_indexes, size)
    r6.recover_d_p(data_fname, error_indexes[0])
    r6.detect_corruption(data_fname)
Exemple #7
0
def predict(te, vocab, label_dict, label_rev_dict, mesh_map, mesh_rev_map,
            prefix, buckets, model, nhidden, nlayer, dropout, nepoch,
            batch_size):
    # Prediction for testing data set
    batch_size = 1
    tins, tlabels, tpmids, t, tld, tlrd = load_data(read_content(te),
                                                    vocab,
                                                    label_dict,
                                                    label_rev_dict,
                                                    tr=False)
    print 'tins', len(tins)
    res = {}
    res["documents"] = []
    param_file = "./models/%s-%s" % (prefix, 30)
    #arg_param,aux_param=load_param(param_file)
    make_predict(res, tins, len(label_dict), tpmids, model, param_file,
                 buckets, nhidden, nlayer, vocab, dropout, label_rev_dict,
                 mesh_map, mesh_rev_map, nepoch, batch_size)
    return res
Exemple #8
0
def train(args, path, df, val, te, meshmap, nhidden, nembed, batch_size,
          nepoch, model, nlayer, eta, dropout, split, is_train):
    assert model in ['ffn', 'lstm', 'bilstm', 'gru']
    data = read_content_stream(os.path.join(path, df))
    nins, vocab, label_dict, label_rev_dict = load_data_statics(data)
    mesh_map, mesh_rev_map = mesh_mapping(meshmap)
    contexts = [mx.context.gpu(i) for i in xrange(1)]
    nwords = len(vocab)
    nlabels = len(label_dict)
    print '#ins', nins
    print '#labels', nlabels
    print '#words', nwords
    npart = 30
    pins = chunkl(nins, npart)
    buckets = [50, 100, 200, 300, 150, 1000]
    prefix = model + '_' + str(nlayer) + '_' + str(nhidden) + "_" + str(nembed)
    gen_data = read_content_stream(os.path.join(path, df))
    logging.basicConfig(level=logging.DEBUG)
    logging.info('start with arguments %s', args)
    if model == 'ffn':

        def ffn_gen(seq_len):
            sym = ffn.ffn(nlayer, seq_len, nwords, nhidden, nembed, nlabels,
                          dropout)
            data_names = ['data']
            label_names = ['label']
            return sym, data_names, label_names

        for pidx in xrange(len(pins)):
            print 'partition ', pidx
            data = {'articles': []}
            for _ in xrange(pins[pidx]):
                data['articles'].append(gen_data.next())
            if val == None:
                tr_data, val_data = get_data_iter(ins, labels, nlabels,
                                                  batch_size, [], buckets,
                                                  split)
            else:
                ins, labels, pmids, v, ld, lrd = load_data(
                    data, vocab, label_dict, label_rev_dict)
                tr_data = BucketFlexIter(ins, labels, nlabels, batch_size, [],
                                         buckets)
                vins, vlabels, vpmids, v, ld, lrd = load_data(read_content(
                    os.path.join(path, val)),
                                                              vocab,
                                                              label_dict,
                                                              label_rev_dict,
                                                              tr=False)
                val_data = BucketFlexIter(vins, vlabels, nlabels, batch_size,
                                          [], buckets)
            if len(buckets) == 1:
                mod = mx.mod.Module(*ffn_gen(buckets[0]), context=contexts)
            else:
                mod = mx.mod.BucketingModule(
                    ffn_gen,
                    default_bucket_key=tr_data.default_bucket_key,
                    context=contexts)
            if is_train:
                if pidx:
                    sym, arg_params, aux_params = mx.model.load_checkpoint(
                        './models/%s-%s' % (prefix, pidx - 1), nepoch)
                    mod.bind(data_shapes=tr_data.provide_data,
                             label_shapes=tr_data.provide_label,
                             for_training=True)
                    mod.set_params(arg_params=arg_params,
                                   aux_params=aux_params)
                    mod.fit(tr_data,
                            eval_data=val_data,
                            num_epoch=nepoch,
                            epoch_end_callback=mx.callback.do_checkpoint(
                                './models/%s-%s' % (prefix, pidx),
                                period=nepoch),
                            eval_metric=['rmse', accuracy, ins_recall])
                else:
                    mod.fit(tr_data,
                            eval_data=val_data,
                            num_epoch=nepoch,
                            epoch_end_callback=mx.callback.do_checkpoint(
                                './models/%s-%s' % (prefix, pidx),
                                period=nepoch),
                            eval_metric=['rmse', accuracy, ins_recall],
                            batch_end_callback=mx.callback.Speedometer(
                                batch_size, 500),
                            initializer=mx.init.Xavier(factor_type="in",
                                                       magnitude=2.34),
                            optimizer='sgd',
                            optimizer_params={
                                'learning_rate': eta,
                                'momentum': 0.9,
                                'wd': 0.00001
                            })
    elif model == 'lstm':
        init_c = [('l%d_init_c' % l, (batch_size, nhidden))
                  for l in range(nlayer)]
        init_h = [('l%d_init_h' % l, (batch_size, nhidden))
                  for l in range(nlayer)]
        init_states = init_c + init_h
        state_names = [x[0] for x in init_states]

        def lstm_gen(seq_len):
            sym = lstm.lstm_unroll(nlayer, seq_len, nwords, nhidden, nembed,
                                   nlabels, dropout)
            data_names = ['data'] + state_names
            label_names = ['label']
            return sym, data_names, label_names

        for pidx in xrange(len(pins)):
            print 'partition ', pidx
            data = {'articles': []}
            for _ in xrange(pins[pidx]):
                data['articles'].append(gen_data.next())
            if val == None:
                tr_data, val_data = get_data_iter(ins, labels, nlabels,
                                                  batch_size, [], buckets,
                                                  split)
            else:
                ins, labels, pmids, v, ld, lrd = load_data(
                    data, vocab, label_dict, label_rev_dict)
                tr_data = BucketFlexIter(ins, labels, nlabels, batch_size, [],
                                         buckets)
                vins, vlabels, vpmids, v, ld, lrd = load_data(read_content(
                    os.path.join(path, val)),
                                                              vocab,
                                                              label_dict,
                                                              label_rev_dict,
                                                              tr=False)
                val_data = BucketFlexIter(vins, vlabels, nlabels, batch_size,
                                          [], buckets)
            if len(buckets) == 1:
                mod = mx.mod.Module(*lstm_gen(buckets[0]), context=contexts)
            else:
                mod = mx.mod.BucketingModule(
                    lstm_gen,
                    default_bucket_key=tr_data.default_bucket_key,
                    context=contexts)
            if is_train:
                if pidx:
                    sym, arg_params, aux_params = mx.model.load_checkpoint(
                        './models/%s-%s' % (prefix, pidx - 1), nepoch)
                    mod.bind(data_shapes=tr_data.provide_data,
                             label_shapes=tr_data.provide_label,
                             for_training=True)
                    mod.set_params(arg_params=arg_params,
                                   aux_params=aux_params)
                    mod.fit(tr_data,
                            eval_data=val_data,
                            num_epoch=nepoch,
                            epoch_end_callback=mx.callback.do_checkpoint(
                                './models/%s-%s' % (prefix, pidx),
                                period=nepoch),
                            eval_metric=['rmse', accuracy, ins_recall])
                else:
                    mod.fit(tr_data,
                            eval_data=val_data,
                            num_epoch=nepoch,
                            epoch_end_callback=mx.callback.do_checkpoint(
                                './models/%s-%s' % (prefix, pidx),
                                period=nepoch),
                            eval_metric=['rmse', accuracy, ins_recall],
                            batch_end_callback=mx.callback.Speedometer(
                                batch_size, 500),
                            initializer=mx.init.Xavier(factor_type="in",
                                                       magnitude=2.34),
                            optimizer='sgd',
                            optimizer_params={
                                'learning_rate': eta,
                                'momentum': 0.9,
                                'wd': 0.00001
                            })

    elif model == 'gru':
        init_h = [('l%d_init_h' % l, (batch_size, nhidden))
                  for l in range(nlayer)]
        init_states = init_h
        state_names = [x[0] for x in init_states]

        def gru_gen(seq_len):
            sym = gru.my_GRU_unroll(nlayer, seq_len, nwords, nhidden, nembed,
                                    nlabels, dropout)
            data_names = ['data'] + state_names
            label_names = ['label']
            return sym, data_names, label_names

        for pidx in xrange(len(pins)):
            print 'partition ', pidx
            data = {'articles': []}
            for _ in xrange(pins[pidx]):
                data['articles'].append(gen_data.next())
            if val == None:
                tr_data, val_data = get_data_iter(ins, labels, nlabels,
                                                  batch_size, [], buckets,
                                                  split)
            else:
                ins, labels, pmids, v, ld, lrd = load_data(
                    data, vocab, label_dict, label_rev_dict)
                tr_data = BucketFlexIter(ins, labels, nlabels, batch_size, [],
                                         buckets)
                vins, vlabels, vpmids, v, ld, lrd = load_data(read_content(
                    os.path.join(path, val)),
                                                              vocab,
                                                              label_dict,
                                                              label_rev_dict,
                                                              tr=False)
                val_data = BucketFlexIter(vins, vlabels, nlabels, batch_size,
                                          [], buckets)
            if len(buckets) == 1:
                mod = mx.mod.Module(*lstm_gen(buckets[0]), context=contexts)
            else:
                mod = mx.mod.BucketingModule(
                    lstm_gen,
                    default_bucket_key=tr_data.default_bucket_key,
                    context=contexts)
            if is_train:
                if pidx:
                    sym, arg_params, aux_params = mx.model.load_checkpoint(
                        './models/%s-%s' % (prefix, pidx - 1), nepoch)
                    mod.bind(data_shapes=tr_data.provide_data,
                             label_shapes=tr_data.provide_label,
                             for_training=True)
                    mod.set_params(arg_params=arg_params,
                                   aux_params=aux_params)
                    mod.fit(tr_data,
                            eval_data=val_data,
                            num_epoch=nepoch,
                            epoch_end_callback=mx.callback.do_checkpoint(
                                './models/%s-%s' % (prefix, pidx),
                                period=nepoch),
                            eval_metric=['rmse', accuracy, ins_recall])
                else:
                    mod.fit(tr_data,
                            eval_data=val_data,
                            num_epoch=nepoch,
                            epoch_end_callback=mx.callback.do_checkpoint(
                                './models/%s-%s' % (prefix, pidx),
                                period=nepoch),
                            eval_metric=['rmse', accuracy, ins_recall],
                            batch_end_callback=mx.callback.Speedometer(
                                batch_size, 500),
                            initializer=mx.init.Xavier(factor_type="in",
                                                       magnitude=2.34),
                            optimizer='sgd',
                            optimizer_params={
                                'learning_rate': eta,
                                'momentum': 0.9,
                                'wd': 0.00001
                            })

    elif model == 'bilstm':
        init_cf = [('lf%d_init_c' % l, (batch_size, nhidden))
                   for l in range(nlayer)]
        init_cb = [('lb%d_init_c' % l, (batch_size, nhidden))
                   for l in range(nlayer)]
        init_hf = [('lf%d_init_h' % l, (batch_size, nhidden))
                   for l in range(nlayer)]
        init_hb = [('lb%d_init_h' % l, (batch_size, nhidden))
                   for l in range(nlayer)]
        init_states = init_cf + init_hf + init_cb + init_hb
        state_names = [x[0] for x in init_states]

        def bilstm_gen(seq_len):
            data = mx.sym.Variable('data')
            embed_weight = mx.sym.Variable('embed_weight')
            concat_weight = mx.sym.Variable('concat_weight')
            hds = mx.sym.Embedding(data=data,
                                   weight=embed_weight,
                                   input_dim=nwords,
                                   output_dim=nembed,
                                   name='embed')
            w2v = mx.sym.SliceChannel(data=hds,
                                      num_outputs=seq_len,
                                      squeeze_axis=1)
            for layidx in xrange(nlayer):
                w2v = bi_lstm_unroll(w2v, concat_weight, seq_len, nwords,
                                     nhidden, nembed, nlabels, dropout, layidx)
            w2v = [mx.sym.expand_dims(x, axis=1) for x in w2v]
            hidden = mx.sym.Concat(*w2v, dim=1)
            hidden = mx.sym.sum_axis(hidden, axis=1) / seq_len
            cls_weight = mx.sym.Variable('cls_weight')
            cls_bias = mx.sym.Variable('cls_bias')
            hidden = mx.sym.FullyConnected(data=hidden,
                                           weight=cls_weight,
                                           bias=cls_bias,
                                           num_hidden=nlabels,
                                           name='fc_cls')
            loss = mx.sym.LinearRegressionOutput(
                data=hidden, label=mx.sym.Variable('label'))
            return loss, ['data'] + state_names, ['label']

        for pidx in xrange(len(pins)):
            print 'partition ', pidx
            data = {'articles': []}
            for _ in xrange(pins[pidx]):
                data['articles'].append(gen_data.next())
            if val == None:
                tr_data, val_data = get_data_iter(ins, labels, nlabels,
                                                  batch_size, [], buckets,
                                                  split)
            else:
                ins, labels, pmids, v, ld, lrd = load_data(
                    data, vocab, label_dict, label_rev_dict)
                tr_data = BucketFlexIter(ins, labels, nlabels, batch_size, [],
                                         buckets)
                vins, vlabels, vpmids, v, ld, lrd = load_data(read_content(
                    os.path.join(path, val)),
                                                              vocab,
                                                              label_dict,
                                                              label_rev_dict,
                                                              tr=False)
                val_data = BucketFlexIter(vins, vlabels, nlabels, batch_size,
                                          [], buckets)
            if len(buckets) == 1:
                mod = mx.mod.Module(*lstm_gen(buckets[0]), context=contexts)
            else:
                mod = mx.mod.BucketingModule(
                    lstm_gen,
                    default_bucket_key=tr_data.default_bucket_key,
                    context=contexts)
            if is_train:
                if pidx:
                    sym, arg_params, aux_params = mx.model.load_checkpoint(
                        './models/%s-%s' % (prefix, pidx - 1), nepoch)
                    mod.bind(data_shapes=tr_data.provide_data,
                             label_shapes=tr_data.provide_label,
                             for_training=True)
                    mod.set_params(arg_params=arg_params,
                                   aux_params=aux_params)
                    mod.fit(tr_data,
                            eval_data=val_data,
                            num_epoch=nepoch,
                            epoch_end_callback=mx.callback.do_checkpoint(
                                './models/%s-%s' % (prefix, pidx),
                                period=nepoch),
                            eval_metric=['rmse', accuracy, ins_recall])
                else:
                    mod.fit(tr_data,
                            eval_data=val_data,
                            num_epoch=nepoch,
                            epoch_end_callback=mx.callback.do_checkpoint(
                                './models/%s-%s' % (prefix, pidx),
                                period=nepoch),
                            eval_metric=['rmse', accuracy, ins_recall],
                            batch_end_callback=mx.callback.Speedometer(
                                batch_size, 500),
                            initializer=mx.init.Xavier(factor_type="in",
                                                       magnitude=2.34),
                            optimizer='sgd',
                            optimizer_params={
                                'learning_rate': eta,
                                'momentum': 0.9,
                                'wd': 0.00001
                            })

    return vocab, label_dict, label_rev_dict, prefix, buckets, mesh_map, mesh_rev_map
regex4 = re.compile("SYRINGE[^$\W]")
# regex4 = re.compile("[\s]+DEPRESSION")
regex5 = re.compile("PTSD[^$\W]")

regex_last = re.compile('[-(]?[A-Z][a-z0-9]+[\W]?[a-z0-9:]*[\s]*')

# data = utils.readData("../"+cfg.PATH_INPUT, "../"+cfg.PATH_PREPROCESSED_TRAIN, 1)

outDir = cfg.PATH_TRAIN + "refactor/"

try:
    os.makedirs(outDir)
except OSError:
    pass

data = utils.read_content(cfg.PATH_TRAIN)

for idx, content in data.items():
    content_str = ""
    for word in content.split(' '):
        found = False

        new_str, found = getUpdatedStr([regex1], regex_exclude, found, 1, word)
        #         if not found:
        new_str, found = getUpdatedStr([regex_last], regex_exclude, found, 3,
                                       word)
        if not found:
            new_str, found = getUpdatedStr([regex2, regex3, regex4, regex5],
                                           regex_exclude, found, 2, word)
        if not found:
            new_str = word
Exemple #10
0
 def get(self, url, param, retry=3):
     logger.info('Crawl content url: %s, %s', url, str(param))
     if not url.startswith('http'):
         return utils.read_content(url)
     return utils.get_data(url, param, retry)
Exemple #11
0
app = Flask(__name__)

template_filename = 'lomake_2.html'

storyfiles = {'Ilmastonmuutos': 'ilmasto.txt', 'Töfö-pasta': 'pasta.txt'}

dirs = {'imgdir': 'static/img/', 'styledir': 'static/css/'}

possible_adjectives = [
    'cooli', 'epäcooli', 'tyhjä', 'pörröinen', 'höpö', 'suurenmoinen',
    'vituttava', 'vihreä', 'neliömäinen', 'pyöreä', 'juustomainen',
    'kukkainen', 'koiramainen', 'iso', 'kaunis', 'upee', 'harmaa', 'pimeä',
    'rasistinen'
]

stories = read_content(storyfiles)


def roll_article(s):
    o = random.choice(list(s.keys()))
    return o


def roll_adjective(possible_adjectives=possible_adjectives):
    a = random.choice(possible_adjectives)
    return a


def handle_post(data):
    a = []
    print(data)