Example #1
0
def get_summary_id_by_subject(cur,text):
    rec = utils.get_summary(cur)
    for r in rec:
        #print("%s:%s" % (r['subject'],text))
        if str(r['subject']) in text:
            return r['id']
    return None
Example #2
0
    def __init__(self, **kwargs):
        for key, value in kwargs.iteritems():
            if not hasattr(self, key):
                raise TypeError("__init__() got an unexpected \
                                 keyword argument '%s'" % key)
            setattr(self, key, value)

        parser_kw = {
            'prog': self.name,
            'usage': self.usage,
            'description': self.description,
            'formatter_class': argparse.RawDescriptionHelpFormatter,
            'add_help': False,
        }

        self.parser = argparse.ArgumentParser(**parser_kw)

        arggroup_name = '%s arguments' % self.name
        self.arggroup = self.parser.add_argument_group(arggroup_name)

        for argument in self.arguments:
            if isinstance(argument, Argument):
                self.arggroup.add_argument(*argument.args, **argument.kwargs)

        if self.subcommands:
            summary = get_summary(self.subcommands, title='subcommands:')
            description = [self.parser.description, summary]
            self.parser.description = '\n\n'.join(description)
def main(results_data_package, connection_string):

    if not sqlalchemy_utils.database_exists(connection_string):
        sqlalchemy_utils.create_database(connection_string)

    if not utils.check_results_package_file(results_data_package):
        print('Invalid results package file.')
        exit(1)

    engine = sqlalchemy.create_engine(connection_string)

    summary = utils.get_summary(results_data_package)

    with tarfile.open(results_data_package) as tar:
        for resource in summary['resources']:
            if resource['path'].startswith("outputs"):
                (_, perspective, summary_set,
                 output_file) = str.split(resource['path'], '/')
                output_type, ext = str.split(output_file, '.')
                if output_type != 'summary_info':
                    print('{}_{}_{}'.format(output_type, summary_set,
                                            perspective))
                    summary_info_file = "outputs/{}/{}/summary_info.csv".format(
                        perspective, summary_set)
                    csv_contents = tar.extractfile(summary_info_file).read()
                    summary_info_df = pd.read_csv(io.BytesIO(csv_contents),
                                                  encoding='utf8')
                    csv_contents = tar.extractfile(resource['path']).read()
                    df = pd.read_csv(io.BytesIO(csv_contents), encoding='utf8')
                    df.merge(summary_info_df, on='summary_id')
                    df.to_sql(
                        '{}_{}_{}'.format(output_type, summary_set,
                                          perspective), engine)
def get_summary_id_by_subject(cur,text):
    rec = utils.get_summary(cur)
    for r in rec:
        #print("%s:%s" % (r['subject'],text))
        # 通过信息中是否包含某summary主题来确定该信息是针对哪个summary的
        if str(r['subject']) in text:
            return r['id']
    return 0
Example #5
0
  def to_view_data(self):
    years = []
    for y in sorted(os.walk(config.photo_dir).next()[1], reverse=True):
        year = Year(y)
        month_images = [m.first_image_url() for m in year.months()]
        image_url = first([i for i in month_images if i != ''])
        
        years.append({'view_data': year.to_view_data(),
                      'image_url': image_url})

    return {'years': years,
            'summary': get_summary(config.photo_dir)}
Example #6
0
    def to_view_data(self):
        years = []
        for y in sorted(os.walk(config.photo_dir).next()[1], reverse=True):
            year = Year(y)
            month_images = [m.first_image_url() for m in year.months()]
            image_url = first([i for i in month_images if i != ''])

            years.append({
                'view_data': year.to_view_data(),
                'image_url': image_url
            })

        return {'years': years, 'summary': get_summary(config.photo_dir)}
Example #7
0
    def to_view_data(self):
        months_result = []
        for m in self.months():
            image = m.first_image_url()
            if image == '':
                continue
            months_result.append({
                'month': m.name,
                'url': m.url_path,
                'summary': m.get_summary(),
                'first_image_url': m.first_image_url()
            })

        return {
            'months': months_result,
            'year': self.year,
            'summary': get_summary(self.year_dir)
        }
Example #8
0
    tg_rl_cost -= args.beta * tg.seq_action_entropy

    rl_grads = tf.gradients(tg_rl_cost, tvars)
    # do not increase global step -- ml op increases it 
    rl_op = rl_opt_func.apply_gradients(zip(rl_grads, tvars))
    
    tf.add_to_collection('n_fast_action', args.n_fast_action)

    train_set, valid_set, test_set = utils.prepare_dataset(args)

    init_op = tf.global_variables_initializer()

    save_op, best_save_op = utils.init_savers(args)

    with tf.name_scope("tr_eval"):
        tr_summary = get_summary('ce rl cr image'.split())
    with tf.name_scope("val_eval"):
        val_summary = get_summary('ce rl cr fer image'.split())

    vf = utils.LinearVF()

    with tf.Session() as sess:
        sess.run(init_op)
        summary_writer = tf.summary.FileWriter(args.logdir, sess.graph, flush_secs=5.0)
    
        # ce, accuracy, rl cost, action entropy, reward, compression ratio
        accu_list = [Accumulator() for i in range(6)]
        ce, ac, rl, ae, rw, cr = accu_list 

        _best_score = np.iinfo(np.int32).max
Example #9
0
 def get_summary(self):
     return get_summary(self.month_dir)
Example #10
0
def train():
    os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu
    
    # Load parallel data to train
    print('Loading training data..')
    train_set = BiTextIterator(source=FLAGS.source_train_data,
                               target=FLAGS.target_train_data,
                               source_dict=FLAGS.source_vocabulary,
                               target_dict=FLAGS.target_vocabulary,
                               batch_size=FLAGS.batch_size,
                               max_length=FLAGS.max_seq_length,
                               n_words_source=FLAGS.num_encoder_symbols,
                               n_words_target=FLAGS.num_decoder_symbols,
                               sort_by_length=FLAGS.sort_by_length,
                               split_sign=FLAGS.split_sign
                               )
    
    if FLAGS.source_valid_data and FLAGS.target_valid_data:
        print('Loading validation data..')
        valid_set = BiTextIterator(source=FLAGS.source_valid_data,
                                   target=FLAGS.target_valid_data,
                                   source_dict=FLAGS.source_vocabulary,
                                   target_dict=FLAGS.target_vocabulary,
                                   batch_size=FLAGS.batch_size,
                                   max_length=FLAGS.max_seq_length,
                                   n_words_source=FLAGS.num_encoder_symbols,
                                   n_words_target=FLAGS.num_decoder_symbols,
                                   sort_by_length=FLAGS.sort_by_length,
                                   split_sign=FLAGS.split_sign
                                   )
    else:
        valid_set = None
    
    # Initiate TF session
    with tf.Session(config=tf.ConfigProto(allow_soft_placement=FLAGS.allow_soft_placement,
                                          log_device_placement=FLAGS.log_device_placement,
                                          gpu_options=tf.GPUOptions(allow_growth=True))) as sess:
        
        
        # Create a new model or reload existing checkpoint
        model = create_model(sess, FLAGS)

        # Create a log writer object
        train_summary_writer = tf.summary.FileWriter(join(FLAGS.model_dir, 'train'), graph=sess.graph)
        valid_summary_writer = tf.summary.FileWriter(join(FLAGS.model_dir, 'valid'), graph=sess.graph)

        step_time, loss = 0.0, 0.0
        words_seen, sents_seen, processed_number = 0, 0, 0
        start_time = time.time()
        
        # Training loop
        print('Training..')
        
        for epoch_idx in range(FLAGS.max_epochs):
            if model.global_epoch_step.eval() >= FLAGS.max_epochs:
                print('Training is already complete.',
                      'current epoch:{}, max epoch:{}'.format(model.global_epoch_step.eval(), FLAGS.max_epochs))
                break
            
            # reset train set
            train_set.reset()
            
            with tqdm(total=train_set.length()) as pbar:
                
                for source_seq, target_seq in train_set.next():
                    # Get a batch from training parallel data
                    source, source_len, target, target_len = prepare_pair_batch(source_seq, target_seq,
                                                                                FLAGS.max_seq_length,
                                                                                FLAGS.max_seq_length)
                    
                    # print('Get Data', source.shape, target.shape, source_len.shape, target_len.shape)
                    # print('Data', list(source[0]), list(target[0]))
                    
                    processed_number += len(source_seq)
                    
                    if source is None or target is None:
                        print('No samples under max_seq_length ', FLAGS.max_seq_length)
                        continue
                    
                    # Execute a single training step
                    step_loss, summary = model.train(sess, encoder_inputs=source, encoder_inputs_length=source_len,
                                                     decoder_inputs=target, decoder_inputs_length=target_len)
                    
                    loss += float(step_loss) / FLAGS.display_freq
                    words_seen += float(np.sum(source_len + target_len))
                    sents_seen += float(source.shape[0])  # batch_size
                    
                    if model.global_step.eval() % FLAGS.display_freq == 0:
                        avg_perplexity = math.exp(float(loss)) if loss < 300 else float("inf")
                        
                        time_elapsed = time.time() - start_time
                        step_time = time_elapsed / FLAGS.display_freq
                        
                        words_per_sec = words_seen / time_elapsed
                        sents_per_sec = sents_seen / time_elapsed
                        
                        print('Epoch:', model.global_epoch_step.eval(), 'Step:', model.global_step.eval(),
                              'Perplexity {0:.2f}:'.format(avg_perplexity), 'Loss:', loss, 'Step-time:', step_time,
                              '{0:.2f} sents/s'.format(sents_per_sec), '{0:.2f} words/s'.format(words_per_sec))

                        # Record training summary for the current batch
                        summary = get_summary('train_loss', loss)
                        train_summary_writer.add_summary(summary, model.global_step.eval())
                        print('Record Training Summary', model.global_step.eval())
                        train_summary_writer.flush()
                        
                        pbar.update(processed_number)
                        
                        loss = 0
                        words_seen = 0
                        sents_seen = 0
                        processed_number = 0
                        start_time = time.time()
                        
                    
                    # Execute a validation step
                    if valid_set and model.global_step.eval() % FLAGS.valid_freq == 0:
                        print('Validation step')
                        valid_loss = 0.0
                        valid_sents_seen = 0
                        
                        # reset valid set
                        valid_set.reset()
                        
                        for source_seq, target_seq in valid_set.next():
                            # Get a batch from validation parallel data
                            source, source_len, target, target_len = prepare_pair_batch(source_seq, target_seq,
                                                                                        FLAGS.max_seq_length,
                                                                                        FLAGS.max_seq_length)
                            
                            # Compute validation loss: average per word cross entropy loss
                            step_loss, summary = model.eval(sess, encoder_inputs=source,
                                                            encoder_inputs_length=source_len,
                                                            decoder_inputs=target, decoder_inputs_length=target_len)
                            batch_size = source.shape[0]
                            
                            valid_loss += step_loss * batch_size
                            valid_sents_seen += batch_size
                            print('{} samples seen,'.format(valid_sents_seen),
                                  'Step Loss: {0:.2f}'.format(step_loss))
                        
                        valid_loss = valid_loss / valid_sents_seen
                        print('Valid perplexity: {0:.2f}:'.format(math.exp(valid_loss)), 'Loss:', valid_loss)

                        # Record training summary for the current batch
                        summary = get_summary('valid_loss', valid_loss)
                        valid_summary_writer.add_summary(summary, model.global_step.eval())
                        print('Record Valid Summary', model.global_step.eval())
                        valid_summary_writer.flush()
                        
                    # Save the model checkpoint
                    if model.global_step.eval() % FLAGS.save_freq == 0:
                        print('Saving the model..')
                        checkpoint_path = os.path.join(FLAGS.model_dir, FLAGS.model_name)
                        model.save(sess, checkpoint_path, global_step=model.global_step)
                        json.dump(model.config,
                                  open('%s-%d.json' % (checkpoint_path, model.global_step.eval()), 'w'),
                                  indent=2)
            
            # Increase the epoch index of the model
            model.global_epoch_step_op.eval()
            print('Epoch {0:} DONE'.format(model.global_epoch_step.eval()))
        
        print('Saving the last model..')
        checkpoint_path = os.path.join(FLAGS.model_dir, FLAGS.model_name)
        model.save(sess, checkpoint_path, global_step=model.global_step)
        json.dump(model.config,
                  open('%s-%d.json' % (checkpoint_path, model.global_step.eval()), 'w', encoding='utf-8'),
                  indent=2)
    
    print('Training Terminated')
Example #11
0
    ml_grads, _ = tf.clip_by_global_norm(tf.gradients(tg_ml_cost, tvars),
                                         clip_norm=1.0)
    ml_op = ml_opt_func.apply_gradients(zip(ml_grads, tvars),
                                        global_step=global_step)

    tf.add_to_collection('n_skip', args.n_skip)
    tf.add_to_collection('n_hidden', args.n_hidden)

    train_set, valid_set, test_set = utils.prepare_dataset(args)

    init_op = tf.global_variables_initializer()

    save_op, best_save_op = utils.init_savers(args)

    with tf.name_scope("tr_eval"):
        tr_summary = utils.get_summary('ce cr image'.split())
    with tf.name_scope("val_eval"):
        val_summary = utils.get_summary('ce cr fer image'.split())

    with tf.Session() as sess:
        sess.run(init_op)
        summary_writer = tf.summary.FileWriter(args.logdir,
                                               sess.graph,
                                               flush_secs=5.0)

        # ce, accuracy, compression ratio
        accu_list = [Accumulator() for i in range(3)]
        ce, ac, cr = accu_list

        _best_score = np.iinfo(np.int32).max
Example #12
0
    def summary(self):
        if hasattr(self, 'cached_summary'):
            return self.cached_summary

        self.cached_summary = get_summary(self.album_dir)
        return self.cached_summary
Example #13
0
# TODO: this part can add one incise the dna

# get the thinning img
thinning_image = []
for i in clean_img:
    print('thinning.......')
    thinning_image.append(utils.thinning(i))

# count the head and get stat summary
stat_list = []
for i in thinning_image:
    stat, heads = utils.head_and_len(i)
    stat_list.append(stat)

# get the summary
utils.get_summary(stat_list)

# enlarge the img space
large_image = []
for i in thinning_image:
    large_image.append(utils.transfe(i))

# label the complex one
all_single = []
for i in large_image:
    print('label .......')
    all_single = all_single + utils.img_label(i)

pic_num = 0
for i in all_single:
    cv2.imwrite('tmp/{}.png'.format(pic_num), i)
Example #14
0
 def get_summary(self):
   return get_summary(self.month_dir)
def main():
    args = ArgParser().parse_args()
    args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    args.n_gpu = torch.cuda.device_count()

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO)

    logger.warning(
        "device: %s, n_gpu: %s, 16-bits training: %s",
        args.device,
        args.n_gpu,
        args.fp16,
    )

    set_seed(args)

    processor = MimicProcessor()

    label_list = processor.get_labels()
    num_labels = len(label_list)

    config = AutoConfig.from_pretrained(
        args.config_name if args.config_name else args.model_name_or_path,
        num_labels=num_labels,
        finetuning_task=args.task,
        cache_dir=args.cache_dir)
    args.model_type = config.model_type
    config.codes_attention = args.codes_attention
    config.threshold = args.threshold
    logger.info("CODES ATTENTION IS {}".format(args.codes_attention))
    logger.info("INCLUDE CODES IS {}".format(args.include_codes))
    logger.info("PRETRAINED ICD IS {}".format(args.pretrained_icd))
    logger.info("ONLY CODES IS {}".format(args.only_codes))
    config.only_codes = args.only_codes
    tokenizer = AutoTokenizer.from_pretrained(
        args.tokenizer_name
        if args.tokenizer_name else args.model_name_or_path,
        do_lower_case=args.do_lower_case,
        cache_dir=args.cache_dir)
    model_options = {
        'bert': BertForSequenceClassification,
        'electra': ElectraForSequenceClassification
    }
    model_class = model_options[args.model_type]

    model = model_class.from_pretrained(args.model_name_or_path,
                                        config=config,
                                        cache_dir=args.cache_dir)

    # Add ICD codes as new tokens to tokenizer
    if args.include_codes:
        icd_codes_mortality = pd.read_csv(
            '/home/dc925/project/data/graphmimic/mortality/icd_codes_mortality.txt',
            header=None)
        icd_codes_readmission = pd.read_csv(
            '/home/dc925/project/data/graphmimic/readmission/icd_codes_readmission.txt',
            header=None)
        icd_codes_mortality = icd_codes_mortality[0].tolist()
        icd_codes_readmission = icd_codes_readmission[0].tolist()
        icd_codes = set(icd_codes_mortality + icd_codes_readmission)
        icd_codes = sorted(icd_codes)
        icd_codes_tokens = ['ICD' + c for c in icd_codes]
        num_added_tokens = tokenizer.add_tokens(icd_codes_tokens)
        logger.info('we have added {} tokens'.format(num_added_tokens))
        model.resize_token_embeddings(len(tokenizer))

    if args.pretrained_icd:
        assert args.include_codes
        # read in kge and entities.tsv
        kge = np.load(
            '/home/dc925/project/graphmimic/ckpts/RotatE_ICD9_2/ICD9_RotatE_entity.npy'
        )
        entities = pd.read_csv(
            '/home/dc925/project/data/graphmimic/UMLS/ICD_KG/entities.tsv',
            sep='\t',
            header=None)
        entities.columns = ['ID', 'ICD']
        icd2id = pd.Series(entities['ID'].values,
                           index=entities['ICD']).to_dict()
        id2icd = {v: k for k, v in icd2id.items()}

        broad_idx = [icd2id[c] for c in icd_codes]
        broad_kge = kge[broad_idx]
        assert broad_kge.shape[1] == config.embedding_size

        with torch.no_grad():
            embeddings = model.get_input_embeddings()
            embeddings.weight[-num_added_tokens:, :] = torch.tensor(broad_kge)

    model.to(args.device)
    logger.info("Training/evaluation parameters {}".format(args))
    get_summary(model)

    if args.do_train:

        train_dataset = load_and_cache_examples(args, processor, tokenizer)
        global_step, tr_loss = train(args, train_dataset, model, processor,
                                     tokenizer)
        logger.info("global_step = {}, average loss = {}".format(
            global_step, tr_loss))

        logger.info("Saving model checkpoint to {}".format(args.output_dir))
        model.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        model = model_class.from_pretrained(args.output_dir)
        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
        model.to(args.device)

    results = {}
    if args.do_eval:
        result = evaluate(args, model, processor, tokenizer)
        result = dict(
            (k + "'_{}".format(global_step), v) for k, v in result.items())
        results.update(result)
    if args.do_test:
        result = evaluate(args, model, processor, tokenizer, mode='test')
        result = dict(
            (k + "'_{}".format(global_step), v) for k, v in result.items())
        results.update(result)
    return results
Example #16
0
def analysis(fpath: str, extname, imgdir=None, do_drawings=False):
    content = None
    images = []
    # drawings = []

    kw_arr = []
    freq_arr = []
    ph_arr = []
    nw_arr = []
    sum_arr = []

    # if not do_drawings:
    if True:
        if extname == '.txt':
            content = readtxt.read(fpath)

        if extname == '.docx':
            content = readword.readtxt(fpath)
            images = readword.readimg(fpath, imgdir, str(uuid.uuid4()))
        if extname == '.doc':
            content = readword.readtxt(fpath + 'x')
            images = readword.readimg(fpath + 'x', imgdir, str(uuid.uuid4()))

        if extname == '.pptx':
            content = readppt.readtxt(fpath)
            images = readppt.readimg(fpath, imgdir, str(uuid.uuid4()))
        if extname == '.ppt':
            content = readppt.readtxt(fpath + 'x')
            images = readppt.readimg(fpath + 'x', imgdir, str(uuid.uuid4()))

        if extname == '.pdf':
            content = readpdf.readtext(fpath)

    drawings = None
    do_split_drawing = False
    if do_drawings:
        if extname == '.dxf':
            content = readdxf.readtxt(fpath)
            if do_split_drawing:
                drawings = readdxf.split_drawing_byblock(fpath)

        if extname == '.dwg':
            maxtry = 30
            transpath = fpath.replace('.dwg', '.dxf')
            for ii in range(maxtry):
                print(ii)
                time.sleep(3)
                if os.path.isfile(transpath):
                    content = readdxf.readtxt(transpath)
                    if do_split_drawing:
                        drawings = readdxf.split_drawing_byblock(fpath)
                    break

        if extname == '.rar':
            content = readrar.readrar(fpath, rm_prefix=True, maxnames=10)

        if extname == '.zip':
            content = readrar.readzip(fpath, rm_prefix=True, maxnames=10)

    # do analysis
    if content is not None:
        # too long!!!
        total_words_count = len(' '.join(content))
        total_paragraph_count = len(content)
        max_words = 50000
        if total_words_count > max_words:
            paragraph_limit = math.ceil(max_words / total_words_count *
                                        total_paragraph_count)
            content = content[:paragraph_limit]
            print('limit paragraphs ' + str(paragraph_limit))
            print('limit words ' + str(len(' '.join(content))))

        # key words
        kw_arr = utils.get_keywords(content, config.kw_topk)
        # word frequency array
        freq = utils.get_freq(content)
        freq_arr = list(map(lambda x: str(freq[x])
                            if x in freq else 0, kw_arr))
        # key phrases
        ph_arr = utils.get_phrase(content, n=10)
        # new words
        if not extname == '.dwg':
            nw_arr = utils.get_newwords(content, n=20)
        # auto summary
        if extname == '.rar' or extname == '.zip':
            sum_arr = content
        else:
            sum_arr = utils.get_summary(content, n=10)

    # give keywords to images
    # ['fname', 'keywords', 'relatedtxt']
    makeparam = {}
    if images:
        for cimg in images:
            # cimg['keywords'] = ','.join(utils.get_keywords([cimg['relatedtxt']], config.kw_topk_image))
            makeparam[cimg['fname']] = cimg['relatedtxt']

        kwdic = utils.get_keywordsmany(makeparam, config.kw_topk_image)
        for cimg in images:
            cimg['keywords'] = ','.join(kwdic[cimg['fname']][0])
            cimg['newwords'] = ','.join(kwdic[cimg['fname']][1])
            cimg['docname'] = fpath

    return (
        ','.join(kw_arr),
        # ','.join(freq_arr),
        ','.join([x + ':' + y for x, y in zip(kw_arr, freq_arr)]),
        ','.join(ph_arr),
        ','.join(nw_arr),
        sum_arr,
        images,
        drawings)
Example #17
0
File: urls.py Project: zhu327/blog
def _get_summary(content):
    return md.render(get_summary(content))
Example #18
0
  def summary(self):
    if hasattr(self, 'cached_summary'):
      return self.cached_summary

    self.cached_summary = get_summary(self.album_dir)
    return self.cached_summary
Example #19
0
                                        global_step=global_step)

    tf.add_to_collection('n_skip', args.n_skip)
    tf.add_to_collection('n_hidden', args.n_hidden)
    tf.add_to_collection('n_step', args.n_step)
    tf.add_to_collection('n_layer', args.n_layer)
    tf.add_to_collection('n_class', args.n_class)

    train_set, valid_set, test_set = utils.prepare_dataset(args)

    init_op = tf.global_variables_initializer()

    save_op, best_save_op = utils.init_savers(args)

    with tf.name_scope("tr_eval"):
        tr_summary = utils.get_summary('ce cr'.split())
    with tf.name_scope("val_eval"):
        val_summary = utils.get_summary('ce cr fer'.split())

    with tf.Session() as sess:
        sess.run(init_op)
        summary_writer = tf.summary.FileWriter(args.logdir,
                                               sess.graph,
                                               flush_secs=5.0)

        # ce, accuracy, compression ratio
        accu_list = [Accumulator() for i in range(3)]
        ce, ac, cr = accu_list

        _best_score = np.iinfo(np.int32).max