def get_summary_id_by_subject(cur,text): rec = utils.get_summary(cur) for r in rec: #print("%s:%s" % (r['subject'],text)) if str(r['subject']) in text: return r['id'] return None
def __init__(self, **kwargs): for key, value in kwargs.iteritems(): if not hasattr(self, key): raise TypeError("__init__() got an unexpected \ keyword argument '%s'" % key) setattr(self, key, value) parser_kw = { 'prog': self.name, 'usage': self.usage, 'description': self.description, 'formatter_class': argparse.RawDescriptionHelpFormatter, 'add_help': False, } self.parser = argparse.ArgumentParser(**parser_kw) arggroup_name = '%s arguments' % self.name self.arggroup = self.parser.add_argument_group(arggroup_name) for argument in self.arguments: if isinstance(argument, Argument): self.arggroup.add_argument(*argument.args, **argument.kwargs) if self.subcommands: summary = get_summary(self.subcommands, title='subcommands:') description = [self.parser.description, summary] self.parser.description = '\n\n'.join(description)
def main(results_data_package, connection_string): if not sqlalchemy_utils.database_exists(connection_string): sqlalchemy_utils.create_database(connection_string) if not utils.check_results_package_file(results_data_package): print('Invalid results package file.') exit(1) engine = sqlalchemy.create_engine(connection_string) summary = utils.get_summary(results_data_package) with tarfile.open(results_data_package) as tar: for resource in summary['resources']: if resource['path'].startswith("outputs"): (_, perspective, summary_set, output_file) = str.split(resource['path'], '/') output_type, ext = str.split(output_file, '.') if output_type != 'summary_info': print('{}_{}_{}'.format(output_type, summary_set, perspective)) summary_info_file = "outputs/{}/{}/summary_info.csv".format( perspective, summary_set) csv_contents = tar.extractfile(summary_info_file).read() summary_info_df = pd.read_csv(io.BytesIO(csv_contents), encoding='utf8') csv_contents = tar.extractfile(resource['path']).read() df = pd.read_csv(io.BytesIO(csv_contents), encoding='utf8') df.merge(summary_info_df, on='summary_id') df.to_sql( '{}_{}_{}'.format(output_type, summary_set, perspective), engine)
def get_summary_id_by_subject(cur,text): rec = utils.get_summary(cur) for r in rec: #print("%s:%s" % (r['subject'],text)) # 通过信息中是否包含某summary主题来确定该信息是针对哪个summary的 if str(r['subject']) in text: return r['id'] return 0
def to_view_data(self): years = [] for y in sorted(os.walk(config.photo_dir).next()[1], reverse=True): year = Year(y) month_images = [m.first_image_url() for m in year.months()] image_url = first([i for i in month_images if i != '']) years.append({'view_data': year.to_view_data(), 'image_url': image_url}) return {'years': years, 'summary': get_summary(config.photo_dir)}
def to_view_data(self): years = [] for y in sorted(os.walk(config.photo_dir).next()[1], reverse=True): year = Year(y) month_images = [m.first_image_url() for m in year.months()] image_url = first([i for i in month_images if i != '']) years.append({ 'view_data': year.to_view_data(), 'image_url': image_url }) return {'years': years, 'summary': get_summary(config.photo_dir)}
def to_view_data(self): months_result = [] for m in self.months(): image = m.first_image_url() if image == '': continue months_result.append({ 'month': m.name, 'url': m.url_path, 'summary': m.get_summary(), 'first_image_url': m.first_image_url() }) return { 'months': months_result, 'year': self.year, 'summary': get_summary(self.year_dir) }
tg_rl_cost -= args.beta * tg.seq_action_entropy rl_grads = tf.gradients(tg_rl_cost, tvars) # do not increase global step -- ml op increases it rl_op = rl_opt_func.apply_gradients(zip(rl_grads, tvars)) tf.add_to_collection('n_fast_action', args.n_fast_action) train_set, valid_set, test_set = utils.prepare_dataset(args) init_op = tf.global_variables_initializer() save_op, best_save_op = utils.init_savers(args) with tf.name_scope("tr_eval"): tr_summary = get_summary('ce rl cr image'.split()) with tf.name_scope("val_eval"): val_summary = get_summary('ce rl cr fer image'.split()) vf = utils.LinearVF() with tf.Session() as sess: sess.run(init_op) summary_writer = tf.summary.FileWriter(args.logdir, sess.graph, flush_secs=5.0) # ce, accuracy, rl cost, action entropy, reward, compression ratio accu_list = [Accumulator() for i in range(6)] ce, ac, rl, ae, rw, cr = accu_list _best_score = np.iinfo(np.int32).max
def get_summary(self): return get_summary(self.month_dir)
def train(): os.environ['CUDA_VISIBLE_DEVICES'] = FLAGS.gpu # Load parallel data to train print('Loading training data..') train_set = BiTextIterator(source=FLAGS.source_train_data, target=FLAGS.target_train_data, source_dict=FLAGS.source_vocabulary, target_dict=FLAGS.target_vocabulary, batch_size=FLAGS.batch_size, max_length=FLAGS.max_seq_length, n_words_source=FLAGS.num_encoder_symbols, n_words_target=FLAGS.num_decoder_symbols, sort_by_length=FLAGS.sort_by_length, split_sign=FLAGS.split_sign ) if FLAGS.source_valid_data and FLAGS.target_valid_data: print('Loading validation data..') valid_set = BiTextIterator(source=FLAGS.source_valid_data, target=FLAGS.target_valid_data, source_dict=FLAGS.source_vocabulary, target_dict=FLAGS.target_vocabulary, batch_size=FLAGS.batch_size, max_length=FLAGS.max_seq_length, n_words_source=FLAGS.num_encoder_symbols, n_words_target=FLAGS.num_decoder_symbols, sort_by_length=FLAGS.sort_by_length, split_sign=FLAGS.split_sign ) else: valid_set = None # Initiate TF session with tf.Session(config=tf.ConfigProto(allow_soft_placement=FLAGS.allow_soft_placement, log_device_placement=FLAGS.log_device_placement, gpu_options=tf.GPUOptions(allow_growth=True))) as sess: # Create a new model or reload existing checkpoint model = create_model(sess, FLAGS) # Create a log writer object train_summary_writer = tf.summary.FileWriter(join(FLAGS.model_dir, 'train'), graph=sess.graph) valid_summary_writer = tf.summary.FileWriter(join(FLAGS.model_dir, 'valid'), graph=sess.graph) step_time, loss = 0.0, 0.0 words_seen, sents_seen, processed_number = 0, 0, 0 start_time = time.time() # Training loop print('Training..') for epoch_idx in range(FLAGS.max_epochs): if model.global_epoch_step.eval() >= FLAGS.max_epochs: print('Training is already complete.', 'current epoch:{}, max epoch:{}'.format(model.global_epoch_step.eval(), FLAGS.max_epochs)) break # reset train set train_set.reset() with tqdm(total=train_set.length()) as pbar: for source_seq, target_seq in train_set.next(): # Get a batch from training parallel data source, source_len, target, target_len = prepare_pair_batch(source_seq, target_seq, FLAGS.max_seq_length, FLAGS.max_seq_length) # print('Get Data', source.shape, target.shape, source_len.shape, target_len.shape) # print('Data', list(source[0]), list(target[0])) processed_number += len(source_seq) if source is None or target is None: print('No samples under max_seq_length ', FLAGS.max_seq_length) continue # Execute a single training step step_loss, summary = model.train(sess, encoder_inputs=source, encoder_inputs_length=source_len, decoder_inputs=target, decoder_inputs_length=target_len) loss += float(step_loss) / FLAGS.display_freq words_seen += float(np.sum(source_len + target_len)) sents_seen += float(source.shape[0]) # batch_size if model.global_step.eval() % FLAGS.display_freq == 0: avg_perplexity = math.exp(float(loss)) if loss < 300 else float("inf") time_elapsed = time.time() - start_time step_time = time_elapsed / FLAGS.display_freq words_per_sec = words_seen / time_elapsed sents_per_sec = sents_seen / time_elapsed print('Epoch:', model.global_epoch_step.eval(), 'Step:', model.global_step.eval(), 'Perplexity {0:.2f}:'.format(avg_perplexity), 'Loss:', loss, 'Step-time:', step_time, '{0:.2f} sents/s'.format(sents_per_sec), '{0:.2f} words/s'.format(words_per_sec)) # Record training summary for the current batch summary = get_summary('train_loss', loss) train_summary_writer.add_summary(summary, model.global_step.eval()) print('Record Training Summary', model.global_step.eval()) train_summary_writer.flush() pbar.update(processed_number) loss = 0 words_seen = 0 sents_seen = 0 processed_number = 0 start_time = time.time() # Execute a validation step if valid_set and model.global_step.eval() % FLAGS.valid_freq == 0: print('Validation step') valid_loss = 0.0 valid_sents_seen = 0 # reset valid set valid_set.reset() for source_seq, target_seq in valid_set.next(): # Get a batch from validation parallel data source, source_len, target, target_len = prepare_pair_batch(source_seq, target_seq, FLAGS.max_seq_length, FLAGS.max_seq_length) # Compute validation loss: average per word cross entropy loss step_loss, summary = model.eval(sess, encoder_inputs=source, encoder_inputs_length=source_len, decoder_inputs=target, decoder_inputs_length=target_len) batch_size = source.shape[0] valid_loss += step_loss * batch_size valid_sents_seen += batch_size print('{} samples seen,'.format(valid_sents_seen), 'Step Loss: {0:.2f}'.format(step_loss)) valid_loss = valid_loss / valid_sents_seen print('Valid perplexity: {0:.2f}:'.format(math.exp(valid_loss)), 'Loss:', valid_loss) # Record training summary for the current batch summary = get_summary('valid_loss', valid_loss) valid_summary_writer.add_summary(summary, model.global_step.eval()) print('Record Valid Summary', model.global_step.eval()) valid_summary_writer.flush() # Save the model checkpoint if model.global_step.eval() % FLAGS.save_freq == 0: print('Saving the model..') checkpoint_path = os.path.join(FLAGS.model_dir, FLAGS.model_name) model.save(sess, checkpoint_path, global_step=model.global_step) json.dump(model.config, open('%s-%d.json' % (checkpoint_path, model.global_step.eval()), 'w'), indent=2) # Increase the epoch index of the model model.global_epoch_step_op.eval() print('Epoch {0:} DONE'.format(model.global_epoch_step.eval())) print('Saving the last model..') checkpoint_path = os.path.join(FLAGS.model_dir, FLAGS.model_name) model.save(sess, checkpoint_path, global_step=model.global_step) json.dump(model.config, open('%s-%d.json' % (checkpoint_path, model.global_step.eval()), 'w', encoding='utf-8'), indent=2) print('Training Terminated')
ml_grads, _ = tf.clip_by_global_norm(tf.gradients(tg_ml_cost, tvars), clip_norm=1.0) ml_op = ml_opt_func.apply_gradients(zip(ml_grads, tvars), global_step=global_step) tf.add_to_collection('n_skip', args.n_skip) tf.add_to_collection('n_hidden', args.n_hidden) train_set, valid_set, test_set = utils.prepare_dataset(args) init_op = tf.global_variables_initializer() save_op, best_save_op = utils.init_savers(args) with tf.name_scope("tr_eval"): tr_summary = utils.get_summary('ce cr image'.split()) with tf.name_scope("val_eval"): val_summary = utils.get_summary('ce cr fer image'.split()) with tf.Session() as sess: sess.run(init_op) summary_writer = tf.summary.FileWriter(args.logdir, sess.graph, flush_secs=5.0) # ce, accuracy, compression ratio accu_list = [Accumulator() for i in range(3)] ce, ac, cr = accu_list _best_score = np.iinfo(np.int32).max
def summary(self): if hasattr(self, 'cached_summary'): return self.cached_summary self.cached_summary = get_summary(self.album_dir) return self.cached_summary
# TODO: this part can add one incise the dna # get the thinning img thinning_image = [] for i in clean_img: print('thinning.......') thinning_image.append(utils.thinning(i)) # count the head and get stat summary stat_list = [] for i in thinning_image: stat, heads = utils.head_and_len(i) stat_list.append(stat) # get the summary utils.get_summary(stat_list) # enlarge the img space large_image = [] for i in thinning_image: large_image.append(utils.transfe(i)) # label the complex one all_single = [] for i in large_image: print('label .......') all_single = all_single + utils.img_label(i) pic_num = 0 for i in all_single: cv2.imwrite('tmp/{}.png'.format(pic_num), i)
def main(): args = ArgParser().parse_args() args.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") args.n_gpu = torch.cuda.device_count() if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO) logger.warning( "device: %s, n_gpu: %s, 16-bits training: %s", args.device, args.n_gpu, args.fp16, ) set_seed(args) processor = MimicProcessor() label_list = processor.get_labels() num_labels = len(label_list) config = AutoConfig.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task, cache_dir=args.cache_dir) args.model_type = config.model_type config.codes_attention = args.codes_attention config.threshold = args.threshold logger.info("CODES ATTENTION IS {}".format(args.codes_attention)) logger.info("INCLUDE CODES IS {}".format(args.include_codes)) logger.info("PRETRAINED ICD IS {}".format(args.pretrained_icd)) logger.info("ONLY CODES IS {}".format(args.only_codes)) config.only_codes = args.only_codes tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir) model_options = { 'bert': BertForSequenceClassification, 'electra': ElectraForSequenceClassification } model_class = model_options[args.model_type] model = model_class.from_pretrained(args.model_name_or_path, config=config, cache_dir=args.cache_dir) # Add ICD codes as new tokens to tokenizer if args.include_codes: icd_codes_mortality = pd.read_csv( '/home/dc925/project/data/graphmimic/mortality/icd_codes_mortality.txt', header=None) icd_codes_readmission = pd.read_csv( '/home/dc925/project/data/graphmimic/readmission/icd_codes_readmission.txt', header=None) icd_codes_mortality = icd_codes_mortality[0].tolist() icd_codes_readmission = icd_codes_readmission[0].tolist() icd_codes = set(icd_codes_mortality + icd_codes_readmission) icd_codes = sorted(icd_codes) icd_codes_tokens = ['ICD' + c for c in icd_codes] num_added_tokens = tokenizer.add_tokens(icd_codes_tokens) logger.info('we have added {} tokens'.format(num_added_tokens)) model.resize_token_embeddings(len(tokenizer)) if args.pretrained_icd: assert args.include_codes # read in kge and entities.tsv kge = np.load( '/home/dc925/project/graphmimic/ckpts/RotatE_ICD9_2/ICD9_RotatE_entity.npy' ) entities = pd.read_csv( '/home/dc925/project/data/graphmimic/UMLS/ICD_KG/entities.tsv', sep='\t', header=None) entities.columns = ['ID', 'ICD'] icd2id = pd.Series(entities['ID'].values, index=entities['ICD']).to_dict() id2icd = {v: k for k, v in icd2id.items()} broad_idx = [icd2id[c] for c in icd_codes] broad_kge = kge[broad_idx] assert broad_kge.shape[1] == config.embedding_size with torch.no_grad(): embeddings = model.get_input_embeddings() embeddings.weight[-num_added_tokens:, :] = torch.tensor(broad_kge) model.to(args.device) logger.info("Training/evaluation parameters {}".format(args)) get_summary(model) if args.do_train: train_dataset = load_and_cache_examples(args, processor, tokenizer) global_step, tr_loss = train(args, train_dataset, model, processor, tokenizer) logger.info("global_step = {}, average loss = {}".format( global_step, tr_loss)) logger.info("Saving model checkpoint to {}".format(args.output_dir)) model.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) torch.save(args, os.path.join(args.output_dir, "training_args.bin")) model = model_class.from_pretrained(args.output_dir) tokenizer = AutoTokenizer.from_pretrained(args.output_dir) model.to(args.device) results = {} if args.do_eval: result = evaluate(args, model, processor, tokenizer) result = dict( (k + "'_{}".format(global_step), v) for k, v in result.items()) results.update(result) if args.do_test: result = evaluate(args, model, processor, tokenizer, mode='test') result = dict( (k + "'_{}".format(global_step), v) for k, v in result.items()) results.update(result) return results
def analysis(fpath: str, extname, imgdir=None, do_drawings=False): content = None images = [] # drawings = [] kw_arr = [] freq_arr = [] ph_arr = [] nw_arr = [] sum_arr = [] # if not do_drawings: if True: if extname == '.txt': content = readtxt.read(fpath) if extname == '.docx': content = readword.readtxt(fpath) images = readword.readimg(fpath, imgdir, str(uuid.uuid4())) if extname == '.doc': content = readword.readtxt(fpath + 'x') images = readword.readimg(fpath + 'x', imgdir, str(uuid.uuid4())) if extname == '.pptx': content = readppt.readtxt(fpath) images = readppt.readimg(fpath, imgdir, str(uuid.uuid4())) if extname == '.ppt': content = readppt.readtxt(fpath + 'x') images = readppt.readimg(fpath + 'x', imgdir, str(uuid.uuid4())) if extname == '.pdf': content = readpdf.readtext(fpath) drawings = None do_split_drawing = False if do_drawings: if extname == '.dxf': content = readdxf.readtxt(fpath) if do_split_drawing: drawings = readdxf.split_drawing_byblock(fpath) if extname == '.dwg': maxtry = 30 transpath = fpath.replace('.dwg', '.dxf') for ii in range(maxtry): print(ii) time.sleep(3) if os.path.isfile(transpath): content = readdxf.readtxt(transpath) if do_split_drawing: drawings = readdxf.split_drawing_byblock(fpath) break if extname == '.rar': content = readrar.readrar(fpath, rm_prefix=True, maxnames=10) if extname == '.zip': content = readrar.readzip(fpath, rm_prefix=True, maxnames=10) # do analysis if content is not None: # too long!!! total_words_count = len(' '.join(content)) total_paragraph_count = len(content) max_words = 50000 if total_words_count > max_words: paragraph_limit = math.ceil(max_words / total_words_count * total_paragraph_count) content = content[:paragraph_limit] print('limit paragraphs ' + str(paragraph_limit)) print('limit words ' + str(len(' '.join(content)))) # key words kw_arr = utils.get_keywords(content, config.kw_topk) # word frequency array freq = utils.get_freq(content) freq_arr = list(map(lambda x: str(freq[x]) if x in freq else 0, kw_arr)) # key phrases ph_arr = utils.get_phrase(content, n=10) # new words if not extname == '.dwg': nw_arr = utils.get_newwords(content, n=20) # auto summary if extname == '.rar' or extname == '.zip': sum_arr = content else: sum_arr = utils.get_summary(content, n=10) # give keywords to images # ['fname', 'keywords', 'relatedtxt'] makeparam = {} if images: for cimg in images: # cimg['keywords'] = ','.join(utils.get_keywords([cimg['relatedtxt']], config.kw_topk_image)) makeparam[cimg['fname']] = cimg['relatedtxt'] kwdic = utils.get_keywordsmany(makeparam, config.kw_topk_image) for cimg in images: cimg['keywords'] = ','.join(kwdic[cimg['fname']][0]) cimg['newwords'] = ','.join(kwdic[cimg['fname']][1]) cimg['docname'] = fpath return ( ','.join(kw_arr), # ','.join(freq_arr), ','.join([x + ':' + y for x, y in zip(kw_arr, freq_arr)]), ','.join(ph_arr), ','.join(nw_arr), sum_arr, images, drawings)
def _get_summary(content): return md.render(get_summary(content))
global_step=global_step) tf.add_to_collection('n_skip', args.n_skip) tf.add_to_collection('n_hidden', args.n_hidden) tf.add_to_collection('n_step', args.n_step) tf.add_to_collection('n_layer', args.n_layer) tf.add_to_collection('n_class', args.n_class) train_set, valid_set, test_set = utils.prepare_dataset(args) init_op = tf.global_variables_initializer() save_op, best_save_op = utils.init_savers(args) with tf.name_scope("tr_eval"): tr_summary = utils.get_summary('ce cr'.split()) with tf.name_scope("val_eval"): val_summary = utils.get_summary('ce cr fer'.split()) with tf.Session() as sess: sess.run(init_op) summary_writer = tf.summary.FileWriter(args.logdir, sess.graph, flush_secs=5.0) # ce, accuracy, compression ratio accu_list = [Accumulator() for i in range(3)] ce, ac, cr = accu_list _best_score = np.iinfo(np.int32).max