import biLSTM.encoder.encoder_model as biLSTM_encoder_model import biLSTM.encoder.entailment_model as biLSTM_entailment_model import biLSTM.encoder.bi_lstm_model as bi_lstm_model MAX_SEQ_LEN_LABEL_DEF = 512 ## max len for GO def (probably can be smaller) if args.w2v_emb is not None: ## we can just treat each node as a vector without word description Vocab = load_vocab(args.vocab_list) # all words found in pubmed and trained in w2v ... should trim down ## reading in feature label is in @GCN folder. too lazy to port this function out. LabelDescLoader = GCN_data_loader.LabelProcessor() if args.tree: # @label_in_ontology to get GO in the whole ontology, will be needed if we use tree method LabelSamples = LabelDescLoader.get_examples(args.data_dir, label_array=label_in_ontology) LabelSamples = GCN_data_loader.convert_labels_to_features(LabelSamples, MAX_SEQ_LEN_LABEL_DEF, Vocab, all_name_array=label_in_ontology, tokenize_style='space') else: ## only get vectors for labels we want. LabelSamples = LabelDescLoader.get_examples(args.data_dir, label_array=label_to_test) LabelSamples = GCN_data_loader.convert_labels_to_features(LabelSamples, MAX_SEQ_LEN_LABEL_DEF, Vocab, all_name_array=label_to_test, tokenize_style='space') GO_loader_for_biLSTM, GO_name_for_biLSTM = GCN_data_loader.make_label_loader (LabelSamples,args.batch_size_bert,fp16=False) ## if we fix encoder, then we don't have to worry about batch size, should be able to handle 32 or even 64 ## **** load protein data if args.ontology is None: add_name = ""
if (args.w2v_emb is not None) and ( args.word_mode != 'PretrainedGO' ): ## we can just treat each node as a vector without word description ## if we do PretrainedGO vector, we don't need the GO definitions Vocab = load_vocab( args.vocab_list ) # all words found in pubmed and trained in w2v ... should trim down processor = data_loader.LabelProcessor() label_desc_examples = processor.get_examples( args.main_dir, all_name_array) ## must get all labels label_desc_features = data_loader.convert_labels_to_features( label_desc_examples, max_seq_length=MAX_SEQ_LEN, tokenizer=Vocab, all_name_array=all_name_array, tokenize_style="space") if args.batch_size_label_desc == 0: args.batch_size_label_desc = args.num_label label_desc_dataloader, label_desc_name = data_loader.make_label_loader( label_desc_features, batch_size=args.batch_size_label_desc, fp16=args.fp16) # len(label_desc_examples) print('num of label desc to be transformed by gcn {}'.format( len(label_desc_examples))) else: