Esempio n. 1
0
import biLSTM.encoder.encoder_model as biLSTM_encoder_model
import biLSTM.encoder.entailment_model as biLSTM_entailment_model
import biLSTM.encoder.bi_lstm_model as bi_lstm_model

MAX_SEQ_LEN_LABEL_DEF = 512 ## max len for GO def (probably can be smaller)

if args.w2v_emb is not None: ## we can just treat each node as a vector without word description 
  Vocab = load_vocab(args.vocab_list) # all words found in pubmed and trained in w2v ... should trim down

## reading in feature label is in @GCN folder. too lazy to port this function out.
LabelDescLoader = GCN_data_loader.LabelProcessor()

if args.tree:
  # @label_in_ontology to get GO in the whole ontology, will be needed if we use tree method
  LabelSamples = LabelDescLoader.get_examples(args.data_dir, label_array=label_in_ontology)
  LabelSamples = GCN_data_loader.convert_labels_to_features(LabelSamples, MAX_SEQ_LEN_LABEL_DEF, Vocab, all_name_array=label_in_ontology, tokenize_style='space')

else:
  ## only get vectors for labels we want.
  LabelSamples = LabelDescLoader.get_examples(args.data_dir, label_array=label_to_test)
  LabelSamples = GCN_data_loader.convert_labels_to_features(LabelSamples, MAX_SEQ_LEN_LABEL_DEF, Vocab, all_name_array=label_to_test, tokenize_style='space')


GO_loader_for_biLSTM, GO_name_for_biLSTM = GCN_data_loader.make_label_loader (LabelSamples,args.batch_size_bert,fp16=False) ## if we fix encoder, then we don't have to worry about batch size, should be able to handle 32 or even 64


## **** load protein data


if args.ontology is None:
  add_name = ""
Esempio n. 2
0
if (args.w2v_emb is not None) and (
        args.word_mode != 'PretrainedGO'
):  ## we can just treat each node as a vector without word description

    ## if we do PretrainedGO vector, we don't need the GO definitions

    Vocab = load_vocab(
        args.vocab_list
    )  # all words found in pubmed and trained in w2v ... should trim down
    processor = data_loader.LabelProcessor()
    label_desc_examples = processor.get_examples(
        args.main_dir, all_name_array)  ## must get all labels

    label_desc_features = data_loader.convert_labels_to_features(
        label_desc_examples,
        max_seq_length=MAX_SEQ_LEN,
        tokenizer=Vocab,
        all_name_array=all_name_array,
        tokenize_style="space")

    if args.batch_size_label_desc == 0:
        args.batch_size_label_desc = args.num_label

    label_desc_dataloader, label_desc_name = data_loader.make_label_loader(
        label_desc_features,
        batch_size=args.batch_size_label_desc,
        fp16=args.fp16)  # len(label_desc_examples)

    print('num of label desc to be transformed by gcn {}'.format(
        len(label_desc_examples)))

else: