Esempio n. 1
0
## reading in feature label is in @GCN folder. too lazy to port this function out.
LabelDescLoader = GCN_data_loader.LabelProcessor()

if args.tree:
  # @label_in_ontology to get GO in the whole ontology, will be needed if we use tree method
  LabelSamples = LabelDescLoader.get_examples(args.data_dir, label_array=label_in_ontology)
  LabelSamples = GCN_data_loader.convert_labels_to_features(LabelSamples, MAX_SEQ_LEN_LABEL_DEF, Vocab, all_name_array=label_in_ontology, tokenize_style='space')

else:
  ## only get vectors for labels we want.
  LabelSamples = LabelDescLoader.get_examples(args.data_dir, label_array=label_to_test)
  LabelSamples = GCN_data_loader.convert_labels_to_features(LabelSamples, MAX_SEQ_LEN_LABEL_DEF, Vocab, all_name_array=label_to_test, tokenize_style='space')


GO_loader_for_biLSTM, GO_name_for_biLSTM = GCN_data_loader.make_label_loader (LabelSamples,args.batch_size_bert,fp16=False) ## if we fix encoder, then we don't have to worry about batch size, should be able to handle 32 or even 64


## **** load protein data


if args.ontology is None:
  add_name = ""
else:
  add_name = '-' + args.ontology

train_loader = protSeqLoader.ProtLoader (args.data_dir, 'train'+add_name+'.tsv', all_name_array, MAX_SEQ_LEN, 'random', args, args.do_kmer, label_to_test)

dev_loader = protSeqLoader.ProtLoader (args.data_dir, 'dev'+add_name+'.tsv', all_name_array, MAX_SEQ_LEN, 'sequential', args, args.do_kmer, label_to_test)

Esempio n. 2
0
    processor = data_loader.LabelProcessor()
    label_desc_examples = processor.get_examples(
        args.main_dir, all_name_array)  ## must get all labels

    label_desc_features = data_loader.convert_labels_to_features(
        label_desc_examples,
        max_seq_length=MAX_SEQ_LEN,
        tokenizer=Vocab,
        all_name_array=all_name_array,
        tokenize_style="space")

    if args.batch_size_label_desc == 0:
        args.batch_size_label_desc = args.num_label

    label_desc_dataloader, label_desc_name = data_loader.make_label_loader(
        label_desc_features,
        batch_size=args.batch_size_label_desc,
        fp16=args.fp16)  # len(label_desc_examples)

    print('num of label desc to be transformed by gcn {}'.format(
        len(label_desc_examples)))

else:
    label_desc_dataloader = None

## **** for the rest, we don't use the "GO:"
# all_name_array = [ re.sub(r"GO:","",g) for g in all_name_array ]

## read go terms entailment pairs to train

processor = data_loader.QnliProcessor()
label_list = processor.get_labels()  ## no/yes entailment style