def create_fields( header: List[str], to_lower: bool = False, sen_column: str = "sen", tokenize_columns: Optional[List[str]] = None, convert_numerical: bool = False, tokenizer: Optional[PreTrainedTokenizer] = None, ) -> List[Tuple[str, Field]]: tokenize_columns = tokenize_columns or [sen_column] pipeline = None if convert_numerical: def preprocess_field(s: Union[str, int]) -> Union[str, int]: return int(s) if (isinstance(s, str) and s.isdigit()) else s pipeline = Pipeline(convert_token=preprocess_field) fields = [] for column in header: if column in tokenize_columns: field = Field(batch_first=True, include_lengths=True, lower=to_lower) if tokenizer is not None: attach_tokenizer(field, tokenizer) else: field = RawField(preprocessing=pipeline) field.is_target = False fields.append((column, field)) return fields
def preprocess(self, x): """Load a single example using this field, tokenizing if necessary. If the input is a Python 2 `str`, it will be converted to Unicode first. If `sequential=True`, it will be tokenized. Then the input will be optionally lowercased and passed to the user-provided `preprocessing` Pipeline.""" if (six.PY2 and isinstance(x, six.string_types) and not isinstance(x, six.text_type)): # never x = Pipeline(lambda s: six.text_type(s, encoding='utf-8'))(x) if self.sequential and isinstance(x, six.text_type): # never x = self.tokenize(x.rstrip('\n')) if self.lower: x = [Pipeline(six.text_type.lower)(xx) for xx in x] if self.preprocessing is not None: return self.preprocessing(x) else: return x
def preprocess(self, x): """Load a single example using this field, tokenizing if necessary. If the input is a Python 2 `str`, it will be converted to Unicode first. If `sequential=True`, it will be tokenized. Then the input will be optionally lowercased and passed to the user-provided `preprocessing` Pipeline.""" if (six.PY2 and isinstance(x, six.string_types) and not isinstance(x, six.text_type)): x = Pipeline(lambda s: six.text_type(s, encoding='utf-8'))(x) if isinstance(x, six.text_type): x = self.tokenize(x.rstrip('\n')) if self.lower: x = Pipeline(six.text_type.lower)(x) # The Pipeline that will be applied to examples using this field after # tokenizing but before numericalizing. Many Datasets replace this # attribute with a custom preprocessor. Default: None. if self.preprocessing is not None: return self.preprocessing(x) else: return x
def preprocess(self, x): """Load a single example using this field, tokenizing if necessary. If the input is a Python 2 `str`, it will be converted to Unicode first. If `sequential=True`, it will be tokenized. Then the input will be optionally lowercased and passed to the user-provided `preprocessing` Pipeline.""" if (six.PY2 and isinstance(x, six.string_types) and not isinstance(x, six.text_type)): x = Pipeline(lambda s: six.text_type(s, encoding='utf-8'))(x) if self.lower: x = Pipeline(six.text_type.lower)(x) if self.sequential and isinstance(x, six.text_type): doc = [] sents = x.strip().split(' <eos> ') for sent in sents: doc.append(sent.strip().split()) return doc else: raise RuntimeError('text_type')
def get_E2E_loaders(path, valid=0.1, batch_size=32): utterance = data.Field(tokenize=tokenizer, lower=True) label = data.Field(sequential=False, postprocessing=Pipeline(convert_token=convert_token)) id = data.Field(use_vocab=False, sequential=False) fields = [('id', id), ('turn1', utterance), ('turn2', utterance), ('turn3', utterance), ('label', label)] train = data.TabularDataset('{}/train.txt'.format(path), format='tsv', fields=fields, skip_header=True) valid = data.TabularDataset('{}/valid.txt'.format(path), format='tsv', fields=fields, skip_header=True) test = data.TabularDataset('{}/test.txt'.format(path), format='tsv', fields=fields, skip_header=True) vectors = vocab.Vectors(name='emojiplusglove.txt', cache='/media/backup/nlp-cic/DialogueRNN/') utterance.build_vocab(train, valid, test, vectors=vectors) #utterance.build_vocab(train, valid, test, vectors='glove.840B.300d') label.build_vocab(train) train_iter = BucketIterator(train, train=True, batch_size=batch_size, sort_key=lambda x: len(x.turn3), device=torch.device(0)) valid_iter = BucketIterator(valid, batch_size=batch_size, sort_key=lambda x: len(x.turn3), device=torch.device(0)) test_iter = BucketIterator(test, batch_size=batch_size, sort_key=lambda x: len(x.turn3), device=torch.device(0)) return train_iter, valid_iter, test_iter,\ utterance.vocab.vectors if not args.cuda else utterance.vocab.vectors.cuda(),\ label.vocab.itos
def get_E2E_loaders(path, valid=0.1, batch_size=32): utterance = data.Field(tokenize=tokenizer, lower=True) label = data.Field(sequential=False, postprocessing=Pipeline(convert_token=convert_token)) id = data.Field(use_vocab=False, sequential=False) fields = [('id', id), ('turn1', utterance), ('turn2', utterance), ('turn3', utterance), ('label', label)] train, valid = data.TabularDataset('{}/train.txt'.format(path), format='tsv', fields=fields, skip_header=True).split(1 - valid) test = data.TabularDataset('{}/devwithoutlabels.txt'.format(path), format='tsv', fields=fields[:-1], skip_header=True) utterance.build_vocab(train, valid, test, vectors='glove.840B.300d') label.build_vocab(train) train_iter = BucketIterator(train, train=True, batch_size=batch_size, sort_key=lambda x: len(x.turn3), device=torch.device(0)) valid_iter = BucketIterator(valid, batch_size=batch_size, sort_key=lambda x: len(x.turn3), device=torch.device(0)) test_iter = BucketIterator(test, batch_size=batch_size, sort_key=lambda x: len(x.turn3), device=torch.device(0)) return train_iter, valid_iter, test_iter,\ utterance.vocab.vectors if not args.cuda else utterance.vocab.vectors.cuda(),\ label.vocab.itos
nltk.download("punkt") # sys.path.append(os.path.join(os.path.dirname(__file__), '..')) TEMPLATE_DIR = os.path.abspath("./templates") STATIC_DIR = os.path.abspath("./static") app = Flask(__name__, template_folder=TEMPLATE_DIR, static_folder=STATIC_DIR) # original stuff: # app = Flask(__name__) # , static_url_path= '', static_folder= './static/vendor' # app._static_folder = './static/vendor' # bootstrap = Bootstrap(app) RESULT = None pre_pipeline = Pipeline(lemmatize) pre_pipeline.add_before(preprocessing) TEXT = Field( sequential=True, tokenize=word_tokenize, lower=True, stop_words=STOPWORDS, preprocessing=pre_pipeline, ) LABELS = ["Neutral", "Negative", "Positive"] VOCAB = {} with open("./models/vocab.pkl", "rb") as f: VOCAB = pickle.load(f) best_config = { "hidden_size": 302,
def import_corpus( path: str, header: Optional[List[str]] = None, header_from_first_line: bool = False, to_lower: bool = False, vocab_path: Optional[str] = None, vocab_from_corpus: bool = False, sen_column: str = "sen", ) -> TabularDataset: """ Imports a corpus from a path. The corpus can either be a raw string or a pickled dictionary. Outputs a `Corpus` type, that is used throughout the library. The raw sentence is assumed to be labeled `sen` or `sent` Sentences can possibly be labeled, which are assumed to be labeled by a `labels` tag. Parameters ---------- path : str Path to corpus file header : List[str], optional Optional list of attribute names of each column, if not provided all lines will be considered to be sentences, with the attribute name "sen". to_lower : bool, optional Transform entire corpus to lower case, defaults to False. header_from_first_line : bool, optional Use the first line of the corpus as the attribute names of the corpus. vocab_path : str, optional Path to the model vocabulary, which should a file containing a vocab entry at each line. vocab_from_corpus : bool, optional Create a new vocabulary from the tokens of the corpus itself. If set to True `vocab_path` does not need to be provided. Defaults to False. sen_column : str, optional Name of the corpus column containing the raw sentences. Defaults to `sen`. Returns ------- corpus : TabularDataset A TabularDataset containing the parsed sentences and optional labels """ if header is None: if header_from_first_line: with open(path) as f: header = f.readline().strip().split("\t") else: header = ["sen"] assert sen_column in header, "`sen` should be part of corpus_header!" def preprocess(s: str) -> Union[str, int]: return int(s) if s.isdigit() else s pipeline = Pipeline(convert_token=preprocess) fields = {} for field in header: if field == sen_column: fields[field] = Field( batch_first=True, include_lengths=True, lower=to_lower ) elif field == "labels": fields[field] = Field( use_vocab=False, tokenize=lambda s: list(map(int, s.split())) ) else: fields[field] = RawField(preprocessing=pipeline) fields[field].is_target = False corpus = TabularDataset( fields=fields.items(), format="tsv", path=path, skip_header=header_from_first_line, csv_reader_params={"quotechar": None}, ) # The current torchtext Vocab does not allow a fixed vocab order if vocab_path is not None or vocab_from_corpus: attach_vocab(corpus, vocab_path or path, sen_column=sen_column) return corpus
# - #Define all the variables to be read by torchtext TabularDataset TEXT1 = Field(sequential=True, tokenize=tokenize_protein, init_token=None, eos_token=None, pad_first=False) TEXT2 = Field(sequential=True, tokenize=tokenize_drug, init_token=None, eos_token=None) LABEL = Field(sequential=False, use_vocab=False, is_target=True, dtype=torch.float, preprocessing=Pipeline(lambda x: float(x))) INDEX1 = Field(sequential=False, use_vocab=True) INDEX2 = Field(sequential=False, use_vocab=True) #Read the data and get Protein Sequence, Canonical Smiles and Pchembl_value datafields = [('uniprot_accession', INDEX1), ("Sequence", TEXT1), ('standard_inchi_key', INDEX2), ("canonical_smiles", TEXT2), ("pchembl_value", LABEL)] #Predict activity score for sars-cov-2 viral proteins #Full data is used only for the purpose of having inchikey of all compounds in train and test set and uniprot accession of all viral organisms full_data, data, test_data = TabularDataset.splits( path="../data/", train='all_compound_viral_interactions_for_supervised_learning.csv', validation=args.input1, #test='Test_Compound_Viral_interactions_for_Supervised_Learning.csv',
class TRECQA(CastorPairDataset): NAME = 'trecqa' NUM_CLASSES = 2 ID_FIELD = Field(sequential=False, tensor_type=torch.FloatTensor, use_vocab=False, batch_first=True) AID_FIELD = Field(sequential=False, use_vocab=False, batch_first=True) TEXT_FIELD = Field( batch_first=True, tokenize=lambda x: x ) # tokenizer is identity since we already tokenized it to compute external features EXT_FEATS_FIELD = Field( tensor_type=torch.FloatTensor, use_vocab=False, batch_first=True, tokenize=lambda x: x, postprocessing=Pipeline(lambda arr, _, train: [float(y) for y in arr])) LABEL_FIELD = Field(sequential=False, use_vocab=False, batch_first=True) VOCAB_SIZE = 0 @staticmethod def sort_key(ex): return len(ex.sentence_1) def __init__(self, path): """ Create a TRECQA dataset instance """ super(TRECQA, self).__init__(path, load_ext_feats=True) @classmethod def splits(cls, path, train='train-all', validation='raw-dev', test='raw-test', **kwargs): return super(TRECQA, cls).splits(path, train=train, validation=validation, test=test, **kwargs) @classmethod def set_vectors(cls, field, vector_path): if os.path.isfile(vector_path): stoi, vectors, dim = torch.load(vector_path) field.vocab.vectors = torch.Tensor(len(field.vocab), dim) for i, token in enumerate(field.vocab.itos): wv_index = stoi.get(token, None) if wv_index is not None: field.vocab.vectors[i] = vectors[wv_index] else: # initialize <unk> with uniform_(-0.05, 0.05) vectors field.vocab.vectors[i] = torch.FloatTensor(dim).uniform_( -0.05, 0.05) else: print("Error: Need word embedding pt file") exit(1) return field @classmethod def iters(cls, path, vectors_name, vectors_dir, batch_size=64, shuffle=True, device=0, pt_file=False, vectors=None, unk_init=torch.Tensor.zero_): """ :param path: directory containing train, test, dev files :param vectors_name: name of word vectors file :param vectors_dir: directory containing word vectors file :param batch_size: batch size :param device: GPU device :param vectors: custom vectors - either predefined torchtext vectors or your own custom Vector classes :param unk_init: function used to generate vector for OOV words :return: """ train, validation, test = cls.splits(path) if not pt_file: if vectors is None: vectors = Vectors(name=vectors_name, cache=vectors_dir, unk_init=unk_init) cls.TEXT_FIELD.build_vocab(train, validation, test, vectors=vectors) else: cls.TEXT_FIELD.build_vocab(train, validation, test) cls.TEXT_FIELD = cls.set_vectors( cls.TEXT_FIELD, os.path.join(vectors_dir, vectors_name)) cls.LABEL_FIELD.build_vocab(train, validation, test) cls.VOCAB_SIZE = len(cls.TEXT_FIELD.vocab) return BucketIterator.splits((train, validation, test), batch_size=batch_size, repeat=False, shuffle=shuffle, device=device)
def parse_input_files(batch_size, embedding_dim, using_GPU, filepath="./data/new_annot/trainsplit_holdtarg", train_name="train.json", dev_name="dev.json", test_name="test.json", has_holdtarg=False, dev_batch_size=100): """ Reads the file with name filename """ if using_GPU: torch.cuda.device(0) print("Running on device " + str(torch.cuda.current_device())) print("creating fields") TEXT = data.Field(sequential=True, use_vocab=True, batch_first=True, tokenize=dummy_tokenizer, include_lengths=True) LABEL = data.Field(sequential=False, use_vocab=False, batch_first=True) POLARITY = data.Field(sequential=True, use_vocab=True, batch_first=True, tokenize=dummy_tokenizer) DOCID = data.Field(sequential=False, use_vocab=True, batch_first=True, tokenize=dummy_tokenizer) # may not need these two? # HOLDER = data.Field(sequential=True, use_vocab=True, batch_first=True, tokenize=tokenizer) # TARGET = data.Field(sequential=True, use_vocab=True, batch_first=True, tokenize=tokenizer) if has_holdtarg: HOLDER_TARGET = data.Field(sequential=True, use_vocab=True, batch_first=True, tokenize=dummy_tokenizer) H_IND = data.Field(sequential=True, use_vocab=False, batch_first=True, postprocessing=Pipeline(custom_post_inds), include_lengths=True) T_IND = data.Field(sequential=True, use_vocab=False, batch_first=True, postprocessing=Pipeline(custom_post_inds), include_lengths=True) # features CO_OCCURRENCES = data.Field(sequential=False, use_vocab=False, batch_first=True) HOLDER_RANK = data.Field(sequential=False, use_vocab=False, batch_first=True) TARGET_RANK = data.Field(sequential=False, use_vocab=False, batch_first=True) SENT_CLASSIFY = data.Field(sequential=False, use_vocab=False, batch_first=True) data_fields = { 'token': ('text', TEXT), 'label': ('label', LABEL), #'holder': ('holder', HOLDER), 'target': ('target', TARGET), 'polarity': ('polarity', POLARITY), 'docid': ('docid', DOCID), 'holder_index': ('holder_index', H_IND), 'target_index': ('target_index', T_IND), 'co_occurrences': ('co_occurrences', CO_OCCURRENCES), 'holder_rank': ('holder_rank', HOLDER_RANK), 'target_rank': ('target_rank', TARGET_RANK), 'classify': ('sent_classify', SENT_CLASSIFY) } if has_holdtarg: data_fields['holder_target'] = ('holder_target', HOLDER_TARGET) print("parsing data from file") train, val, test = data.TabularDataset.splits(path=filepath, train=train_name, validation=dev_name, test=test_name, format='json', fields=data_fields) print("loading word embeddings") TEXT.build_vocab(train, vectors="glove.6B." + str(embedding_dim) + "d") POLARITY.build_vocab(train) print(POLARITY.vocab.stoi) if has_holdtarg: HOLDER_TARGET.build_vocab(train) print(HOLDER_TARGET.vocab.stoi) DOCID.build_vocab(train, val, test) print("Train length = " + str(len(train.examples))) print("Dev length = " + str(len(val.examples))) print("Test length = " + str(len(test.examples))) #print(val.examples[0].text) validation_batch = min(len(val.examples), 100) if dev_batch_size is not None: validation_batch = dev_batch_size test_batch = min(len(test.examples), 100) print("splitting & batching data") train_iter, val_iter, test_iter = data.Iterator.splits( (train, val, test), sort_key=lambda x: len(x.text), repeat=False, batch_sizes=(batch_size, validation_batch, test_batch), sort_within_batch=True) print("Repeat = " + str(train_iter.repeat)) return train_iter, val_iter, test_iter, TEXT, DOCID, POLARITY