def stacking_embedding(df): glove_embedding = WordEmbeddings('glove') flair_embedding_news_forward = FlairEmbeddings('news-forward') flair_embedding_news_backward = FlairEmbeddings('news-backward') bert_embedding = BertEmbeddings() elmo_embedding = ELMoEmbeddings() stacked_embeddings = StackedEmbeddings([ glove_embedding, flair_embedding_news_forward, flair_embedding_news_backward, bert_embedding, elmo_embedding ]) for index, row tqdm(df.iterrows(), total=len(df), desc='Embedding'): sentence = Sentence(row['name']) token_series = set() for token in sentence: token_series.add
def _train_model(self): # type: () -> None corpus = ClassificationCorpus( Path(__path_to_base__), test_file=os.path.basename(self.path_to_test), dev_file=os.path.basename(self.path_to_dev), train_file=os.path.basename(self.path_to_train)) word_embeddings = [ ELMoEmbeddings('original'), FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast') ] document_embeddings = DocumentRNNEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256) classifier = TextClassifier( document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False) trainer = ModelTrainer(classifier, corpus) trainer.train(__path_to_base__, max_epochs=10)
def hyper_opt(corpus): print("hyper_opt is started") # define your search space search_space = SearchSpace() search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[ StackedEmbeddings([ WordEmbeddings('en'), WordEmbeddings('glove'), CharacterEmbeddings(), FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), ELMoEmbeddings() ]) ]) search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[256]) #search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2]) #search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5) search_space.add(Parameter.LEARNING_RATE, hp.choice, options=[0.01, 0.1]) search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[32, 64]) # create the parameter selector param_selector = SequenceTaggerParamSelector( corpus, 'ner', #'/content/gdrive/My Drive/resume_ner_data/hyperparam_selection', model_path, max_epochs=50, training_runs=2, optimization_value=OptimizationValue.DEV_SCORE) # start the optimization param_selector.optimize(search_space, max_evals=100)
import re # Create Flask application here app = Flask(__name__) # Load blueprint for flask bp = Blueprint('note', __name__) STOP_WORDS = set(ENGLISH_STOP_WORDS) STOP_WORDS.remove('no') STOP_WORDS.remove('not') # Loading lemmatizer for BLEU-socre and Stanford NLP Pipeline lemmatizer = WordNetLemmatizer() nlp = stanfordnlp.Pipeline(processors="tokenize,mwt,pos,depparse") embedding = ELMoEmbeddings('pubmed') # Define upload folder location # UPLOAD_FOLDER = '/home/TheLumino/UCSF_NLP_UI/Uploads' UPLOAD_FOLDER = 'Uploads' app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER # Load word2vec and random forest model # currentmodel = Word2Vec.load('/home/TheLumino/UCSF_NLP_UI/flaskr/cosine_model/cosine_similarity_metric') # rf_model = pickle.load(open('/home/TheLumino/UCSF_NLP_UI/flaskr/cosine_model/random_forest_confidence_score.sav', 'rb')) currentmodel = Word2Vec.load('flaskr/cosine_model/bestmodel') rf_model = pickle.load(open('flaskr/cosine_model/random_forest_confidence_score.sav', 'rb')) # set UMLS database links for quick access from UI rx_url = 'https://mor.nlm.nih.gov/RxNav/search?searchBy=RXCUI&searchTerm='
# #### cbc = CamembertConfig(output_hidden_states=True) bert = CamembertModel(cbc) bert.from_pretrained("camembert-base") bert = bert.eval() bert = bert.to(device) bert_tok = CamembertTokenizer.from_pretrained("camembert-base") app = Flask(__name__) # create a StackedEmbedding object that combines glove and forward/backward flair embeddings SIZE_EMBED = -1 adaptive_pool = nn.AdaptiveAvgPool1d(SIZE_EMBED) if SIZE_EMBED > 0 else None embedder = ELMoEmbeddings("small") # bert_model_or_path="distilbert-base-uncased", # pooling_operation="mean", use_scalar_mix=True) @app.route('/vectorize', methods=['POST']) def vectorize(): tokens = request.json["tokens"] embeddings = [] # print("Call") for m in tokens: mot = Sentence(m) embedder.embed(mot) embed = mot[0].embedding
sentence = Sentence('The grass is green .') flair_embedding_forward.embed(sentence) for token in sentence: print(token) print(token.embedding) #Bert Embedding加载训练 embedding = BertEmbeddings() sentence = Sentence('The grass is green .') embedding.embed(sentence) for token in sentence: print(token) print(token.embedding) #Elmo Embedding加载训练 embedding = ELMoEmbeddings() sentence = Sentence('The grass is green .') embedding.embed(sentence) for token in sentence: print(token) print(token.embedding) #混合Embedding加载训练 stacked_embeddings = StackedEmbeddings([WordEmbeddings('model/glove.gensim'), FlairEmbeddings('model/news-forward-0.4.1.pt')]) sentence = Sentence('The grass is green .') stacked_embeddings.embed(sentence) for token in sentence: print(token) print(token.embedding) #Character Embeddings和BytePairEmbeddings,无法翻墙则运行是下载会报错
from flair.models import SequenceTagger # Set up the Corpus columns = {0: 'text', 1:'ner'} data_folder = './data/IOBES' corpus: Corpus = ColumnCorpus(data_folder, columns, train_file="train.txt", dev_file="dev.txt", test_file="test.txt") tag_type = 'ner' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) # define search_space search_space = SearchSpace() search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[ StackedEmbeddings([ ELMoEmbeddings('original') ]), StackedEmbeddings([ ELMoEmbeddings('original'), CharacterEmbeddings() ]) ]) search_space.add(Parameter.HIDDEN_SIZE, hp.randint, upper=400) search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1,2]) search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5) search_space.add(Parameter.LEARNING_RATE, hp.uniform, low=0.01, high=0.25) search_space.add(Parameter.PATIENCE, hp.choice, options=[3, 5]) search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[16, 32]) search_space.add(Parameter.USE_CRF, hp.choice, options=[True, False]) # initialise embeddings param_selector = SequenceTaggerParamSelector( corpus,
test_data = all_annos[4500:6200] dev_data = all_annos[6200:] search_space = SearchSpace() #Create or embedding stacks #Flair recommends adding GLoVe to their character-level embeddings flair_normal = StackedEmbeddings([ WordEmbeddings('glove'), FlairEmbeddings('mix-forward'), FlairEmbeddings('mix-backward') ]) bert = BertEmbeddings() elmo = ELMoEmbeddings('original') flair_pooled = StackedEmbeddings([ WordEmbeddings('glove'), PooledFlairEmbeddings('mix-forward'), PooledFlairEmbeddings('mix-backward') ]) search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[bert, elmo, flair_normal, flair_pooled]) #other hyperparams are kept fixed for this excercise. #Add to the lists to add to grid #unfortunately for small grids, Flair picks random search instead of true #grid search
def init_embeddings(corpus_name, embedding_type): """ Initializes embeddings for a given corpus. Parameters: corpus_name (str): name of the corpus used to load proper embeddings embedding_type (str): type of embeddings (e.g. flair, elmo, bert, word+char) Returns: tuple(StackedEmbeddings, bool): loaded embeddings """ from typing import List from flair.embeddings import TokenEmbeddings, WordEmbeddings, StackedEmbeddings from flair.embeddings import FlairEmbeddings from flair.embeddings import BertEmbeddings, ELMoEmbeddings from flair.embeddings import WordEmbeddings, CharacterEmbeddings embedding_types: List[TokenEmbeddings] = [] if corpus_name in ['conll03_en']: if embedding_type == 'flair': embedding_types.append(WordEmbeddings('glove')) embedding_types.append(FlairEmbeddings('news-forward')) embedding_types.append(FlairEmbeddings('news-backward')) embeddings_in_memory = True elif embedding_type == 'bert': embedding_types.append( BertEmbeddings(bert_model_or_path='bert-base-cased')) #embedding_types.append(BertEmbeddings(bert_model_or_path='bert-large-cased')) embeddings_in_memory = True elif embedding_type == 'elmo': embedding_types.append(ELMoEmbeddings()) embeddings_in_memory = True elif embedding_type == 'word+char': # similar to Lample et al. (2016) embedding_types.append(WordEmbeddings('glove')) embedding_types.append(CharacterEmbeddings()) embeddings_in_memory = False # because it contains a char model (problem with deepcopy) else: log.error(f"no settings for '{embedding_type}'!") exit(EXIT_FAILURE) elif corpus_name in ["conll03_de", "germeval"]: if embedding_type == 'flair': embedding_types.append(WordEmbeddings('de')) embedding_types.append(FlairEmbeddings('german-forward')) embedding_types.append(FlairEmbeddings('german-backward')) embeddings_in_memory = True elif embedding_type == 'word+char': # similar to Lample et al. (2016) embedding_types.append(WordEmbeddings('de')) embedding_types.append(CharacterEmbeddings()) embeddings_in_memory = False # because it contains a char model (problem with deepcopy) else: log.error(f"no settings for '{embedding_type}'!") exit(EXIT_FAILURE) else: log.error(f"unknown corpus or embeddings '{corpus_name}'!") exit(EXIT_FAILURE) embeddings: StackedEmbeddings = StackedEmbeddings( embeddings=embedding_types) log.info("'{}' function finished!".format(sys._getframe().f_code.co_name)) return embeddings, embeddings_in_memory
cuda_device = 0 if str(device) != 'cpu' else -1 model = allennlp.commands.elmo.ElmoEmbedder( options_file='path_to_pretrain_elmo_options.json', weight_file='path_to_pretrain_elmo_weights.hdf5', cuda_device=cuda_device) ### embedding_types: List[TokenEmbeddings] = [ FlairEmbeddings('multi-forward'), FlairEmbeddings('multi-backward'), FlairEmbeddings('path_to_pretrain_flair_forward.pt'), FlairEmbeddings('path_to_pretrain_flair_backward.pt'), WordEmbeddings('glove'), ELMoEmbeddings('medium'), ELMoEmbeddings('pubmed'), OwnELMoEmbeddings(model), UMLSEmbedding(), CCREmbedding() ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True) trainer: ModelTrainer = ModelTrainer(tagger, corpus)
data['label'] = '__label__' + data['label'].astype(str) #train-test split data.iloc[0:int(len(data)*0.8)].to_csv(PATH/'flair/train.csv', sep='\t', index = False, header = False) data.iloc[int(len(data)*0.8):int(len(data)*1)].to_csv(PATH/'flair/test.csv', sep='\t', index = False, header = False) corpus = ClassificationCorpus(Path('/content/drive/My Drive/emnlp/flair/'), test_file='test.csv', dev_file='test.csv',train_file='train.csv') print(corpus.obtain_statistics()) ## use any pretrained stacked embedding from the FLAIR Framework # embedding = RoBERTaEmbeddings() # embedding = BertEmbeddings('bert-base-uncased') embedding = ELMoEmbeddings('small') #stack them with other embeddings word_embeddings = [ embedding, # FlairEmbeddings('news-forward',use_cache=True), # FlairEmbeddings('news-backward',use_cache=True), ] #apply document LSTM to the stacked embeddings document_embeddings = DocumentRNNEmbeddings( word_embeddings, # hidden_size=512, # reproject_words=True, # reproject_words_dimension=256, )
use_flair = False use_elmo = False use_bert = False mini_batch_size = 32 word_embeddings = [] if use_glove: word_embeddings.append(WordEmbeddings('glove')) word_embeddings.append(CharacterEmbeddings()) if use_cui2vec: word_embeddings.append(WordEmbeddings('./cui2vec_embed_vectors.bin')) if use_flair: word_embeddings.append(FlairEmbeddings('./forward-lm.pt')) word_embeddings.append(FlairEmbeddings('./backward-lm.pt')) if use_elmo: word_embeddings.append(ELMoEmbeddings('pubmed')) if use_bert: word_embeddings.append(BertEmbeddings('./bert-base-clinical-cased')) mini_batch_size = 8 stacked_word_embeddings = StackedEmbeddings(word_embeddings) from flair.embeddings import DocumentRNNEmbeddings document_embeddings = DocumentRNNEmbeddings(word_embeddings, rnn_type='LSTM', bidirectional=True, hidden_size=512, reproject_words=True, reproject_words_dimension=256)
parser.add_argument( '--model_name', default='large', action='store', ) parser.add_argument( '--lm_emb_save_path', default='./wv/elmo.emb.pkl', action='store', ) args = parser.parse_args() embedding = ELMoEmbeddings(args.model_name) flag = args.dataset dataset = [] with open(f'./datasets/unified/train.{flag}.json') as f: dataset += json.load(f) with open(f'./datasets/unified/valid.{flag}.json') as f: dataset += json.load(f) with open(f'./datasets/unified/test.{flag}.json') as f: dataset += json.load(f) bert_emb_dict = {} for item in tqdm(dataset): tokens = tuple(item['tokens']) s = form_sentence(tokens) embedding.embed(s)
corpus: Corpus = ColumnCorpus(data_folder, columns, train_file="train.txt", dev_file="test.txt", test_file="test.txt") tag_type = 'ner' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) # define search_space search_space = SearchSpace() search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[ StackedEmbeddings([ELMoEmbeddings('original')]), StackedEmbeddings( [ELMoEmbeddings('original'), CharacterEmbeddings()]) ]) search_space.add(Parameter.HIDDEN_SIZE, hp.randint, upper=400) search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2]) search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5) search_space.add(Parameter.LEARNING_RATE, hp.uniform, low=0.01, high=0.25) search_space.add(Parameter.PATIENCE, hp.choice, options=[3, 5]) search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[16, 32]) search_space.add(Parameter.USE_CRF, hp.choice, options=[True, False]) # initialise embeddings param_selector = SequenceTaggerParamSelector(
test_file='test_.tsv', dev_file='dev.tsv', train_file='train.tsv') # way to select language model model_selector = { "Glove": [WordEmbeddings('glove')], "FastText": [WordEmbeddings('en-news')], "BPE": [BytePairEmbeddings('en')], "FlairFast": [ FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast') ], "FlairNews": [FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward')], "ElmoOriginal": [ELMoEmbeddings('original')], 'Bert': [BertEmbeddings('large-uncased')], 'BertLS': [ BertEmbeddings(bert_model_or_path='bert-large-uncased', layers="0,1,2,3,4,5,6,7,8,9,10,11,12,13,14," "15,16,17,18,19,20,21,22,23,24", use_scalar_mix=True) ], "RoBERTa": [RoBERTaEmbeddings('roberta-base')], "RoBERTaL": [RoBERTaEmbeddings('roberta-large')], "RoBERTaLS": [ RoBERTaEmbeddings(pretrained_model_name_or_path="roberta-large", layers="0,1,2,3,4,5,6,7,8,9,10,11,12,13,14," "15,16,17,18,19,20,21,22,23,24", use_scalar_mix=True) ],
) # # 4. initialize embeddings embedding_types: List[TokenEmbeddings] = [ # WordEmbeddings('glove'), # comment in this line to use character embeddings CharacterEmbeddings( path_to_char_dict= "/media/bubbles/fecf5b15-5a64-477b-8192-f8508a986ffe/ai/abs/flair-custom/custom_dict.pkl" ), # comment in these lines to use flair embeddings # FlairEmbeddings('news-forward'), # CharLMEmbeddings('news-forward',use_cache=True), ELMoEmbeddings('elmo-small'), # BertEmbeddings(), # FlairEmbeddings('news-backward-fast'), ] embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types) #5. initialize sequence tagger from flair.models import SequenceTagger tagger: SequenceTagger = SequenceTagger(hidden_size=256, embeddings=embeddings, tag_dictionary=tag_dictionary, rnn_layers=2, tag_type=tag_type, use_crf=True)
def get_elmo(model_name): return ELMoEmbeddings(model_name)
def trainer(file_path: Path, filenames: Tuple[str, str, str], checkpoint: str, stack: str, n_epochs: int) -> None: """Train sentiment model using Flair NLP library: https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md To help provide added context, we can stack Glove, Bert or ELMo embeddings along with Flair embeddings. """ # pip install flair allennlp from flair.datasets import ClassificationCorpus from flair.embeddings import FlairEmbeddings, DocumentRNNEmbeddings, DocumentPoolEmbeddings from flair.models import TextClassifier from flair.trainers import ModelTrainer from flair.training_utils import EvaluationMetric from flair.visual.training_curves import Plotter if stack == "glove": from flair.embeddings import WordEmbeddings stacked_embedding = WordEmbeddings('glove') elif stack == "fasttext": from flair.embeddings import WordEmbeddings stacked_embedding = WordEmbeddings('it') elif stack == "elmo": from flair.embeddings import ELMoEmbeddings stacked_embedding = ELMoEmbeddings('original') elif stack == "bert": from flair.embeddings import BertEmbeddings stacked_embedding = BertEmbeddings('bert-base-uncased') elif stack == "bert-multi": from flair.embeddings import BertEmbeddings stacked_embedding = BertEmbeddings('bert-base-multilingual-uncased') elif stack == 'bpe': from flair.embeddings import BytePairEmbeddings stacked_embedding = BytePairEmbeddings('it') else: stacked_embedding = None # Define and Load corpus from the provided dataset train, dev, test = filenames corpus = ClassificationCorpus( file_path, train_file=train, dev_file=dev, test_file=test, ) # Create label dictionary from provided labels in data label_dict = corpus.make_label_dictionary() # Stack Flair string-embeddings with optional embeddings word_embeddings = list( filter(None, [ stacked_embedding, FlairEmbeddings('it-forward'), FlairEmbeddings('it-backward'), ])) # Initialize document embedding by passing list of word embeddings document_embeddings = DocumentRNNEmbeddings( word_embeddings, hidden_size=256, reproject_words=True, dropout=0.5, reproject_words_dimension=256, ) #document_embeddings = DocumentPoolEmbeddings([ # stacked_embedding, # FlairEmbeddings('it-forward'), # FlairEmbeddings('it-backward')],pooling='mean') # Define classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=True) if not checkpoint: trainer = ModelTrainer(classifier, corpus) else: # If checkpoint file is defined, resume training #checkpoint = classifier.load_checkpoint(Path(checkpoint)) trainer = ModelTrainer.load_checkpoint(checkpoint, corpus) # Begin training (enable checkpointing to continue training at a later time, if desired) trainer.train( file_path, max_epochs=n_epochs, checkpoint=True, ) # Plot curves and store weights and losses plotter = Plotter() plotter.plot_training_curves(file_path + '/loss.tsv') plotter.plot_weights(file_path + '/weights.txt')
dev_file=None) #corpus.downsample(0.1) # 2. what tag do we want to predict? tag_type = 'ner' # 3. make the tag dictionary from the corpus tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) print(tag_dictionary) print(corpus.train[0].to_tagged_string('ner')) # 4. initialize embeddings embedding_types: List[TokenEmbeddings] = [] if embedding == "ep": embedding_types = [WordEmbeddings('glove'), ELMoEmbeddings('pubmed')] elif embedding == "fp": embedding_types = [ WordEmbeddings('glove'), FlairEmbeddings('./forward-lm.pt'), FlairEmbeddings('./backward-lm.pt') ] elif embedding == "fpd": embedding_types = [ WordEmbeddings('glove'), FlairEmbeddings('pubmed-forward'), FlairEmbeddings('pubmed-backward') ] elif embedding == "cb": embedding_types = [ WordEmbeddings('glove'),
with timer('turian'): embedding = WordEmbeddings('turian') result = w2v_flair(train["Description"], embedding, name="turian") result.to_feather("../feature/turian_flair.feather") with timer('twitter'): embedding = WordEmbeddings('twitter') result = w2v_flair(train["Description"], embedding, name="twitter") result.to_feather("../feature/twitter_flair.feather") with timer('news'): embedding = FlairEmbeddings('news-forward') result = w2v_flair(train["Description"], embedding, name="news_flair") result.to_feather("../feature/news_flair.feather") with timer('char'): embedding = CharacterEmbeddings() result = w2v_flair(train["Description"], embedding, name="char") result.to_feather("../feature/char_flair.feather") with timer('byte_pair'): embedding = BytePairEmbeddings('en') result = w2v_flair(train["Description"], embedding, name="byte_pair") result.to_feather("../feature/byte_pair_flair.feather") with timer('elmo'): embedding = ELMoEmbeddings('medium') result = w2v_flair(train["Description"], embedding, name="elmo") result.to_feather("../feature/elmo_flair.feather")
premise_text = remove_punctuations(premise_text) premise_sentence = Sentence(premise_text) document_embedding.embed(premise_sentence) embedded_premise = premise_sentence.get_embedding().detach().numpy( ).tolist() embedded_premises[premise[2]] = embedded_premise argument_uid = premise[0] embedded_arguments[argument_uid] = [ embedded_conclusion, embedded_premises ] save_embedding(embedded_arguments, file_name) if __name__ == '__main__': elmo_embedding = ELMoEmbeddings() compute_embedding(elmo_embedding, remove_punctuation=True, file_name="elmo_embeddings_without_punctuation.json") compute_embedding(elmo_embedding, remove_punctuation=False, file_name="elmo_embeddings_with_punctuation.json") bert_embedding = BertEmbeddings() compute_embedding(bert_embedding, remove_punctuation=True, file_name="bert_embeddings_without_punctuation.json") compute_embedding(bert_embedding, remove_punctuation=False, file_name="bert_embeddings_with_punctuation.json")
def call_test(skip_list=[], test_name="", langs=[], template="", params={}, ANALOGIES_FILE="", ANALOGIES_DIR="", EMBEDDINGS_DIR=""): logger.debug('call_test : {0}'.format(test_name)) if template == 'sick': for lang in langs: if test_already_done(test_name, lang): continue params["flair_model"] = ELMoEmbeddings('original') model = Embedding(**params) measure = get_measure(model, 'en', test_name, sick=True) yield measure elif template == 'bert': for lang in langs: if test_already_done(test_name, lang): continue params["bert_model"] = BertClient() model = Embedding(**params) measure = get_measure(model, lang, test_name) yield measure elif template == 'flair': for lang in langs: if test_already_done(test_name, lang): continue params["flair_model"] = ELMoEmbeddings('pt') model = Embedding(**params) measure = get_measure(model, lang, test_name) yield measure elif template == 'flair-custom-1' or template == 'flair-custom-2': for lang in langs: if test_already_done(test_name, lang): continue if template == 'flair-custom-1': params["flair_model"] = ELMoEmbeddings( options_file="../embeddings/elmo/options.json", weight_file="../embeddings/elmo/elmo_pt_weights.hdf5") elif template == 'flair-custom-2': params["flair_model"] = ELMoEmbeddings( options_file="../embeddings/elmo/options_dgx1.json", weight_file="../embeddings/elmo/elmo_pt_weights_dgx1.hdf5") model = Embedding(**params) measure = get_measure(model, lang, test_name) yield measure elif template == 'gensim' or template == 'flair-gensim' or template == 'custom-flair-gensim-1' or template == 'custom-flair-gensim-2' or template == "flair-gensim-local": assert (EMBEDDINGS_DIR != None) logger.debug("Template: " + template) logger.debug("EMBEDDINGS_DIR: " + EMBEDDINGS_DIR) logger.debug("skip-list: " + str(skip_list)) for fname in get_NILC(EMBEDDINGS_DIR): logger.debug("Embedding: " + fname) for item in skip_list: if item in fname: logger.debug("SKIP - skip_list - {0}".format(fname)) break else: logger.debug("RUN - {0}".format(fname)) t = test_name + '_' + fname for lang in langs: if test_already_done(t, lang): continue emb = KeyedVectors.load(fname) params["gensim_model"] = emb if template == 'flair-gensim': params["flair_model"] = ELMoEmbeddings('pt') elif template == 'flair-gensim-local': params["flair_model"] = ELMoEmbeddings( options_file= "../embeddings/allen_elmo/elmo_pt_options.json", weight_file= "../embeddings/allen_elmo/elmo_pt_weights.hdf5") elif template == 'custom-flair-gensim-1': params["flair_model"] = ELMoEmbeddings( options_file="../embeddings/elmo/options.json", weight_file= "../embeddings/elmo/elmo_pt_weights.hdf5") elif template == 'custom-flair-gensim-2': params["flair_model"] = ELMoEmbeddings( options_file="../embeddings/elmo/options_dgx1.json", weight_file= "../embeddings/elmo/elmo_pt_weights_dgx1.hdf5") model = Embedding(**params) measure = get_measure(model, lang, t) yield measure elif template == "analogies": try: open(ANALOGIES_FILE, 'r').close() except: with open(ANALOGIES_FILE, 'w+') as f: json.dump({}, f) for path, path2, name, name2, dst, dst2 in get_analogies( ANALOGIES_DIR, EMBEDDINGS_DIR): key = name.rstrip('.txt') + '_' + path2.split( '/')[-1] + '_' + name2.rstrip('.model') with open(ANALOGIES_FILE, 'r') as f: stats = json.load(ANALOGIES_FILE) try: stats[key] continue except: pass embedding = KeyedVectors.load(dst2) score = embedding.evaluate_word_analogies(dst)[0] stats[key] = score with open(ANALOGIES_FILE, 'w+') as f: json.dump(stats, f) message = {"key": key, "stats": stats[key]} yield message
def main(): print("Instantiate embeddings") embeddings = DocumentPoolEmbeddings([ FlairEmbeddings("pubmed-forward"), FlairEmbeddings("pubmed-backward"), ELMoEmbeddings("pubmed"), BertEmbeddings("bert-large-uncased"), ]) print("Load pubmed_data") pubmed_data = pd.concat([ pd.read_json(f"data/medline/{medline_file}") for medline_file in ["medline_2016.json", "medline_2017.json", "medline_2018.json"] ]) print("pubmed_corpus") pubmed_corpus = [ try_sentence(text) for text in pubmed_data.title.apply(preproc).head(10_000) ] pubmed_corpus = [ text for text in pubmed_corpus if text ] print(pubmed_corpus[0:5]) print("query") query = [ Sentence(text) for text in [ "Searching for the causal effects of body mass index in over 300 000 participants in UK Biobank, using Mendelian randomization.", "Prioritizing putative influential genes in cardiovascular disease susceptibility by applying tissue-specific Mendelian randomization.", "Longitudinal analysis strategies for modelling epigenetic trajectories", "FATHMM-XF: accurate prediction of pathogenic point mutations via extended features", "PhenoSpD: an integrated toolkit for phenotypic correlation estimation and multiple testing correction using GWAS summary statistics.", "LD Hub: a centralized database and web interface to perform LD score regression that maximizes the potential of summary level GWAS data for SNP heritability and genetic correlation analysis.", "MELODI: Mining Enriched Literature Objects to Derive Intermediates", "The MR-Base platform supports systematic causal inference across the human phenome", ] ] print("Embed") for text in query + pubmed_corpus: embeddings.embed(text) print("Calculate scores") cos = torch.nn.CosineSimilarity(dim=0, eps=1e-5) cos_scores = [] for query_id, query_text in enumerate(query): cos_res = [ { "query_id": query_id, "target_id": target_id, "score": cos(query_text.embedding, target_text.embedding).item() } for target_id, target_text in enumerate(pubmed_corpus) ] cos_scores.append(cos_res) cos_scores = pd.concat(pd.DataFrame(x) for x in cos_scores) print(cos_scores) n = 5 for query_id, query_text in enumerate(query): print(f"# Query {query_id}") print(query_text) top_n = ( cos_scores .query(f"query_id == {query_id}") .sort_values("score", ascending=False) .head(n) ) for target_id, target_score in zip(top_n.target_id, top_n.score): print(f" ## Candidate {target_id}, score {target_score}") print(f" {pubmed_corpus[target_id]}\n") print("\n\n")
qdata['sentential_nodes_to_idx'] = sentential_nodes_to_idx qdata['doc_neighbors'] = doc_neighbors qdata['match_neighbors'] = match_neighbors qdata['question_idx'] = set([ v for k, v in qdata['sentential_nodes_to_idx'].items() if k[0] == 'question' ]) for ch in qdata['question']['choices']: qdata['choice_%s_idx' % ch['label']] = set([ v for k, v in qdata['sentential_nodes_to_idx'].items() if k[0] == 'choice:%s' % ch['label'] ]) del qdata['edges'] tokenizer = lambda x: nlp.word_tokenize(x) embedder = ELMoEmbeddings(embedding_model) neighbor_indices = list( set(n[0] for qdata in data for n in qdata['sentential_nodes_to_idx'] if type(n[0]) == int)) outputs = emb.get_corpus_embeddings(corpus, neighbor_indices, embedder, tokenizer) reverse_neighbor_idx_mapping = { v: i for i, v in enumerate(neighbor_indices) } for qdata in tqdm(data): qdata['node_embedding_matrix'] = emb.get_question_embedding_matrix( qdata, outputs, reverse_neighbor_idx_mapping, embedding_dim) store_final_data(output_file_name, qdata)
from preprocessing.load_data import download_and_load_sts_data, download_and_load_sick_dataset from preprocessing.normalize import normalize from utility.frequency_loader import load_frequencies, load_doc_frequencies from utility.run_experiment import run_experiment import os if not os.path.exists(IMAGE_PATH): os.makedirs(IMAGE_PATH) sick_all, sick_train, sick_test, sick_dev = download_and_load_sick_dataset() print('Downloaded data') frequency = load_frequencies("data/frequencies/frequencies.tsv") doc_frequency = load_doc_frequencies("data/frequencies/doc_frequencies.tsv") word2vec = load_word2vec(w2v_path) elmo = ELMoEmbeddings('large') bert = TransformerWordEmbeddings('bert-large-cased') flair = StackedEmbeddings([ WordEmbeddings('glove'), FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward') ]) elmo_bert = StackedEmbeddings([elmo, bert]) print("Loaded Resources") benchmarks = [("AVG-W2V", ft.partial(run_avg_benchmark, model=word2vec, use_stoplist=False)), ("AVG-ELMO",
args = parser.parse_args() test_df = pd.read_csv(args.inputfile) # Throw away unwanted rows test_df['AveragePosition'] = test_df['Average.Position'] test_df.drop('Average.Position', axis=1, inplace=True) test_df.dropna(inplace=True) test_df.CTR = test_df.CTR.apply(lambda x: x[:-1]).astype('float64') test_df = test_df[(test_df.CPC != 0.0) | (test_df.AveragePosition != 0.0) | (test_df.Impressions != 0.0)] # Compute embeddings print('Loading ELMo model...', end='') elmo_small = ELMoEmbeddings('small') print('Done!') document_embedding = DocumentPoolEmbeddings([elmo_small]) def compute_elmo_embedding(keyword): sentence = Sentence(keyword) document_embedding.embed(sentence) return sentence.get_embedding().detach().cpu().numpy() vectors = [] print('\nNow computing embeddings for keywords...', end='') for keyword in tqdm(test_df.Keyword.values, total=test_df.shape[0]): vectors.append(compute_elmo_embedding(keyword)) vectors = pd.DataFrame.from_records(np.array(vectors), index=test_df.index)
def __init__(self, args, agent_mode): # initializes environment variables and then reads sentences. print('Initializing the Environment...') self.domain = args.domain self.dis_dim = args.dis_dim # 50 self.tag_dim = args.tag_dim # 50 self.word_dim = args.word_dim # 50 self.num_words = args.num_words # 500 self.action_rate = args.action_rate # 0.1 self.use_act_rate = args.use_act_rate # 1 # self.use_act_att = args.use_act_att # 0 self.reward_base = args.reward_base # 50.0 self.ra = args.reward_assign # [1,2,3] self.word2vec = args.word2vec self.terminal_flag = False self.train_epoch_end_flag = False self.valid_epoch_end_flag = False self.max_data_char_len = 0 self.max_data_sent_len = 0 self.agent_mode = agent_mode # args.agent_mode self.context_len = args.context_len # 100 if not args.gui_mode2: self.stacked_embeddings = args.stacked_embeddings elif args.gui_mode2: #if gui mode ..set different embeddings for different networks if agent_mode == 'act': self.word_dim = self.tag_dim = self.dis_dim = 3172 self.stacked_embeddings = StackedEmbeddings([ WordEmbeddings('glove'), BertEmbeddings('bert-base-uncased') ]) elif agent_mode == 'arg': self.word_dim = self.tag_dim = self.dis_dim = 868 self.stacked_embeddings = StackedEmbeddings( [WordEmbeddings('glove'), ELMoEmbeddings('small')]) # read the sentences!!! if not args.gui_mode: if self.agent_mode == 'arg': indata = load_pkl('data/refined_%s_data.pkl' % self.domain)[-1] arg_sents = [] for i in tqdm(range(len(indata))): for j in range(len(indata[i])): if len(indata[i][j]) == 0: continue # -1 obj_ind refer to UNK # words = indata[i][j]['last_sent'] + indata[i][j]['this_sent'] + ['UNK'] # we don't need an unknown here. words = indata[i][j]['last_sent'] + indata[i][j][ 'this_sent'] current_sent = indata[i][j]['this_sent'] sent_len = len( words) #here sent len is last_sent + this_sent. act_inds = [ a['act_idx'] for a in indata[i][j]['acts'] if a['act_idx'] < self.num_words ] #list of action indexes less than self.num_words = 128 for k in range(len(indata[i][j]['acts'])): act_ind = indata[i][j]['acts'][k][ 'act_idx'] # action index obj_inds = indata[i][j]['acts'][k][ 'obj_idxs'] # object index list arg_sent = {} # set arg tags arg_tags = np.ones(sent_len, dtype=np.int32) # tags if len(obj_inds[1]) == 0: arg_tags[obj_inds[0]] = 2 # essential objects else: arg_tags[obj_inds[0]] = 4 # exclusive objects arg_tags[obj_inds[1]] = 4 # exclusive objects # set distance position = np.zeros(sent_len, dtype=np.int32) position.fill(act_ind) distance = np.abs(np.arange(sent_len) - position) arg_sent['tokens'] = words arg_sent['tags'] = arg_tags arg_sent['act_ind'] = act_ind arg_sent['distance'] = distance arg_sent['act_inds'] = act_inds arg_sent['obj_inds'] = obj_inds # ipdb.set_trace() sent_vec = [] if args.stacked_embeddings == 'word2vec': for w in arg_sent['tokens']: if len(w) > self.max_data_char_len: self.max_data_char_len = len(w) if w in self.word2vec.vocab: sent_vec.append(self.word2vec[w]) else: sent_vec.append(np.zeros( self.word_dim)) else: # Stacked embeddings line = ' '.join(words) sent = Sentence(line) args.stacked_embeddings.embed(sent) for token in sent: sent_vec.append(token.embedding.numpy()) for w in arg_sent['tokens']: if len(w) > self.max_data_char_len: self.max_data_char_len = len(w) sent_vec = np.array(sent_vec) pad_len = self.num_words - len(sent_vec) if len(sent_vec) > self.max_data_sent_len: self.max_data_sent_len = len(sent_vec) distance = np.zeros([self.num_words, self.dis_dim]) act_vec = sent_vec[arg_sent[ 'act_ind']] # word vector of the input action # TODO: Attention is not required for contextual word embeddings, so commented it out to save time. Try it out if time permits. # attention = np.sum(sent_vec * act_vec, axis=1) # attention between the input action and its context # attention = np.exp(attention) # attention /= sum(attention) if pad_len > 0: # doc_vec = np.concatenate((doc_vec, np.zeros([pad_len, self.word_dim]))) # doc_vec.shape = [5oo, 5o] # act_text['tags'] = np.concatenate((np.array(act_text['tags']), np.ones(pad_len, dtype=np.int32))) # [500] sent_vec = np.concatenate( (sent_vec, np.zeros([pad_len, self.word_dim]))) # arg_sent['tags'] = np.concatenate( (np.array(arg_sent['tags']), np.ones(pad_len, dtype=np.int32))) # attention = np.concatenate((attention, np.zeros(pad_len))) for d in range(len(arg_sent['distance'])): distance[d] = arg_sent['distance'][d] else: sent_vec = sent_vec[:self.num_words] arg_sent['tokens'] = arg_sent[ 'tokens'][:self.num_words] arg_sent['tags'] = np.array( arg_sent['tags'])[:self.num_words] # attention = attention[: self.num_words] for d in range(self.num_words): distance[d] = arg_sent['distance'][d] # TODO: Future work: Use attention # if self.use_act_att: # apply attention to word embedding # sent_vec = attention.reshape(-1, 1) * sent_vec sent_vec = np.concatenate((sent_vec, distance), axis=1) arg_sent['sent_vec'] = sent_vec arg_sent['tags'].shape = (self.num_words, 1) # self.create_matrix(arg_sent,words) #create_matrix function arg_sents.append(arg_sent) ''' Split into train and test first. Split train into train and val then. ''' self.train_data, self.test_data = train_test_split( arg_sents, test_size=0.2, random_state=1) self.train_data, self.validation_data = train_test_split( self.train_data, test_size=0.2, random_state=1) self.train_steps = len(self.train_data) * self.num_words self.validation_steps = len( self.validation_data) * self.num_words self.test_steps = len(self.test_data) * self.num_words self.num_train = len(self.train_data) self.num_validation = len(self.validation_data) self.num_test = len(self.test_data) print('\n\ntraining texts: %d\tvalidation texts: %d' % (len(self.train_data), len(self.validation_data))) print('max_data_sent_len: %d\tmax_data_char_len: %d' % (self.max_data_sent_len, self.max_data_char_len)) print('self.train_steps: %d\tself.valid_steps: %d\n\n' % (self.train_steps, self.validation_steps)) print('\n\ntest texts: %d\t self.test_steps:%d\n' % (len(self.test_data), self.test_steps)) else: #actions # self.read_act_texts() # read action texts into input_data input_data = load_pkl('data/%s_labeled_text_data.pkl' % self.domain) # unroll the stuff inside and store it in a list called act_texts act_texts = [] for i in range( len(input_data )): #until length of training examples (documents) if len( input_data[i] ['words']) == 0: #if there are no words in a document continue # act_text is a dictionary to store info. act_text = {} act_text['tokens'] = input_data[i][ 'words'] #tokens = individual words act_text['sents'] = input_data[i][ 'sents'] #sents = sentences [['a ','cat ', 'runs.'], [ ], ...] act_text['acts'] = input_data[i][ 'acts'] #acts = [{},{},{}, ..] where {} = 4 tuple containing keys: [act_idx, obj_idxs, act_type, related_acts] act_text['sent_acts'] = input_data[i][ 'sent_acts'] #list of acts in a sentence for every sentence. act_text['word2sent'] = input_data[i][ 'word2sent'] # {0:0, 1:0, 2:0, .... 38:2....} Mapping of word_index to sentence_index act_text['tags'] = np.ones( len(input_data[i]['words']), dtype=np.int32 ) #same length as number of words in a document. act_text['act2related'] = {} #related actions #for all action 4 tuples for acts in input_data[i]['acts']: act_text['act2related'][acts['act_idx']] = acts[ 'related_acts'] # act_text['act2related'] = {act_idx: []} where [] is list of related actions act_text['tags'][acts['act_idx']] = acts[ 'act_type'] + 1 # TODO: 2, 3, 4? - why? act_text['tags'] = [2,3,4,2,2,3,3,4,4,...] where index of array is action_index # self.create_matrix(act_text) # Creating matrix doc_vec = [] if args.stacked_embeddings != 'word2vec': # doing Flair embeddings for sent in tqdm(act_text['sents']): line = ' '.join(sent) sentence = Sentence(line) args.stacked_embeddings.embed(sentence) for token in sentence: # print(token.embedding.shape) # 4196 doc_vec.append(token.embedding.numpy()) #initialize word2vec or zeroes for word in act_text['tokens']: if len(word) > self.max_data_char_len: self.max_data_char_len = len( word ) #max_data_char_len shows longest word. # if word in self.word2vec.vocab: # doc_vec.append(self.word2vec[word]) # else: # doc_vec.append(np.zeros(self.word_dim)) elif args.stacked_embeddings == 'word2vec': # initialize word2vec or zeroes for word in act_text['tokens']: if len(word) > self.max_data_char_len: self.max_data_char_len = len( word ) # max_data_char_len shows longest word. if word in self.word2vec.vocab: doc_vec.append(self.word2vec[word]) else: doc_vec.append(np.zeros(self.word_dim)) doc_vec = np.array(doc_vec) pad_len = self.num_words - len(doc_vec) if len(doc_vec) > self.max_data_sent_len: self.max_data_sent_len = len( doc_vec ) #max_data_sent_len is length of longest document vector.. # print(doc_vec.shape) if pad_len > 0: #if not negative. doc_vec = np.concatenate( (doc_vec, np.zeros([pad_len, self.word_dim ]))) # doc_vec.shape = [5oo, 5o] act_text['tags'] = np.concatenate( (np.array(act_text['tags']), np.ones(pad_len, dtype=np.int32))) # [500] else: #pad_len is negative doc_vec = doc_vec[:self.num_words] #pick first 500 act_text['tokens'] = act_text[ 'tokens'][:self. num_words] #also in tokens, first 500 act_text['tags'] = np.array( act_text['tags'] )[:self.num_words] #also in tags, first 500 act_text[ 'sent_vec'] = doc_vec # set sentence vec to 500,50 doc_vec act_text['tags'].shape = (self.num_words, 1 ) # redefine shape to 500,1 act_texts.append( act_text) #keep collecting documents in act_texts ''' Split into train and test first. Split train into train and val then. ''' # seed makes sure dataset is always split in the same way randomly self.train_data, self.test_data = train_test_split( act_texts, test_size=0.2, random_state=1) self.train_data, self.validation_data = train_test_split( self.train_data, test_size=0.2, random_state=1) self.train_steps = len( self.train_data ) * self.num_words # length of train data * 500 self.validation_steps = len( self.validation_data ) * self.num_words #length of validation data * 500 -- Why a step includes multiplication with num_words? because each training and val example contains 500 words. self.test_steps = len(self.test_data) * self.num_words self.num_train = len(self.train_data) self.num_validation = len(self.validation_data) self.num_test = len(self.test_data) print('\n\ntraining texts: %d\tvalidation texts: %d' % (len(self.train_data), len(self.validation_data))) print('max_data_sent_len: %d\tmax_data_char_len: %d' % (self.max_data_sent_len, self.max_data_char_len)) #sent len means doc len print('self.train_steps: %d\tself.valid_steps: %d\n\n' % (self.train_steps, self.validation_steps)) print('\n\ntest texts: %d\t self.test_steps:%d\n' % (len(self.test_data), self.test_steps)) args.train_steps = self.train_steps args.valid_steps = self.validation_steps # validation steps args.test_steps = self.test_steps
def load_elmo_embeddings(ename): return DocumentPoolEmbeddings([ELMoEmbeddings(ename)])
corpus: Corpus = ColumnCorpus(data_folder, columns, train_file="train.txt", dev_file="test.txt") tag_type = 'ner' tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type) # define search_space search_space = SearchSpace() search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[ StackedEmbeddings([ ELMoEmbeddings('original'), FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), BertEmbeddings('bert-large-cased') ]), StackedEmbeddings([ ELMoEmbeddings('original'), FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), BertEmbeddings('bert-large-cased'), CharacterEmbeddings() ]) ]) search_space.add(Parameter.HIDDEN_SIZE, hp.randint, upper=400) search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2]) search_space.add(Parameter.DROPOUT, hp.uniform, low=0.0, high=0.5)
class Database(object): def __init__(self, docs): #self.documents_orig = np.loadtxt(docs, delimiter='\n', dtype = str) # only 9999 documents self.documents_orig = [] with open(docs, 'r') as f: # getting 10k docs using this self.documents_orig = f.readlines() self.documents = [] self.elmo = ELMoEmbeddings() #self.embedding = DocumentPoolEmbeddings([self.elmo]) self.debug = True def knn(self, query, query_txt, k): #cos_sim = torch.mm(self.documents, query) / (torch.norm(query) * torch.norm(self.documents)) cos_sim = torch.nn.functional.cosine_similarity(self.documents, query) topk, topk_indices = torch.topk(cos_sim, k, 0, True) topk_indices = topk_indices.numpy().astype('int') topk = topk.numpy().astype('float') top_combined = np.vstack((topk, topk_indices)).T if self.debug: print("\n") print("Query: ", query_txt, " | index: ", topk_indices.T) [ print(self.documents_orig[int(i[1])], " --- ", i[0]) for i in top_combined ] return list(zip(topk, topk_indices)) #used to return tuples def load_documents_into_embedding(self): print("Embedding ", len(self.documents_orig), " Documents") #self.documents_orig = self.documents_orig[0:50] self.documents = [ self.elmo.embed(Sentence(elem)) for elem in self.documents_orig ] self.documents = torch.stack([ torch.cat([token.embedding.unsqueeze(0) for token in elem[0]], dim=0)[0] for elem in self.documents ]) np.save("./documents_embedded.npy", self.documents) def run_query(self, query, k=None): """Run a query on the given documents based on word embeddings Arguments: query {str} -- Query string. Keyword Arguments: k {int} -- The top documents to return (default: 10) Returns: list[tuple[float, int]] -- Sorted list of tuples, which contain the score and the document id. Made up example to show the formatting with k=5: [(0.89316645860672, 1567), (0.6174346804618835, 125), (0.5975501537321234, 1181), (0.5779426293373108, 3979), (0.5110726475715637, 7155)] """ if k is None: k = 10 sentence = Sentence(query) #self.embedding.embed(sentence) self.elmo.embed(sentence) sentence = [token.embedding.unsqueeze(0) for token in sentence][0] #print(sentence) # A returned list should look like this for k=5. Btw. the numbers are made up! #[ # (0.89316645860672, 1567), # (0.6174346804618835, 125), # (0.5975501537321234, 1181), # (0.5779426293373108, 3979), # (0.5110726475715637, 7155), # ] return self.knn(sentence, query, k=k) def run_query_txt(self, text): self.queries = np.loadtxt(text, delimiter='\n', dtype=str) results = [] for query in self.queries: out = self.run_query(query) results.append(out) #saving results file = open("results.txt", 'w') for elem in results: out = "" for res in elem: out += str(res[0]) + "," + str(res[1]) + ";" out = out[:-1] out += '\n' file.write(out) file.close()