def compute_similarity_using_word2vec_model(query_word, steam_tokens=None, model=None, enforce_training=False): if steam_tokens is None: steam_tokens = load_tokens() if model is None: try: print('Loading Word2Vec model.') model = Word2Vec.load(get_word_model_file_name()) if enforce_training: model = train_word_model_on_steam_tokens( model=model, steam_tokens=steam_tokens) except FileNotFoundError: print('Training Word2Vec model from scratch.') model = train_word_model_on_steam_tokens(model=None, steam_tokens=steam_tokens) if query_word in get_word_model_vocabulary(model): similar_words = test_word(model, query_word) else: print('The word {} is not part of the word model vocabulary.'.format( query_word)) similar_words = None return similar_words
def compute_similarity_with_candidate_sentences_using_wmd(query_app_id, steam_tokens=None, model=None, candidates=None): if steam_tokens is None: steam_tokens = load_tokens() if model is None: model = Word2Vec.load(get_word_model_file_name()) constrain_search = (candidates is not None) query = steam_tokens[query_app_id] if constrain_search: documents = list(steam_tokens[i] for i in candidates) else: # Caveat: the Word Mover algorithm is painfully slow! Please consider constraining the search to few candidates! documents = list(steam_tokens.values()) instance = WmdSimilarity(documents, model.wv, num_best=10) similarity_scores_as_tuples = instance[query] similarity_scores = reformat_similarity_scores_for_wmd(similarity_scores_as_tuples, candidates) print_most_similar_sentences(similarity_scores) return similarity_scores
def load_input(): game_names, _ = load_game_names(include_genres=False, include_categories=False) steam_tokens = load_tokens() app_ids = list(int(app_id) for app_id in steam_tokens.keys()) return game_names, steam_tokens, app_ids
def compute_similarity_with_all_other_steam_sentences( query_app_id, steam_tokens=None, model=None, game_names=None, filter_out_words_out_of_vocabulary=True): if steam_tokens is None: steam_tokens = load_tokens() if model is None: model = Word2Vec.load(get_word_model_file_name()) if game_names is None: game_names, _ = load_game_names() index2word_set = get_word_model_vocabulary(model) query_sentence = steam_tokens[query_app_id] if filter_out_words_out_of_vocabulary: query_sentence = filter_out_words_not_in_vocabulary( query_sentence, index2word_set) similarity_scores = {} counter = 0 num_games = len(steam_tokens) for app_id in steam_tokens: counter += 1 if (counter % 1000) == 0: print('[{}/{}] appID = {} ({})'.format(counter, num_games, app_id, game_names[app_id])) reference_sentence = steam_tokens[app_id] if filter_out_words_out_of_vocabulary: reference_sentence = filter_out_words_not_in_vocabulary( reference_sentence, index2word_set) try: similarity_scores[app_id] = model.wv.n_similarity( query_sentence, reference_sentence) except ZeroDivisionError: similarity_scores[app_id] = 0 return similarity_scores
def train_doc_model_on_steam_tokens(model=None, steam_tokens=None, num_epochs=10): # You do not want to perform training this way, because training already happened when initializating the model # with Doc2Vec(documents). Moreover, calling train() several times messes with decay of learning rate alpha! if steam_tokens is None: steam_tokens = load_tokens() documents = list(read_corpus(steam_tokens)) if model is None: model = doc2vec.Doc2Vec(documents) # training happens with 5 epochs (default) here start = time() model.train(documents, total_examples=len(documents), epochs=num_epochs) print('Elapsed time: {%.2f}' % (time() - start)) model.save(get_doc_model_file_name()) return model
def train_word_model_on_steam_tokens(model=None, steam_tokens=None, num_epochs=10): # Warning: training will happen several times, which might be detrimental to your model! if steam_tokens is None: steam_tokens = load_tokens() documents = list(steam_tokens.values()) if model is None: model = Word2Vec( documents ) # training already happens here, due to the 'documents' argument! model.train(documents, total_examples=len(documents), epochs=num_epochs) model.save(get_word_model_file_name()) return model
def compute_similarity_using_doc2vec_model(query_app_id, steam_tokens=None, model=None, verbose=False, enforce_training=False, avoid_inference=False, num_items_displayed=10): if steam_tokens is None: steam_tokens = load_tokens() if model is None: try: print('Loading Doc2Vec model.') model = doc2vec.Doc2Vec.load(get_doc_model_file_name()) if enforce_training: model = train_doc_model_on_steam_tokens(model=model, steam_tokens=steam_tokens) except FileNotFoundError: print('Training Doc2Vec model from scratch.') model = train_doc_model_on_steam_tokens(model=None, steam_tokens=steam_tokens) if avoid_inference: if verbose: print('Finding most similar documents based on the query appID.') # For games which are part of the training corpus, we do not need to call model.infer_vector() similarity_scores_as_tuples = model.docvecs.most_similar(positive=get_tag_prefix() + str(query_app_id), topn=num_items_displayed) else: if verbose: print('Finding most similar documents based on an inferred vector, which represents the query document.') query = steam_tokens[query_app_id] # Caveat: « Subsequent calls to this function may infer different representations for the same document. » # Reference: https://radimrehurek.com/gensim/models/doc2vec.html#gensim.models.doc2vec.Doc2Vec.infer_vector inferred_vector = model.infer_vector(query) similarity_scores_as_tuples = model.docvecs.most_similar([inferred_vector]) similarity_scores = reformat_similarity_scores_for_doc2vec(similarity_scores_as_tuples) print_most_similar_sentences(similarity_scores, num_items_displayed=num_items_displayed) return similarity_scores
print('Loading Word2Vec model.') model = Word2Vec.load(get_word_model_file_name()) if enforce_training: model = train_word_model_on_steam_tokens( model=model, steam_tokens=steam_tokens) except FileNotFoundError: print('Training Word2Vec model from scratch.') model = train_word_model_on_steam_tokens(model=None, steam_tokens=steam_tokens) if query_word in get_word_model_vocabulary(model): similar_words = test_word(model, query_word) else: print('The word {} is not part of the word model vocabulary.'.format( query_word)) similar_words = None return similar_words if __name__ == '__main__': steam_tokens = load_tokens() model = Word2Vec.load(get_word_model_file_name()) for query_word in ['anime', 'fun', 'violent']: compute_similarity_using_word2vec_model(query_word, steam_tokens, model)
import os os.environ["CUDA_VISIBLE_DEVICES"] = "-1" import math as m import progressbar as pb import tensorflow as tf import utils as u from datetime import datetime as dt # --- script settings start --- document = '../data/marktwain.txt' yml = './simple_rnn.yml' # --- script settings end --- FLAGS = u.load_flags(yml) tokens = u.load_tokens(document) unique_tokens = set(tokens) unique_tokens = dict((v, i) for i, v in enumerate(unique_tokens)) samples = u.make_samples(tokens, FLAGS['sample_length']) # build a simple model # words -> one-hot -> rnn -> dense -> output model = tf.keras.Sequential([ tf.keras.layers.SimpleRNN(units=FLAGS['units'], input_shape=(FLAGS['sample_length'] - 1, len(unique_tokens))), tf.keras.layers.Dense(len(unique_tokens)), tf.keras.layers.Activation('softmax') ]) optimizer = tf.keras.optimizers.Nadam() model.compile(optimizer=optimizer, loss='categorical_crossentropy')
import asyncore from server.tcptunnel import TCPServer from server.udptunnel import UDPServer from utils import load_tokens if __name__ == '__main__': tokens = load_tokens() tcp_server = TCPServer('0.0.0.0', 6666, tokens) #udp_server = UDPServer('0.0.0.0', 7778) asyncore.loop()
# -*- coding: utf-8 -*- from keras.models import load_model import utils tokenizer = utils.load_tokens() review = "Good but the charger is not the same size as an Apple charger and that makes it a little difficult for charging with cases on." # 2 input_data = utils.convert_review(review, tokenizer) model_name = 'model/weights-improvement-20-0.9417.hdf5' model = load_model(model_name) predictions = model.predict(input_data) utils.display_rating(predictions)
def retrieve_similar_store_descriptions( compute_from_scratch=True, use_unit_vectors=False, alpha=1e-3, # in SIF weighting scheme, parameter in the range [3e-5, 3e-3] num_removed_components_for_sentence_vectors=0, # in SIF weighting scheme pre_process_word_vectors=False, num_removed_components_for_word_vectors=0, count_words_out_of_vocabulary=True, use_idf_weights=True, shuffle_corpus=True, use_glove_with_spacy=True, use_cosine_similarity=True, num_neighbors=10, no_below=5, # only relevant with Word2Vec no_above=0.5, # only relevant with Word2Vec only_print_banners=True): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) game_names, _ = load_game_names(include_genres=False, include_categories=False) steam_tokens = load_tokens() documents = list(steam_tokens.values()) if shuffle_corpus: # Useful for Doc2Vec in 'doc2vec_model.py'. It might be useful for other methods. random.shuffle(documents) if compute_from_scratch: if not use_glove_with_spacy: # Use self-trained Word2Vec vectors dct = Dictionary(documents) print('Dictionary size (before trimming): {}'.format(len(dct))) dct.filter_extremes(no_below=no_below, no_above=no_above) print('Dictionary size (after trimming): {}'.format(len(dct))) model = Word2Vec(documents, workers=multiprocessing.cpu_count()) wv = model.wv else: # Use pre-trained GloVe vectors loaded from spaCy # Reference: https://spacy.io/models/en#en_vectors_web_lg spacy_model_name = 'en_vectors_web_lg' # either 'en_core_web_lg' or 'en_vectors_web_lg' nlp = spacy.load(spacy_model_name) wv = nlp.vocab if pre_process_word_vectors: # Jiaqi Mu, Pramod Viswanath, All-but-the-Top: Simple and Effective Postprocessing for Word Representations, # in: ICLR 2018 conference. # Reference: https://openreview.net/forum?id=HkuGJ3kCb if use_glove_with_spacy: wv.vectors.data -= np.array(wv.vectors.data).mean(axis=0) if num_removed_components_for_word_vectors > 0: wv.vectors.data = remove_pc( wv.vectors.data, npc=num_removed_components_for_word_vectors) else: wv.vectors -= np.array(wv.vectors).mean(axis=0) if num_removed_components_for_word_vectors > 0: wv.vectors = remove_pc( wv.vectors, npc=num_removed_components_for_word_vectors) wv.init_sims() if use_unit_vectors and not use_glove_with_spacy: # Pre-computations of unit word vectors, which replace the unnormalized word vectors. A priori not required # here, because another part of the code takes care of it. A fortiori not required when using spaCy. wv.init_sims( replace=True ) # TODO IMPORTANT choose whether to normalize vectors if not use_glove_with_spacy: index2word_set = set(wv.index2word) else: index2word_set = None num_games = len(steam_tokens) word_counter = {} document_per_word_counter = {} counter = 0 for app_id in steam_tokens: counter += 1 if (counter % 1000) == 0: print('[{}/{}] appID = {} ({})'.format(counter, num_games, app_id, game_names[app_id])) reference_sentence = steam_tokens[app_id] if not count_words_out_of_vocabulary: # This has an impact on the value of 'total_counter'. reference_sentence = filter_out_words_not_in_vocabulary( reference_sentence, index2word_set, wv) for word in reference_sentence: try: word_counter[word] += 1 except KeyError: word_counter[word] = 1 for word in set(reference_sentence): try: document_per_word_counter[word] += 1 except KeyError: document_per_word_counter[word] = 1 total_counter = sum(word_counter.values()) # Inverse Document Frequency (IDF) idf = {} for word in document_per_word_counter: idf[word] = math.log( (1 + num_games) / (1 + document_per_word_counter[word])) # Word frequency. Caveat: over the whole corpus! word_frequency = dict() for word in word_counter: word_frequency[word] = word_counter[word] / total_counter sentence_vector = {} if not use_glove_with_spacy: word_vector_length = wv.vector_size else: word_vector_length = wv.vectors_length X = np.zeros([num_games, word_vector_length]) counter = 0 for (i, app_id) in enumerate(steam_tokens.keys()): counter += 1 if (counter % 1000) == 0: print('[{}/{}] appID = {} ({})'.format(counter, num_games, app_id, game_names[app_id])) reference_sentence = steam_tokens[app_id] num_words_in_reference_sentence = len(reference_sentence) reference_sentence = filter_out_words_not_in_vocabulary( reference_sentence, index2word_set, wv) if not count_words_out_of_vocabulary: # NB: Out-of-vocabulary words are not counted in https://stackoverflow.com/a/35092200 num_words_in_reference_sentence = len(reference_sentence) weighted_vector = np.zeros(word_vector_length) for word in reference_sentence: if use_idf_weights: weight = idf[word] else: weight = (alpha / (alpha + word_frequency[word])) # TODO IMPORTANT Why use the normalized word vectors instead of the raw word vectors? if not use_glove_with_spacy: if use_unit_vectors: # Reference: https://github.com/RaRe-Technologies/movie-plots-by-genre word_vector = wv.vectors_norm[wv.vocab[word].index] else: word_vector = wv.vectors[wv.vocab[word].index] else: word_vector = wv.get_vector(word) if use_unit_vectors: word_vector_norm = wv[word].vector_norm if word_vector_norm > 0: word_vector = word_vector / word_vector_norm weighted_vector += weight * word_vector if len(reference_sentence) > 0: sentence_vector[ app_id] = weighted_vector / num_words_in_reference_sentence else: sentence_vector[app_id] = weighted_vector X[i, :] = sentence_vector[app_id] # Reference: https://stackoverflow.com/a/11620982 X = np.where(np.isfinite(X), X, 0) print('Saving the sentence embedding.') np.save('data/X.npy', X) else: print('Loading the sentence embedding.') X = np.load('data/X.npy', mmap_mode='r') if num_removed_components_for_sentence_vectors > 0: X = remove_pc(X, npc=num_removed_components_for_sentence_vectors) app_ids = list(int(app_id) for app_id in steam_tokens.keys()) query_app_ids = load_benchmarked_app_ids(append_hard_coded_app_ids=True) matches_as_app_ids = perform_knn_search_with_app_ids_as_input( query_app_ids, label_database=X, app_ids=app_ids, use_cosine_similarity=use_cosine_similarity, num_neighbors=num_neighbors) print_ranking(query_app_ids, matches_as_app_ids, num_elements_displayed=num_neighbors, only_print_banners=only_print_banners) retrieval_score = compute_retrieval_score( query_app_ids, matches_as_app_ids, num_elements_displayed=num_neighbors, verbose=False) retrieval_score_by_genre = compute_retrieval_score_based_on_sharing_genres( query_app_ids, matches_as_app_ids, num_elements_displayed=num_neighbors, verbose=False) retrieval_score_by_tag = compute_retrieval_score_based_on_sharing_tags( query_app_ids, matches_as_app_ids, num_elements_displayed=num_neighbors, verbose=False) return retrieval_score, retrieval_score_by_genre, retrieval_score_by_tag
"=====================================================================" #load train set description train_descriptions = UT.load_clean_desc('description.txt', train) print("Descriptions Train: ", len(train_descriptions)) "======================================================================" train_features = UT.load_photo_features('features.pkl', train) print("train features:", len(train_features)) "=======================================================================" # Get tokens tokens = UT.load_tokens(train_descriptions) vocab = len(tokens.word_index) + 1 print('Vocab Size:', vocab) max_length = UT.max_length(train_descriptions) print('Description Length:', max_length) # prepare sequences X1train, X2train, ytrain = UT.create_sequence(tokens, max_length, features, train_descriptions, vocab) print('Size of sequence', len(X2train)) # TIME FOR LOAD VALIDATION DATASET print("[INFO] Load Val data.......") test = UT.load_identifiers(args['devPath']) print('Dataset: %d' % len(test)) # descriptions
def main(chosen_model_no=0, num_items_displayed=10, use_spacy=False, use_soft_cosine_similarity=False, num_topics=None, no_below=5, no_above=0.5, normalize_vectors=False): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) if num_topics is None: num_topics = 100 possible_model_names = [ 'tf_idf', # 0 'lsi_bow', 'lsi_tf_idf', # 1, 2 'rp_bow', 'rp_tf_idf', # 3, 4 'lda_bow', 'lda_tf_idf', # 5, 6 'hdp_bow', 'hdp_tf_idf', # 7, 8 'word2vec', # 9 ] chosen_model_name = possible_model_names[chosen_model_no] print(chosen_model_name) game_names, _ = load_game_names(include_genres=False, include_categories=False) steam_tokens = load_tokens() nlp = spacy.load('en_core_web_lg') documents = list(steam_tokens.values()) dct = Dictionary(documents) print(len(dct)) dct.filter_extremes(no_below=no_below, no_above=no_above) print(len(dct)) corpus = [dct.doc2bow(doc) for doc in documents] # Pre-processing pre_process_corpus_with_tf_idf = chosen_model_name.endswith('_tf_idf') tfidf_model = TfidfModel(corpus, id2word=dct, normalize=normalize_vectors) if pre_process_corpus_with_tf_idf: # Caveat: the leading underscore is important. Do not use this pre-processing if the chosen model is Tf-Idf! print('Corpus as Tf-Idf') pre_processed_corpus = tfidf_model[corpus] else: print('Corpus as Bag-of-Words') pre_processed_corpus = corpus # Model model = None wv = None index2word_set = None if chosen_model_name == 'tf_idf': print('Term Frequency * Inverse Document Frequency (Tf-Idf)') model = tfidf_model elif chosen_model_name.startswith('lsi'): print('Latent Semantic Indexing (LSI/LSA)') model = LsiModel(pre_processed_corpus, id2word=dct, num_topics=num_topics) elif chosen_model_name.startswith('rp'): print('Random Projections (RP)') model = RpModel(pre_processed_corpus, id2word=dct, num_topics=num_topics) elif chosen_model_name.startswith('lda'): print('Latent Dirichlet Allocation (LDA)') model = LdaModel(pre_processed_corpus, id2word=dct, num_topics=num_topics) elif chosen_model_name.startswith('hdp'): print('Hierarchical Dirichlet Process (HDP)') model = HdpModel(pre_processed_corpus, id2word=dct) elif chosen_model_name == 'word2vec': use_a_lot_of_ram = False if use_a_lot_of_ram: model = None print('Loading Word2Vec based on Google News') # Warning: this takes a lot of time and uses a ton of RAM! wv = KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin.gz', binary=True) else: if use_spacy: print('Using Word2Vec with spaCy') else: print('Training Word2Vec') model = Word2Vec(documents) wv = model.wv if not use_spacy: wv.init_sims(replace=normalize_vectors) index2word_set = set(wv.index2word) else: print('No model specified.') model = None if chosen_model_name != 'word2vec': if not use_soft_cosine_similarity: index = MatrixSimilarity(model[pre_processed_corpus], num_best=10, num_features=len(dct)) else: w2v_model = Word2Vec(documents) similarity_index = WordEmbeddingSimilarityIndex(w2v_model.wv) similarity_matrix = SparseTermSimilarityMatrix(similarity_index, dct, tfidf_model, nonzero_limit=100) index = SoftCosineSimilarity(model[pre_processed_corpus], similarity_matrix) else: index = None query_app_ids = load_benchmarked_app_ids(append_hard_coded_app_ids=True) app_ids = list(int(app_id) for app_id in steam_tokens.keys()) matches_as_app_ids = [] for query_count, query_app_id in enumerate(query_app_ids): print('[{}/{}] Query appID: {} ({})'.format(query_count + 1, len(query_app_ids), query_app_id, get_app_name(query_app_id, game_names))) query = steam_tokens[str(query_app_id)] if use_spacy: spacy_query = Doc(nlp.vocab, query) else: spacy_query = None if chosen_model_name != 'word2vec': vec_bow = dct.doc2bow(query) if pre_process_corpus_with_tf_idf: pre_preoccessed_vec = tfidf_model[vec_bow] else: pre_preoccessed_vec = vec_bow vec_lsi = model[pre_preoccessed_vec] sims = index[vec_lsi] if use_soft_cosine_similarity: sims = enumerate(sims) similarity_scores_as_tuples = [(str(app_ids[i]), sim) for (i, sim) in sims] similarity_scores = reformat_similarity_scores_for_doc2vec(similarity_scores_as_tuples) else: if use_spacy: similarity_scores = {} for app_id in steam_tokens: reference_sentence = steam_tokens[app_id] spacy_reference = Doc(nlp.vocab, reference_sentence) similarity_scores[app_id] = spacy_query.similarity(spacy_reference) else: query_sentence = filter_out_words_not_in_vocabulary(query, index2word_set) similarity_scores = {} counter = 0 num_games = len(steam_tokens) for app_id in steam_tokens: counter += 1 if (counter % 1000) == 0: print('[{}/{}] appID = {} ({})'.format(counter, num_games, app_id, game_names[app_id])) reference_sentence = steam_tokens[app_id] reference_sentence = filter_out_words_not_in_vocabulary(reference_sentence, index2word_set) try: similarity_scores[app_id] = wv.n_similarity(query_sentence, reference_sentence) except ZeroDivisionError: similarity_scores[app_id] = 0 similar_app_ids = print_most_similar_sentences(similarity_scores, num_items_displayed=num_items_displayed, verbose=False) matches_as_app_ids.append(similar_app_ids) print_ranking(query_app_ids, matches_as_app_ids, only_print_banners=True) return
def apply_pipeline(train_from_scratch=True, avoid_inference=False, shuffle_corpus=True, include_genres=False, include_categories=True, include_app_ids=True, verbose=False): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) game_names, game_tags = load_game_names(include_genres, include_categories) steam_tokens = load_tokens() documents = list(read_corpus(steam_tokens, game_tags, include_app_ids)) if shuffle_corpus: # « Only if the training data has some existing clumping – like all the examples with certain words/topics are # stuck together at the top or bottom of the ordering – is native ordering likely to cause training problems. # And in that case, a single shuffle, before any training, should be enough to remove the clumping. » # Reference: https://stackoverflow.com/a/48080869 random.shuffle(documents) if train_from_scratch: print('Creating a new Doc2Vec model from scratch.') model = doc2vec.Doc2Vec(documents, vector_size=100, window=5, min_count=5, epochs=20, workers=multiprocessing.cpu_count()) # NB: Do not follow the piece of advice given in https://rare-technologies.com/doc2vec-tutorial/ # « I have obtained better results by iterating over the data several times and either: # 1. randomizing the order of input sentences, or # 2. manually controlling the learning rate over the course of several iterations. » # Indeed, in my experience, this leads to buggy results. Moreover, this approach is not recommended according to # https://stackoverflow.com/a/48080869 model.save(get_doc_model_file_name()) else: print('Loading previous Doc2Vec model.') model = doc2vec.Doc2Vec.load(get_doc_model_file_name()) # Test doc2vec if verbose: try: # Spelunky + (Slay the Spire) - (Dream Quest) check_analogy(model, pos=['239350', '646570'], neg=['557410']) except TypeError: pass try: # Half-Life + (Witcher 2) - (Witcher) check_analogy(model, pos=['70', '20920'], neg=['20900']) except TypeError: pass query_app_ids = ['620', '364470', '504230', '583950', '646570', '863550', '794600'] for query_app_id in query_app_ids: print('Query appID: {} ({})'.format(query_app_id, game_names[query_app_id])) compute_similarity_using_doc2vec_model(query_app_id, steam_tokens, model, avoid_inference=avoid_inference, num_items_displayed=10) # Check the relevance of the corresponding word2vec for query_word in ['anime', 'fun', 'violent']: compute_similarity_using_word2vec_model(query_word, steam_tokens, model) entity = get_doc_model_entity(model) tag_entity = set(tag for tag in entity if 'appID_' not in tag) print(tag_entity) query_tags = ['In-App Purchases', 'Free to Play', 'Violent', 'Early Access'] for query_tag in tag_entity.intersection(query_tags): for query_app_id in query_app_ids: try: sim = model.docvecs.similarity(get_tag_prefix() + query_app_id, query_tag) print('Similarity = {:.0%} for tag {} vs. appID {} ({})'.format(sim, query_tag, query_app_id, game_names[query_app_id])) except KeyError: pass num_items_displayed = 3 for query_tag in tag_entity: print('\nTag: {}'.format(query_tag)) similarity_scores_as_tuples = model.docvecs.most_similar(positive=query_tag, topn=num_items_displayed) similarity_scores = reformat_similarity_scores_for_doc2vec(similarity_scores_as_tuples) print_most_similar_sentences(similarity_scores, num_items_displayed=num_items_displayed) # Top 100 query_app_ids = load_benchmarked_app_ids(append_hard_coded_app_ids=True) num_neighbors = 10 only_print_banners = True use_cosine_similarity = True label_database = np.array(model.docvecs.vectors_docs) doc_tags = list(model.docvecs.doctags.keys()) init_indices = np.array(range(len(doc_tags))) bool_indices_to_remove = list(map(lambda x: not x.startswith(get_tag_prefix()), doc_tags)) indices_to_remove = init_indices[bool_indices_to_remove] label_database = np.delete(label_database, indices_to_remove, axis=0) app_ids = [int(doc_tag[len(get_tag_prefix()):]) for doc_tag in doc_tags if doc_tag.startswith(get_tag_prefix())] knn = prepare_knn_search(label_database, use_cosine_similarity=use_cosine_similarity) query_des = None for query_app_id in query_app_ids: if avoid_inference: inferred_vector = label_database[app_ids.index(query_app_id)] else: # From query appID to query feature vector query = steam_tokens[str(query_app_id)] # Caveat: « Subsequent calls to this function may infer different representations for the same document. » # Reference: https://radimrehurek.com/gensim/models/doc2vec.html#gensim.models.doc2vec.Doc2Vec.infer_vector inferred_vector = model.infer_vector(query) if query_des is None: query_des = inferred_vector else: query_des = np.vstack((query_des, inferred_vector)) # Matching of feature vectors matches = perform_knn_search_with_vectors_as_input(query_des, knn, num_neighbors) # From feature matches to appID matches matches_as_app_ids = transform_matches_to_app_ids(matches, app_ids) print_ranking(query_app_ids, matches_as_app_ids, num_elements_displayed=num_neighbors, only_print_banners=only_print_banners) return
def train(args, states=None): config_obj = Config(args.config_file) config = config_obj.elements # make training runs deterministic set_seed(seed_value=config['random_seed']) logging.info("Loading datasets...") dataset, labels = load_tokens( input_id_path=config['input_id'], token_type_id_path=config['token_type_id'], attention_mask_path=config['attention_mask'], label_path=config['labels'], ) train_loader, val_loader, test_loader = create_dataloaders( dataset, labels, batch_size=config['batch_size'], random_seed=config['random_seed'], balance=config['correct_imbalance'], ) model = BertForSequenceClassification.from_pretrained( "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext", num_labels=2, output_attentions=False, output_hidden_states=False, ) if torch.cuda.is_available(): model.cuda() loss_function = nn.CrossEntropyLoss() # optimizer = AdamW(model.parameters(), lr=config['lr']) optimizer = torch.optim.SGD(model.parameters(), lr=config['lr']) total_train_steps = config['num_epochs'] * len(train_loader) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=0, num_training_steps=total_train_steps, ) best_metric = 0 # loop over the dataset multiple times for epoch in range(1, config['num_epochs'] + 1): logging.info( f"==================== Epoch: {epoch} ====================") running_losses = [] for i, data in enumerate(train_loader, 0): # get the inputs; data is a list of [inputs, labels] input_ids, token_type_ids, attention_mask, labels = data if torch.cuda.is_available(): input_ids = input_ids.cuda() token_type_ids = token_type_ids.cuda() attention_mask = attention_mask.cuda() labels = labels.cuda() # zero the parameter gradients optimizer.zero_grad() # forward _, logits = model( input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, labels=labels, ) # probs = F.softmax(logits, dim=1) # backprop loss = loss_function(logits, labels) loss.backward() # clip gradients torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # update/optimize optimizer.step() # update learning rate scheduler.step() # Log summary running_losses.append(loss.item()) if i % args.log_interval == 0: interval_loss = sum(running_losses) / len(running_losses) logging.info(f"step = {i}, loss = {interval_loss}") running_losses = [] if i % args.test_interval == 0: dev_metric = eval( val_loader, model, loss_function, args.eval_metric, ) if dev_metric > best_metric: best_metric = dev_metric states = { "epoch": epoch, "step": i, "model": model.state_dict(), "optimizer": optimizer.state_dict() } save_model_state(save_dir=args.model_dir, step=i, states=states) print(f"Finished Training, best {args.eval_metric}: {best_metric}")
'py36': 'python:3.6-slim', 'py35': 'python:3.5-slim', 'py27': 'python:2.7-slim' } BASE64_REGEX = re.compile(r'^[a-zA-Z0-9+/]+={0,2}$') FILENAME_REGEX = re.compile(r'^[\w,\s-]+\.[A-Za-z]{1,4}$') MAX_REQUEST_SIZE = 1 * 1024 * 1024 # 1MB MAX_EXECUTION_LIMIT = 60 # in seconds MAX_OUTPUT_FILESIZE = 4 * 1024 * 1024 # 4MB CONTAINER_WORKING_DIR = '/usr/src/app' FORMATS = ['text', 'base64_encoded_binary', 'json'] DIR_PATH = os.path.dirname(os.path.realpath(__file__)) CODE_PATH = os.path.dirname(os.path.realpath(__file__)) TOKENS_FILE = os.path.join(DIR_PATH, 'tokens.txt') TOKENS = utils.load_tokens(TOKENS_FILE) def limit_content_length(max_length: int) -> Callable: """Limits a request to max_length bytes at max.""" def decorator(f): @wraps(f) def wrapper(*args, **kwargs): cl = request.content_length if cl is not None and cl > max_length: abort(413) return f(*args, **kwargs) return wrapper return decorator def die(reason: str = "Fatal Error. Please contact the service's operators and consult for support.", returncode: int = 400) -> Tuple[str, int]: """Returns a json of an error."""