def make_model(src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1): "Helper: Construct a model from hyperparameters." c = copy.deepcopy attn = MultiHeadedAttention(h, d_model) ff = PositionwiseFeedForward(d_model, d_ff, dropout) position = PositionalEncoding(d_model, dropout) model = EncoderDecoder( Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N), Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N), nn.Sequential(Embeddings(d_model, src_vocab), c(position)), nn.Sequential(Embeddings(d_model, tgt_vocab), c(position)), Generator(d_model, tgt_vocab)) # This was important from their code. # Initialize parameters with Glorot / fan_avg. for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p) """https://zhuanlan.zhihu.com/p/74274453 #權值初始化 Xavier均勻分佈""" return model
def one_classifier(text, lang, embedding_name, model_path, model_file): #-------------------------------------------------------------------------------------------- #--- LOAD MODEL AND EMBEDDING #-------------------------------------------------------------------------------------------- print(model_file) cls = pickle.load(open(model_path + model_file, 'rb')) embedding = Embeddings(embedding_name) #-------------------------------------------------------------------------------------------- #--- PROCESSING #-------------------------------------------------------------------------------------------- processed_text = preprocess(text) no_stpw_text = remove_stopwords(processed_text, lang) if len( to_vector_single_nonzeros(no_stpw_text, embedding, len(no_stpw_text))) > 0: vectorized_text = np.mean(to_vector_single_nonzeros( no_stpw_text, embedding, len(no_stpw_text)), axis=0) vectorized_text2 = np.reshape(vectorized_text, (1, -1)) prob = cls.predict_proba(vectorized_text2)[:, 1] else: vectorized_text = np.zeros((300, ) * 1) prob = 0 #print(cls.classes_) # check that class at second position is L1 for i in list(prob): return (i)
def _make_model(self, num_tgt_chars, N, d_model, d_ff, h, dropout): """ :param num_tgt_chars: output space :param N: number of decoder and encoder layers :param d_model: model dimensionality :param d_ff: hidden size of the feed-forward neural network :param h: number of attention heads :param dropout: dropout rate :return: model """ c = copy.deepcopy attn = MultiHeadedAttention(h, d_model) ff = PositionwiseFeedForward(d_model, d_ff, dropout) position = PositionalEncoding(d_model, dropout) if self.config.USE_RESNET: feature_extractor = ResNet(block=BasicBlock, layers=self.config.RESNET_LAYERS, d_model=self.config.D_MODEL) else: feature_extractor = FeatureExtractionNetwork(d_model=self.config.D_MODEL) direction_embed = Embeddings(d_model, 2) model = EncoderDecoder( encoder=Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N), decoder=Decoder(DecoderLayer(d_model, c(attn), c(attn), c(ff), dropout), N), tgt_embed=nn.Sequential(Embeddings(d_model, num_tgt_chars), c(position)), generator=PredictionLayer(d_model, num_tgt_chars), feature_extractor=feature_extractor, prediction_layer=PredictionLayer(d_model, len(Dataset.CHAR_ID_MAP)), bidirectional_decoding=self.config.BIDIRECTIONAL_DECODING, direction_embed=direction_embed, device=self.device ) for p in model.parameters(): if p.dim() > 1: nn.init.xavier_normal_(p) logging.info("Model created") return model
def make_vectorize(): try: #Load the data data = request.get_json() except Exception as e: raise e if data == {}: return (bad_request()) else: #Get the text and the language try: lang = data['lang'] except: try: lang = detect_language(data['text']) print(lang) except: responses = jsonify( "Error in vectorize: language field is missing") return responses try: text = data['text'] except: responses = jsonify("Error in vectorize: text is missing") return responses if lang not in ['en', 'es', 'ar', 'ro', 'fr']: responses = jsonify( "Language not available. Language must be in ['en','es','ar','ro','fr']" ) return responses #Preprocess the text print("Vectorize...") embeddings = Embeddings(emb_dict[lang]) processed_text = preprocess(text) no_stpw_text = remove_stopwords(processed_text, lang) vectorized_tokens = to_vector_single_nonzeros(no_stpw_text, embeddings, len(no_stpw_text)) if len(vectorized_tokens) > 0: vectorized_text = np.mean(vectorized_tokens, axis=0) else: vectorized_text = np.zeros((300, ) * 1) print(vectorized_text) #Send the response codes responses = jsonify(vector=vectorized_text.tolist()) responses.status_code = 200 return responses
def make_model(src_vocab, tgt_vocab, N=6, d_model=512, d_ff=2048, h=8, dropout=0.1): "Helper: Construct a model from hyperparameters" c = copy.deepcopy attn = MultiHeadedAttention(h, d_model) ff = PositionWiseFeedForward(d_model, dropout) position = PositionalEncoding(d_model, dropout) model = EncoderDecoder( Encoder(EncoderLayer(d_model, c(attn), c(ff), dropout), N), Decoder(DecoderLayer(d_model, c(attn), c(ff), dropout), N), nn.Sequential(Embeddings(d_model, src_vocab), c(position)), nn.Sequential(Embeddings(d_model, tgt_vocab, c(position))), Generator(d_model, tgt_vocab)) for p in model.parameters(): if p.dim() > 1: nn.init.xavier_uniform(p) return model
def get_topics(text, lang, topics_path): #initialization embeddings = Embeddings(emb_dict[lang]) # get the topics dictionary from the path topics_dicts = load_data(topics_path) topics_dict = topics_dicts[lang] topics = list(topics_dict.keys()) if lang == 'en': #cl = 0.7 # when a topic is "close" cl = 0.5 else: cl = 0.5 # now vectorize the topics vect_dict_topics = [ (w, np.mean(to_vector_single_nonzeros(topics_dict[w], embeddings, len(topics_dict[w])), axis=0)) for w in topics ] #print(vect_dict_topics) # get topics assigned_topics = [] dists = [] if len(to_vector_single_nonzeros(text, embeddings, len(text))) > 0: vectorized_text = np.mean(to_vector_single_nonzeros( text, embeddings, len(text)), axis=0) else: vectorized_text = np.zeros((300, ) * 1) for v in vect_dict_topics: dists.append(spatial.distance.cosine( vectorized_text, v[1])) # measure distance to all topics good_topics = [ topics[i].upper() for i in range(len(topics)) if dists[i] < cl ] # choose close topics if not good_topics: good_topics.append('OTHER') # assigned_topics.append(topic) assigned_topics.append(good_topics) return assigned_topics
def test(): emb = Embeddings(128, 1000, 0) model = TransformerEncoder(num_layers=2, d_model=128, heads=16, d_ff=8, dropout=0.5, embeddings=emb, max_relative_positions=100) #print(model) ipt = torch.empty((16, 10, 1), dtype=torch.long).random_(1000) embs, outs, lengths = model(ipt) print("embs") print(embs.size()) print("outs") print(outs.size()) print("lengths") print(lengths)
def two_classifier(text, lang, embedding_name, model_path, model_file_JIH, model_file_EXR): #-------------------------------------------------------------------------------------------- #--- LOAD MODEL AND EMBEDDING #-------------------------------------------------------------------------------------------- cls_JIH = pickle.load(open(model_path + model_file_JIH, 'rb')) cls_EXR = pickle.load(open(model_path + model_file_EXR, 'rb')) embedding = Embeddings(embedding_name) #-------------------------------------------------------------------------------------------- #--- PROCESSING #-------------------------------------------------------------------------------------------- processed_text = preprocess(text) no_stpw_text = remove_stopwords(processed_text, lang) if len( to_vector_single_nonzeros(no_stpw_text, embedding, len(no_stpw_text))) > 0: vectorized_text = np.mean(to_vector_single_nonzeros( no_stpw_text, embedding, len(no_stpw_text)), axis=0) vectorized_text2 = np.reshape(vectorized_text, (1, -1)) prob_JIH = cls_JIH.predict_proba(vectorized_text2)[:, 1] prob_EXR = cls_EXR.predict_proba(vectorized_text2)[:, 1] else: vectorized_text = np.zeros((300, ) * 1) prob_JIH = 0 prob_EXR = 0 if prob_JIH > prob_EXR: prob = prob_JIH else: prob = prob_EXR for i in list(prob): return (i)
def classifier(annotated_data, lang, user_id, case_id, clas_name): #-------------------------------------------------------------------------------------------- #--- DEFINE FILES AND LANGUAGE #-------------------------------------------------------------------------------------------- model_path = './data/probability/insikt/' model_file = user_id + '_' + case_id + '_' + clas_name + '_classifier.model' if (lang == 'en'): embedding_name = 'embedding-EN' if (lang == 'ar'): embedding_name = 'embedding-AR' if (lang == 'es'): embedding_name = 'embedding-ES' if (lang == 'ro'): embedding_name = 'embedding-RO' if (lang == 'fr'): embedding_name = 'embedding-FR' embedding = Embeddings(embedding_name) #-------------------------------------------------------------------------------------------- #--- GENERAL SCRIPT #-------------------------------------------------------------------------------------------- ########## Tokenize + stopwords #print(annotated_data) #raw_data=np.array(annotated_data) x_train = [i[0] for i in annotated_data] #print(x_train) y_train = [i[1] for i in annotated_data] #replace N0 for L0...!!! #print(y_train) x_train_DL = [] print('Data training with ' + str(len(x_train)) + ' texts') for text in x_train: #print(text) processed_text = preprocess(text) no_stpw_text = remove_stopwords(processed_text, lang) if len( to_vector_single_nonzeros(no_stpw_text, embedding, len(no_stpw_text))) > 0: vectorized_text = np.mean(to_vector_single_nonzeros( no_stpw_text, embedding, len(no_stpw_text)), axis=0) else: vectorized_text = np.zeros((300, ) * 1) #print(vectorized_text) #x_train_DL.append(np.reshape(vectorized_text,(1,-1))) x_train_DL.append(vectorized_text) ########## Build and test classifiers with 10-fold -cross validation skf = StratifiedKFold(n_splits=10, shuffle=True) # Stochastic Descent Gradient cls = SGDClassifier(loss="log", penalty="l2", max_iter=500).fit(x_train_DL, y_train) scores = cross_val_score(cls, x_train_DL, y_train, cv=skf, scoring='accuracy') print("Accuracy C-10V EN: %2.1f (+/- %2.1f)" % (100 * scores.mean(), scores.std() * 200)) print(cls.classes_) # check that class at the second position is 'Yes' accuracy = round((100 * scores.mean()), 2) ########## Save the model pickle.dump(cls, open(model_path + model_file, 'wb')) return (accuracy)
def main(morphFileFolder, outputFolder, embeddings_file, positive_words, negatve_words): print "loading embedding vectors..." e = Embeddings(embeddings_file) print "done!" positive_df = pd.read_csv(positive_words) negative_df = pd.read_csv(negatve_words) positive_df["vector"] = positive_df["word"].apply(lambda x: e.get(x)) negative_df["vector"] = negative_df["word"].apply(lambda x: e.get(x)) files = [ f for f in listdir(morphFileFolder) if isfile(join(morphFileFolder, f)) ] recordings = [] for f in files: recordings.append(Recording.Recording(join(morphFileFolder, f))) questionSummaries = [] for r in recordings: for i in xrange(len(r.questions)): if len(questionSummaries) < (i + 1): questionSummaries.append(Question.Question([])) questionSummaries[i].mergeWith(r.questions[i]) #specific metrics comparison across all questions nouns = {} verbs = {} adjectives = {} adverbs = {} content = {} person_1 = {} person_2 = {} person_3 = {} for i, q in enumerate(questionSummaries): norm_pos = counter2normDictionary(q.pos, q.word_count) norm_per = counter2normDictionary(q.person, q.word_count) nouns[i + 1] = norm_pos["noun"] verbs[i + 1] = norm_pos["verb"] adjectives[i + 1] = norm_pos["adjective"] adverbs[i + 1] = norm_pos["adverb"] content[i + 1] = norm_pos["noun"] + norm_pos["verb"] + norm_pos[ "adjective"] + norm_pos["adverb"] person_1[i + 1] = norm_per["1"] person_2[i + 1] = norm_per["2"] person_3[i + 1] = norm_per["3"] print "Question " + ` (i + 1) ` + ", avg word count: " + ` ( q.word_count / len(questionSummaries)) ` counter2hist(nouns, 'Nouns', outputFolder) counter2hist(verbs, 'Verbs', outputFolder) counter2hist(adjectives, 'Adjectives', outputFolder) counter2hist(adverbs, 'Adverbs', outputFolder) counter2hist(content, 'Content words', outputFolder) counter2hist(person_1, '1st person', outputFolder) counter2hist(person_2, '2nd person', outputFolder) counter2hist(person_3, '3rd person', outputFolder) #raw metrics for each question sentiment_scores = {} for i, q in enumerate(questionSummaries): positive_score = calculate_sentiment_score(q.words, positive_df, e) negative_score = calculate_sentiment_score(q.words, negative_df, e) print "Question " + ` ( i + 1 ) ` + ", Positive: " + ` positive_score ` + ", Negative: " + ` negative_score ` + ", Overall: " + ` ( positive_score / negative_score) ` sentiment_scores[i + 1] = (positive_score / negative_score) buildWordCloud(q.contentWords, True, 'Question ' + ` (i + 1) ` + ' Content Word Cloud', outputFolder) counter2hist(counter2normDictionary(q.pos, q.word_count), 'Question ' + ` (i + 1) ` + ' POS', outputFolder) counter2hist(counter2normDictionary(q.person, q.word_count), 'Question ' + ` (i + 1) ` + ' Person', outputFolder) counter2hist(sentiment_scores, 'Sentiment scores', outputFolder)