def main(): st.title("Tweet Screener") st.subheader("*Guaranteeing 2020 proof tweets to the masses*") st.sidebar.image("image_resources/DSI-logo.jpg", use_column_width = True) st.sidebar.subheader("Africa DSI NLP Project by Team 2") st.sidebar.write("Catherine, Fanamby, Malcolm, and Martin") section = st.sidebar.radio('Sections to Visit',('Swear Word Analyser', 'Sentiment Analyser', 'Topic Identifier')) publish = st.sidebar.button(label = "Publish Tweet!") if section == 'Swear Word Analyser': st.subheader('Swear Word Analyser') blacklist = load_screener() sentence = st.text_area('Input your message/tweet here:') if sentence: # Pre-process tweet answer = PS.profanityscreen(sentence, blacklist, True) st.subheader("Swear Analysis Results:") # Show predictions st.write('Swear Words Found:') st.dataframe(pd.DataFrame(answer[1], columns=["Swear Words"])) st.write('Your Censored Tweet:') st.write(answer[0]) if section == "Sentiment Analyser": st.subheader('Sentiment Analyser') sentSentence = st.text_area('Input your message/tweet here:') if sentSentence: sentimentTweet = Sentence(preprocess(sentSentence)) emoteTweet = Sentence(preprocess(sentSentence)) #Sentiment Dictionaries sentiment_dict = {'0': 'Negative', '4': 'Positive'} emote_dict = {'0': 'Anger', '1': 'Fear', '2': 'Joy', '3': 'Love', '4': 'Sadness', '5': 'Surprise'} emoji_dict = {'0': ':rage:', '1': ':fearful:', '2': ':joy:', '3': ':heart_eyes:', '4': ':cry:', '5': ':astonished:'} basic_emo_dict = {"0": ':rage:', "4": ":smile:"} with st.spinner("Weeeeeee......."): SentClassifier = TextClassifier.load('twitter_sentiment/model-saves/final-model.pt') EmoteClassifier = TextClassifier.load('twitter_sentiment/model-saves/emotion-model.pt') SentClassifier.predict(sentimentTweet) EmoteClassifier.predict(emoteTweet) st.subheader("Sentiment Analysis Results:") predSent = sentimentTweet.labels[0] predSText = sentiment_dict[predSent.value[0]] st.markdown('Your sentence is ' + str(predSText) + " " + basic_emo_dict[predSent.value[0]] + ' with '+ "{:.2f}".format(predSent.score*100)+ '% confidence') predEmote = emoteTweet.labels[0] predEText = emote_dict[predEmote.value[0]] st.markdown('Your sentence is predicted to portray ' + predEText + " " +emoji_dict[predEmote.value[0]] +' with '+ "{:.2f}".format(predEmote.score*100)+ ' % confidence') if section == "Topic Identifier": st.subheader("Topic Identifier") topicSentence = st.text_area('Input your message/tweet here:') st.subheader("Sensitivity Analysis Results:") if topicSentence: topicTweet = preprocess_test(topicSentence) topic_dict = {0: "obscenity", 1: "violence", 2: "verbal abuse", 3: "identity hate speech", 4: "hate speech", 5: "offense", 6: "neither"} policies_dict = {0 : "https://help.twitter.com/en/safety-and-security/offensive-tweets-and-content", 1 : "https://help.twitter.com/en/rules-and-policies/violent-threats-glorification", 2 : "https://help.twitter.com/en/rules-and-policies/abusive-behavior", 3 : "https://help.twitter.com/en/rules-and-policies/hateful-conduct-policy", 4 : "https://help.twitter.com/en/rules-and-policies/hateful-conduct-policy", 5 : "https://help.twitter.com/en/safety-and-security/offensive-tweets-and-content"} twitter_rules = "https://help.twitter.com/en/rules-and-policies#general-policies" with st.spinner("Predicting..."): TopicClassifier = tf.keras.models.load_model('Topic Identifier/model_saves/topic_identifier_model.h5') topic_pred = TopicClassifier.predict(topicTweet) topTopic = topic_pred.argmax(1)[0] topTopicText = topic_dict[topic_pred.argmax(1)[0]] graph_pred = pd.DataFrame(topic_pred, columns = ["obscenity", "violence", "verbal abuse", "identity hate crime", "hate crime", "offense", "neither"]) columns = ["obscenity", "violence", "verbal abuse", "identity hate speech", "hate speech", "offense", "neither"] if topic_pred.argmax(1)[0]!=6 : st.write("Your tweet may contain sentences that promote " + topTopicText+ " with "+str(topic_pred[0][topTopic]*100) +" % confidence") st.write("Please review Twitter Rules and policies: "+ twitter_rules) st.write("And Twiiter's "+ topTopicText + " policy: "+ policies_dict[topic_pred.argmax(1)[0]]) else: st.write("Your tweet is fine in terms of policy.") plt.bar(height = topic_pred.flatten(),x=columns, width = 1) plt.title('Policy Breaking Likelihood') plt.xticks(rotation=45) plt.xlabel('Twitter Policies (Topics)') plt.ylabel('Probability of Violation') st.pyplot() st.write(topic_pred) if publish: if sentence: sentiment = "None" sentence = sentence publish_tweet(sentiment, sentence) elif sentSentence: sentiment = predEText sentence = sentSentence publish_tweet(sentiment, sentence) elif topicSentence: sentiment = predEText sentence = topicSentence publish_tweet(sentiment, sentence) else: st.sidebar.write("You haven't written or analysed your tweet yet.") st.sidebar.markdown("This application helps determine how problematic your tweet is before publishing it." + " We utilise three main tools to achieve this." +" A swear word analyser that checks your tweet for profanity and delivers a censored tweet." +" A sentiment analyser that predicts the emotion in your tweet, to check if you were really being positive." +" Finally a topic identifier which determines if you broke one of Twitter's policies with out knowing it!" +" Once you have thouroughly scrubbed you tweet you may store your results for further analyses.")
from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings, Sentence from flair.data_fetcher import NLPTaskDataFetcher from flair.embeddings import BertEmbeddings from flair.models import TextClassifier from flair.trainers import ModelTrainer from pathlib import Path glove_embedding = WordEmbeddings('glove') bert_embedding = BertEmbeddings('bert-base-uncased') corpus = NLPTaskDataFetcher.load_classification_corpus(Path('./'), test_file='test.csv', dev_file='dev.csv', train_file='train.csv') document_embeddings = DocumentPoolEmbeddings([bert_embedding, glove_embedding]) classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=True) trainer = ModelTrainer(classifier, corpus) trainer.train('./', max_epochs=10)
) stats = corpus.obtain_statistics() print(stats) # create the label dictionary label_dict = corpus.make_label_dictionary() print(label_dict) # make a list of word embeddings embeddings = [OneHotEmbeddings(corpus=corpus)] # initialize document embedding by passing list of word embeddings # Can choose between many RNN types (GRU by default, to change use rnn_type parameter) document_embeddings = DocumentRNNEmbeddings(embeddings, bidirectional=True, hidden_size=256) # create the text classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict) # initialize the text classifier trainer trainer = ModelTrainer(classifier, corpus) # start the training trainer.train('resources/', learning_rate=0.1, mini_batch_size=128, anneal_factor=0.5, patience=5, max_epochs=20)
#from bson.objectid import ObjectId from flask_cors import CORS from flair.models import TextClassifier from flair.data import Sentence from flask import session app = Flask(__name__) app.secret_key = "super_secret_key" # # app.config['SECRET_KEY'] = 'oh_so_secret' # app.config['MONGO_DBNAME'] = 'exposeModel' # app.config['MONGO_URI'] = 'mongodb://*****:*****@app.route('/', methods=['GET']) def index(): return jsonify("welcome to Arafa API") @app.route('/api/tasks', methods=['GET']) def get_result(): result = [] try: data_result = session['my_result'] result.append({ 'title': data_result['title'], 'tag': data_result['tag']
def upload_file(): uploaded_file = request.files['file'] if uploaded_file.filename != '': ## uploaded_file jako parametr pandas ## cala logika tworzenia positive i negative df = load_dataset_from(uploaded_file) # To verify if data loaded correctly: # print(df.head(10)) print(df.head(10)) classifier = TextClassifier.load('sentiment') tmp_negatives = {} tmp_positives = {} print(f'Number of players: {len(df["Player"].unique())}') for player_name in df["Player"].unique(): tmp_negatives[player_name] = list() tmp_positives[player_name] = list() for dp in df.values: l = dp.tolist() sentence = Sentence(l[-1]) classifier.predict(sentence) if sentence.labels[0].value == "NEGATIVE": tmp_negatives[l[-2]].append(sentence.labels[0].score) elif sentence.labels[0].value == "POSITIVE": tmp_positives[l[-2]].append(sentence.labels[0].score) negative = {} positive = {} for player in tmp_negatives.keys(): if len(tmp_negatives[player]) > 10: negative[player] = sum(tmp_negatives[player]) / len( tmp_negatives[player]) for player in tmp_positives.keys(): if len(tmp_positives[player]) > 10: positive[player] = sum(tmp_positives[player]) / len( tmp_positives[player]) print(negative) print(positive) # for player in tmp.keys(): # tmp[player] = Sentence(tmp[player]) # classifier.predict(tmp[player]) # if tmp[player].labels[0].value == "NEGATIVE": # negative[player] = tmp[player].labels[0].score # elif tmp[player].labels[0].value == "POSITIVE": # positive[player] = tmp[player].labels[0].score # positive_json = json.dumps(positive, indent=4) negative_json = json.dumps(negative, indent=4) list = {} list[0] = positive_json list[1] = negative_json return list
'Buenos dias, vamos a hacer algunos recados y a empezar el dia con energia!!', # 3 '@mireiaescribano justo cuando se terminan las fiestas de verano, me viene genial', # 3 'No sabes cuantas horas, minutos y segundos espero para volver a ver esa sonrisa que tanto me gusta ver salir de ti', # 0 '@cinthiaszc jajajaja me vas a decir a mi mi abuela cocina tan rico que mando al tacho la dieta :v', # 0 'te adoroVen a Perú pls' # 3 ] label_dictionary = ['0', '1', '2', '3'] fasttext_path = '../fasttext/models/{}_{}'.format('intertass', FT_MODEL_NAME) fasttext_model = fasttext.load_model(path=fasttext_path) dev_fasttext_probabilities, dev_fasttext_predictions = fasttext_embedding.predict_with_fasttext_model( fasttext_model, tweets_to_test, label_dictionary) print(dev_fasttext_probabilities) print(dev_fasttext_predictions) print("BERT MODEL") bert_path = '{}/{}/{}/best-model.pt'.format(BERT_MODEL_PATH, BERT_MODEL_NAME, 'intertass') bert_model = TextClassifier.load(bert_path) dev_bert_probabilities, dev_bert_predictions = bert_embeddings.predict_with_bert_model( bert_model, tweets_to_test, label_dictionary) print(dev_bert_probabilities) print(dev_bert_predictions) # sentences = ['horrible'] # # analyzer = vaderSentiment.vaderSentiment.SentimentIntensityAnalyzer() # for sentence in sentences: # vs = analyzer.polarity_scores(sentence) # print("{:-<65} {}".format(sentence, str(vs)))
def train_model(self, model_name="text_classification_model", custom_word_embeddings=None, rnn_type="GRU", use_pool_embedding=False, hidden_size=16, reproject_words=True, reproject_words_dimension=128, learning_rate=1e-3, batch_size=8, anneal_factor=0.5, patience=2, max_epochs=30, **kwargs): """ Train flair model and save it in your data folder Parameters ---------- model_name: str Name of your model custom_word_embeddings: list<embedding> Use custom flair embedding See more in flair documentation: https://github.com/zalandoresearch/flair/tree/master/resources/docs Return ------- None """ self.model_name = model_name corpus = CSVClassificationCorpus(self.data_folder, self.column_name_map, skip_header=True) label_dict = corpus.make_label_dictionary() # Word embedding selection if custom_word_embeddings is None: word_embeddings = [WordEmbeddings('fr')] else: word_embeddings = custom_word_embeddings # initialize document embedding by passing list of word embeddings and parameters if use_pool_embedding: document_embeddings = DocumentPoolEmbeddings( word_embeddings, pooling='max', fine_tune_mode='nonlinear') else: document_embeddings = DocumentRNNEmbeddings( word_embeddings, hidden_size=hidden_size, reproject_words=reproject_words, reproject_words_dimension=reproject_words_dimension, rnn_type=rnn_type) # create the text classifier and initialize trainer classifier = TextClassifier(document_embeddings, label_dictionary=label_dict) trainer = ModelTrainer(classifier, corpus, optimizer=Adam) # let's train ! num_workers = cpu_count() trainer.train("{0}\\{1}".format(self.data_folder, self.model_name), learning_rate=learning_rate, num_workers=num_workers, mini_batch_size=batch_size, anneal_factor=anneal_factor, patience=patience, max_epochs=max_epochs, **kwargs)
def train(self, X, y): X_text = X[:, self.args.TEXT_COL] y = y.flatten() #corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03) train: List[Sentence] = [] for tweet, label in zip(X_text, y): if tweet == '': tweet = 'dummy' s: Sentence = Sentence(tweet) s.add_label(str(label)) train.append(s) corpus: TaggedCorpus = TaggedCorpus(train, train, train) # 2. create the label dictionary label_dict = corpus.make_label_dictionary() # 3. make a list of word embeddings word_embeddings = [ glove_embeddings, #twitter_embeddings, # comment in this line to use character embeddings #CharacterEmbeddings(), # comment in flair embeddings for state-of-the-art results # FlairEmbeddings('news-forward'), fflair, # FlairEmbeddings('news-backward'), bflair ] # 4. initialize document embedding by passing list of word embeddings document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256, ) # 5. create the text classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False) # 6. initialize the text classifier trainer trainer = ModelTrainer(classifier, corpus) self.model = trainer.model self.model.save = self.save self.model.save_checkpoint = self.save_checkpoint # 7. start the training trainer.train('../data/ecuador_earthquake_2016/models', learning_rate=0.1, mini_batch_size=32, anneal_factor=0.5, patience=5, max_epochs=5) self.clf = classifier
def test(): classifier = TextClassifier.load(model_file) sentence = Sentence('Awesome stuff!') classifier.predict(sentence) print(sentence.labels)
# coding utf-8 # docker run --name flask -p 5000:5000 -v G:\workspace:/flask -it python bash # export FLASK_APP="server.py" # flask run --host=0.0.0.0 # or python -m forever.run -t 1000000 python server.py & from flask import Flask, jsonify import feedparser from flair.data import build_japanese_tokenizer, Sentence from flair.models import TextClassifier app = Flask(__name__) classifier = TextClassifier.load('resources/best-model.pt') japanese_tokenizer = build_japanese_tokenizer() def get_score(text): # create example sentence sentence = Sentence(text, use_tokenizer=japanese_tokenizer) # predict class and print classifier.predict(sentence) label_dict = sentence.to_dict()["labels"][0] return label_dict["confidence"] if label_dict[ "value"] == "__label__O" else 0 def get_feed(): RSS_URL = "https://www.lifehacker.jp/feed/index.xml" feed = feedparser.parse(RSS_URL)
############################################################################################# ############################################################################################# print("#####################################") print("########## EMOTIONS ##############") print("#####################################") from flair.models import TextClassifier # load sentiment model classifier = TextClassifier.load('en-sentiment') # example sentence sentence = Sentence('Porto wins. I am happy') # predict NER tags classifier.predict(sentence) # print sentence with predicted labels print(sentence.labels) print("#####################################") print("########## EMOTIONS ##############") print("#####################################")
def train(self, intent_fst) -> None: from flair.data import Sentence, Token from flair.models import SequenceTagger, TextClassifier from flair.embeddings import ( FlairEmbeddings, StackedEmbeddings, DocumentRNNEmbeddings, ) from flair.data import TaggedCorpus from flair.trainers import ModelTrainer # Directory to look for downloaded embeddings cache_dir = self.profile.read_path( self.profile.get("intent.flair.cache_dir", "flair/cache") ) os.makedirs(cache_dir, exist_ok=True) # Directory to store generated models data_dir = self.profile.write_path( self.profile.get("intent.flair.data_dir", "flair/data") ) if os.path.exists(data_dir): shutil.rmtree(data_dir) self.embeddings = self.profile.get("intent.flair.embeddings", []) assert len(self.embeddings) > 0, "No word embeddings" # Create directories to write training data to class_data_dir = os.path.join(data_dir, "classification") ner_data_dir = os.path.join(data_dir, "ner") os.makedirs(class_data_dir, exist_ok=True) os.makedirs(ner_data_dir, exist_ok=True) # Convert FST to training data class_data_path = os.path.join(class_data_dir, "train.txt") ner_data_path = os.path.join(ner_data_dir, "train.txt") # { intent: [ { 'text': ..., 'entities': { ... } }, ... ] } sentences_by_intent: Dict[str, Any] = {} # Get sentences for training do_sampling = self.profile.get("intent.flair.do_sampling", True) start_time = time.time() if do_sampling: # Sample from each intent FST num_samples = int(self.profile.get("intent.flair.num_samples", 10000)) intent_map_path = self.profile.read_path( self.profile.get("training.intent.intent_map", "intent_map.json") ) with open(intent_map_path, "r") as intent_map_file: intent_map = json.load(intent_map_file) # Gather FSTs for all known intents fsts_dir = self.profile.write_dir( self.profile.get("speech_to_text.fsts_dir") ) intent_fst_paths = { intent_id: os.path.join(fsts_dir, f"{intent_id}.fst") for intent_id in intent_map.keys() } # Generate samples self._logger.debug( f"Generating {num_samples} sample(s) from {len(intent_fst_paths)} intent(s)" ) sentences_by_intent = sample_sentences_by_intent( intent_fst_paths, num_samples ) else: # Exhaustively generate all sentences self._logger.debug( "Generating all possible sentences (may take a long time)" ) sentences_by_intent = make_sentences_by_intent(intent_fst) sentence_time = time.time() - start_time self._logger.debug(f"Generated sentences in {sentence_time} second(s)") # Get least common multiple in order to balance sentences by intent lcm_sentences = lcm(*(len(sents) for sents in sentences_by_intent.values())) # Generate examples class_sentences = [] ner_sentences: Dict[str, List[Sentence]] = defaultdict(list) for intent_name, intent_sents in sentences_by_intent.items(): num_repeats = max(1, lcm_sentences // len(intent_sents)) for intent_sent in intent_sents: # Only train an intent classifier if there's more than one intent if len(sentences_by_intent) > 1: # Add balanced copies for i in range(num_repeats): class_sent = Sentence(labels=[intent_name]) for word in intent_sent["tokens"]: class_sent.add_token(Token(word)) class_sentences.append(class_sent) if len(intent_sent["entities"]) == 0: continue # no entities, no sequence tagger # Named entity recognition (NER) example token_idx = 0 entity_start = {ev["start"]: ev for ev in intent_sent["entities"]} entity_end = {ev["end"]: ev for ev in intent_sent["entities"]} entity = None word_tags = [] for word in intent_sent["tokens"]: # Determine tag label tag = "O" if not entity else f"I-{entity}" if token_idx in entity_start: entity = entity_start[token_idx]["entity"] tag = f"B-{entity}" word_tags.append((word, tag)) # word ner token_idx += len(word) + 1 if (token_idx - 1) in entity_end: entity = None # Add balanced copies for i in range(num_repeats): ner_sent = Sentence() for word, tag in word_tags: token = Token(word) token.add_tag("ner", tag) ner_sent.add_token(token) ner_sentences[intent_name].append(ner_sent) # Start training max_epochs = int(self.profile.get("intent.flair.max_epochs", 100)) # Load word embeddings self._logger.debug(f"Loading word embeddings from {cache_dir}") word_embeddings = [ FlairEmbeddings(os.path.join(cache_dir, "embeddings", e)) for e in self.embeddings ] if len(class_sentences) > 0: self._logger.debug("Training intent classifier") # Random 80/10/10 split class_train, class_dev, class_test = self._split_data(class_sentences) class_corpus = TaggedCorpus(class_train, class_dev, class_test) # Intent classification doc_embeddings = DocumentRNNEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256, ) classifier = TextClassifier( doc_embeddings, label_dictionary=class_corpus.make_label_dictionary(), multi_label=False, ) self._logger.debug( f"Intent classifier has {len(class_sentences)} example(s)" ) trainer = ModelTrainer(classifier, class_corpus) trainer.train(class_data_dir, max_epochs=max_epochs) else: self._logger.info("Skipping intent classifier training") if len(ner_sentences) > 0: self._logger.debug(f"Training {len(ner_sentences)} NER sequence tagger(s)") # Named entity recognition stacked_embeddings = StackedEmbeddings(word_embeddings) for intent_name, intent_ner_sents in ner_sentences.items(): ner_train, ner_dev, ner_test = self._split_data(intent_ner_sents) ner_corpus = TaggedCorpus(ner_train, ner_dev, ner_test) tagger = SequenceTagger( hidden_size=256, embeddings=stacked_embeddings, tag_dictionary=ner_corpus.make_tag_dictionary(tag_type="ner"), tag_type="ner", use_crf=True, ) ner_intent_dir = os.path.join(ner_data_dir, intent_name) os.makedirs(ner_intent_dir, exist_ok=True) self._logger.debug( f"NER tagger for {intent_name} has {len(intent_ner_sents)} example(s)" ) trainer = ModelTrainer(tagger, ner_corpus) trainer.train(ner_intent_dir, max_epochs=max_epochs) else: self._logger.info("Skipping NER sequence tagger training")
elif (val.value == 'b'): row.pred_neg = 0 row.pred_pos = 1 elif (val.value == 'a'): row.pred_neg = 0 row.pred_pos = 0 elif (val.value == 'd'): row.pred_neg = 1 row.pred_pos = 0 else: print("incorrectVal {}".format(val)) return row if __name__ == "__main__": classifier = TextClassifier.load('./data/POLARITY_MULTI/best-model.pt') data_to_pred = pd.read_csv('./data/test_gold.csv',sep='\t') data_to_pred['pred_pos'] = 0 data_to_pred['pred_neg'] = 0 data_to_pred = data_to_pred.apply(lambda row: addPredictionForPositive(row), axis=1) data_to_pred.head() data_to_save = data_to_pred[['idtwitter','subj','pred_pos','pred_neg','iro','lpos','lneg','top']] data_to_save.idtwitter = data_to_save.idtwitter.astype(str) data_to_save.subj = data_to_save.subj.astype(str) data_to_save.pred_pos = data_to_save.pred_neg.astype(str) data_to_save.pred_neg = data_to_save.pred_neg.astype(str) data_to_save.iro = data_to_save.iro.astype(str) data_to_save.lpos = data_to_save.lpos.astype(str) data_to_save.lneg = data_to_save.lneg.astype(str) data_to_save.top = data_to_save.top.astype(str)
print(label_dictionary) flat_labels = [item for sublist in labels for item in sublist] class_weights = compute_class_weight('balanced', np.unique(flat_labels), flat_labels) unique_labels = np.unique(flat_labels) weights = {} for i in range(len(unique_labels)): weights[unique_labels[i]] = class_weights[i] document_embeddings = TransformerDocumentEmbeddings( params['version_model'], fine_tune=True) classifier = TextClassifier(document_embeddings, label_dictionary=label_dictionary, loss_weights=weights, multi_label=False) trainer = ModelTrainer(classifier, corpus) trainer.train(params['model_dir'], learning_rate=params['learning_rate'], mini_batch_size=params['batch_size'], anneal_factor=params['anneal_factor'], patience=params['patience'], max_epochs=params['epochs'], embeddings_storage_mode=params['embeddings_storage_mode']) # print_predictions(trainer, tokens_test, params['results']+'gloveSentence')
test_run = args.test_run nrows = args.nrows model_type = args.model # read data df = datatable.fread(f"./data/from_USNavy_for_flair.csv").to_pandas() print(len(df)) df = df.head(nrows) data = df.copy() data = data[['sentence', "row"]] # load classifier from flair.models import TextClassifier from flair.data import Sentence model = TextClassifier.load( f'./data/model_{model_type}_{fold}/_{test_run}_best-model.pt') document_embeddings = model.document_embeddings # prepare df for output to csv # batch size for embedding tweet instances bs = 32 tweets_to_embed = data['sentence'].copy() print("beginning embedding") # prepare mini batches low_limits = list() for x in range(0, len(tweets_to_embed), bs): low_limits.append(x) up_limits = [x + bs for x in low_limits[:-1]] up_limits.append(len(tweets_to_embed))
# FlairEmbeddings('news-forward'), # FlairEmbeddings('news-backward'), ] # 4. initialize document embedding by passing list of word embeddings # Can choose between many RNN types (GRU by default, to change use rnn_type parameter) document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256, ) # 5. create the text classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False) # 6. initialize the text classifier trainer trainer = ModelTrainer(classifier, corpus) # 7. start the training trainer.train('resources/taggers/ag_news', learning_rate=0.1, mini_batch_size=32, anneal_factor=0.5, patience=5, max_epochs=150) # 8. plot training curves (optional) from flair.visual.training_curves import Plotter
import numpy as np from nlppreprocess import NLP app = Flask(__name__) app.secret_key = "super_secret_key" APP_ROOT = os.path.dirname(os.path.abspath(__file__)) UPLOAD_FOLDER = os.path.join(APP_ROOT, 'uploads') app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER # # app.config['SECRET_KEY'] = 'oh_so_secret' # app.config['MONGO_DBNAME'] = 'exposeModel' # app.config['MONGO_URI'] = 'mongodb://*****:*****@app.route('/', methods=['GET']) def index(): return jsonify("welcome to Sadeeq API") @app.route('/api/tasks', methods=['GET']) def get_result(): result = [] try: # result = session['my_result'] result = modelResult
def trainer(file_path: Path, filenames: Tuple[str, str, str], checkpoint: str, stack: str, n_epochs: int) -> None: """Train sentiment model using Flair NLP library: https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md To help provide added context, we can stack Glove, Bert or ELMo embeddings along with Flair embeddings. """ # pip install flair allennlp from flair.datasets import ClassificationCorpus from flair.embeddings import FlairEmbeddings, DocumentRNNEmbeddings from flair.models import TextClassifier from flair.trainers import ModelTrainer from flair.training_utils import EvaluationMetric from flair.visual.training_curves import Plotter if stack == "glove": from flair.embeddings import WordEmbeddings stacked_embedding = WordEmbeddings('glove') elif stack == "elmo": from flair.embeddings import ELMoEmbeddings stacked_embedding = ELMoEmbeddings('original') elif stack == "bert": from flair.embeddings import BertEmbeddings stacked_embedding = BertEmbeddings('bert-base-cased') else: stacked_embedding = None # Define and Load corpus from the provided dataset train, dev, test = filenames corpus = ClassificationCorpus( file_path, train_file=train, dev_file=dev, test_file=test, ) # Create label dictionary from provided labels in data label_dict = corpus.make_label_dictionary() # Stack Flair string-embeddings with optional embeddings word_embeddings = list( filter(None, [ stacked_embedding, FlairEmbeddings('news-forward'), FlairEmbeddings('news-backward'), ])) # Initialize document embedding by passing list of word embeddings document_embeddings = DocumentRNNEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256, ) # Define classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False) if not checkpoint: trainer = ModelTrainer(classifier, corpus) else: # If checkpoint file is defined, resume training checkpoint = classifier.load_checkpoint(Path(checkpoint)) trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus) # Begin training (enable checkpointing to continue training at a later time, if desired) trainer.train(file_path, EvaluationMetric.MACRO_F1_SCORE, max_epochs=n_epochs, checkpoint=True) # Plot curves and store weights and losses plotter = Plotter() plotter.plot_training_curves(file_path / 'loss.tsv') plotter.plot_weights(file_path / 'weights.txt')
WordEmbeddings('glove'), #FlairEmbeddings('news-forward'), #FlairEmbeddings('news-backward'), ] # initialize document embedding by passing list of word embeddings # Can choose between many RNN types (GRU by default, to change use rnn_type parameter) document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings( word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256) # create the text classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False) # initialize the text classifier trainer trainer = ModelTrainer(classifier, corpus) trainer.train('resources/clf', learning_rate=0.1, mini_batch_size=64, anneal_factor=0.5, patience=1, max_epochs=60) classifier = TextClassifier.load('resources/clf/final-model.pt') # create example sentence
f1_list = [] mcc_list = [] f1sub_list = [] f1perf_list = [] f1rec_list = [] f1tec_list = [] f1time_list = [] f1ll_list = [] bas_list = [] # main loop for fold in range(k_folds): # load pre-trained classifier try: subject_classifier = TextClassifier.load( './data/{}/model_subject_category_{}/{}_best-model.pt'.format( dataset, fold, model_type)) except FileNotFoundError: print("----- Does such test run exist? Try another name. -----") quit() # placeholder for results of predictions flair_subject = [] # add timestamp before all classification takes place time_schedule.append(time.perf_counter()) # Main Analysis Loop: start analysis for i in range(len(data)): # get the sentence to be analyzed sent = [str(data.iloc[i, 2])]
class TARSClassifier(FewshotClassifier): """ TARS model for text classification. In the backend, the model uses a BERT based binary text classifier which given a <label, text> pair predicts the probability of two classes "True", and "False". The input data is a usual Sentence object which is inflated by the model internally before pushing it through the transformer stack of BERT. """ static_label_type = "tars_label" def __init__( self, task_name: str, label_dictionary: Dictionary, label_type: str, embeddings: str = 'bert-base-uncased', num_negative_labels_to_sample: int = 2, prefix: bool = True, **tagger_args, ): """ Initializes a TextClassifier :param task_name: a string depicting the name of the task :param label_dictionary: dictionary of labels you want to predict :param embeddings: name of the pre-trained transformer model e.g., 'bert-base-uncased' etc :param num_negative_labels_to_sample: number of negative labels to sample for each positive labels against a sentence during training. Defaults to 2 negative labels for each positive label. The model would sample all the negative labels if None is passed. That slows down the training considerably. :param multi_label: auto-detected by default, but you can set this to True to force multi-label predictionor False to force single-label prediction :param multi_label_threshold: If multi-label you can set the threshold to make predictions :param beta: Parameter for F-beta score for evaluation and training annealing """ super(TARSClassifier, self).__init__() from flair.embeddings import TransformerDocumentEmbeddings if not isinstance(embeddings, TransformerDocumentEmbeddings): embeddings = TransformerDocumentEmbeddings(model=embeddings, fine_tune=True, layers='-1', layer_mean=False, ) # prepare TARS dictionary tars_dictionary = Dictionary(add_unk=False) tars_dictionary.add_item('False') tars_dictionary.add_item('True') # initialize a bare-bones sequence tagger self.tars_model = TextClassifier(document_embeddings=embeddings, label_dictionary=tars_dictionary, label_type=self.static_label_type, **tagger_args, ) # transformer separator self.separator = str(self.tars_embeddings.tokenizer.sep_token) if self.tars_embeddings.tokenizer._bos_token: self.separator += str(self.tars_embeddings.tokenizer.bos_token) self.prefix = prefix self.num_negative_labels_to_sample = num_negative_labels_to_sample # Store task specific labels since TARS can handle multiple tasks self.add_and_switch_to_new_task(task_name, label_dictionary, label_type) def _get_tars_formatted_sentence(self, label, sentence): original_text = sentence.to_tokenized_string() label_text_pair = f"{label} {self.separator} {original_text}" if self.prefix \ else f"{original_text} {self.separator} {label}" sentence_labels = [label.value for label in sentence.get_labels(self.get_current_label_type())] tars_label = "True" if label in sentence_labels else "False" tars_sentence = Sentence(label_text_pair, use_tokenizer=False).add_label(self.static_label_type, tars_label) return tars_sentence def _get_state_dict(self): model_state = { "state_dict": self.state_dict(), "current_task": self._current_task, "label_type": self.get_current_label_type(), "label_dictionary": self.get_current_label_dictionary(), "tars_model": self.tars_model, "num_negative_labels_to_sample": self.num_negative_labels_to_sample, "task_specific_attributes": self._task_specific_attributes, } return model_state @staticmethod def _init_model_with_state_dict(state): print("init TARS") # init new TARS classifier label_dictionary = state["label_dictionary"] model: TARSClassifier = TARSClassifier( task_name=state["current_task"], label_dictionary=label_dictionary, label_type=state["label_type"], embeddings=state["tars_model"].document_embeddings, num_negative_labels_to_sample=state["num_negative_labels_to_sample"], ) # set all task information model.task_specific_attributes = state["task_specific_attributes"] # linear layers of internal classifier model.load_state_dict(state["state_dict"]) return model @staticmethod def _fetch_model(model_name) -> str: model_map = {} hu_path: str = "https://nlp.informatik.hu-berlin.de/resources/models" model_map["tars-base"] = "/".join([hu_path, "tars-base", "tars-base-v8.pt"]) cache_dir = Path("models") if model_name in model_map: model_name = cached_path(model_map[model_name], cache_dir=cache_dir) return model_name @property def tars_embeddings(self): return self.tars_model.document_embeddings def predict( self, sentences: Union[List[Sentence], Sentence], mini_batch_size=32, verbose: bool = False, label_name: Optional[str] = None, return_loss=False, embedding_storage_mode="none", ): # return """ Predict sequence tags for Named Entity Recognition task :param sentences: a Sentence or a List of Sentence :param mini_batch_size: size of the minibatch, usually bigger is more rapid but consume more memory, up to a point when it has no more effect. :param all_tag_prob: True to compute the score for each tag on each token, otherwise only the score of the best tag is returned :param verbose: set to True to display a progress bar :param return_loss: set to True to return loss :param label_name: set this to change the name of the label type that is predicted :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively. 'gpu' to store embeddings in GPU memory. """ if label_name == None: label_name = self.get_current_label_type() # with torch.no_grad(): if not sentences: return sentences if isinstance(sentences, Sentence): sentences = [sentences] # set context if not set already previous_sentence = None for sentence in sentences: if sentence.is_context_set(): continue sentence._previous_sentence = previous_sentence sentence._next_sentence = None if previous_sentence: previous_sentence._next_sentence = sentence previous_sentence = sentence # reverse sort all sequences by their length rev_order_len_index = sorted(range(len(sentences)), key=lambda k: len(sentences[k]), reverse=True) reordered_sentences: List[Union[Sentence, str]] = [sentences[index] for index in rev_order_len_index] dataloader = DataLoader(dataset=SentenceDataset(reordered_sentences), batch_size=mini_batch_size) # progress bar for verbosity if verbose: dataloader = tqdm(dataloader) overall_loss = 0 overall_count = 0 batch_no = 0 with torch.no_grad(): for batch in dataloader: batch_no += 1 if verbose: dataloader.set_description(f"Inferencing on batch {batch_no}") batch = self._filter_empty_sentences(batch) # stop if all sentences are empty if not batch: continue # go through each sentence in the batch for sentence in batch: # always remove tags first sentence.remove_labels(label_name) all_labels = [label.decode("utf-8") for label in self.get_current_label_dictionary().idx2item] all_detected = {} for label in all_labels: tars_sentence = self._get_tars_formatted_sentence(label, sentence) loss_and_count = self.tars_model.predict(tars_sentence, label_name=label_name, return_loss=True) overall_loss += loss_and_count[0].item() overall_count += loss_and_count[1] predicted_tars_label = tars_sentence.get_labels(label_name)[0] if predicted_tars_label.value == "True": sentence.add_label(label_name, label, predicted_tars_label.score) # clearing token embeddings to save memory store_embeddings(batch, storage_mode=embedding_storage_mode) if return_loss: return overall_loss, overall_count
#!/usr/bin/env python # coding: utf-8 # In[1]: from flair.models import TextClassifier from flair.data import Sentence classifier = TextClassifier.load('en-sentiment') sentence = Sentence('Flair is pretty neat!') classifier.predict(sentence) # print sentence with predicted labels print('Sentence above is: ', sentence.labels) # In[1]: import pandas as pd data = pd.read_csv("./spam.csv", encoding='latin-1').sample(frac=1).drop_duplicates() data = data[['v1', 'v2']].rename(columns={"v1":"label", "v2":"text"}) data['label'] = '__label__' + data['label'].astype(str) data.iloc[0:int(len(data)*0.8)].to_csv('train.csv', sep='\t', index = False, header = False) data.iloc[int(len(data)*0.8):int(len(data)*0.9)].to_csv('test.csv', sep='\t', index = False, header = False) data.iloc[int(len(data)*0.9):].to_csv('dev.csv', sep='\t', index = False, header = False) # In[2]:
except FileNotFoundError: print( "File couldn't be found. Verify if '{f_path}' is a correct file path!" .format(f_path=csv_file)) exit(1) if __name__ == '__main__': print(">>> STARTING PROGRAM") df = load_dataset_from(DATASET_FILE) # To verify if data loaded correctly: # print(df.head(10)) classifier = TextClassifier.load('sentiment') tmp_negatives = {} tmp_positives = {} print(f'Number of players: {len(df["Player"].unique())}') for player_name in df["Player"].unique(): tmp_negatives[player_name] = list() tmp_positives[player_name] = list() for dp in df.values: l = dp.tolist() sentence = Sentence(l[-1]) classifier.predict(sentence) if sentence.labels[0].value == "NEGATIVE":
def __init__(self, model_name_or_path: str): self.classifier = TextClassifier.load(model_name_or_path)
def __init__(self, c_classifier_file, p_classifier_file): self.classifier_c = TextClassifier.load_from_file(c_classifier_file) self.classifier_p = TextClassifier.load_from_file(p_classifier_file)
# if not universal sentence encoder if not _use: # load Flair import torch import flair from flair.models import TextClassifier # load various embeddings from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings, RoBERTaEmbeddings from flair.data import Sentence # if trained embeddings if not pool: # embeddings trained to the "downstream task" model = TextClassifier.load( f'./data/model_sentiment_{fold}/{test_run}_best-model.pt') document_embeddings = model.document_embeddings # if simple pooled embeddings else: if test_run == "fasttext": document_embeddings = DocumentPoolEmbeddings( [WordEmbeddings('en-twitter')]) elif test_run == "roberta": document_embeddings = DocumentPoolEmbeddings([ RoBERTaEmbeddings( pretrained_model_name_or_path="roberta-large", layers="21,22,23,24", pooling_operation="first", use_scalar_mix=True) ])
def initialize_training(text_column_index, label_column_index, delimiter=';', model_type=None, model=None, max_epochs=10, patience=3, use_amp=0, calc_class_weights=0): """ Create a text classification model using FLAIR, SentenceTransformers and Huggingface Transformers. Params: data_folder_path: Folder path with each file titled appropriately i.e. train.csv test.csv dev.csv. Will create a 80/10/10 split if only train is supplied. output_folder_path: Folder path for storing the best model & checkpoints. text_column_index: In which index (starting from 0) the input column is located. label_column_index: In which index (starting from 0) the label column is located. delimiter: type of delimiter used in the .csv file. model_type: SentenceTransformerDocumentEmbeddings or TransformerDocumentEmbeddings model: Which model to use. max_epochs: Number of epochs to train the model for. patience: Number of epochs without improvement before adjusting learning rate. use_amp: Whether to enable automatic mixed precisions (AMP). calc_class_weights: Whether to create a dictionary with class weights to deal with imbalanced datasets. Output: best-model.pt final-model.pt training.log """ # 1. Column format indicating which columns hold the text and label(s) column_name_map = { text_column_index: "text", label_column_index: "label_topic" } # 2. Load corpus containing training, test and dev data. corpus: Corpus = CSVClassificationCorpus("/root/text-classification/data/", column_name_map, skip_header=True, delimiter=delimiter) # Print statistics about the corpus. training_data_statistics = corpus.obtain_statistics() print(training_data_statistics) # 3A. Create a label dictionary. label_dict = corpus.make_label_dictionary() # 3B. Calculate class weights. if bool(calc_class_weights): weight_dict = create_weight_dict(delimiter=delimiter, label_index=label_column_index) else: weight_dict = None # 4. Initialize the sentence_transformers model. if model_type == "SentenceTransformerDocumentEmbeddings": document_embeddings = SentenceTransformerDocumentEmbeddings(model) elif model_type == "TransformerDocumentEmbeddings": document_embeddings = TransformerDocumentEmbeddings(model, fine_tune=True) elif model_type == "WordEmbeddings": word_embeddings = [WordEmbeddings(model)] document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=256) elif model_type == "StackedEmbeddings": document_embeddings = DocumentRNNEmbeddings([ WordEmbeddings('glove'), FlairEmbeddings(model + '-backward'), FlairEmbeddings(model + '-forward') ]) else: raise Exception( "Pick SentenceTransformerDocumentEmbeddings, StackedEmbeddings, WordEmbeddings or TransformerDocumentEmbeddings." ) # 5. create the text classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, loss_weights=weight_dict) # 6. initialize the text classifier trainer with Adam optimizer trainer = ModelTrainer(classifier, corpus, optimizer=Adam, use_tensorboard=False) # 7. start the training trainer.train("/root/text-classification/checkpoint/", learning_rate=3e-5, max_epochs=max_epochs, patience=patience, use_amp=bool(use_amp), checkpoint=True, mini_batch_size=16, mini_batch_chunk_size=4)
# In[125]: # initialize document embedding by passing list of word embeddings document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(stacked_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256, ) # In[126]: # create text classifier classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False) # In[127]: # initialize the text classifier trainer trainer = ModelTrainer(classifier, corpus) # In[ ]: trainer.train('FLAIR_data\\BERT', learning_rate=0.1, mini_batch_size=16,
def main(data_folder, benchmark_classifier_folder, new_data_folder, finetuned_classifier_folder): from flair.embeddings import FlairEmbeddings, DocumentLSTMEmbeddings, BertEmbeddings, DocumentRNNEmbeddings, TransformerDocumentEmbeddings from flair.models import TextClassifier from flair.trainers import ModelTrainer from flair.datasets import CSVClassificationCorpus from flair.data import Corpus import pandas as pd import os ### First Stage (Train on benchmark dataset) benchmark = pd.read_csv(data_folder + "combined_benchmark.csv") benchmark = benchmark[['label', 'text']] #### Create train, dev and test set #benchmark = benchmark.sample(frac=1) # if not set random state, everytime has different training result benchmark = benchmark.sample(frac=1, random_state=42) benchmark.iloc[0:int(len(benchmark) * 0.8)].to_csv(data_folder + 'train.csv', sep='\t', index=False, header=False) benchmark.iloc[int(len(benchmark) * 0.8):int(len(benchmark) * 0.9)].to_csv( data_folder + 'test.csv', sep='\t', index=False, header=False) benchmark.iloc[int(len(benchmark) * 0.9):].to_csv(data_folder + 'dev.csv', sep='\t', index=False, header=False) #### Build corpus column_name_map = {1: "text", 0: "label_topic"} corpus: Corpus = CSVClassificationCorpus( data_folder, column_name_map, skip_header=False, #no header in kaggle data delimiter='\t', # comma separated rows #train_file='train.csv', ## passing in file names manually when it can't auto detect #dev_file='dev.csv', #test_file='test.csv' ) #### Create word embeddings word_embeddings = [ BertEmbeddings(), FlairEmbeddings('news-forward-fast'), FlairEmbeddings('news-backward-fast') ] ## caveat: issue of deprecation. BertEmbeddings and DocumentLSTMEmbeddings existed in version 0.4.5, and became legacy embeddings(still available) in version 0.5 #### First Stage Fine-tuning document_embeddings = DocumentLSTMEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256) classifier = TextClassifier( document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False) trainer = ModelTrainer(classifier, corpus) #trainer.train(benchmark_classifier_folder, max_epochs=1) #offline test use epoch=1 trainer.train(benchmark_classifier_folder, max_epochs=10) ### every finetuning results in different scores ### accuracy at phase1 finetuning does not matter too much, phase2 scores more important in biasing the models towards learning indicator-specific keywords ### Second Stage (train on hand annotated datasets) #### Build corpus ### this column_name_map must be updated to reflect which column stores the X(text features) and y(golden labels) for training use ### in the csv file contained in new_data_folder, 2nd column is 'title_desc', ### 4th column is 'title_desc_sent_1' (where we stored agreed sentiment annotations) new_column_name_map = {1: "text", 3: "label_topic"} print(new_column_name_map) corpus: Corpus = CSVClassificationCorpus( new_data_folder, new_column_name_map, skip_header=True, delimiter=',' # comma separated rows ) #### Second Stage fine-tuning benchmark_classifier = TextClassifier.load( os.path.join(benchmark_classifier_folder, 'best-model.pt')) trainer = ModelTrainer(benchmark_classifier, corpus) #trainer.train(finetuned_classifier_folder, max_epochs=1) #offline test use trainer.train(finetuned_classifier_folder, max_epochs=10)
def spamcleaner(text): classifier = TextClassifier.load('./utils/model/best-model.pt') sentence = Sentence(text) classifier.predict(sentence) print(sentence.labels) return (str(sentence.labels))