def main():
    

    st.title("Tweet Screener")
    st.subheader("*Guaranteeing 2020 proof tweets to the masses*")
    st.sidebar.image("image_resources/DSI-logo.jpg", use_column_width = True)
    
    st.sidebar.subheader("Africa DSI NLP Project by Team 2")
    st.sidebar.write("Catherine, Fanamby, Malcolm, and Martin")
    section = st.sidebar.radio('Sections to Visit',('Swear Word Analyser', 'Sentiment Analyser', 'Topic Identifier'))

    
    publish = st.sidebar.button(label = "Publish Tweet!")
    
    

    if section == 'Swear Word Analyser':
        st.subheader('Swear Word Analyser')
        blacklist = load_screener()


        sentence = st.text_area('Input your message/tweet here:')
       

        if sentence:
            # Pre-process tweet
            answer = PS.profanityscreen(sentence, blacklist, True)
            st.subheader("Swear Analysis Results:")
            
            # Show predictions

            st.write('Swear Words Found:')
            st.dataframe(pd.DataFrame(answer[1], columns=["Swear Words"]))

            st.write('Your Censored Tweet:')
            st.write(answer[0])

            
                
    if section == "Sentiment Analyser":
        st.subheader('Sentiment Analyser')
        sentSentence = st.text_area('Input your message/tweet here:')
        
        if sentSentence:
            sentimentTweet = Sentence(preprocess(sentSentence))
            emoteTweet = Sentence(preprocess(sentSentence))
            
            #Sentiment Dictionaries
            sentiment_dict = {'0': 'Negative', '4': 'Positive'}
            emote_dict = {'0': 'Anger', '1': 'Fear', '2': 'Joy', '3': 'Love', '4': 'Sadness', '5': 'Surprise'}
            emoji_dict = {'0': ':rage:', '1': ':fearful:', '2': ':joy:', '3': ':heart_eyes:', '4': ':cry:', '5': ':astonished:'}
            basic_emo_dict = {"0": ':rage:', "4": ":smile:"}
            with st.spinner("Weeeeeee......."):
                SentClassifier = TextClassifier.load('twitter_sentiment/model-saves/final-model.pt')
                EmoteClassifier = TextClassifier.load('twitter_sentiment/model-saves/emotion-model.pt')
                SentClassifier.predict(sentimentTweet)
                EmoteClassifier.predict(emoteTweet)
                
            st.subheader("Sentiment Analysis Results:")

            predSent = sentimentTweet.labels[0]
            predSText = sentiment_dict[predSent.value[0]]
            
            st.markdown('Your sentence is ' + str(predSText) + " " + basic_emo_dict[predSent.value[0]] + ' with '+ "{:.2f}".format(predSent.score*100)+ '% confidence')       
         
            predEmote = emoteTweet.labels[0]
            predEText = emote_dict[predEmote.value[0]]
            st.markdown('Your sentence is predicted to portray ' + predEText +  " " +emoji_dict[predEmote.value[0]] +' with '+ "{:.2f}".format(predEmote.score*100)+ ' % confidence')
            
    
    if section == "Topic Identifier":
        st.subheader("Topic Identifier")
        topicSentence = st.text_area('Input your message/tweet here:')
        st.subheader("Sensitivity Analysis Results:")
        if topicSentence:
            topicTweet = preprocess_test(topicSentence)
            
            topic_dict = {0: "obscenity", 1: "violence", 2: "verbal abuse", 3: "identity hate speech", 4: "hate speech", 5: "offense", 6: "neither"}
            
            policies_dict = {0 : "https://help.twitter.com/en/safety-and-security/offensive-tweets-and-content", 
                 1 : "https://help.twitter.com/en/rules-and-policies/violent-threats-glorification",
                 2 : "https://help.twitter.com/en/rules-and-policies/abusive-behavior",
                 3 : "https://help.twitter.com/en/rules-and-policies/hateful-conduct-policy",
                 4 : "https://help.twitter.com/en/rules-and-policies/hateful-conduct-policy",
                 5 : "https://help.twitter.com/en/safety-and-security/offensive-tweets-and-content"}

            twitter_rules = "https://help.twitter.com/en/rules-and-policies#general-policies"

            with st.spinner("Predicting..."):
                TopicClassifier =  tf.keras.models.load_model('Topic Identifier/model_saves/topic_identifier_model.h5')
                topic_pred = TopicClassifier.predict(topicTweet)
            
            
            topTopic = topic_pred.argmax(1)[0]
            topTopicText = topic_dict[topic_pred.argmax(1)[0]]
            
            graph_pred = pd.DataFrame(topic_pred, columns = ["obscenity", "violence", "verbal abuse", "identity hate crime", "hate crime", "offense", "neither"])
            columns = ["obscenity", "violence", "verbal abuse", "identity hate speech", "hate speech", "offense", "neither"]
            if topic_pred.argmax(1)[0]!=6 :
              st.write("Your tweet may contain sentences that promote " + topTopicText+ " with  "+str(topic_pred[0][topTopic]*100) +" % confidence")
              st.write("Please review  Twitter Rules and policies: "+ twitter_rules)
              st.write("And Twiiter's "+ topTopicText + " policy: "+ policies_dict[topic_pred.argmax(1)[0]])
            else:
                st.write("Your tweet is fine in terms of policy.")
        
            plt.bar(height = topic_pred.flatten(),x=columns, width = 1)
            plt.title('Policy Breaking Likelihood')
            plt.xticks(rotation=45)
            plt.xlabel('Twitter Policies (Topics)')
            plt.ylabel('Probability of Violation')
            st.pyplot()
            st.write(topic_pred)
              
    if publish:
        
            
        if sentence:
   
            sentiment = "None"
            sentence = sentence
            publish_tweet(sentiment, sentence)
                
        elif sentSentence:
     
            sentiment = predEText
            sentence = sentSentence
            publish_tweet(sentiment, sentence)
            
        elif topicSentence:
      
            sentiment = predEText
            sentence = topicSentence
            publish_tweet(sentiment, sentence)
            
        else:
            st.sidebar.write("You haven't written or analysed your tweet yet.")
        

    st.sidebar.markdown("This   application helps determine how problematic your tweet is before publishing it."
                    + " We utilise three main tools to achieve this."
                    +" A swear word analyser that checks your tweet for profanity and delivers a censored tweet."
                    +" A sentiment analyser that predicts the emotion in your tweet, to check if you were really being positive."
                    +" Finally a topic identifier which determines if you broke one of Twitter's policies with out knowing it!"
                    +" Once you have thouroughly scrubbed you tweet you may store your results for further analyses.")
Example #2
0
from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings, Sentence
from flair.data_fetcher import NLPTaskDataFetcher
from flair.embeddings import BertEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from pathlib import Path
glove_embedding = WordEmbeddings('glove')
bert_embedding = BertEmbeddings('bert-base-uncased')
corpus = NLPTaskDataFetcher.load_classification_corpus(Path('./'),
                                                       test_file='test.csv',
                                                       dev_file='dev.csv',
                                                       train_file='train.csv')
document_embeddings = DocumentPoolEmbeddings([bert_embedding, glove_embedding])
classifier = TextClassifier(document_embeddings,
                            label_dictionary=corpus.make_label_dictionary(),
                            multi_label=True)
trainer = ModelTrainer(classifier, corpus)
trainer.train('./', max_epochs=10)
Example #3
0
)

stats = corpus.obtain_statistics()
print(stats)

# create the label dictionary
label_dict = corpus.make_label_dictionary()
print(label_dict)

# make a list of word embeddings
embeddings = [OneHotEmbeddings(corpus=corpus)]

# initialize document embedding by passing list of word embeddings
# Can choose between many RNN types (GRU by default, to change use rnn_type parameter)
document_embeddings = DocumentRNNEmbeddings(embeddings,
                                            bidirectional=True,
                                            hidden_size=256)

# create the text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)

# initialize the text classifier trainer
trainer = ModelTrainer(classifier, corpus)

# start the training
trainer.train('resources/',
              learning_rate=0.1,
              mini_batch_size=128,
              anneal_factor=0.5,
              patience=5,
              max_epochs=20)
#from bson.objectid import ObjectId
from flask_cors import CORS
from flair.models import TextClassifier
from flair.data import Sentence
from flask import session

app = Flask(__name__)
app.secret_key = "super_secret_key"
# # app.config['SECRET_KEY'] = 'oh_so_secret'

# app.config['MONGO_DBNAME'] = 'exposeModel'
# app.config['MONGO_URI'] = 'mongodb://*****:*****@app.route('/', methods=['GET'])
def index():
    return jsonify("welcome to Arafa API")


@app.route('/api/tasks', methods=['GET'])
def get_result():
    result = []
    try:
        data_result = session['my_result']
        result.append({
            'title': data_result['title'],
            'tag': data_result['tag']
Example #5
0
def upload_file():
    uploaded_file = request.files['file']
    if uploaded_file.filename != '':
        ## uploaded_file jako parametr pandas
        ## cala logika tworzenia positive i negative
        df = load_dataset_from(uploaded_file)

        # To verify if data loaded correctly:
        # print(df.head(10))
        print(df.head(10))
        classifier = TextClassifier.load('sentiment')

        tmp_negatives = {}
        tmp_positives = {}

        print(f'Number of players: {len(df["Player"].unique())}')

        for player_name in df["Player"].unique():
            tmp_negatives[player_name] = list()
            tmp_positives[player_name] = list()

        for dp in df.values:
            l = dp.tolist()
            sentence = Sentence(l[-1])
            classifier.predict(sentence)
            if sentence.labels[0].value == "NEGATIVE":
                tmp_negatives[l[-2]].append(sentence.labels[0].score)
            elif sentence.labels[0].value == "POSITIVE":
                tmp_positives[l[-2]].append(sentence.labels[0].score)

        negative = {}
        positive = {}

        for player in tmp_negatives.keys():
            if len(tmp_negatives[player]) > 10:
                negative[player] = sum(tmp_negatives[player]) / len(
                    tmp_negatives[player])

        for player in tmp_positives.keys():
            if len(tmp_positives[player]) > 10:
                positive[player] = sum(tmp_positives[player]) / len(
                    tmp_positives[player])

        print(negative)
        print(positive)

        # for player in tmp.keys():
        #     tmp[player] = Sentence(tmp[player])
        #     classifier.predict(tmp[player])
        #     if tmp[player].labels[0].value == "NEGATIVE":
        #         negative[player] = tmp[player].labels[0].score
        #     elif tmp[player].labels[0].value == "POSITIVE":
        #         positive[player] = tmp[player].labels[0].score
        #

        positive_json = json.dumps(positive, indent=4)
        negative_json = json.dumps(negative, indent=4)
        list = {}
        list[0] = positive_json
        list[1] = negative_json
        return list
Example #6
0
                  'Buenos dias, vamos a hacer algunos recados y a empezar el dia con energia!!',  # 3
                  '@mireiaescribano justo cuando se terminan las fiestas de verano, me viene genial',  # 3
                  'No sabes cuantas horas, minutos y segundos espero para volver a ver esa sonrisa que tanto me gusta ver salir de ti',  # 0
                  '@cinthiaszc jajajaja me vas a decir a mi mi abuela cocina tan rico que mando al tacho la dieta :v',  # 0
                  'te adoroVen a PerĂº pls'  # 3
                  ]
label_dictionary = ['0', '1', '2', '3']

fasttext_path = '../fasttext/models/{}_{}'.format('intertass', FT_MODEL_NAME)
fasttext_model = fasttext.load_model(path=fasttext_path)
dev_fasttext_probabilities, dev_fasttext_predictions = fasttext_embedding.predict_with_fasttext_model(
    fasttext_model, tweets_to_test, label_dictionary)
print(dev_fasttext_probabilities)
print(dev_fasttext_predictions)

print("BERT MODEL")
bert_path = '{}/{}/{}/best-model.pt'.format(BERT_MODEL_PATH, BERT_MODEL_NAME, 'intertass')
bert_model = TextClassifier.load(bert_path)
dev_bert_probabilities, dev_bert_predictions = bert_embeddings.predict_with_bert_model(
    bert_model, tweets_to_test, label_dictionary)
print(dev_bert_probabilities)
print(dev_bert_predictions)

# sentences = ['horrible']
#
# analyzer = vaderSentiment.vaderSentiment.SentimentIntensityAnalyzer()
# for sentence in sentences:
#     vs = analyzer.polarity_scores(sentence)
#     print("{:-<65} {}".format(sentence, str(vs)))

Example #7
0
    def train_model(self,
                    model_name="text_classification_model",
                    custom_word_embeddings=None,
                    rnn_type="GRU",
                    use_pool_embedding=False,
                    hidden_size=16,
                    reproject_words=True,
                    reproject_words_dimension=128,
                    learning_rate=1e-3,
                    batch_size=8,
                    anneal_factor=0.5,
                    patience=2,
                    max_epochs=30,
                    **kwargs):
        """
        Train flair model and save it in your data folder

        Parameters
        ----------
        model_name: str
            Name of your model
        custom_word_embeddings: list<embedding>
            Use custom flair embedding

        See more in flair documentation: https://github.com/zalandoresearch/flair/tree/master/resources/docs

        Return
        -------
        None
        """
        self.model_name = model_name
        corpus = CSVClassificationCorpus(self.data_folder,
                                         self.column_name_map,
                                         skip_header=True)
        label_dict = corpus.make_label_dictionary()

        # Word embedding selection
        if custom_word_embeddings is None:
            word_embeddings = [WordEmbeddings('fr')]
        else:
            word_embeddings = custom_word_embeddings

        # initialize document embedding by passing list of word embeddings and parameters
        if use_pool_embedding:
            document_embeddings = DocumentPoolEmbeddings(
                word_embeddings, pooling='max', fine_tune_mode='nonlinear')
        else:
            document_embeddings = DocumentRNNEmbeddings(
                word_embeddings,
                hidden_size=hidden_size,
                reproject_words=reproject_words,
                reproject_words_dimension=reproject_words_dimension,
                rnn_type=rnn_type)

        # create the text classifier and initialize trainer
        classifier = TextClassifier(document_embeddings,
                                    label_dictionary=label_dict)
        trainer = ModelTrainer(classifier, corpus, optimizer=Adam)

        # let's train !
        num_workers = cpu_count()
        trainer.train("{0}\\{1}".format(self.data_folder, self.model_name),
                      learning_rate=learning_rate,
                      num_workers=num_workers,
                      mini_batch_size=batch_size,
                      anneal_factor=anneal_factor,
                      patience=patience,
                      max_epochs=max_epochs,
                      **kwargs)
Example #8
0
    def train(self, X, y):

        X_text = X[:, self.args.TEXT_COL]
        y = y.flatten()
        #corpus: TaggedCorpus = NLPTaskDataFetcher.load_corpus(NLPTask.CONLL_03)

        train: List[Sentence] = []

        for tweet, label in zip(X_text, y):
            if tweet == '':
                tweet = 'dummy'
            s: Sentence = Sentence(tweet)
            s.add_label(str(label))
            train.append(s)

        corpus: TaggedCorpus = TaggedCorpus(train, train, train)

        # 2. create the label dictionary
        label_dict = corpus.make_label_dictionary()

        # 3. make a list of word embeddings
        word_embeddings = [
            glove_embeddings,
            #twitter_embeddings,
            # comment in this line to use character embeddings
            #CharacterEmbeddings(),
            # comment in flair embeddings for state-of-the-art results
            # FlairEmbeddings('news-forward'),
            fflair,
            # FlairEmbeddings('news-backward'),
            bflair
        ]

        # 4. initialize document embedding by passing list of word embeddings
        document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings(
            word_embeddings,
            hidden_size=512,
            reproject_words=True,
            reproject_words_dimension=256,
        )
        # 5. create the text classifier
        classifier = TextClassifier(document_embeddings,
                                    label_dictionary=label_dict,
                                    multi_label=False)

        # 6. initialize the text classifier trainer
        trainer = ModelTrainer(classifier, corpus)

        self.model = trainer.model
        self.model.save = self.save
        self.model.save_checkpoint = self.save_checkpoint

        # 7. start the training
        trainer.train('../data/ecuador_earthquake_2016/models',
                      learning_rate=0.1,
                      mini_batch_size=32,
                      anneal_factor=0.5,
                      patience=5,
                      max_epochs=5)

        self.clf = classifier
Example #9
0
def test():
    classifier = TextClassifier.load(model_file)
    sentence = Sentence('Awesome stuff!')
    classifier.predict(sentence)
    print(sentence.labels)
Example #10
0
# coding utf-8
# docker run --name flask -p 5000:5000 -v G:\workspace:/flask -it python bash
# export FLASK_APP="server.py"
# flask run --host=0.0.0.0
# or python -m forever.run -t 1000000 python server.py &
from flask import Flask, jsonify
import feedparser

from flair.data import build_japanese_tokenizer, Sentence
from flair.models import TextClassifier

app = Flask(__name__)
classifier = TextClassifier.load('resources/best-model.pt')
japanese_tokenizer = build_japanese_tokenizer()


def get_score(text):
    # create example sentence
    sentence = Sentence(text, use_tokenizer=japanese_tokenizer)
    # predict class and print
    classifier.predict(sentence)

    label_dict = sentence.to_dict()["labels"][0]

    return label_dict["confidence"] if label_dict[
        "value"] == "__label__O" else 0


def get_feed():
    RSS_URL = "https://www.lifehacker.jp/feed/index.xml"
    feed = feedparser.parse(RSS_URL)
Example #11
0
#############################################################################################
#############################################################################################






print("#####################################")
print("##########  EMOTIONS   ##############")
print("#####################################")

from flair.models import TextClassifier

# load sentiment model
classifier = TextClassifier.load('en-sentiment')

# example sentence
sentence = Sentence('Porto wins. I am happy')

# predict NER tags
classifier.predict(sentence)

# print sentence with predicted labels
print(sentence.labels)



print("#####################################")
print("##########  EMOTIONS   ##############")
print("#####################################")
Example #12
0
    def train(self, intent_fst) -> None:
        from flair.data import Sentence, Token
        from flair.models import SequenceTagger, TextClassifier
        from flair.embeddings import (
            FlairEmbeddings,
            StackedEmbeddings,
            DocumentRNNEmbeddings,
        )
        from flair.data import TaggedCorpus
        from flair.trainers import ModelTrainer

        # Directory to look for downloaded embeddings
        cache_dir = self.profile.read_path(
            self.profile.get("intent.flair.cache_dir", "flair/cache")
        )

        os.makedirs(cache_dir, exist_ok=True)

        # Directory to store generated models
        data_dir = self.profile.write_path(
            self.profile.get("intent.flair.data_dir", "flair/data")
        )

        if os.path.exists(data_dir):
            shutil.rmtree(data_dir)

        self.embeddings = self.profile.get("intent.flair.embeddings", [])
        assert len(self.embeddings) > 0, "No word embeddings"

        # Create directories to write training data to
        class_data_dir = os.path.join(data_dir, "classification")
        ner_data_dir = os.path.join(data_dir, "ner")
        os.makedirs(class_data_dir, exist_ok=True)
        os.makedirs(ner_data_dir, exist_ok=True)

        # Convert FST to training data
        class_data_path = os.path.join(class_data_dir, "train.txt")
        ner_data_path = os.path.join(ner_data_dir, "train.txt")

        # { intent: [ { 'text': ..., 'entities': { ... } }, ... ] }
        sentences_by_intent: Dict[str, Any] = {}

        # Get sentences for training
        do_sampling = self.profile.get("intent.flair.do_sampling", True)
        start_time = time.time()

        if do_sampling:
            # Sample from each intent FST
            num_samples = int(self.profile.get("intent.flair.num_samples", 10000))
            intent_map_path = self.profile.read_path(
                self.profile.get("training.intent.intent_map", "intent_map.json")
            )

            with open(intent_map_path, "r") as intent_map_file:
                intent_map = json.load(intent_map_file)

            # Gather FSTs for all known intents
            fsts_dir = self.profile.write_dir(
                self.profile.get("speech_to_text.fsts_dir")
            )

            intent_fst_paths = {
                intent_id: os.path.join(fsts_dir, f"{intent_id}.fst")
                for intent_id in intent_map.keys()
            }

            # Generate samples
            self._logger.debug(
                f"Generating {num_samples} sample(s) from {len(intent_fst_paths)} intent(s)"
            )

            sentences_by_intent = sample_sentences_by_intent(
                intent_fst_paths, num_samples
            )
        else:
            # Exhaustively generate all sentences
            self._logger.debug(
                "Generating all possible sentences (may take a long time)"
            )
            sentences_by_intent = make_sentences_by_intent(intent_fst)

        sentence_time = time.time() - start_time
        self._logger.debug(f"Generated sentences in {sentence_time} second(s)")

        # Get least common multiple in order to balance sentences by intent
        lcm_sentences = lcm(*(len(sents) for sents in sentences_by_intent.values()))

        # Generate examples
        class_sentences = []
        ner_sentences: Dict[str, List[Sentence]] = defaultdict(list)
        for intent_name, intent_sents in sentences_by_intent.items():
            num_repeats = max(1, lcm_sentences // len(intent_sents))
            for intent_sent in intent_sents:
                # Only train an intent classifier if there's more than one intent
                if len(sentences_by_intent) > 1:
                    # Add balanced copies
                    for i in range(num_repeats):
                        class_sent = Sentence(labels=[intent_name])
                        for word in intent_sent["tokens"]:
                            class_sent.add_token(Token(word))

                        class_sentences.append(class_sent)

                if len(intent_sent["entities"]) == 0:
                    continue  # no entities, no sequence tagger

                # Named entity recognition (NER) example
                token_idx = 0
                entity_start = {ev["start"]: ev for ev in intent_sent["entities"]}
                entity_end = {ev["end"]: ev for ev in intent_sent["entities"]}
                entity = None

                word_tags = []
                for word in intent_sent["tokens"]:
                    # Determine tag label
                    tag = "O" if not entity else f"I-{entity}"
                    if token_idx in entity_start:
                        entity = entity_start[token_idx]["entity"]
                        tag = f"B-{entity}"

                    word_tags.append((word, tag))

                    # word ner
                    token_idx += len(word) + 1

                    if (token_idx - 1) in entity_end:
                        entity = None

                # Add balanced copies
                for i in range(num_repeats):
                    ner_sent = Sentence()
                    for word, tag in word_tags:
                        token = Token(word)
                        token.add_tag("ner", tag)
                        ner_sent.add_token(token)

                    ner_sentences[intent_name].append(ner_sent)

        # Start training
        max_epochs = int(self.profile.get("intent.flair.max_epochs", 100))

        # Load word embeddings
        self._logger.debug(f"Loading word embeddings from {cache_dir}")
        word_embeddings = [
            FlairEmbeddings(os.path.join(cache_dir, "embeddings", e))
            for e in self.embeddings
        ]

        if len(class_sentences) > 0:
            self._logger.debug("Training intent classifier")

            # Random 80/10/10 split
            class_train, class_dev, class_test = self._split_data(class_sentences)
            class_corpus = TaggedCorpus(class_train, class_dev, class_test)

            # Intent classification
            doc_embeddings = DocumentRNNEmbeddings(
                word_embeddings,
                hidden_size=512,
                reproject_words=True,
                reproject_words_dimension=256,
            )

            classifier = TextClassifier(
                doc_embeddings,
                label_dictionary=class_corpus.make_label_dictionary(),
                multi_label=False,
            )

            self._logger.debug(
                f"Intent classifier has {len(class_sentences)} example(s)"
            )
            trainer = ModelTrainer(classifier, class_corpus)
            trainer.train(class_data_dir, max_epochs=max_epochs)
        else:
            self._logger.info("Skipping intent classifier training")

        if len(ner_sentences) > 0:
            self._logger.debug(f"Training {len(ner_sentences)} NER sequence tagger(s)")

            # Named entity recognition
            stacked_embeddings = StackedEmbeddings(word_embeddings)

            for intent_name, intent_ner_sents in ner_sentences.items():
                ner_train, ner_dev, ner_test = self._split_data(intent_ner_sents)
                ner_corpus = TaggedCorpus(ner_train, ner_dev, ner_test)

                tagger = SequenceTagger(
                    hidden_size=256,
                    embeddings=stacked_embeddings,
                    tag_dictionary=ner_corpus.make_tag_dictionary(tag_type="ner"),
                    tag_type="ner",
                    use_crf=True,
                )

                ner_intent_dir = os.path.join(ner_data_dir, intent_name)
                os.makedirs(ner_intent_dir, exist_ok=True)

                self._logger.debug(
                    f"NER tagger for {intent_name} has {len(intent_ner_sents)} example(s)"
                )
                trainer = ModelTrainer(tagger, ner_corpus)
                trainer.train(ner_intent_dir, max_epochs=max_epochs)
        else:
            self._logger.info("Skipping NER sequence tagger training")
Example #13
0
  	elif (val.value == 'b'):
  		row.pred_neg = 0
  		row.pred_pos = 1
  	elif (val.value == 'a'):
  		row.pred_neg = 0
  		row.pred_pos = 0
  	elif (val.value == 'd'):
  		row.pred_neg = 1
  		row.pred_pos = 0
  	else:
  		print("incorrectVal {}".format(val))

  return row

if __name__ == "__main__":
	classifier = TextClassifier.load('./data/POLARITY_MULTI/best-model.pt')

	data_to_pred = pd.read_csv('./data/test_gold.csv',sep='\t')
	data_to_pred['pred_pos'] = 0
	data_to_pred['pred_neg'] = 0
	data_to_pred = data_to_pred.apply(lambda row: addPredictionForPositive(row), axis=1)
	data_to_pred.head()
	data_to_save = data_to_pred[['idtwitter','subj','pred_pos','pred_neg','iro','lpos','lneg','top']]
	data_to_save.idtwitter = data_to_save.idtwitter.astype(str)
	data_to_save.subj = data_to_save.subj.astype(str)
	data_to_save.pred_pos = data_to_save.pred_neg.astype(str)
	data_to_save.pred_neg = data_to_save.pred_neg.astype(str)
	data_to_save.iro = data_to_save.iro.astype(str)
	data_to_save.lpos = data_to_save.lpos.astype(str)
	data_to_save.lneg = data_to_save.lneg.astype(str)
	data_to_save.top = data_to_save.top.astype(str)
Example #14
0
    print(label_dictionary)

    flat_labels = [item for sublist in labels for item in sublist]
    class_weights = compute_class_weight('balanced', np.unique(flat_labels),
                                         flat_labels)
    unique_labels = np.unique(flat_labels)
    weights = {}
    for i in range(len(unique_labels)):
        weights[unique_labels[i]] = class_weights[i]

    document_embeddings = TransformerDocumentEmbeddings(
        params['version_model'], fine_tune=True)

    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dictionary,
                                loss_weights=weights,
                                multi_label=False)

    trainer = ModelTrainer(classifier, corpus)

    trainer.train(params['model_dir'],
                  learning_rate=params['learning_rate'],
                  mini_batch_size=params['batch_size'],
                  anneal_factor=params['anneal_factor'],
                  patience=params['patience'],
                  max_epochs=params['epochs'],
                  embeddings_storage_mode=params['embeddings_storage_mode'])

    # print_predictions(trainer, tokens_test, params['results']+'gloveSentence')
Example #15
0
test_run = args.test_run
nrows = args.nrows
model_type = args.model

# read data
df = datatable.fread(f"./data/from_USNavy_for_flair.csv").to_pandas()
print(len(df))
df = df.head(nrows)
data = df.copy()

data = data[['sentence', "row"]]

# load classifier
from flair.models import TextClassifier
from flair.data import Sentence
model = TextClassifier.load(
    f'./data/model_{model_type}_{fold}/_{test_run}_best-model.pt')
document_embeddings = model.document_embeddings

# prepare df for output to csv
# batch size for embedding tweet instances
bs = 32
tweets_to_embed = data['sentence'].copy()
print("beginning embedding")

# prepare mini batches
low_limits = list()
for x in range(0, len(tweets_to_embed), bs):
    low_limits.append(x)
up_limits = [x + bs for x in low_limits[:-1]]
up_limits.append(len(tweets_to_embed))
Example #16
0
    # FlairEmbeddings('news-forward'),
    # FlairEmbeddings('news-backward'),
]

# 4. initialize document embedding by passing list of word embeddings
# Can choose between many RNN types (GRU by default, to change use rnn_type parameter)
document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(
    word_embeddings,
    hidden_size=512,
    reproject_words=True,
    reproject_words_dimension=256,
)

# 5. create the text classifier
classifier = TextClassifier(document_embeddings,
                            label_dictionary=label_dict,
                            multi_label=False)

# 6. initialize the text classifier trainer
trainer = ModelTrainer(classifier, corpus)

# 7. start the training
trainer.train('resources/taggers/ag_news',
              learning_rate=0.1,
              mini_batch_size=32,
              anneal_factor=0.5,
              patience=5,
              max_epochs=150)

# 8. plot training curves (optional)
from flair.visual.training_curves import Plotter
Example #17
0
import numpy as np
from nlppreprocess import NLP

app = Flask(__name__)
app.secret_key = "super_secret_key"
APP_ROOT = os.path.dirname(os.path.abspath(__file__))
UPLOAD_FOLDER = os.path.join(APP_ROOT, 'uploads')
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
# # app.config['SECRET_KEY'] = 'oh_so_secret'

# app.config['MONGO_DBNAME'] = 'exposeModel'
# app.config['MONGO_URI'] = 'mongodb://*****:*****@app.route('/', methods=['GET'])
def index():
    return jsonify("welcome to Sadeeq API")


@app.route('/api/tasks', methods=['GET'])
def get_result():
    result = []
    try:
        # result = session['my_result']
        result = modelResult
def trainer(file_path: Path, filenames: Tuple[str, str, str], checkpoint: str,
            stack: str, n_epochs: int) -> None:
    """Train sentiment model using Flair NLP library:
    https://github.com/zalandoresearch/flair/blob/master/resources/docs/TUTORIAL_7_TRAINING_A_MODEL.md

    To help provide added context, we can stack Glove, Bert or ELMo embeddings along with Flair embeddings.
    """
    # pip install flair allennlp
    from flair.datasets import ClassificationCorpus
    from flair.embeddings import FlairEmbeddings, DocumentRNNEmbeddings
    from flair.models import TextClassifier
    from flair.trainers import ModelTrainer
    from flair.training_utils import EvaluationMetric
    from flair.visual.training_curves import Plotter

    if stack == "glove":
        from flair.embeddings import WordEmbeddings
        stacked_embedding = WordEmbeddings('glove')
    elif stack == "elmo":
        from flair.embeddings import ELMoEmbeddings
        stacked_embedding = ELMoEmbeddings('original')
    elif stack == "bert":
        from flair.embeddings import BertEmbeddings
        stacked_embedding = BertEmbeddings('bert-base-cased')
    else:
        stacked_embedding = None

    # Define and Load corpus from the provided dataset
    train, dev, test = filenames
    corpus = ClassificationCorpus(
        file_path,
        train_file=train,
        dev_file=dev,
        test_file=test,
    )
    # Create label dictionary from provided labels in data
    label_dict = corpus.make_label_dictionary()

    # Stack Flair string-embeddings with optional embeddings
    word_embeddings = list(
        filter(None, [
            stacked_embedding,
            FlairEmbeddings('news-forward'),
            FlairEmbeddings('news-backward'),
        ]))
    # Initialize document embedding by passing list of word embeddings
    document_embeddings = DocumentRNNEmbeddings(
        word_embeddings,
        hidden_size=512,
        reproject_words=True,
        reproject_words_dimension=256,
    )
    # Define classifier
    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict,
                                multi_label=False)

    if not checkpoint:
        trainer = ModelTrainer(classifier, corpus)
    else:
        # If checkpoint file is defined, resume training
        checkpoint = classifier.load_checkpoint(Path(checkpoint))
        trainer = ModelTrainer.load_from_checkpoint(checkpoint, corpus)

    # Begin training (enable checkpointing to continue training at a later time, if desired)
    trainer.train(file_path,
                  EvaluationMetric.MACRO_F1_SCORE,
                  max_epochs=n_epochs,
                  checkpoint=True)

    # Plot curves and store weights and losses
    plotter = Plotter()
    plotter.plot_training_curves(file_path / 'loss.tsv')
    plotter.plot_weights(file_path / 'weights.txt')
Example #19
0
    WordEmbeddings('glove'),
    #FlairEmbeddings('news-forward'),
    #FlairEmbeddings('news-backward'),
]

# initialize document embedding by passing list of word embeddings
# Can choose between many RNN types (GRU by default, to change use rnn_type parameter)
document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(
    word_embeddings,
    hidden_size=512,
    reproject_words=True,
    reproject_words_dimension=256)

# create the text classifier
classifier = TextClassifier(document_embeddings,
                            label_dictionary=label_dict,
                            multi_label=False)

# initialize the text classifier trainer
trainer = ModelTrainer(classifier, corpus)

trainer.train('resources/clf',
              learning_rate=0.1,
              mini_batch_size=64,
              anneal_factor=0.5,
              patience=1,
              max_epochs=60)

classifier = TextClassifier.load('resources/clf/final-model.pt')

# create example sentence
f1_list = []
mcc_list = []
f1sub_list = []
f1perf_list = []
f1rec_list = []
f1tec_list = []
f1time_list = []
f1ll_list = []
bas_list = []

# main loop
for fold in range(k_folds):
    # load pre-trained classifier
    try:
        subject_classifier = TextClassifier.load(
            './data/{}/model_subject_category_{}/{}_best-model.pt'.format(
                dataset, fold, model_type))
    except FileNotFoundError:
        print("----- Does such test run exist? Try another name. -----")
        quit()
    # placeholder for results of predictions
    flair_subject = []

    # add timestamp before all classification takes place
    time_schedule.append(time.perf_counter())

    # Main Analysis Loop:    start analysis
    for i in range(len(data)):
        # get the sentence to be analyzed
        sent = [str(data.iloc[i, 2])]
Example #21
0
class TARSClassifier(FewshotClassifier):
    """
    TARS model for text classification. In the backend, the model uses a BERT based binary
    text classifier which given a <label, text> pair predicts the probability of two classes
    "True", and "False". The input data is a usual Sentence object which is inflated
    by the model internally before pushing it through the transformer stack of BERT.
    """

    static_label_type = "tars_label"

    def __init__(
            self,
            task_name: str,
            label_dictionary: Dictionary,
            label_type: str,
            embeddings: str = 'bert-base-uncased',
            num_negative_labels_to_sample: int = 2,
            prefix: bool = True,
            **tagger_args,
    ):
        """
        Initializes a TextClassifier
        :param task_name: a string depicting the name of the task
        :param label_dictionary: dictionary of labels you want to predict
        :param embeddings: name of the pre-trained transformer model e.g.,
        'bert-base-uncased' etc
        :param num_negative_labels_to_sample: number of negative labels to sample for each
        positive labels against a sentence during training. Defaults to 2 negative
        labels for each positive label. The model would sample all the negative labels
        if None is passed. That slows down the training considerably.
        :param multi_label: auto-detected by default, but you can set this to True
        to force multi-label predictionor False to force single-label prediction
        :param multi_label_threshold: If multi-label you can set the threshold to make predictions
        :param beta: Parameter for F-beta score for evaluation and training annealing
        """
        super(TARSClassifier, self).__init__()

        from flair.embeddings import TransformerDocumentEmbeddings

        if not isinstance(embeddings, TransformerDocumentEmbeddings):
            embeddings = TransformerDocumentEmbeddings(model=embeddings,
                                                       fine_tune=True,
                                                       layers='-1',
                                                       layer_mean=False,
                                                       )

        # prepare TARS dictionary
        tars_dictionary = Dictionary(add_unk=False)
        tars_dictionary.add_item('False')
        tars_dictionary.add_item('True')

        # initialize a bare-bones sequence tagger
        self.tars_model = TextClassifier(document_embeddings=embeddings,
                                         label_dictionary=tars_dictionary,
                                         label_type=self.static_label_type,
                                         **tagger_args,
                                         )

        # transformer separator
        self.separator = str(self.tars_embeddings.tokenizer.sep_token)
        if self.tars_embeddings.tokenizer._bos_token:
            self.separator += str(self.tars_embeddings.tokenizer.bos_token)

        self.prefix = prefix
        self.num_negative_labels_to_sample = num_negative_labels_to_sample

        # Store task specific labels since TARS can handle multiple tasks
        self.add_and_switch_to_new_task(task_name, label_dictionary, label_type)

    def _get_tars_formatted_sentence(self, label, sentence):

        original_text = sentence.to_tokenized_string()

        label_text_pair = f"{label} {self.separator} {original_text}" if self.prefix \
            else f"{original_text} {self.separator} {label}"

        sentence_labels = [label.value for label in sentence.get_labels(self.get_current_label_type())]

        tars_label = "True" if label in sentence_labels else "False"

        tars_sentence = Sentence(label_text_pair, use_tokenizer=False).add_label(self.static_label_type, tars_label)

        return tars_sentence

    def _get_state_dict(self):
        model_state = {
            "state_dict": self.state_dict(),

            "current_task": self._current_task,
            "label_type": self.get_current_label_type(),
            "label_dictionary": self.get_current_label_dictionary(),
            "tars_model": self.tars_model,
            "num_negative_labels_to_sample": self.num_negative_labels_to_sample,

            "task_specific_attributes": self._task_specific_attributes,
        }
        return model_state

    @staticmethod
    def _init_model_with_state_dict(state):
        print("init TARS")

        # init new TARS classifier
        label_dictionary = state["label_dictionary"]

        model: TARSClassifier = TARSClassifier(
            task_name=state["current_task"],
            label_dictionary=label_dictionary,
            label_type=state["label_type"],
            embeddings=state["tars_model"].document_embeddings,
            num_negative_labels_to_sample=state["num_negative_labels_to_sample"],
        )

        # set all task information
        model.task_specific_attributes = state["task_specific_attributes"]
        # linear layers of internal classifier
        model.load_state_dict(state["state_dict"])
        return model

    @staticmethod
    def _fetch_model(model_name) -> str:

        model_map = {}
        hu_path: str = "https://nlp.informatik.hu-berlin.de/resources/models"

        model_map["tars-base"] = "/".join([hu_path, "tars-base", "tars-base-v8.pt"])

        cache_dir = Path("models")
        if model_name in model_map:
            model_name = cached_path(model_map[model_name], cache_dir=cache_dir)

        return model_name

    @property
    def tars_embeddings(self):
        return self.tars_model.document_embeddings

    def predict(
            self,
            sentences: Union[List[Sentence], Sentence],
            mini_batch_size=32,
            verbose: bool = False,
            label_name: Optional[str] = None,
            return_loss=False,
            embedding_storage_mode="none",
    ):
        # return
        """
        Predict sequence tags for Named Entity Recognition task
        :param sentences: a Sentence or a List of Sentence
        :param mini_batch_size: size of the minibatch, usually bigger is more rapid but consume more memory,
        up to a point when it has no more effect.
        :param all_tag_prob: True to compute the score for each tag on each token,
        otherwise only the score of the best tag is returned
        :param verbose: set to True to display a progress bar
        :param return_loss: set to True to return loss
        :param label_name: set this to change the name of the label type that is predicted
        :param embedding_storage_mode: default is 'none' which is always best. Only set to 'cpu' or 'gpu' if
        you wish to not only predict, but also keep the generated embeddings in CPU or GPU memory respectively.
        'gpu' to store embeddings in GPU memory.
        """
        if label_name == None:
            label_name = self.get_current_label_type()

        # with torch.no_grad():
        if not sentences:
            return sentences

        if isinstance(sentences, Sentence):
            sentences = [sentences]

        # set context if not set already
        previous_sentence = None
        for sentence in sentences:
            if sentence.is_context_set(): continue
            sentence._previous_sentence = previous_sentence
            sentence._next_sentence = None
            if previous_sentence: previous_sentence._next_sentence = sentence
            previous_sentence = sentence

        # reverse sort all sequences by their length
        rev_order_len_index = sorted(range(len(sentences)), key=lambda k: len(sentences[k]), reverse=True)

        reordered_sentences: List[Union[Sentence, str]] = [sentences[index] for index in rev_order_len_index]

        dataloader = DataLoader(dataset=SentenceDataset(reordered_sentences), batch_size=mini_batch_size)

        # progress bar for verbosity
        if verbose:
            dataloader = tqdm(dataloader)

        overall_loss = 0
        overall_count = 0
        batch_no = 0
        with torch.no_grad():
            for batch in dataloader:

                batch_no += 1

                if verbose:
                    dataloader.set_description(f"Inferencing on batch {batch_no}")

                batch = self._filter_empty_sentences(batch)
                # stop if all sentences are empty
                if not batch:
                    continue

                # go through each sentence in the batch
                for sentence in batch:

                    # always remove tags first
                    sentence.remove_labels(label_name)

                    all_labels = [label.decode("utf-8") for label in self.get_current_label_dictionary().idx2item]

                    all_detected = {}
                    for label in all_labels:
                        tars_sentence = self._get_tars_formatted_sentence(label, sentence)

                        loss_and_count = self.tars_model.predict(tars_sentence,
                                                                 label_name=label_name,
                                                                 return_loss=True)

                        overall_loss += loss_and_count[0].item()
                        overall_count += loss_and_count[1]

                        predicted_tars_label = tars_sentence.get_labels(label_name)[0]
                        if predicted_tars_label.value == "True":
                            sentence.add_label(label_name, label, predicted_tars_label.score)

                # clearing token embeddings to save memory
                store_embeddings(batch, storage_mode=embedding_storage_mode)

        if return_loss:
            return overall_loss, overall_count
#!/usr/bin/env python
# coding: utf-8

# In[1]:


from flair.models import TextClassifier
from flair.data import Sentence
classifier = TextClassifier.load('en-sentiment')
sentence = Sentence('Flair is pretty neat!')
classifier.predict(sentence)
# print sentence with predicted labels
print('Sentence above is: ', sentence.labels)


# In[1]:


import pandas as pd
data = pd.read_csv("./spam.csv", encoding='latin-1').sample(frac=1).drop_duplicates()
data = data[['v1', 'v2']].rename(columns={"v1":"label", "v2":"text"})
 
data['label'] = '__label__' + data['label'].astype(str)
data.iloc[0:int(len(data)*0.8)].to_csv('train.csv', sep='\t', index = False, header = False)
data.iloc[int(len(data)*0.8):int(len(data)*0.9)].to_csv('test.csv', sep='\t', index = False, header = False)
data.iloc[int(len(data)*0.9):].to_csv('dev.csv', sep='\t', index = False, header = False)


# In[2]:

Example #23
0
    except FileNotFoundError:
        print(
            "File couldn't be found. Verify if '{f_path}' is a correct file path!"
            .format(f_path=csv_file))
        exit(1)


if __name__ == '__main__':
    print(">>> STARTING PROGRAM")

    df = load_dataset_from(DATASET_FILE)

    # To verify if data loaded correctly:
    # print(df.head(10))

    classifier = TextClassifier.load('sentiment')

    tmp_negatives = {}
    tmp_positives = {}

    print(f'Number of players: {len(df["Player"].unique())}')

    for player_name in df["Player"].unique():
        tmp_negatives[player_name] = list()
        tmp_positives[player_name] = list()

    for dp in df.values:
        l = dp.tolist()
        sentence = Sentence(l[-1])
        classifier.predict(sentence)
        if sentence.labels[0].value == "NEGATIVE":
Example #24
0
 def __init__(self, model_name_or_path: str):
     self.classifier = TextClassifier.load(model_name_or_path)
Example #25
0
 def __init__(self, c_classifier_file, p_classifier_file):
     self.classifier_c = TextClassifier.load_from_file(c_classifier_file)
     self.classifier_p = TextClassifier.load_from_file(p_classifier_file)
Example #26
0
# if not universal sentence encoder
if not _use:
    # load Flair
    import torch
    import flair
    from flair.models import TextClassifier

    # load various embeddings
    from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings, RoBERTaEmbeddings
    from flair.data import Sentence

    # if trained embeddings
    if not pool:
        # embeddings trained to the "downstream task"
        model = TextClassifier.load(
            f'./data/model_sentiment_{fold}/{test_run}_best-model.pt')
        document_embeddings = model.document_embeddings

    # if simple pooled embeddings
    else:
        if test_run == "fasttext":
            document_embeddings = DocumentPoolEmbeddings(
                [WordEmbeddings('en-twitter')])
        elif test_run == "roberta":
            document_embeddings = DocumentPoolEmbeddings([
                RoBERTaEmbeddings(
                    pretrained_model_name_or_path="roberta-large",
                    layers="21,22,23,24",
                    pooling_operation="first",
                    use_scalar_mix=True)
            ])
Example #27
0
def initialize_training(text_column_index,
                        label_column_index,
                        delimiter=';',
                        model_type=None,
                        model=None,
                        max_epochs=10,
                        patience=3,
                        use_amp=0,
                        calc_class_weights=0):
    """
    Create a text classification model using FLAIR, SentenceTransformers and
    Huggingface Transformers.
    Params:
    data_folder_path: Folder path with each file titled appropriately i.e.
                      train.csv test.csv dev.csv.
                      Will create a 80/10/10 split if only train is supplied.
    output_folder_path: Folder path for storing the best model & checkpoints.
    text_column_index: In which index (starting from 0) the input column is located.
    label_column_index: In which index (starting from 0) the label column is located.
    delimiter: type of delimiter used in the .csv file.
    model_type: SentenceTransformerDocumentEmbeddings or TransformerDocumentEmbeddings
    model: Which model to use.
    max_epochs: Number of epochs to train the model for.
    patience: Number of epochs without improvement before adjusting learning rate.
    use_amp: Whether to enable automatic mixed precisions (AMP).
    calc_class_weights: Whether to create a dictionary with class weights to deal
                        with imbalanced datasets.
    Output:
        best-model.pt
        final-model.pt
        training.log
    """

    # 1. Column format indicating which columns hold the text and label(s)
    column_name_map = {
        text_column_index: "text",
        label_column_index: "label_topic"
    }

    # 2. Load corpus containing training, test and dev data.
    corpus: Corpus = CSVClassificationCorpus("/root/text-classification/data/",
                                             column_name_map,
                                             skip_header=True,
                                             delimiter=delimiter)

    # Print statistics about the corpus.
    training_data_statistics = corpus.obtain_statistics()
    print(training_data_statistics)

    # 3A. Create a label dictionary.
    label_dict = corpus.make_label_dictionary()

    # 3B. Calculate class weights.
    if bool(calc_class_weights):
        weight_dict = create_weight_dict(delimiter=delimiter,
                                         label_index=label_column_index)
    else:
        weight_dict = None

    # 4. Initialize the sentence_transformers model.
    if model_type == "SentenceTransformerDocumentEmbeddings":
        document_embeddings = SentenceTransformerDocumentEmbeddings(model)
    elif model_type == "TransformerDocumentEmbeddings":
        document_embeddings = TransformerDocumentEmbeddings(model,
                                                            fine_tune=True)
    elif model_type == "WordEmbeddings":
        word_embeddings = [WordEmbeddings(model)]
        document_embeddings = DocumentRNNEmbeddings(word_embeddings,
                                                    hidden_size=256)
    elif model_type == "StackedEmbeddings":
        document_embeddings = DocumentRNNEmbeddings([
            WordEmbeddings('glove'),
            FlairEmbeddings(model + '-backward'),
            FlairEmbeddings(model + '-forward')
        ])
    else:
        raise Exception(
            "Pick SentenceTransformerDocumentEmbeddings, StackedEmbeddings, WordEmbeddings or TransformerDocumentEmbeddings."
        )

    # 5. create the text classifier
    classifier = TextClassifier(document_embeddings,
                                label_dictionary=label_dict,
                                loss_weights=weight_dict)

    # 6. initialize the text classifier trainer with Adam optimizer
    trainer = ModelTrainer(classifier,
                           corpus,
                           optimizer=Adam,
                           use_tensorboard=False)

    # 7. start the training
    trainer.train("/root/text-classification/checkpoint/",
                  learning_rate=3e-5,
                  max_epochs=max_epochs,
                  patience=patience,
                  use_amp=bool(use_amp),
                  checkpoint=True,
                  mini_batch_size=16,
                  mini_batch_chunk_size=4)
Example #28
0
# In[125]:


# initialize document embedding by passing list of word embeddings
document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(stacked_embeddings,
                                                                     hidden_size=512,
                                                                     reproject_words=True,
                                                                     reproject_words_dimension=256,
                                                                     )


# In[126]:


# create text classifier
classifier = TextClassifier(document_embeddings, label_dictionary=label_dict, multi_label=False)


# In[127]:


# initialize the text classifier trainer
trainer = ModelTrainer(classifier, corpus)


# In[ ]:


trainer.train('FLAIR_data\\BERT',
              learning_rate=0.1,
              mini_batch_size=16,
def main(data_folder, benchmark_classifier_folder, new_data_folder,
         finetuned_classifier_folder):
    from flair.embeddings import FlairEmbeddings, DocumentLSTMEmbeddings, BertEmbeddings, DocumentRNNEmbeddings, TransformerDocumentEmbeddings
    from flair.models import TextClassifier
    from flair.trainers import ModelTrainer
    from flair.datasets import CSVClassificationCorpus
    from flair.data import Corpus
    import pandas as pd
    import os

    ### First Stage (Train on benchmark dataset)
    benchmark = pd.read_csv(data_folder + "combined_benchmark.csv")
    benchmark = benchmark[['label', 'text']]

    #### Create train, dev and test set
    #benchmark = benchmark.sample(frac=1) # if not set random state, everytime has different training result
    benchmark = benchmark.sample(frac=1, random_state=42)
    benchmark.iloc[0:int(len(benchmark) * 0.8)].to_csv(data_folder +
                                                       'train.csv',
                                                       sep='\t',
                                                       index=False,
                                                       header=False)
    benchmark.iloc[int(len(benchmark) * 0.8):int(len(benchmark) * 0.9)].to_csv(
        data_folder + 'test.csv', sep='\t', index=False, header=False)
    benchmark.iloc[int(len(benchmark) * 0.9):].to_csv(data_folder + 'dev.csv',
                                                      sep='\t',
                                                      index=False,
                                                      header=False)

    #### Build corpus
    column_name_map = {1: "text", 0: "label_topic"}

    corpus: Corpus = CSVClassificationCorpus(
        data_folder,
        column_name_map,
        skip_header=False,  #no header in kaggle data
        delimiter='\t',  # comma separated rows
        #train_file='train.csv', ## passing in file names manually when it can't auto detect
        #dev_file='dev.csv',
        #test_file='test.csv'
    )

    #### Create word embeddings
    word_embeddings = [
        BertEmbeddings(),
        FlairEmbeddings('news-forward-fast'),
        FlairEmbeddings('news-backward-fast')
    ]
    ## caveat: issue of deprecation. BertEmbeddings and DocumentLSTMEmbeddings existed in version 0.4.5, and became legacy embeddings(still available) in version 0.5

    #### First Stage Fine-tuning
    document_embeddings = DocumentLSTMEmbeddings(word_embeddings,
                                                 hidden_size=512,
                                                 reproject_words=True,
                                                 reproject_words_dimension=256)
    classifier = TextClassifier(
        document_embeddings,
        label_dictionary=corpus.make_label_dictionary(),
        multi_label=False)
    trainer = ModelTrainer(classifier, corpus)
    #trainer.train(benchmark_classifier_folder, max_epochs=1) #offline test use epoch=1
    trainer.train(benchmark_classifier_folder, max_epochs=10)

    ### every finetuning results in different scores
    ### accuracy at phase1 finetuning does not matter too much, phase2 scores more important in biasing the models towards learning indicator-specific keywords

    ### Second Stage (train on hand annotated datasets)
    #### Build corpus

    ### this column_name_map must be updated to reflect which column stores the X(text features) and y(golden labels) for training use
    ### in the csv file contained in new_data_folder, 2nd column is 'title_desc',
    ### 4th column is 'title_desc_sent_1' (where we stored agreed sentiment annotations)
    new_column_name_map = {1: "text", 3: "label_topic"}
    print(new_column_name_map)

    corpus: Corpus = CSVClassificationCorpus(
        new_data_folder,
        new_column_name_map,
        skip_header=True,
        delimiter=','  # comma separated rows
    )

    #### Second Stage fine-tuning

    benchmark_classifier = TextClassifier.load(
        os.path.join(benchmark_classifier_folder, 'best-model.pt'))
    trainer = ModelTrainer(benchmark_classifier, corpus)
    #trainer.train(finetuned_classifier_folder, max_epochs=1) #offline test use
    trainer.train(finetuned_classifier_folder, max_epochs=10)
Example #30
0
def spamcleaner(text):
    classifier = TextClassifier.load('./utils/model/best-model.pt')
    sentence = Sentence(text)
    classifier.predict(sentence)
    print(sentence.labels)
    return (str(sentence.labels))