def bare_command(doc): """ Check the first word of each sentence is a verb AND is contained in list of key words Output: Count of matches """ keywords = set( [' be ', ' do ', ' please ', ' have ', ' thank ', ' hang ', ' let ']) #nlp.enable_pipe("senter") #doc = nlp(text) # Returns first word of every sentence along with the corresponding POS first_words = [ ' ' + prep.prep_simple(str(sent[0])) + ' ' for sent in doc.sents ] POS_fw = [sent[0].tag_ for sent in doc.sents] # returns word if word is a verb and in list of keywords bc = [ b for a, b in zip(POS_fw, first_words) if a == 'VB' and b not in keywords ] return len(bc)
def sentence_split(doc): # doc = nlp(text) sentences = [str(sent) for sent in doc.sents] sentences = [' ' + prep.prep_simple(str(s)) + ' ' for s in sentences] return sentences
def word_start(keywords, doc): """ Find first words in text such as conjunctions and affirmations """ key_res = [] phrase2_count = [] # doc = nlp(text) for key in keywords: first_words = [ ' ' + prep.prep_simple(str(sent[0])) + ' ' for sent in doc.sents ] #first_words = [prep.prep_simple(str(fw)) for fw in first_words] cs = [w for w in first_words if w in keywords[key]] phrase2_count.append(len(cs)) key_res.append(key) res = pd.DataFrame([key_res, phrase2_count], index=['Features', 'Counts']).T return res
import time import prep import pandas as pd #from sentiment import Sentiment import feature_extraction as fe import spacy import en_core_web_sm #from negspacy.negation import Negex nlp = en_core_web_sm.load() start_time = time.clock() text = 'I don\'t understand what you mean, but for me please could you let me know how you came to this way of thinking? Would you mind?' clean_text = prep.prep_simple(text) doc = nlp(clean_text) PATH = '../Data/' UPLOAD_FOLDER = '../Data/In/' DOWNLOAD_FOLDER = '../data/Out/' FOLDERS_IN = ['word_matches', 'spacy_pos', 'spacy_neg', 'word_start'] READ_TYPE = ['single', 'multiple', 'multiple', 'single'] #prep.commit_data(path, folders, words_in_line) kw = prep.load_saved_data(UPLOAD_FOLDER, FOLDERS_IN) #print(kw['word_matches']) sc1 = fe.count_matches(kw['word_matches'], text) #print(count_matches) # Includes negation handling
key_res = [ 'highPolarity_score', 'lowPolarity_score', 'highPolarity_count', 'lowPolarity_count', 'highSubjectivity_score', 'lowSubjectivity_score', 'highSubjectivity_count', 'lowSubjectivity_count' ] sentiment_res = [ pol_scores[0], pol_scores[1], pol_counts[0], pol_counts[1], sub_scores[0], sub_scores[1], sub_counts[0], sub_counts[1] ] res = pd.DataFrame(sentiment_res, index=key_res) return res if __name__ == '__main__': text = 'I\'m not quite sure I understand for me, please could you let me know how you came to this way of thinking? Would you mind?' clean_text = prep.sentenciser(text) #clean_text = prep.phrase_split(text) clean_text = [prep.prep_simple(t) for t in clean_text] se = Sentiment(clean_text) sent = se.sentiment() #df = pd.concat(sent, axis = 0) print(sent)
def feat_counts(text, kw): """ Main function for getting the features from text input. Calls other functions to load dataset, clean text, counts features, removes negation phrases. Input: Text string Saved data of keywords and dependency pairs from pickle files Output: Feature counts """ text = re.sub('(?<! )(?=[.,!?()])|(?<=[.,!?()])(?! )', r' ', text) text = text.lstrip() #print(text) clean_text = prep.prep_simple(text) #print(clean_text) doc_text = nlp(text) doc_clean_text = nlp(clean_text) # quick test to check what's being counted in Positive_Emotion # t1 = [token for token in doc_clean_text] # print(t1) # for t in t1: # if ' ' + str(t) + ' ' in kw['word_matches']['Positive_Emotion']: # print(t) # Count key words and dependency pairs with negation kw_matches = count_matches(kw['word_matches'], doc_text) dep_pairs, negations = get_dep_pairs(doc_clean_text) dep_pair_matches = count_spacy_matches(kw['spacy_pos'], dep_pairs) dep_pairs_noneg = get_dep_pairs_noneg(doc_clean_text) disagreement = count_spacy_matches(kw['spacy_noneg'], dep_pairs_noneg) neg_dp = set([' ' + i[1] + ' ' for i in negations]) neg_only = count_spacy_matches(kw['spacy_neg_only'], neg_dp) # count start word matches like conjunctions and affirmations start_matches = word_start(kw['word_start'], doc_text) scores = pd.concat( [kw_matches, dep_pair_matches, disagreement, start_matches, neg_only]) scores = scores.groupby('Features').sum().sort_values(by='Counts', ascending=False) scores = scores.reset_index() # add remaining features bc = bare_command(doc_text) scores.loc[len(scores)] = ['Bare_Command', bc] ynq, whq = Question(doc_text) scores.loc[len(scores)] = ['YesNo_Questions', ynq] scores.loc[len(scores)] = ['WH_Questions', whq] adl = adverb_limiter(kw['spacy_tokentag'], doc_text) scores.loc[len(scores)] = ['Adverb_Limiter', adl] scores = scores.sort_values(by='Counts', ascending=False) tokens = token_count(doc_text) scores.loc[len(scores)] = ['Token_count', tokens] return scores