コード例 #1
0
def bare_command(doc):
    """
	Check the first word of each sentence is a verb AND is contained in list of key words

	Output: Count of matches
	"""

    keywords = set(
        [' be ', ' do ', ' please ', ' have ', ' thank ', ' hang ', ' let '])

    #nlp.enable_pipe("senter")
    #doc = nlp(text)

    # Returns first word of every sentence along with the corresponding POS
    first_words = [
        ' ' + prep.prep_simple(str(sent[0])) + ' ' for sent in doc.sents
    ]

    POS_fw = [sent[0].tag_ for sent in doc.sents]

    # returns word if word is a verb and in list of keywords
    bc = [
        b for a, b in zip(POS_fw, first_words)
        if a == 'VB' and b not in keywords
    ]

    return len(bc)
コード例 #2
0
def sentence_split(doc):

    # doc = nlp(text)
    sentences = [str(sent) for sent in doc.sents]
    sentences = [' ' + prep.prep_simple(str(s)) + ' ' for s in sentences]

    return sentences
コード例 #3
0
def word_start(keywords, doc):
    """
	Find first words in text such as conjunctions and affirmations
	"""

    key_res = []
    phrase2_count = []

    # doc = nlp(text)

    for key in keywords:

        first_words = [
            ' ' + prep.prep_simple(str(sent[0])) + ' ' for sent in doc.sents
        ]
        #first_words = [prep.prep_simple(str(fw)) for fw in first_words]
        cs = [w for w in first_words if w in keywords[key]]

        phrase2_count.append(len(cs))
        key_res.append(key)

    res = pd.DataFrame([key_res, phrase2_count], index=['Features',
                                                        'Counts']).T
    return res
コード例 #4
0
import time
import prep
import pandas as pd
#from sentiment import Sentiment
import feature_extraction as fe
import spacy
import en_core_web_sm
#from negspacy.negation import Negex

nlp = en_core_web_sm.load()

start_time = time.clock()

text = 'I don\'t understand what you mean, but for me please could you let me know how you came to this way of thinking? Would you mind?'
clean_text = prep.prep_simple(text)
doc = nlp(clean_text)

PATH = '../Data/'
UPLOAD_FOLDER = '../Data/In/'
DOWNLOAD_FOLDER = '../data/Out/'
FOLDERS_IN = ['word_matches', 'spacy_pos', 'spacy_neg', 'word_start']
READ_TYPE = ['single', 'multiple', 'multiple', 'single']

#prep.commit_data(path, folders, words_in_line)
kw = prep.load_saved_data(UPLOAD_FOLDER, FOLDERS_IN)

#print(kw['word_matches'])
sc1 = fe.count_matches(kw['word_matches'], text)
#print(count_matches)

# Includes negation handling
コード例 #5
0
        key_res = [
            'highPolarity_score', 'lowPolarity_score', 'highPolarity_count',
            'lowPolarity_count', 'highSubjectivity_score',
            'lowSubjectivity_score', 'highSubjectivity_count',
            'lowSubjectivity_count'
        ]

        sentiment_res = [
            pol_scores[0], pol_scores[1], pol_counts[0], pol_counts[1],
            sub_scores[0], sub_scores[1], sub_counts[0], sub_counts[1]
        ]

        res = pd.DataFrame(sentiment_res, index=key_res)

        return res


if __name__ == '__main__':

    text = 'I\'m not quite sure I understand for me, please could you let me know how you came to this way of thinking? Would you mind?'

    clean_text = prep.sentenciser(text)
    #clean_text = prep.phrase_split(text)
    clean_text = [prep.prep_simple(t) for t in clean_text]

    se = Sentiment(clean_text)
    sent = se.sentiment()

    #df = pd.concat(sent, axis = 0)
    print(sent)
コード例 #6
0
def feat_counts(text, kw):
    """
	Main function for getting the features from text input.
	Calls other functions to load dataset, clean text, counts features,
	removes negation phrases.

	Input:
		Text string
		Saved data of keywords and dependency pairs from pickle files

	Output:
		Feature counts
	"""

    text = re.sub('(?<! )(?=[.,!?()])|(?<=[.,!?()])(?! )', r' ', text)
    text = text.lstrip()
    #print(text)
    clean_text = prep.prep_simple(text)
    #print(clean_text)
    doc_text = nlp(text)
    doc_clean_text = nlp(clean_text)

    # quick test to check what's being counted in Positive_Emotion
    # t1 = [token for token in doc_clean_text]
    # print(t1)
    # for t in t1:
    # 	if ' ' + str(t) + ' ' in kw['word_matches']['Positive_Emotion']:
    # 		print(t)

    # Count key words and dependency pairs with negation
    kw_matches = count_matches(kw['word_matches'], doc_text)

    dep_pairs, negations = get_dep_pairs(doc_clean_text)
    dep_pair_matches = count_spacy_matches(kw['spacy_pos'], dep_pairs)

    dep_pairs_noneg = get_dep_pairs_noneg(doc_clean_text)
    disagreement = count_spacy_matches(kw['spacy_noneg'], dep_pairs_noneg)

    neg_dp = set([' ' + i[1] + ' ' for i in negations])
    neg_only = count_spacy_matches(kw['spacy_neg_only'], neg_dp)

    # count start word matches like conjunctions and affirmations
    start_matches = word_start(kw['word_start'], doc_text)

    scores = pd.concat(
        [kw_matches, dep_pair_matches, disagreement, start_matches, neg_only])
    scores = scores.groupby('Features').sum().sort_values(by='Counts',
                                                          ascending=False)
    scores = scores.reset_index()

    # add remaining features
    bc = bare_command(doc_text)
    scores.loc[len(scores)] = ['Bare_Command', bc]

    ynq, whq = Question(doc_text)

    scores.loc[len(scores)] = ['YesNo_Questions', ynq]
    scores.loc[len(scores)] = ['WH_Questions', whq]

    adl = adverb_limiter(kw['spacy_tokentag'], doc_text)
    scores.loc[len(scores)] = ['Adverb_Limiter', adl]

    scores = scores.sort_values(by='Counts', ascending=False)

    tokens = token_count(doc_text)
    scores.loc[len(scores)] = ['Token_count', tokens]

    return scores