Example #1
0
def prep_text(txt):
    '''
    Process a text string to a list of non-stopwords, be a google's vocab
    :param txt: text String
    :return: list of non-stopwords and be a google's vocab
    '''
    # loop for filter a non-stopwords and google's vocab
    return [word for word in Sentence(txt.lower()).words if word not in STOPWORDS]
def WM_prep_text(txt):
    '''
    Process a text string to list of word
    :param txt: text String
    :return: list of word that is non-stopwords and lemmatized
    '''
    word_list = []
    postags = Sentence(txt.lower()).pos_tags
    for word, pos in postags:
        if word in STOPWORDS:
            continue

        if pos[0] in POSMAP.keys():
            word_list.append(Word(word).lemmatize(POSMAP[pos[0]]))
        else:
            word_list.append(word)
    return Sentence(' '.join(word_list))
def get_features(features, operation='train'):
    row = features.shape[0]
    phrase_vectors1 = translate(features[:, 0].astype(str), table=translator)
    phrase_vectors2 = translate(features[:, 1].astype(str), table=translator)

    filename = os.path.join(dir_path, 'data','sentiment_vectors_'+operation)
    if not os.path.exists(filename):
        sentiment_vector1 = np.array([Sentence(each).polarity for each in phrase_vectors1]).reshape(row, 1)
        sentiment_vector2 = np.array([Sentence(each).polarity for each in phrase_vectors2]).reshape(row, 1)
        with open(filename, 'wb') as f:
            pickle.dump(sentiment_vector1, f)
            pickle.dump(sentiment_vector2, f)
    else:
        with open(filename, 'rb') as f:
            sentiment_vector1 = pickle.load(f)
            sentiment_vector2 = pickle.load(f)

    filename = os.path.join(dir_path, 'data', 'raw_phrase_vectors_'+operation)
    if not os.path.exists(filename):
        phrase_vectors1 = np.vectorize(get_phrase_vector_obj)(phrase_vectors1)
        phrase_vectors2 = np.vectorize(get_phrase_vector_obj)(phrase_vectors2)
        with open(filename, 'wb') as f:
            pickle.dump(phrase_vectors1, f)
            pickle.dump(phrase_vectors2, f)
    else:
        with open(filename, 'rb') as f:
            phrase_vectors1 = pickle.load(f)
            phrase_vectors2 = pickle.load(f)

    filename = os.path.join(dir_path, 'data', 'processed_phrase_vectors_'+operation)
    if not os.path.exists(filename):
        phrase_vectors1 = get_phrase_vector(phrase_vectors1).reshape(row, 300)
        phrase_vectors2 = get_phrase_vector(phrase_vectors2).reshape(row, 300)
        with open(filename, 'wb') as f:
            pickle.dump(phrase_vectors1, f)
            pickle.dump(phrase_vectors2, f)
    else:
        with open(filename, 'rb') as f:
            phrase_vectors1 = pickle.load(f)
            phrase_vectors2 = pickle.load(f)

    features = np.concatenate((sentiment_vector1, sentiment_vector2, phrase_vectors1, phrase_vectors2), axis=1)
    return features
Example #4
0
def get_random_lorem_ipsum_sentance() -> str:
    """get lorem ipsum sentence"""
    lorem_sentence = lorem.sentence()
    if decision(lorem_ipsum_fuck_probability):
        lorem_sentence = fix_punctuation_spacing(
            TreebankWordDetokenizer().detokenize(
                recumpile_sentence(Sentence(lorem_sentence))
            )
        )
    return lorem_sentence
Example #5
0
def generate_wordlist_emb(txt):
    '''
    Process a text string to list of embedded word
    'I love you' -> ['I', 'love', 'you'] -> list([emb('I'), emb('love'), emb('you')])
    :param word_seq: text String
    :return: list of embedded word
    '''
    word_seq = Sentence(txt.lower()).words
    wordlist = []
    for word in word_seq:
        if word in WORD_EMB.vocab:
            wordlist.append(WORD_EMB[word])
        else:
            # shape is depend on embedding dimension
            # use [1., 1., 1., ...] represent a unknown word
            wordlist.append(
                np.full(shape=[300], fill_value=1.0, dtype=np.float32))
    return wordlist
def get_sentiment(df_headlines):
    ls_titles = df_headlines['title'].tolist()
    project_dir = str(Path.cwd()) + "/finbert/finBERT"
    cl_path = project_dir + "/models/classifier_model/finbert-sentiment"
    model = BertForSequenceClassification.from_pretrained(cl_path,
                                                          cache_dir=None,
                                                          num_labels=3)

    ldf_bert_title_sentiments = [predict(title, model) for title in ls_titles]
    df_bert_title_sentiments = pd.concat(ldf_bert_title_sentiments)
    df_bert_title_sentiments.reset_index(inplace=True, drop=True)
    # use TextBlob to seperate string into sentences, then evaluate their sentiment using finbert
    blob = TextBlob(ls_titles[0])
    for title in ls_titles[1:]:
        blob.sentences.append(Sentence(title))
    ss_titles = pd.Series([sentence.raw for sentence in blob.sentences])
    sf_title_sentiment = pd.Series(
        [sentence.sentiment.polarity for sentence in blob.sentences])
    df_textblob_title_sentiments = pd.DataFrame()
    df_textblob_title_sentiments['title'] = ss_titles
    df_textblob_title_sentiments[
        'textblob_sentiment_prediction'] = sf_title_sentiment
    i_temp_len = len(df_bert_title_sentiments)

    df_bert_title_sentiments['title'] = df_bert_title_sentiments['sentence']
    del df_bert_title_sentiments['sentence']

    df_bert_title_sentiments = df_bert_title_sentiments.merge(
        df_textblob_title_sentiments, on='title', how='inner')
    print(
        str((len(ldf_bert_title_sentiments) / i_temp_len) * 100) +
        "% of berts/textblob sentiments merged!")
    print("Tuned BERT model complete!! " + str(len(df_bert_title_sentiments)) +
          " financial headlines have been processed successfully!")
    print(f'Average headline sentiment is %.2f.' %
          (df_bert_title_sentiments.sentiment_score.mean()))

    return df_bert_title_sentiments
Example #7
0
def create_pandas_dataframe_from_text_par(texts_dic, selectedTerms, ndici,
                                          titlename):
    dfst = pd.DataFrame(
        columns=["%s selected terms" % titlename, "Frequencies"])
    dflines = pd.DataFrame(columns=[
        "start", 'end', 'sentence_length', 'sentence', 'narrator',
        'protagonists', '#_of_protagonists', 'polarity', 'subjectivity'
    ])
    u = 1

    selectedTermsDic = {}

    selectedTermsDics = Counter()
    occurlist = []
    coccurlist = []
    occurdict = Counter()
    # all_sents=blobbook.sentences
    sec_prot = nx.MultiGraph()
    uu = 0
    # for sen in all_sents:
    #     dd=sen.dict
    for actor in texts_dic:
        for countr, sent in texts_dic[actor].items():
            sen = Sentence(sent)
            dd = sen.dict
            ssdd = [i for i in dd['noun_phrases'] if i in selectedTerms]
            # for ssdi in issdd:
            # print ssdi,actor,issdd
            # ssdd=[actor,ssdi]
            # ssdd.append(actor)
            nssdd = list(set([ndici[i] for i in ssdd]))
            narrat = ndici[actor]
            selectedTermsDics[narrat] += 1
            # print dd['start_index']+countr,dd['end_index']+countr,dd['end_index']-dd['start_index'],dd['raw'],nssdd,len(nssdd),dd['polarity'],dd['subjectivity']
            # print uu

            dflines.loc[uu] = [
                dd['start_index'] + countr, dd['end_index'] + countr,
                dd['end_index'] - dd['start_index'], dd['raw'], narrat, nssdd,
                len(nssdd), dd['polarity'], dd['subjectivity']
            ]

            if len(nssdd) > 0:
                for j in nssdd:
                    selectedTermsDics[j] += 1
                    coccurlist.append([[narrat, j], dd['polarity'],
                                       dd['subjectivity']])
                    occurlist.append([(narrat, j), dd['polarity'],
                                      dd['subjectivity']])
                    sec_prot.add_edge(uu, j)
                sec_prot.add_node(uu,
                                  polarity=dd['polarity'],
                                  subjectivity=dd['subjectivity'])

            # if len(ssdd)==2:
            #     coccurlist.append([[ndici[ssdd[0]],ndici[ssdd[1]]],dd['polarity'],dd['subjectivity']])
            #     occurlist.append([tuple(sorted([ndici[ssdd[0]],ndici[ssdd[1]]])),dd['polarity'],dd['subjectivity']])
            #     for jk in nssdd:
            #         sec_prot.add_edge(uu,jk)
            #     sec_prot.add_node(uu,polarity=dd['polarity'],subjectivity=dd['subjectivity'])

            # elif len(ssdd)>2:
            #     for jj in it.combinations(ssdd,2):
            #         occurlist.append([tuple(sorted([ndici[jj[0]],ndici[jj[1]]])),dd['polarity'],dd['subjectivity']])
            #         coccurlist.append([[ndici[jj[0]],ndici[jj[1]]],dd['polarity'],dd['subjectivity']])
            #     for jk in nssdd:
            #         sec_prot.add_edge(uu,jk)
            #     sec_prot.add_node(uu,polarity=dd['polarity'],subjectivity=dd['subjectivity'])
            uu += 1

    for i in occurlist:
        occurdict[i[0]] += 1

    u = 0
    for l, v in selectedTermsDics.items():
        dfst.loc[u] = [l, v]
        u += 1
    return dfst, sec_prot, coccurlist, occurlist, dflines
Example #8
0
def recumpile_sentence(sentence: Sentence) -> List[str]:
    new_tokens = []
    # TODO: determine mood classifier for sentence and add respective emoji
    sentiment_emoji = None
    if decision(0.89):
        sentiment_emoji = get_sentiment_emoji(sentence)

    for token in sentence.tokenize(TweetWordTokenizer()):
        # TODO: this is only for discord so we dont break tokenization
        if re.match(
                r"@everyone|@here|<:[^:\s]+:[0-9]+>|<a:[^:\s]+:[0-9]+>|<(?:@!?\d+|:[A-Za-z0-9]+:)\w+>",
                token,
        ):
            new_tokens.append(token)
            continue

        emoji = None
        alias_emoji = get_cheap_emoji_alias(token)

        # TODO: refactor into its own mutator
        if decision(0.9) and (re.match("among", token, flags=re.IGNORECASE) or
                              re.match("amogus", token, flags=re.IGNORECASE) or
                              re.match(r"su+s", token, flags=re.IGNORECASE)):
            emoji = "ඞ"

        emoticon = get_emoticon(token)

        if alias_emoji:
            if decision(0.1) or (len(str(token)) == 1 and decision(0.9)):
                new_tokens.append(alias_emoji)
                continue
            else:
                if decision(0.5):
                    new_tokens.append(alias_emoji)

        if decision(0.5):
            emoji = get_emoji_from_data(token)
        if decision(0.3):
            emoji = get_gloveword_emoji(token)
        if emoji:
            if decision(0.5):
                new_tokens.append(emoji)

        if decision(random_synonym_probability):
            token = replace_with_random_synonym(token)
        if decision(0.5) and profanity.contains_profanity(token):
            token = token.upper()
        if decision(censor_profanity_probability
                    ) and profanity.contains_profanity(token):
            if decision(0.1):
                token = custom_censoring(token, 1)
            else:
                token = custom_censoring(token, censor_profanity_percent)
        elif decision(random_censor_probability):
            token = custom_censoring(token, random_censor_percent)

        if re.match("musk", token, flags=re.IGNORECASE):
            add_husky = True
        else:
            add_husky = False

        # processing
        recumpiled_token = recumpile_token(token)

        # post processing
        new_tokens.append(recumpiled_token)

        if emoji:
            if decision(0.8):
                new_tokens.append(emoji)
        if alias_emoji:
            if decision(0.8):
                new_tokens.append(alias_emoji)
        if emoticon:
            if decision(0.8):
                new_tokens.append(emoticon)

        if add_husky:
            new_tokens.append(recumpile_token("husky"))

        if add_random_garbage and decision(add_random_garbage_probability):
            new_tokens.append(recumpile_token(add_random_garbage_token()))
        if add_randomly_text_face_emoji and decision(
                add_randomly_text_face_emoji_probability):
            new_tokens.append(get_random_text_face_emojis())
        if add_random_simple_text_emoji and decision(
                # TODO: use textblob to determine mood of text and insert faces
                #  accordingly likely need to do this after reconstruction of the
                #  text blob and go through this sentence by sentence rather than
                #  word by word.
                add_random_simple_text_emoji_probability):
            new_tokens.append(get_random_simple_text_emojis())
        if add_random_rp_action and decision(
                add_random_rp_mid_sentence_action_probability):
            new_tokens.append(get_random_rp_action_sentence())
    if add_random_rp_action and decision(
            add_random_rp_end_sentence_action_probability):
        new_tokens.append(get_random_rp_action_sentence())

    if sentiment_emoji:
        new_tokens.append(sentiment_emoji)
        if decision(0.4):
            for i in range(5):
                if decision(0.3):
                    new_tokens.append(sentiment_emoji)
                else:
                    break

    return new_tokens
Example #9
0
def get_features(features, operation='train'):
    row = features.shape[0]
    operation = operation + '.den'
    phrase_vectors1 = translate(features[:, 0].astype(str), table=translator)
    phrase_vectors2 = translate(features[:, 1].astype(str), table=translator)

    filename = os.path.join(dir_path, 'data', 'sentiment_vectors_' + operation)
    if not os.path.exists(filename):
        sentiment_vector1 = np.array([
            Sentence(each).polarity for each in phrase_vectors1
        ]).reshape(row, 1)
        sentiment_vector2 = np.array([
            Sentence(each).polarity for each in phrase_vectors2
        ]).reshape(row, 1)
        with open(filename, 'wb') as f:
            pickle.dump(sentiment_vector1, f)
            pickle.dump(sentiment_vector2, f)
    else:
        with open(filename, 'rb') as f:
            sentiment_vector1 = pickle.load(f)
            sentiment_vector2 = pickle.load(f)

    filename = os.path.join(dir_path, 'data',
                            'subjective_vectors_' + operation)
    if not os.path.exists(filename):
        subjective_vectors1 = np.array([
            Sentence(each).subjectivity for each in phrase_vectors1
        ]).reshape(row, 1)
        subjective_vectors2 = np.array([
            Sentence(each).subjectivity for each in phrase_vectors2
        ]).reshape(row, 1)
        with open(filename, 'wb') as f:
            pickle.dump(subjective_vectors1, f)
            pickle.dump(subjective_vectors2, f)
    else:
        with open(filename, 'rb') as f:
            subjective_vectors1 = pickle.load(f)
            subjective_vectors2 = pickle.load(f)

    filename = os.path.join(dir_path, 'data', 'shared_tokens_' + operation)
    if not os.path.exists(filename):
        shared_token_vector = [0] * len(phrase_vectors1)
        current_index = 0
        for each_question1, each_question2 in zip(phrase_vectors1,
                                                  phrase_vectors2):
            shared_tokens = 0
            for each_word1 in each_question1.split(' '):
                if each_word1 not in stopwords.words('english'):
                    for each_word2 in each_question2.split(' '):
                        try:
                            wv1 = word_model[each_word1]
                            wv2 = word_model[each_word2]
                            if each_word2 not in stopwords.words(
                                    'english') and cosine_similarity(
                                        wv1, wv2)[0][0] >= 0.6:
                                shared_tokens += 1
                        except:
                            if each_word1 == each_word2:
                                shared_tokens += 1
            shared_token_vector[current_index] = shared_tokens
            current_index += 1

        with open(filename, 'wb') as f:
            pickle.dump(shared_token_vector, f)
    else:
        with open(filename, 'rb') as f:
            shared_token_vector = pickle.load(f)

    filename = os.path.join(dir_path, 'data',
                            'fuzzy_wuzzy_partial_ratio_' + operation)
    if not os.path.exists(filename):
        partial_ratio_vector1 = get_fuzzy_partial_vector(
            phrase_vectors1).reshape(row, 1)
        partial_ratio_vector2 = get_fuzzy_partial_vector(
            phrase_vectors2).reshape(row, 1)
        with open(filename, 'wb') as f:
            pickle.dump(partial_ratio_vector1, f)
            pickle.dump(partial_ratio_vector2, f)
    else:
        with open(filename, 'rb') as f:
            partial_ratio_vector1 = pickle.load(f)
            partial_ratio_vector2 = pickle.load(f)

    filename = os.path.join(dir_path, 'data',
                            'raw_phrase_vectors_' + operation)
    if not os.path.exists(filename):
        phrase_vectors1 = np.vectorize(get_phrase_vector_obj)(phrase_vectors1)
        phrase_vectors2 = np.vectorize(get_phrase_vector_obj)(phrase_vectors2)
        with open(filename, 'wb') as f:
            pickle.dump(phrase_vectors1, f)
            pickle.dump(phrase_vectors2, f)
    else:
        with open(filename, 'rb') as f:
            phrase_vectors1 = pickle.load(f)
            phrase_vectors2 = pickle.load(f)

    filename = os.path.join(dir_path, 'data',
                            'cosine_similarity_vector_' + operation)
    if not os.path.exists(filename):
        cosine_similarity_vector = get_cosine_similarity_vector(
            phrase_vectors1, phrase_vectors2).reshape(row, 1)
        with open(filename, 'wb') as f:
            pickle.dump(cosine_similarity_vector, f)
    else:
        with open(filename, 'rb') as f:
            cosine_similarity_vector = pickle.load(f)

    filename = os.path.join(dir_path, 'data',
                            'processed_phrase_vectors_' + operation)
    if not os.path.exists(filename):
        phrase_vectors1 = get_phrase_vector(phrase_vectors1).reshape(row, 300)
        phrase_vectors2 = get_phrase_vector(phrase_vectors2).reshape(row, 300)
        with open(filename, 'wb') as f:
            pickle.dump(phrase_vectors1, f)
            pickle.dump(phrase_vectors2, f)
    else:
        with open(filename, 'rb') as f:
            phrase_vectors1 = pickle.load(f)
            phrase_vectors2 = pickle.load(f)

    features = np.concatenate(
        (shared_token_vector, cosine_similarity_vector, partial_ratio_vector1,
         partial_ratio_vector2, subjective_vectors1, subjective_vectors2,
         sentiment_vector1, sentiment_vector2, phrase_vectors1,
         phrase_vectors2),
        axis=1)
    return features
Example #10
0
    def text(self, text):

        try:
            # check for text, transform to unicode if necessary
            if text is not None:
                if not (type(text) is unicode or type(text) is str):
                    raise TypeError(
                        "supplied text object of type that is not str or unicode"
                    )
                else:
                    if not type(text) is unicode:
                        text = text.decode('utf8')

                    # unicode?  unicode.
                    blob = TextBlob(text)
                    blob_nonalpha_thresh = self.nonalpha_thresh(blob)
            else:
                raise ValueError("no input text supplied")

            # replace all instances of sentences broken by newline
            # to ensure that we're dealing with contiguous text

            s1_text = re.sub(r'([a-z\,]+)[\n\r]+?([^A-Z0-9]+?)', r'\1 \2',
                             text)
            s1_list = list()

            # try to remove header-type section labels through the use of some convoluted
            # rule bs.  really not elegant, but it works.

            stopwords = sw.words("english")
            for sentence in s1_text.split('\n'):
                # line begins or ends with a number - maybe ToC or heading
                if re.match('(?:^[0-9]+?|[0-9]+?[\n\r]+?$)', sentence):
                    continue
                words = sentence.split()
                if len(words) <= 3:
                    continue
                for word in words:
                    # boring word
                    if word.lower() in stopwords:
                        continue
                    # not a word - unicode bullets or other nonsense
                    if not re.match(r'\w+?', word):
                        continue
                    # links
                    if re.match(r'^[a-zA-Z]+\:\/\/', word):
                        continue
                    # no title case headings
                    if not re.match('^[0-9A-Z]{1}', word):
                        s1_list.append(sentence)
                        break

            # let's clear out anything with a nonalpha token ratio higher than the threshold

            s2_list = [
                s for s in s1_list
                if self.nonalpha_pct(Sentence(s)) < (len(s) / 4)
            ]

            # now that we've got a semi-clean set of data, we can do some statistical analysis
            # to determine if we've got a lot of repeat data like headers/footers/copyrights
            # that can skew our keyword stats

            sentence_counts = Counter(s2_list)
            sc_series = [v for (k, v) in sentence_counts.iteritems()]
            sc_std = np.std(sc_series)
            sc_median = np.median(sc_series)

            # if we have repeating text, rebuilt it minus that noise, or anything
            # specified in the global blacklist

            if sc_median >= 1:
                final_list = []

                # some edge cases force us to break outlier "sentences" into smaller units
                # for comparison later
                #
                # once the list is built, we have to check a few different ways to ensure
                # we are removing all the noise we can

                sentence_outliers = [
                    k.strip().lower()
                    for (k, v) in sentence_counts.iteritems()
                    if v >= (sc_median + (sc_std * 2)) > 1
                ]
                self.global_filterlist += sentence_outliers
                for s in s2_list:
                    if s.lower() in self.global_filterlist:
                        continue
                    for o in sentence_outliers:
                        if distance(o, s.lower()) < float(len(s) * .35):
                            break
                        elif o in s.lower():
                            break
                    else:
                        final_list.append(s)

            # text had no repeats or noise to filter (rare)
            else:
                final_list = s2_list

            # we out
            return " ".join(final_list)

        except Exception as e:
            raise e
Example #11
0

num_sentences = 0
num_documents = 0

sentence_text_dict = {}

all_sentences = []
sentence_to_docid = {}

# Read the data and count words / sentences
for inputTextFile in listOfFiles:
    print(inputTextFile)
    with open(inputTextFile, 'r', encoding='utf-8') as content_file:
        csvReader = csv.reader(content_file)
        sentences = [Sentence(sentenceText) for row in csvReader for sentenceText in row]
        all_sentences += sentences
        for sentence in sentences:
            sentence_text_dict[num_sentences] = sentence
            sentence_to_docid[num_sentences] = num_documents
            num_sentences += 1
            for word in sentence.words:
                string = word.encode("utf-8")
                get_word_id(string)
        num_documents += 1


print(all_sentences)
# Generate tf.idf scores and fill in the sparse matrix
tfidfMatrix = scipy.sparse.dok_matrix((num_sentences, nextWordId))
for sentence_id in sentence_text_dict:
Example #12
0
blob.sentiment.polarity

blob.sentiment.subjectivity

# Getting the Sentiment of a Sentence 
for sentence in blob.sentences:
    print(sentence.sentiment)


# Section 12.2.5 Self Check snippets

# Exercise 1
from textblob import Sentence

Sentence('The food is not good.').sentiment

Sentence('The movie was not bad.').sentiment

Sentence('The movie was excellent!').sentiment


# Section 12.2.6 snippets
from textblob.sentiments import NaiveBayesAnalyzer

blob = TextBlob(text, analyzer=NaiveBayesAnalyzer())

blob

blob.sentiment
 def sentim_sent(stri):
     try:
         tt = Sentence(stri).sentiment
     except Exception, e:
         tt = (None, None)