Esempio n. 1
0
def stopwords_round2(text):
    stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

    tweet_token_list = [word for word in text.split(' ')
                        if word not in stop_words]  # remove stopwords
    tweet = ' '.join(tweet_token_list)
    return tweet
def combine_words(text, dictionary):
    '''
    Takes in text that has already been lowercased but not stemmed or lematized
    Also takes in a custom dictionary for the texts

    Combines words that should be analyzed together eg 'national monunments'
    Rejoins the text in the list so it can be vectorized

    Returns a pandas series
    '''
    temp_list = []
    text_list = text.split()
    text_list = [word.replace('diversity', 'diverse') for word in text_list]
    for e, word in enumerate(text_list[0:-2]):
        next_word = text_list[e + 1]
        try:
            for value in dictionary[word]:
                if value in next_word:
                    text_list.append(word + '_' + next_word)
                    temp_list.append(word)
                    temp_list.append(next_word)
        except KeyError:
            pass
    for w in temp_list:
        text_list.remove(w)
    return text_list
Esempio n. 3
0
def importance_scores(text, classifier_func, oov="OUT_OF_VOCABULARY"):
  # First, establish baseline prediction on original text.
  probs = classifier_func(text)
  print(probs)
  # Do note that if there are multiple max probabilities, argmax returns the first one
  max_class = np.argmax(probs)
  # Then, iterate over each word in the document, replacing it with oov and comparing the probability
  words = text.split()
  rvals = list()
  for word_id, word in enumerate(words):
    # Replace word with oov, then flatten list back into a string separated by spaces
    new_text = ' '.join(words[:word_id] + [oov] + words[(word_id+1):])
    new_probs = classifier_func(new_text)
    print(new_probs)
    # Compare probs together
    importance = 0
    new_max = np.argmax(new_probs)
    if max_class == new_max:
      importance = (probs[max_class] - new_probs[max_class])
      print("Same max class, difference in importance;",importance)
    else:
      importance = (probs[max_class] - new_probs[max_class]) + (new_probs[new_max] - probs[new_max])
      print("Different max class, sum of the importance differences;",importance)
    rvals.append(importance)
  return rvals
def transform_review_text(text):
    text = text.lower()
    text = re.sub('[^a-z ]', '', text)
    text_array = [
        stemmer.stem(word) for word in text.split() if word not in stopwords
    ]
    return ' '.join(text_array)
Esempio n. 5
0
def cloak_transposition(text, delta=1.0):
    # This startup process could probably be generalized, using a function handle instead of the length check.

    # Split the message into words.
    words = text.split()
    # We can only transpose letters in words with at least four letters, to avoid changing the start and end.
    # Find all those words.
    longword_indexes = list()
    for i in range(len(words)):
        if len(words[i]) >= 4:
            longword_indexes.append(i)
    # Determine how many of these eligible words we're supposed to modify.
    num_replace = math.ceil(delta * len(longword_indexes))
    # Pick that many words from our eligible words.
    replace = random.sample(longword_indexes, num_replace)
    # Iterate over our chosen words, and fiddle with them.
    for index in replace:
        word = words[index]
        # Adjust the word.
        letters = list(word)
        # For now, pick a random letter that isn't at the ends, and switch it with an adjacent letter
        ind = random.randint(1,len(letters)-3)
        temp = letters[ind]
        letters[ind] = letters[ind+1]
        letters[ind+1] = temp       
        new_word = ''.join(letters)
        # Put the word back.
        words[index] = new_word
    # simplistically we can just merge back with any whitespace, but ideally we would keep the whitespace
    return ' '.join(words)
Esempio n. 6
0
 def build_vocab(self, texts):
     print('building vocab...')
     wordcnt = {}
     for text in tqdm(texts):
         unigrams = text.split()
         unigram_num = len(unigrams)
         for word in unigrams:
             if word not in stopwords:
                 if word in wordcnt.keys():
                     wordcnt[word] += 1
                 else:
                     wordcnt[word] = 1
         for n in [2, 3]:
             for i in range(unigram_num):
                 if unigram_num <= i + n - 1:
                     break
                 ngram = unigrams[i:i + n]
                 if not filter_ngram(ngram):
                     ngram = " ".join(ngram)
                     if ngram in wordcnt.keys():
                         wordcnt[ngram] += 1
                     else:
                         wordcnt[ngram] = 1
     vocab = {'[UNK]': 0}
     i = 1
     for word, cnt in wordcnt.items():
         if cnt >= filter_freq:
             vocab[word] = i
             i += 1
     self.vocab = vocab
     self.vocab_size = i
     print('vocab size:', self.vocab_size)
     '''
Esempio n. 7
0
def remove_extra_whitespace(text):
    '''
    Input: "aslsj       alksdla    asdmda    askldalk"
    Output: "aslsj alksdla asdmda askldalk"
    '''
    #return re.sub(' +', ' ', text)
    return ' '.join([ele for ele in text.split(' ') if len(ele) > 0])
Esempio n. 8
0
 def clean_text(self):
     text = re.sub('[^a-zA-z]', ' ', str(self.doc))
     text = re.sub('\[.*?\]', ' ', text)
     text = re.sub('\d', ' ', text)
     text = " ".join(text.split())
     text = text.lower()
     return text
Esempio n. 9
0
def wordnet_lemmetize_tokenize(text):
    '''
    Custom tokenizer object that applies WordNetLemmatizer
    Intended to be passed into CountVectorizer as a tokenizer object
    '''
    lemmatizer = WordNetLemmatizer()
    words = text.split()

    # additional lemmatization terms
    additional_lemmatize_dict = {
        "cancelled": "cancel",
        "cancellation": "cancel",
        "cancellations": "cancel",
        "delays": "delay",
        "delayed": "delay",
        "baggage": "bag",
        "bags": "bag",
        "luggage": "bag",
        "dms": "dm",
        "thanks": "thank"
    }
    
    tokens = []
    for word in words:
        if word not in sw:
            if word in additional_lemmatize_dict:
                clean_word = additional_lemmatize_dict[word]
            else:
                clean_word = lemmatizer.lemmatize(word)
            tokens.append(clean_word)
    return tokens
Esempio n. 10
0
def simple_stemmer(text):
    '''
    Input: "My system keeps crashing his crashed yesterday, ours crashes daily"
    Output: "My system keep crash hi crash yesterday, our crash daili"
    '''
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text
Esempio n. 11
0
def sent_list(docs, splitStr='__label__'):
    for i in range(1, len(docs)):
        text = str(lines[i])
        splitText = text.split(splitStr)
        #print(i)
        secHalf = splitText[1]
        text = secHalf[2:len(secHalf) - 1]
        sentiment = secHalf[0]
        sent_analysis.append([text, sentiment])
    return sent_analysis
Esempio n. 12
0
        def checker(
            text,
        ):
            spell_checked_text_words = [
                self.spelchek.correct(
                    word=word,
                )
                for word in text.split()
            ]

            return ' '.join(spell_checked_text_words)
Esempio n. 13
0
def tokenize(text):
    """
    先进行 stemming 然后 tokenize
    params:
        text: 一个句子

    return:
        tokens 列表
    """
    text = ' '.join([stemmer.stem(word) for word in text.split(' ')])
    tokens = tokenizer.tokenize(text)

    return tokens
Esempio n. 14
0
def getDocumentSentimentList(docs, splitStr='__label__'):
    for i in range(len(docs)):
        #print('Processing doc ',i,' of ',len(docs))
        text = str(lines[i])
        #print(text)
        splitText = text.split(splitStr)
        secHalf = splitText[1]
        text = secHalf[2:len(secHalf) - 1]
        sentiment = secHalf[0]
        #print('First half:',secHalf[0],'\nsecond half:',secHalf[2:len(secHalf)-1])
        docSentimentList.append([text, sentiment])
    print('Done!!')
    return docSentimentList
def sentence_score(score_dict, text):
    sent_dict = defaultdict(int)
    text_list = text.split('.')
    for e, s in enumerate(text_list):
        score = 0
        temp_list = s.split(' ')
        for w in temp_list:
            w = w.lower()
            try:
                score += score_dict[w][0]
            except KeyError:
                continue
        sent_dict['sent{}'.format(e)] = score
    return sent_dict
Esempio n. 16
0
def extract_statements(nlp, company, text):
    """
  Extracting ESG statements from raw text by removing junk, URLs, etc.
  We group consecutive lines into paragraphs and use spacy to parse sentences.
  """
    lines = []
    sentences = []
    # remove non ASCII characters
    text = remove_non_ascii(text)

    prev = ""
    for line in text.split('\n'):
        # aggregate consecutive lines where text may be broken down
        # only if next line starts with a space or previous does not end with dot.
        if (line.startswith(' ') or not prev.endswith('.')):
            prev = prev + ' ' + line
        else:
            # new paragraph
            lines.append(prev)
            prev = line

    # don't forget left-over paragraph
    lines.append(prev)

    # clean paragraphs from extra space, unwanted characters, urls, etc.
    # best effort clean up, consider a more versatile cleaner

    for line in lines:

        # removing header number
        line = re.sub(r'^\s?\d+(.*)$', r'\1', line)
        # removing trailing spaces
        line = line.strip()
        # words may be split between lines, ensure we link them back together
        line = re.sub('\s?-\s?', '-', line)
        # remove space prior to punctuation
        line = re.sub(r'\s?([,:;\.])', r'\1', line)
        # ESG contains a lot of figures that are not relevant to grammatical structure
        line = re.sub(r'\d{5,}', r' ', line)
        # remove mentions of URLs
        line = re.sub(
            r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*',
            r' ', line)
        # remove multiple spaces
        line = re.sub('\s+', ' ', line)
        # split paragraphs into well defined sentences using spacy
        for part in list(nlp(line).sents):
            sentences.append([company, str(part).strip()])

    return sentences
def get_hashtags_and_user_mentions(special_characters,
                                   text,
                                   wanted_characters=['#', '@']):
    # Identify hashtags, user mentions and remove urls
    results = {}
    for character in special_characters:
        text = re.sub('(' + character + ')+', ' ' + character, text)
        count_character = text.count(character)
        if count_character > 0:
            while count_character > 0:
                start = text.find(character)
                print(text.find(" ", start))
                print(text.find("\n", start))
                if text.find(" ", start) <= text.find("\n", start):
                    end = text.find(" ", start)
                else:
                    end = text.find("\n", start)
                if end == -1:
                    end = len(text)
                text_to_remove = text[start:end]
                print(text_to_remove)
                if len(text_to_remove) > 2:
                    if character in wanted_characters:
                        if character in results.keys():
                            results[character].append(text_to_remove)
                        else:
                            results[character] = [text_to_remove]
                text = text.replace(text_to_remove, "")
                text = ' '.join(text.split())
                count_character = text.count(character)
    for wanted_character in wanted_characters:
        if wanted_character not in results.keys():
            results[wanted_character] = []
    text = text.strip(' ')
    text = ' '.join(text.split())
    results['clean_text'] = text
    return results
Esempio n. 18
0
        def segment_text(
            text,
        ):
            segmented_words = [
                wordsegment.segment(
                    text=word,
                )
                for word in text.split()
            ]
            seperated_words = [
                word
                for segment_text in segmented_words
                for word in segment_text
            ]

            segmented_text = ' '.join(seperated_words)

            return segmented_text
def corpus_specific_text_cleaning(text):
    """
    For performing corpus specific cleaning. Added to this file, since it needs to be adapted to the corpus and therefore a kind of configuration
    """
    text = text.replace('"full_text" : ', "").strip().replace('"', '').replace(
        '\\n*', ' ').replace('\\', ' ').replace('&amp',
                                                ' ').replace("'ve", ' have')
    text = text.replace("don't",
                        'do not').replace("doesn't", 'does not').replace(
                            "Don't", 'Do not').replace("Doesn't", 'Does not')
    text = text.replace("_NEWLINE_", " ").replace(
        "_CITATION_PREVIOUS_POST_PARAGRAPH",
        " ").replace("_CITATION_PREVIOUS_POST_", " ").replace("_POSTER_", " ")
    no_links = []
    for word in text.split(" "):
        if "//" not in word and "http" not in word and "@" not in word:
            no_links.append(word)
    cleaned_text = " ".join(no_links)
    return cleaned_text
Esempio n. 20
0
def clean_text(text, remove_stop_words=False):
    text = text.lower()
    replace_punctuation = str.maketrans(string.punctuation,
                                        ' ' * len(string.punctuation))
    text = text.translate(replace_punctuation)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub('[\n\r]', '', text)

    if remove_stop_words == True:
        text = text.split()
        new_text = []
        stemmer = PorterStemmer()

        for word in text:
            if word not in STOPWORDS:
                new_text.append(stemmer.stem(word))

        text = ' '.join(new_text)

    return text
Esempio n. 21
0
def sent_list(docs,splitStr='__label__'):
    sent_analysis = []
    for i in range(1,len(docs)):
        text=str(lines[i])
        splitText=text.split(splitStr)
        secHalf=splitText[1]
        sentiment=secHalf[0]
        text=secHalf[2:len(secHalf)-1].lower()

        table=str.maketrans(' ',' ', string.punctuation)
        text.translate(table)

        if 'www.' in text or 'http:' in text or 'https:' in text or '.com' in text:
            text = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", text)

        text = re.sub(r'\d+', '', text)



        sent_analysis.append([text,sentiment])
    return sent_analysis
Esempio n. 22
0
def cloak_replacement(text, select_func, replace_func, delta=1.0):
    # First, identify which words we're going to replace.
    words = text.split()
    replace = list()
    for i in range(len(words)):
        if select_func(words[i]):
            replace.append(i)
    # Then, replace them
    for index in replace:
        if delta < 1.0: # if we're only doing some elements, check if we skip this one
            if random.rand(0,1) >= delta:
                continue
        else:
            # We are replacing this word, use the function provided to do it
            word = words[index]
            new_word = replace_func(word)
            # Put the new word back.
            words[index] = new_word
    # Simplistically join back the words. Ideally, we would rejoin using the original whitespace.
    # TODO: reuse original whitespace somehow
    return ' '.join(words)
Esempio n. 23
0
 def encode(self, text):
     feature_vec = [0] * self.vocab_size
     valid_ids = []
     unigrams = text.split()
     unigram_num = len(unigrams)
     for word in unigrams:
         if word not in stopwords:
             pos = self.vocab.get(word, 0)
             feature_vec[pos] = 1
             if pos not in valid_ids:
                 valid_ids.append(pos)
     for n in [2, 3]:
         for i in range(unigram_num):
             if unigram_num <= i + n - 1:
                 break
             ngram = unigrams[i:i + n]
             if not filter_ngram(ngram):
                 ngram = " ".join(ngram)
                 pos = self.vocab.get(ngram, 0)
                 feature_vec[pos] = 1
                 if pos not in valid_ids:
                     valid_ids.append(pos)
     return feature_vec, valid_ids
    def clean_text(self, text):
        """
        # Arguments
            text: text body to be preprocessed and cleaned

        # Return
            cleaned text
        """
        # handle non-ascii/special characters
        text = text.encode("utf-8")
        text = re.sub(r"\\[ux][a-z0-9]+", " ", str(text))
        text = str(text).replace("b", "")
        text = text.strip("'").lower()
        text = re.sub(r'[\:\-\(\)\%\d\.\\\/\_\[\]\+\,\#\"]+', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        word_list = text.split(' ')  # tokenization w.r.t space characters
        rel_words = [
            word for word in word_list
            if word not in self.stop and len(word) >= self.min_word_len
        ]  # relevant words
        rel_words_lemm = [
            self.lemmatizer.lemmatize(word, pos='v') for word in rel_words
        ]
        return " ".join(rel_words_lemm)
def tokenizer(text):
    return text.split()
def tokenizer_porter(text):
    return [port.PorterStemmer().stem(word) for word in text.split()]
def tokenizer(text):
    return text.split()
Esempio n. 28
0
plt.hist(training_set.target, bins=bins, alpha=0.7)
plt.xlabel('Target output class label')
plt.ylabel('Count of documents')
plt.title('Histogram of documents in each category')
plt.show()

# Cleaning the texts
from sklearn.feature_extraction import text
stop_words = text.ENGLISH_STOP_WORDS
import re
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(len(dataset)):
    text = re.sub('[^a-zA-Z]', ' ', dataset['text'][i])
    text = text.lower()
    text = text.split()
    ps = PorterStemmer()
    text = [ps.stem(word) for word in text if not word in stop_words]
    text = ' '.join(text)
    corpus.append(text)
train_corpus = corpus[:len(train_data)]
test_corpus = corpus[len(train_data):]

# Creating the Bag of Words model (min_df=2)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(min_df=2)
count_matrix_train = cv.fit_transform(train_corpus).toarray()
count_matrix_test = cv.transform(test_corpus).toarray()

from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
def __split_keyword(text):
    """
    キーワードを区切り、stopwordsを除外する
    """
    keywords = text.split(" ")
    return [keyword for keyword in keywords if __check_stop_word(keyword)]
 def remove_stopword(text):
     return [word for word in text.split() if word not in stop_words]
Esempio n. 31
0
def strips(dataset):
    for id_, (query, text) in dataset.items():
        query = query.split(" ")
        text = text.split(" ")
        id_ = id_
        yield id_, query, text
def tokenizer_porter(text):
    return [port.PorterStemmer().stem(word) for word in text.split()]