def contractions_fun(self): """ This function replaces words that are -- by checking a word if a word is present in a dictionary if the word is present in dictionary then it is replaced with its value from dictionary """ if self.contraction_method == 'mapping': self.doc = self.mapping_decontraction(str(self.doc)) elif self.contraction_method == 'word2vec': model = pretrained_model cont = Contractions(model) cont.load_models() self.doc = list(cont.expand_texts([str(self.doc)], precise=True))[0] elif self.contraction_method == 'glove': model = api.load("glove-twitter-25") cont = Contractions(kv_model=model) cont.load_models() self.doc = list(cont.expand_texts([str(self.doc)], precise=True))[0]
class ContractionsExpander(TransformerMixin): def __init__(self, kv_model=None, api_key='glove-twitter-100', precise=False): self.kv_model = kv_model self.api_key = api_key self.precise = precise if api_key: self.contractions = Contractions(api_key=api_key) else: self.contractions = Contractions(kv_model=kv_model) self.contractions.load_models() def fit(self, X, y=None): return self def transform(self, X, y=None): return self.contractions.expand_texts(X)
def preprocessing_text(df, series): #removing patterns (usernames & links to websites) pattern_list = ['@[\w]*', r'http\S+'] remove_patterns(df, series, pattern_list) #remove hashtag from clean text df[series] = remove_hashtags(df[series]) # de-emojizing df[series] = df[series].apply(lambda x: emoji.demojize(x)) #remove characters repeated more than 2 times df[series] = df[series].apply(lambda x: ReplaceThreeOrMore(x)) #remove html characters char_list = ['&', '\n', 'á', '<', '>'] remove_chars(df, series, char_list) #handle contractions cont = Contractions(api_key="glove-twitter-100") df[series] = df[series].apply(lambda x: list(cont.expand_texts([x]))) df[series] = df[series].apply(lambda x: str(x)) #removing numbers df[series] = df[series].apply( lambda x: ''.join([i for i in x if not i.isdigit()])) #remove punctuation df[series] = df[series].str.replace('[^\w\s]', ' ') #set to lowercase df[series] = df[series].apply( lambda x: " ".join(x.lower() for x in x.split())) #lemmatization df[series] = df[series].apply(lambda x: str(x)) df[series] = df[series].apply( lambda x: " ".join([Word(word).lemmatize() for word in x.split()])) return df
class ContractionExpander(TextProcessingBaseClass): ''' Removes contractions from the text and uses the full version instead (unification). Example: I'll walk down the road --> I will walk down the road ''' model_contraction_expander = None def __init__(self, model=None): ''' :param model: Pretrained word embedding model. ''' super().__init__() if model is None: # If no model is given, use the default one and store it as a static class variable to avoid multiple loadings if ContractionExpander.model_contraction_expander is None: model_path = os.path.join(NLP_MODELS_PATH, 'pretrained', 'word_embeddings', 'pubmed2018_w2v_400D', 'pubmed2018_w2v_400D.bin') ContractionExpander.model_contraction_expander = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True) model = ContractionExpander.model_contraction_expander self.cont = Contractions(kv_model=model) def level(self) -> str: return "text" def _process_internal(self, text: str) -> str: ''' :param text: Input string. :return: The string without contractions. ''' return list(self.cont.expand_texts([text], precise=True))[0]
def train_model() -> None: train_data = fetch_data.fetch_imdb_train_data() cont = Contractions(constants.CONTRACTIONS_BIN_FILE) cont.load_models() for index, row in train_data.iterrows(): row.review = BeautifulSoup(row.review, features="html.parser").get_text() row.review = cont.expand_texts(row.review, precise=True) train_data.review = clean_reviews(train_data.review) reviews = list(tokenize_sentences(train_data.review)) labels = list(train_data.sentiment) tokenizer = Tokenizer(num_words=constants.MAX_NB_WORDS) tokenizer.fit_on_texts(train_data.review) data = np.zeros((len(train_data.review), constants.MAX_SENTS, constants.MAX_SENT_LENGTH), dtype='float32') words = list() for i, sentences in enumerate(reviews): for j, sent in enumerate(sentences): if j < constants.MAX_SENTS: wordTokens = text_to_word_sequence(sent) k = 0 for _, word in enumerate(wordTokens): if k < constants.MAX_SENT_LENGTH and tokenizer.word_index[word] < constants.MAX_NB_WORDS: data[i, j, k] = tokenizer.word_index[word] k = k + 1 words.append(wordTokens) word_index = tokenizer.word_index print('Total %s unique tokens.' % len(word_index)) labels = to_categorical(np.asarray(labels)) print('Shape of data tensor:', data.shape) print('Shape of label tensor:', labels.shape) wordSkipGramModel = gensim.models.Word2Vec(words, min_count=5, size=constants.EMBEDDING_DIM, window=4, sg=1) word_embedding_matrix = np.random.random((len(word_index) + 1, constants.EMBEDDING_DIM)) for word, i in word_index.items(): try: word_embedding_vector = wordSkipGramModel.wv.get_vector(word) except KeyError: continue # words not found in embedding index will be all-zeros.EMBEDDING_DIM if word_embedding_vector is not None: word_embedding_matrix[i] = word_embedding_vector embedding_layer = Embedding(len(word_index) + 1, constants.EMBEDDING_DIM, weights=[word_embedding_matrix], input_length=constants.MAX_SENT_LENGTH, trainable=True) sentence_input = Input(shape=(constants.MAX_SENT_LENGTH,), dtype='float32') embedded_sequences = embedding_layer(sentence_input) sentence_lstm = Bidirectional(LSTM(200, return_sequences=True))(embedded_sequences) l_dropout = Dropout(0.5)(sentence_lstm) l_dense = TimeDistributed(Dense(400))(l_dropout) l_att = attention_layer.AttLayer()(l_dense) l_dropout_1 = Dropout(0.4)(l_att) sentEncoder = Model(sentence_input, l_dropout_1) review_input = Input(shape=(constants.MAX_SENTS, constants.MAX_SENT_LENGTH), dtype='float64') review_encoder = TimeDistributed(sentEncoder)(review_input) review_dropout = Dropout(0.3)(review_encoder) l_lstm_review = Bidirectional(LSTM(100, return_sequences=True))(review_dropout) l_att_dropout_review = Dropout(0.2)(l_lstm_review) l_dense_review = TimeDistributed(Dense(200))(l_att_dropout_review) l_dropout_review = Dropout(0.2)(l_dense_review) l_att_review = attention_layer.AttLayer()(l_dropout_review) preds = Dense(2, activation='softmax')(l_att_review) model = Model(review_input, preds) adam = Adam(lr=0.0001) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) model.fit(data, labels, validation_split=0.2, epochs=10, batch_size=50, shuffle=False, verbose=1) model.save('deeplearn_sentiment_model.h5') # Save Tokenizer i.e. Vocabulary with open('reviews_tokenizer.pkl', 'wb') as handle: pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('abbreviations.mapper', 'r') as file: content = file.read() abbreviations_map = literal_eval(content) paragraph_separator = '\n\n' sentence_separator = ' ' token_separator = ' ' unnecessary_identifier_regex = '[0-9\[\]%/,()–\'<>^~`@|#$+:;’]' unnecessary_space = ' ' unnecessary_unresolved_pron = '-PRON-' unnecessary_apostrophe = ' \'' unnecessary_space_period = ' \.' period_regex = '\.' valid_eos_token = '[!?]' # Time taking step expander = Contractions(api_key='glove-wiki-gigaword-50') assert list(expander.expand_texts(['loader_demo_text' ]))[0] == 'loader_demo_text' # Time taking step spacy_tool = spacy.load('en_md') neuralcoref.add_to_pipe(spacy_tool) logging.basicConfig(filename='summarizer.log', filemode='w', format='%(name)s - %(levelname)s - %(message)s', level=logging.DEBUG) # Takes about ~40 seconds to start-up
class Cleaner: def __init__(self, embedding_for_smart_contraction="GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin", spell_dictonarypath = "frequency_dictionary_en_82_765.txt"): self.embedding_for_smart_contraction = embedding_for_smart_contraction self.spell_dictonarypath = spell_dictonarypath self.initialized = False def initialize(self): print("Initializing Text Cleaner..") print("Initializing Smart Contractions Module..") self.cont = Contractions(self.embedding_for_smart_contraction) self.cont.load_models() print("Initializing Stopwords Module..") self.stop_words = set(stopwords.words('english')) stop_words_without_negation = copy.deepcopy(self.stop_words) stop_words_without_negation.remove('no') stop_words_without_negation.remove('nor') stop_words_without_negation.remove('not') self.stop_words_without_negation = stop_words_without_negation self.pos_tags_set_1 = {'NNP'} print("Initializing Wordnet Lemmatizer Module..") self.wnl = WordNetLemmatizer() print("Initializing Spellcheck Module..") max_edit_distance_dictionary = 2 prefix_length = 7 self.sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) dictionary_path = os.path.abspath('')+"\\"+self.spell_dictonarypath self.sym_spell.load_dictionary(dictionary_path, 0, 1) print("Initialization complete!") def expand_contractions(self,text): try: text = list(self.cont.expand_texts([text], precise=False))[0] except Exception as e: return text return text def apostrophe_correction(self,text): text = re.sub("’", "'", text) return text def try_decode(self,text): try: text = unidecode.unidecode(codecs.decode(text, 'unicode_escape')) except: text = unidecode.unidecode(text) return text def tokenize_and_keep_only_words(self,text): text = re.findall(r"[a-zA-Z]+", text.lower()) return text def remove_stop_words(self,text): text = [word for word in text if (word not in self.stop_words_without_negation and len(word)>2)] return text def lemmatize(self,text): text = [self.wnl.lemmatize(word) for word in text] return text def spell_check(self,text,max_edit_distance_lookup = 2): # tokenize each word text = word_tokenize(text) # apply pos to each word text = pos_tag(text) correct_text = [] # for each word in sentece for word in text: # if word is not a noun if word[1] not in self.pos_tags_set_1: # check if we can correct it, then correct it suggestions = self.sym_spell.lookup(word[0],Verbosity.CLOSEST, max_edit_distance_lookup) for suggestion in suggestions: # take the first correction correct_text.append(suggestion.term) break else: correct_text.append(word[0]) text = ' '.join([word for word in correct_text]) return text def full_clean(self,text,debug=False): if not self.initialized: self.initialize() self.initialized = True if debug: print("pre-clean: ",text) text = self.try_decode(text) text = self.apostrophe_correction(text) text = self.spell_check(text) text = self.expand_contractions(text) text = self.tokenize_and_keep_only_words(text) text = self.remove_stop_words(text) text = self.lemmatize(text) text = ' '.join(text) if debug: print("post-clean: ",text) return text
def transform(self, x): if self.verbose > 0: print( colored("Called Description Transformer Transform", color="blue", attrs=['bold', 'underline'])) print("Processing description text") # Copy the data and find the name of the description column self.data = x.copy() self.column_name = self.data.columns.values[0] # Load spaCy language processor nlp = spacy.load("en_core_web_sm") # Load pre-trained word embedding if using contractions contraction = Contractions( api_key="glove-twitter-25") if self.contractions else None # Process text by iterating over each sample's index and description for idx, sample in zip(self.data.index.values, self.data.values): # Change accented characters, e.g à -> a sample = self.remove_accents(str(sample)) if contraction: # Contract words, e.g "hasn't" -> "has not" sample = list(contraction.expand_texts([sample], precise=True)) sample = ''.join(sample) # Input sample text into spaCy language processor doc = nlp(sample) # Split sample text into sentences sentences = list(doc.sents) for word_idx in range(len(sentences)): # Remove punctuation tokens, e.g. ! , . sentences[word_idx] = [ token for token in sentences[word_idx] if not token.is_punct ] # Remove stop words if self.stop_words: sentences[word_idx] = [ token for token in sentences[word_idx] if token.text.lower() not in self.stop_words ] # Apply lemmatization if self.transformation[0].lower() == "l": # Resolve words to their dictionary form using PoS tags sentences[word_idx] = [ token.lemma_.lower() for token in sentences[word_idx] ] # Apply stemming (only if lemmatization not applied) elif self.transformation[0].lower() == "s": # Stem tokens for char_idx in range(len(sentences[word_idx])): # Apply stemmer to each word stemmed = self.stemmer_algorithm.stem( sentences[word_idx][char_idx].text) # Convert back to type Token and update word in sentence sentences[word_idx][char_idx] = nlp(stemmed)[0] # Remove remaining punctuation within tokens, e.g. "(years)" -> "years", not including - sentences[word_idx] = [ token.translate( str.maketrans('', '', '!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~')) for token in sentences[word_idx] ] # Split words containing dash or spaces caused by lemmatization, e.g. "16-year" -> "16" + "year" for k in range(len(sentences)): new_sentence = [] for token in sentences[k]: split_token = re.split(' |-', token) for word in split_token: # Check word not empty if word: new_sentence.append(word) # Replace words in sentence sentences[k] = new_sentence # Remove empty lists from list of sentences sentences = [sent for sent in sentences if sent != []] # The join the sentences and update the descriptions dataframe word_list = [word for sent in sentences for word in sent] self.data.loc[idx, self.column_name] = ' '.join( [str(elem) for elem in word_list]) # if self.verbose > 1: # display(self.data) if self.verbose > 0: print( colored("Finshed processing all descriptions\n", color="blue", attrs=['bold', 'underline'])) return self.data
class TextProcessing: """ Class to clean text """ def __init__(self, nlp=spacy.load("en_core_web_sm")): self.nlp = nlp contextualSpellCheck.add_to_pipe(self.nlp) model = api.load(cfg['embeddings']['embedding_file']) self.cont = Contractions(kv_model=model) self.cont.load_models() dirname = os.path.dirname(__file__) with open(os.path.join(dirname, 'acronym.json')) as f: self.acronyms = json.load(f) def process_text(self, text): """ Processes text as follows: 1. decode to unicode 2. remove extra repeated special characters 3. put space around the special characters 4. Remove extra whitespaces 5. replace acronyms 6. expand contractions of english words like ain't 7. correct spelling mistakes 8. replace NE in the text 9. lower case the string Args: text: text to be processed """ text = self.unidecode(text) text = self.remove_repeated_chars(text) text = self.put_space_around_special_chars(text) text = self.remove_extra_whitespaces(text) text = self.replace_acronyms(text) text = self.expand_contractions(text) text = self.correct_spellings(text) text = self.replace_named_entity(text) text = self.lower_case(text) return text def remove_repeated_chars(self, text): """ Removes repeated instances of consecutive special chars Args: text: text to be processed """ text = re.sub(r'([!@#$%^&*,./?\'";:\\])\1+', r'\1', text) return text def put_space_around_special_chars(self, text): """ Puts space around special chars like '[({$&*#@!' Args: text: text to be processed """ chars = [ '$', '?', '%', '@', '!', '#', '^', '*', '&', '"', ':', ';', '/', '\\', ',', '+', '(', ')', '[', ']', '{', '}', '<', '>' ] for char in chars: text = text.replace(char, ' ' + char + ' ') return text def remove_extra_whitespaces(self, text): """ Removes extra whitespaces from the text Args: text: text to be processed """ return text.strip() def unidecode(self, text): """ unidecodes the text Args: text: text to be processed """ return unidecode.unidecode(text.lower()) def lower_case(self, text): """ lower cases the text Args: text: text to be processed """ return text.lower() def expand_contractions(self, text): """ expands contractions for example, "ain't" expands to "am not" Args: text: text to be processed """ return list(self.cont.expand_texts([text.lower()], precise=True))[0] def correct_spellings(self, text): """ corrects spellings from text Args: text: text to be processed """ doc = self.nlp(text) if doc._.performed_spellCheck: text = doc._.outcome_spellCheck return text def replace_acronyms(self, text): """ Replaces acronyms found in English For example: ttyl -> talk to you later Args: text: text to be processed """ for acronym, expansion in self.acronyms.items(): text = text.replace(' ' + acronym.lower() + ' ', ' ' + expansion.lower() + ' ') return text def replace_named_entity(self, text): """ Replaces named entity in the text For example: $5bn loss estimated in the coming year -> MONEY loss estimated in the coming year Args: text: text to be processed """ doc = list( self.nlp.pipe( [text], disable=["tagger", "parser", "contextual spellchecker"]))[0] for ent in doc.ents: text = text.replace(ent.text, ent.label_) return text def token_list(self, text): doc = self.nlp(text) tokens = [] for token in doc: tokens += [token.text] return tokens
# %% # %% # nltk.download('stopwords') stop_words = set(stopwords.words('english')) def stemming(sentence): stemmer = PorterStemmer() sentence = sentence.split() sentence = ' '.join(stemmer.stem(word) for word in sentence ) #if word not in stop_words) return sentence # %% cont = Contractions(api_key="glove-twitter-100") # %% data['question1'] = list(cont.expand_texts(data['question1'])) data['question2'] = list(cont.expand_texts(data['question2'])) data['question1'] = data['question1'].fillna('').apply(lambda x: BeautifulSoup(x, "lxml").text) data['question2'] = data['question2'].fillna('').apply(lambda x: BeautifulSoup(x, "lxml").text) data['question1'] = data['question1'].fillna('').apply(punctutions) data['question2'] = data['question2'].fillna('').apply(punctutions) data['question1'] = data['question1'].fillna('').apply(stemming) data['question2'] = data['question2'].fillna('').apply(stemming) #%% data['fuzz_ratio'] = data.apply(lambda x : fuzz.ratio(x['question1'],x['question2']),axis=1) data['fuzz_partial_ratio'] = data.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1) data['token_sort_ratio'] = data.apply(lambda x: fuzz.token_sort_ratio(x['question1'], x['question2']),axis=1) data['token_set_ratio'] = data.apply(lambda x: fuzz.token_set_ratio(x['question1'],x['question2']),axis=1) # # %%
class Cleaner: def __init__( self, expand_contractions=True, strip_text_in_brackets=False, combine_concatenations=False, w2v_path=None, api_key="word2vec-google-news-300", ): self.opt_expand_contractions = expand_contractions self.opt_strip_text_in_brackets = strip_text_in_brackets self.opt_combine_concatenations = combine_concatenations if expand_contractions: print( "Loading contractions dataset (this will take a while the first time)" ) # Load your favorite word2vec model self.cont = Contractions(w2v_path=w2v_path, api_key=api_key) print("Contractions dataset downloaded") print("Training contractions model (this will take a while)") # prevents loading on first expand_texts call self.cont.load_models() print("Contraction model successfully trained") def expand_contractions(self, text): text = text.replace("’", "'") # need to put in the correct apostrophe expanded_text = list(self.cont.expand_texts([text], precise=True)) return expanded_text[0] def strip_brackets(self, text): # Remove strings in brackets # Eg. "This is a sentence (extra info) description." # Becomes "This is a sentence description." """ Remove brackets from text Matches (), [], {} Converts: 'hello (there) you (my[best] friend) lets {dine } }' -> 'hello you lets }' """ brace_open_type = "" brace_pair = {'(': ')', '[': ']', '{': '}'} open_brace_list = list(brace_pair.keys()) res = "" for c in text: if len(brace_open_type) == 0: # not opened if c in open_brace_list: brace_open_type = c else: res += c else: # opened if brace_pair[brace_open_type] == c: brace_open_type = "" return res def combine_concatenations(self, sentence): """ Recieves string sentence "This is a sentence" """ # convert concatenated words into seperate words # georgetown-louisville becomes georgetown louisville # Pd matches all types of dashes # https://www.compart.com/en/unicode/category/Pd if self.opt_combine_concatenations: def _refu(sent): return regex.sub(r'\p{Pd}+', '', sent) else: def _refu(sent): return regex.sub(r'\p{Pd}+', ' ', sent) return _refu(sentence) def remove_non_english(self, tokens): """ Removes non-english words and all punctuation and numbers Removes extra white space Recieves list of tokens comprising a single sentence: ['this', 'is', 'a', 'sentence'] """ # remove all punctuation (removes non-english words too) # stripped = re.sub('[^a-zA-Z\s]*', '', stripped) # removes extra white spaces # stripped = re.sub('[ ]{2,}',' ', stripped) cleaned_tokens = [] for token in tokens: cleaned = re.sub('[ ]{2,}', ' ', re.sub('[^a-zA-Z\s]*', '', token)).strip() if len(cleaned) != 0: cleaned_tokens.append(cleaned) return cleaned_tokens def lemmatize_sentences(self, tokenized_sentences): """ Recieves Args: tokenized_sentences is of form [['this', 'is', 'sentence'], ['this', 'is', 'another'] ['this', 'is', 'another']] Returns: lemmatized 2d list of same form [['this', 'is', 'sentenc'], ['this', 'is', 'anoth'] ['this', 'is', 'anoth']] """ lemmatized_sentences = [] for sentence in tokenized_sentences: lemmatized_sentences.append(lemmatize(sentence)) # lemmatized_sentences = [lemmatize(sentence) for sentence in tokenized_sentences] return lemmatized_sentences def clean(self, text): if self.opt_expand_contractions: # Expands it's -> it is text = self.expand_contractions(text) # text is lowercased after contractions are expanded # the contractions will be capitalized after they are expanded # eg. (i'm -> [I, am]). Therefore, the lowercasing is done afterwards text = text.lower() if self.opt_strip_text_in_brackets: text = self.strip_brackets(text) sentences = sent_tokenize(text) sentences = [ self.combine_concatenations(sentence) for sentence in sentences ] tokens_per_sentence = [word_tokenize(sent) for sent in sentences] lemmatized_tokens_per_sent = self.lemmatize_sentences( tokens_per_sentence) cleaned_tokens_per_sent = [ self.remove_non_english(sent) for sent in lemmatized_tokens_per_sent ] return cleaned_tokens_per_sent
class NLP(): nlp = None doc = None model = None def __init__(self, spacy_model='en_core_web_sm', gensim_model='glove-twitter-25'): self.nlp = spacy.load(spacy_model) self.model = api.load(gensim_model) self.cont = Contractions(kv_model=self.model) def remove_html(self, text): """Strip HTML tags from text""" soup = BeautifulSoup(text, 'html.parser') return soup.get_text(separator=" ") def remove_accents(self, text): """Remove accented characters from text for non-english words""" return unidecode.unidecode(text) def expand_contractions(self, text): """Convert contractions into whole words. e.g. can't -> can not""" return list(self.cont.expand_texts([text], precise=True))[0] def preprocess(self, text, remove_numbers=False, remove_stopwords=False, excluded_sw=None, toke=False): """Preprocess using standard protocols. @param remove_numbers converts words to digits and removes @param remove_stopwords removes stop words @param excluded_sw is any stopwords to exclude @param toke if true, return tokens, default return text """ text = self.remove_html(text) text = self.remove_accents(text) text = self.expand_contractions(text) if toke or remove_numbers or remove_stopwords: if excluded_sw is not None: for w in excluded_sw: self.nlp.vocab[w].is_stop = False doc = self.nlp(text) tokens = [] for token in doc: if token.pos_ == 'NUM' and not remove_numbers: tokens.append(w2n.word_to_num(token.text)) elif not token.is_stop: tokens.append(token.text) if toke: return tokens text = " ".join(tokens) return text def lemmatize(self, tokens, toke=False): lookups = Lookups() lookups.add_table('lemma_index', lemma_index) lookups.add_table('lemma_exc', lemma_exc) lookups.add_table('lemma_rules', lemma_rules) lemmatizer = Lemmatizer(lookups) lemmas = [] for t in tokens: lemmas.append(lemmatizer(token.text, token.tag_)) if toke: return lemmas return " ".join(lemmas) def get_syllables(self, word): count = 0 vowels = ("a", "e", "i", "o", "u", "y") prev = False for c in word: vowel = c in vowels if vowel and not prev: count += 1 prev = vowel return count def get_lexical_density(self, tokens): c_words = t_words = 0 cont_pos = ['PROPN', 'NOUN', 'VERB', 'ADJ', 'ADV'] for t in tokens: if token.pos_ in cont_pos: c_words += 1 t_words += 1 elif token.pos_ != 'PUNCT': t_words += 1 return round((c_words / t_words), 4) def get_coherence(self, text): doc = self.nlp(text) sentences = [sent for sent in doc.sents if len(sent) >= 2] frequency = defaultdict(int) token_sents = [] for s in sentences: tmp = [] for t in self.preprocess(s, remove_stopwords=True, excluded_sw=['no', 'not'], toke=True): tmp.append(t.text) frequency[t] += 1 token_sents.append(tmp) vocab = [[word for word in sent if frequency[word] > 1] for sent in token_sents] dictionary = corpora.Dictionary(vocab) corpus = [dictionary.doc2bow(word) for word in vocab] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=20) corpus_lsi = lsi[corpus_tfidf] sums = {} topic_count = max([len(line) for line in corpus_lsi]) for line in corpus_lsi: for topic in line: t_num = topic[0] if t_num not in sums: sums[t_num] = abs(topic[1]) else: sums[t_num] += abs(topic(1)) best_topic = max(zip(sums.values(), sums.keys()))[1] ordered = [] i = 0 for line in corpus_lsi: ordered.append((i, line[topic][1])) i += 1 ordered = sorted(ordered, key=lambda x: x[1], reverse=True) threshold = ordered[0][1] - (0.90 * (ordered[0][1] - ordered[-1][1])) problem_sentences = [] for s in ordered: if s[1] < threshold: problem_sentences.append((s[1]), s) problem_sentences = [s for s in ordered if s[1] < threshold] output = {} for p in problem_sentences: output[p[0]] = (p[1], str(sentences[p[0]])) return output def get_readability(self, text): scores = {} doc = self.nlp(text) sentence _count = len(doc) words = self.preprocess(text, toke=True) characters = 0 for word in words: characters += len(word) word_count = len(words) syllable_count = 0 complex_words = 0 for word in words: c = self.get_syllables(word) syllable_count += c if c >= 3 and not word[0].isupper(): complex_words += 1 avgwps = word_count / sentence_count # Automated Readability Index ari = 0.0 ari_grade = 0 if word_count > 0: ari = 4.71 * (characters / word_count) + 0.5 * \ (word_count / sentence_count) - 21.43 if ari < 2: ari_grade = 0 elif ari > 12: ari_grade = 13 else: ari_grade = ari scores["ari"] = (ari, ari_grade) # Flesch Reading Ease flesch_reading_ease = 101 fre_grade = 0 if word_count > 0 and sentence_count > 0: flesch_reading_ease = 206.835 - \ 1.015(word_count / sentence_count) - \ 84.6(syllable_count / word_count) if flesch_reading_ease > 100: fre_grade = 4 elif flesch_reading_ease > 90.0: fre_grade = 5 elif flesch_reading_ease > 80.0: fre_grade = 6 elif flesch_reading_ease > 70.0: fre_grade = 7 elif flesch_reading_ease > 60.0: fre_grade = 9 elif flesch_reading_ease > 50: fre_grade = 12 else: fre_grade = 13 scores["flesch_reading_ease"] = (flesch_reading_ease, fre_grade) # Flesch-Kincaid Grade Level fkg = 0.0 if word_count > 0 and sentence_count > 0: fkg = 0.39(word_count / sentence_count) + \ 11.8(syllable_count / word_count) - 15.59 scores["flesch_kinkaid_grade_level"] = (fkg, int(fkg)) # Gunning Fog Index gfi = 0.0 gfi_grade = 0 if sentence_count > 0 and word_count > 0: gfi = 0.4 * ((word_count / sentence_count) + 100(complex_words / word_count)) if gfi < 6: gfi_grade = 5 elif gfi <= 12: gfi_grade = int(gfi) else: gfi_grade = 13 scores["gunning_fog_index"] = (gfi, gfi_grade) # SMOG Readability smog = 0.0 smog_grade = 0 if sentence_count > 0: smog = 1.0430 * math.sqrt(complex_words * (30 / sentence_count)) + 3.1291 if smog >= 13: smog_grade = 13 else: smog_grade = int(smog) scores["smog_readability"] = (smog, smog_grade) # ColemanLiauIndex coleman = 0.0 coleman_grade = 0 if word_count > 0: coleman = (5.89 * (characters / word_count)) - \ (30 * (sentence_count / word_count)) - 15.8 if coleman >= 13: coleman_grade = 13 else: coleman_grade = int(coleman) scores["coleman_liau"] = (coleman, coleman_grade) # LIX & RIX lix = 0.0 rix = 0.0 lix_grade = 0 rix_grade = 0 if sentence_count > 0 and word_count > 0: long_words = 0 for word in words: if len(word) >= 7: long_words += 1 lix = word_count / sentence_count + ((100. * long_words) / word_count) rix = long_words / sentence_count if lix >= 13: lix_grade = 13 else: lix_grade = int(lix) if rix >= 13: rix_grade = 13 else: rix_grade = int(rix) scores["LIX"] = (lix, lix_grade) scores["RIX"] = (rix, rix_grade) count = 0 avg = 0.0 for k, v in scores.items: avg += v[1] count += 1 scores["AVERAGE_GRADE"] = (avg / count, int(avg / count)) return scores
class TextCleaner: word_re = re.compile('[a-zA-Z]+') number_re = re.compile('[0-9]+$') spell_checker = SpellChecker() lemmatizer = WordNetLemmatizer() all_words = set(words.words()) def __init__(self, save_path, word2vec_model_path, previously_processed=[]): self.contractions = Contractions(word2vec_model_path) self.previously_processed = previously_processed self.save_path = save_path def _get_all_comments(self, subreddit): comments = [] for submission in subreddit["submissions"]: for comment in submission["comments"]: comments.extend(sent_tokenize(comment["body"])) return comments def _remove_urls(self, text): url_pattern = r'(((https?|ftp)://)?(([a-zA-Z])+\.)?([a-zA-Z])+\.([a-zA-Z])+/?.*)|http' new_sentences = [] for word in text.split(): if re.compile(url_pattern).search(word): new_sentences.append(re.sub(url_pattern, "__isurl__", word)) else: new_sentences.append(word) return " ".join(new_sentences) def _invalid_characters(self, string): string = re.sub("(\s|-|_|\.\.\.)+", " ", string) return re.sub("!|#|&|\(|\)|–|\[|{|}|\]|:|;|\?|\*", "", string) def _expand_sentences(self, texts): return list( self.contractions.expand_texts( [x.replace("’", "'") for x in texts], precise=True)) def _replace(self, sentence, is_spell_check=True): words = [] for word in word_tokenize(sentence): word = word.strip() if "/" in word or "\\" in word: words.append("__isslashinword__") elif self.word_re.match(word): if is_spell_check and word not in self.all_words: words.append(self.spell_checker.correction(word)) else: words.append(word) elif self.number_re.match(word): words.append("__isnumber__") elif "__isurl__" in word: words.append("__isurl__") else: words.append("__isinvalidword__") return words def _words_and_tags(self, words): lemmas = [] pos_tags = [] for word, pos_tag in nltk.pos_tag(words): pos_tags.append(pos_tag) if self._get_wordnet_pos(pos_tag): lemmas.append( self.lemmatizer.lemmatize( word, pos=self._get_wordnet_pos(pos_tag))) else: lemmas.append(self.lemmatizer.lemmatize(word)) return (" ".join(lemmas), pos_tags) ## there are others but this is sufficient, e.g. one more wordnet pos tag (adjective satellite) and many more nltk pos tags def _get_wordnet_pos(self, treebank_tag): if treebank_tag.startswith('J'): return wordnet.ADJ elif treebank_tag.startswith('V'): return wordnet.VERB elif treebank_tag.startswith('N'): return wordnet.NOUN elif treebank_tag.startswith('R'): return wordnet.ADV else: return '' def process_subreddits(self, subreddits, save=True, check_previous=True): for subreddit in subreddits: print(subreddit["display_name"]) pathlib.Path(self.save_path).mkdir(exist_ok=True) all_raw_comments = self._get_all_comments(subreddit) raw_comments = all_raw_comments comment_no_urls = [] comment_removed_chars = [] comment_expandeds = [] comment_replaced_spell_corrections = [] comment_processed_spell_corrections = [] pos_tag_sent_spell_corrections = [] comment_replaced_no_spell_corrections = [] comment_processed_no_spell_corrections = [] pos_tag_no_sent_spell_corrections = [] count = 0 total = len(raw_comments) for comment in raw_comments: print(comment) comment_no_url = self._remove_urls(comment) comment_removed_char = self._invalid_characters(comment_no_url) comment_expanded = self._expand_sentences( [comment_removed_char])[0] comment_replaced_spell_correction = self._replace( comment_expanded.lower(), is_spell_check=True) comment_processed_spell_correction, pos_tag_sent_spell_correction = self._words_and_tags( comment_replaced_spell_correction) comment_replaced_no_spell_correction = self._replace( comment_expanded.lower(), is_spell_check=False) comment_processed_no_spell_correction, pos_tag_no_sent_spell_correction = self._words_and_tags( comment_replaced_no_spell_correction) count += 1 print("count:", count, "total:", total, subreddit["display_name"]) # Appending comment_no_urls.append(comment_no_url) comment_removed_chars.append(comment_removed_char) comment_expandeds.append(comment_expanded) comment_replaced_spell_corrections.append( comment_replaced_spell_correction) comment_processed_spell_corrections.append( comment_processed_spell_correction) pos_tag_sent_spell_corrections.append( pos_tag_sent_spell_correction) comment_replaced_no_spell_corrections.append( comment_replaced_no_spell_correction) comment_processed_no_spell_corrections.append( comment_processed_no_spell_correction) pos_tag_no_sent_spell_corrections.append( pos_tag_no_sent_spell_correction) data = { "raw": raw_comments, "comment_no_urls": comment_no_urls, "comment_removed_chars": comment_removed_chars, "comment_expandeds": comment_expandeds, "comment_replaced_spell_corrections": comment_replaced_spell_corrections, "comment_processed_spell_corrections": comment_processed_spell_corrections, "pos_tag_sent_spell_corrections": pos_tag_sent_spell_corrections, "comment_replaced_no_spell_corrections": comment_replaced_no_spell_corrections, "comment_processed_no_spell_corrections": comment_processed_no_spell_corrections, "pos_tag_no_sent_spell_corrections": pos_tag_no_sent_spell_corrections, } if save: subreddit_path = self.save_path + "TEST" + subreddit[ "display_name"] + ".json" with open(subreddit_path, 'w') as fp: json.dump(data, fp) else: return data