def __init__(self, save_path, word2vec_model_path, previously_processed=[]): self.contractions = Contractions(word2vec_model_path) self.previously_processed = previously_processed self.save_path = save_path
def initialize(self): print("Initializing Text Cleaner..") print("Initializing Smart Contractions Module..") self.cont = Contractions(self.embedding_for_smart_contraction) self.cont.load_models() print("Initializing Stopwords Module..") self.stop_words = set(stopwords.words('english')) stop_words_without_negation = copy.deepcopy(self.stop_words) stop_words_without_negation.remove('no') stop_words_without_negation.remove('nor') stop_words_without_negation.remove('not') self.stop_words_without_negation = stop_words_without_negation self.pos_tags_set_1 = {'NNP'} print("Initializing Wordnet Lemmatizer Module..") self.wnl = WordNetLemmatizer() print("Initializing Spellcheck Module..") max_edit_distance_dictionary = 2 prefix_length = 7 self.sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) dictionary_path = os.path.abspath('')+"\\"+self.spell_dictonarypath self.sym_spell.load_dictionary(dictionary_path, 0, 1) print("Initialization complete!")
def __init__( self, expand_contractions=True, strip_text_in_brackets=False, combine_concatenations=False, w2v_path=None, api_key="word2vec-google-news-300", ): self.opt_expand_contractions = expand_contractions self.opt_strip_text_in_brackets = strip_text_in_brackets self.opt_combine_concatenations = combine_concatenations if expand_contractions: print( "Loading contractions dataset (this will take a while the first time)" ) # Load your favorite word2vec model self.cont = Contractions(w2v_path=w2v_path, api_key=api_key) print("Contractions dataset downloaded") print("Training contractions model (this will take a while)") # prevents loading on first expand_texts call self.cont.load_models() print("Contraction model successfully trained")
def __init__(self, kv_model=None, api_key='glove-twitter-100', precise=False): self.kv_model = kv_model self.api_key = api_key self.precise = precise if api_key: self.contractions = Contractions(api_key=api_key) else: self.contractions = Contractions(kv_model=kv_model) self.contractions.load_models()
def __init__(self, nlp=spacy.load("en_core_web_sm")): self.nlp = nlp contextualSpellCheck.add_to_pipe(self.nlp) model = api.load(cfg['embeddings']['embedding_file']) self.cont = Contractions(kv_model=model) self.cont.load_models() dirname = os.path.dirname(__file__) with open(os.path.join(dirname, 'acronym.json')) as f: self.acronyms = json.load(f)
def contraction_removal(): from pycontractions import Contractions import gensim.downloader as api # model = api.load("glove-twitter-25") # model = api.load("glove-twitter-100") model = api.load("word2vec-google-news-300") cont = Contractions(kv_model=model) cont.load_models()
class ContractionsExpander(TransformerMixin): def __init__(self, kv_model=None, api_key='glove-twitter-100', precise=False): self.kv_model = kv_model self.api_key = api_key self.precise = precise if api_key: self.contractions = Contractions(api_key=api_key) else: self.contractions = Contractions(kv_model=kv_model) self.contractions.load_models() def fit(self, X, y=None): return self def transform(self, X, y=None): return self.contractions.expand_texts(X)
def __init__(self, model=None): ''' :param model: Pretrained word embedding model. ''' super().__init__() if model is None: # If no model is given, use the default one and store it as a static class variable to avoid multiple loadings if ContractionExpander.model_contraction_expander is None: model_path = os.path.join(NLP_MODELS_PATH, 'pretrained', 'word_embeddings', 'pubmed2018_w2v_400D', 'pubmed2018_w2v_400D.bin') ContractionExpander.model_contraction_expander = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True) model = ContractionExpander.model_contraction_expander self.cont = Contractions(kv_model=model)
def preprocessing_text(df, series): #removing patterns (usernames & links to websites) pattern_list = ['@[\w]*', r'http\S+'] remove_patterns(df, series, pattern_list) #remove hashtag from clean text df[series] = remove_hashtags(df[series]) # de-emojizing df[series] = df[series].apply(lambda x: emoji.demojize(x)) #remove characters repeated more than 2 times df[series] = df[series].apply(lambda x: ReplaceThreeOrMore(x)) #remove html characters char_list = ['&', '\n', 'á', '<', '>'] remove_chars(df, series, char_list) #handle contractions cont = Contractions(api_key="glove-twitter-100") df[series] = df[series].apply(lambda x: list(cont.expand_texts([x]))) df[series] = df[series].apply(lambda x: str(x)) #removing numbers df[series] = df[series].apply( lambda x: ''.join([i for i in x if not i.isdigit()])) #remove punctuation df[series] = df[series].str.replace('[^\w\s]', ' ') #set to lowercase df[series] = df[series].apply( lambda x: " ".join(x.lower() for x in x.split())) #lemmatization df[series] = df[series].apply(lambda x: str(x)) df[series] = df[series].apply( lambda x: " ".join([Word(word).lemmatize() for word in x.split()])) return df
def contractions_fun(self): """ This function replaces words that are -- by checking a word if a word is present in a dictionary if the word is present in dictionary then it is replaced with its value from dictionary """ if self.contraction_method == 'mapping': self.doc = self.mapping_decontraction(str(self.doc)) elif self.contraction_method == 'word2vec': model = pretrained_model cont = Contractions(model) cont.load_models() self.doc = list(cont.expand_texts([str(self.doc)], precise=True))[0] elif self.contraction_method == 'glove': model = api.load("glove-twitter-25") cont = Contractions(kv_model=model) cont.load_models() self.doc = list(cont.expand_texts([str(self.doc)], precise=True))[0]
class ContractionExpander(TextProcessingBaseClass): ''' Removes contractions from the text and uses the full version instead (unification). Example: I'll walk down the road --> I will walk down the road ''' model_contraction_expander = None def __init__(self, model=None): ''' :param model: Pretrained word embedding model. ''' super().__init__() if model is None: # If no model is given, use the default one and store it as a static class variable to avoid multiple loadings if ContractionExpander.model_contraction_expander is None: model_path = os.path.join(NLP_MODELS_PATH, 'pretrained', 'word_embeddings', 'pubmed2018_w2v_400D', 'pubmed2018_w2v_400D.bin') ContractionExpander.model_contraction_expander = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True) model = ContractionExpander.model_contraction_expander self.cont = Contractions(kv_model=model) def level(self) -> str: return "text" def _process_internal(self, text: str) -> str: ''' :param text: Input string. :return: The string without contractions. ''' return list(self.cont.expand_texts([text], precise=True))[0]
class Cleaner: def __init__(self, embedding_for_smart_contraction="GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin", spell_dictonarypath = "frequency_dictionary_en_82_765.txt"): self.embedding_for_smart_contraction = embedding_for_smart_contraction self.spell_dictonarypath = spell_dictonarypath self.initialized = False def initialize(self): print("Initializing Text Cleaner..") print("Initializing Smart Contractions Module..") self.cont = Contractions(self.embedding_for_smart_contraction) self.cont.load_models() print("Initializing Stopwords Module..") self.stop_words = set(stopwords.words('english')) stop_words_without_negation = copy.deepcopy(self.stop_words) stop_words_without_negation.remove('no') stop_words_without_negation.remove('nor') stop_words_without_negation.remove('not') self.stop_words_without_negation = stop_words_without_negation self.pos_tags_set_1 = {'NNP'} print("Initializing Wordnet Lemmatizer Module..") self.wnl = WordNetLemmatizer() print("Initializing Spellcheck Module..") max_edit_distance_dictionary = 2 prefix_length = 7 self.sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length) dictionary_path = os.path.abspath('')+"\\"+self.spell_dictonarypath self.sym_spell.load_dictionary(dictionary_path, 0, 1) print("Initialization complete!") def expand_contractions(self,text): try: text = list(self.cont.expand_texts([text], precise=False))[0] except Exception as e: return text return text def apostrophe_correction(self,text): text = re.sub("’", "'", text) return text def try_decode(self,text): try: text = unidecode.unidecode(codecs.decode(text, 'unicode_escape')) except: text = unidecode.unidecode(text) return text def tokenize_and_keep_only_words(self,text): text = re.findall(r"[a-zA-Z]+", text.lower()) return text def remove_stop_words(self,text): text = [word for word in text if (word not in self.stop_words_without_negation and len(word)>2)] return text def lemmatize(self,text): text = [self.wnl.lemmatize(word) for word in text] return text def spell_check(self,text,max_edit_distance_lookup = 2): # tokenize each word text = word_tokenize(text) # apply pos to each word text = pos_tag(text) correct_text = [] # for each word in sentece for word in text: # if word is not a noun if word[1] not in self.pos_tags_set_1: # check if we can correct it, then correct it suggestions = self.sym_spell.lookup(word[0],Verbosity.CLOSEST, max_edit_distance_lookup) for suggestion in suggestions: # take the first correction correct_text.append(suggestion.term) break else: correct_text.append(word[0]) text = ' '.join([word for word in correct_text]) return text def full_clean(self,text,debug=False): if not self.initialized: self.initialize() self.initialized = True if debug: print("pre-clean: ",text) text = self.try_decode(text) text = self.apostrophe_correction(text) text = self.spell_check(text) text = self.expand_contractions(text) text = self.tokenize_and_keep_only_words(text) text = self.remove_stop_words(text) text = self.lemmatize(text) text = ' '.join(text) if debug: print("post-clean: ",text) return text
def transform(self, x): if self.verbose > 0: print( colored("Called Description Transformer Transform", color="blue", attrs=['bold', 'underline'])) print("Processing description text") # Copy the data and find the name of the description column self.data = x.copy() self.column_name = self.data.columns.values[0] # Load spaCy language processor nlp = spacy.load("en_core_web_sm") # Load pre-trained word embedding if using contractions contraction = Contractions( api_key="glove-twitter-25") if self.contractions else None # Process text by iterating over each sample's index and description for idx, sample in zip(self.data.index.values, self.data.values): # Change accented characters, e.g à -> a sample = self.remove_accents(str(sample)) if contraction: # Contract words, e.g "hasn't" -> "has not" sample = list(contraction.expand_texts([sample], precise=True)) sample = ''.join(sample) # Input sample text into spaCy language processor doc = nlp(sample) # Split sample text into sentences sentences = list(doc.sents) for word_idx in range(len(sentences)): # Remove punctuation tokens, e.g. ! , . sentences[word_idx] = [ token for token in sentences[word_idx] if not token.is_punct ] # Remove stop words if self.stop_words: sentences[word_idx] = [ token for token in sentences[word_idx] if token.text.lower() not in self.stop_words ] # Apply lemmatization if self.transformation[0].lower() == "l": # Resolve words to their dictionary form using PoS tags sentences[word_idx] = [ token.lemma_.lower() for token in sentences[word_idx] ] # Apply stemming (only if lemmatization not applied) elif self.transformation[0].lower() == "s": # Stem tokens for char_idx in range(len(sentences[word_idx])): # Apply stemmer to each word stemmed = self.stemmer_algorithm.stem( sentences[word_idx][char_idx].text) # Convert back to type Token and update word in sentence sentences[word_idx][char_idx] = nlp(stemmed)[0] # Remove remaining punctuation within tokens, e.g. "(years)" -> "years", not including - sentences[word_idx] = [ token.translate( str.maketrans('', '', '!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~')) for token in sentences[word_idx] ] # Split words containing dash or spaces caused by lemmatization, e.g. "16-year" -> "16" + "year" for k in range(len(sentences)): new_sentence = [] for token in sentences[k]: split_token = re.split(' |-', token) for word in split_token: # Check word not empty if word: new_sentence.append(word) # Replace words in sentence sentences[k] = new_sentence # Remove empty lists from list of sentences sentences = [sent for sent in sentences if sent != []] # The join the sentences and update the descriptions dataframe word_list = [word for sent in sentences for word in sent] self.data.loc[idx, self.column_name] = ' '.join( [str(elem) for elem in word_list]) # if self.verbose > 1: # display(self.data) if self.verbose > 0: print( colored("Finshed processing all descriptions\n", color="blue", attrs=['bold', 'underline'])) return self.data
def main(): #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~INITIALIZATIONS~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ nlp = spacy.load('en') cont = Contractions('../GoogleNews-vectors-negative300.bin.gz') cont.load_models() #stopwords = spacy.lang.en.STOP_WORDS #spacy.lang.en.STOP_WORDS.add("e.g.") #nlp.vocab['the'].is_stop nlp.Defaults.stop_words |= { "(a)", "(b)", "(c)", "etc", "etc.", "etc.)", "w/e", "(e.g.", "no?", "s", "film", "movie", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "e", "f", "k", "n", "q", "de", "oh", "ones", "miike", "http", "imdb", } stopwords = list(nlp.Defaults.stop_words) tokenizer = Tokenizer(nlp.vocab) punctuations = string.punctuation LabeledSentence1 = gensim.models.doc2vec.TaggedDocument #####################################REVIEWS OF 100 BEST MOVIES on IMDB######################################### #open the base URL webpage html_page = urlopen("https://www.imdb.com/list/ls059633855/") #instantiate beautiful soup object of the html page soup = BeautifulSoup(html_page, 'lxml') review_text = get_movie_reviews(soup) all_good_movie_reviews = get_tagged_documents(review_text, cont, stopwords, tokenizer, punctuations, LabeledSentence1) print('\n') d2v_model_best = doc2vec_similarity(all_good_movie_reviews, 'Best') #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~K-MEANS CLUSTERING~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ all_rev_text = [rev for rev, ttl in all_good_movie_reviews] all_rev_ttls = [ttl[0] for rev, ttl in all_good_movie_reviews] numclust = 4 kmeans_model = KMeans(n_clusters=numclust, init='k-means++', max_iter=300, random_state=111) X = kmeans_model.fit(d2v_model_best.docvecs.vectors_docs) kmeans_clust_labels = kmeans_model.labels_.tolist() output_list = list(zip(kmeans_clust_labels, all_rev_ttls)) print('The Groupings assigned by K-Means Clustering are: \n\n'.format( numclust)) for i in sorted(output_list): print(i) print('\n') ttls_and_labels_kmeans_clust = [] i = 0 for rev, ttl in all_good_movie_reviews: ttls_and_labels_kmeans_clust.append((ttl, kmeans_clust_labels[i])) i += 1 kmeans_clust_centroids = np.array(kmeans_model.cluster_centers_) get_docs_closest_to_centroids( data_length=100, n_clust=4, clust_labels=kmeans_clust_labels, centroid_input=kmeans_clust_centroids, doc_vecs=d2v_model_best.docvecs.vectors_docs, all_reviews=all_good_movie_reviews, ttls_with_labels=ttls_and_labels_kmeans_clust) #pca = PCA(n_components=2).fit(d2v_model_best.docvecs.vectors_docs) #datapoint = pca.transform(d2v_model_best.docvecs.vectors_docs) #plt.figure #label1 = ["#FFFF00", "#008000", "#0000FF", "#800080", "#ff0000"] #color = [label1[i] for i in kmeans_clust_labels] #plt.scatter(datapoint[:, 0], datapoint[:, 1], c=color) #centroids = kmeans_model.cluster_centers_ #centroidpoint = pca.transform(centroids) #plt.scatter(centroidpoint[:, 0], centroidpoint[:, 1], marker='^', s=150, c='#000000') #plt.show() #kmeans_3D(d2v_model_best) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~AGGLOMERATIVE HIERARCHICAL CLUSTERING~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ tfidf_input = [' '.join(t) for t in all_rev_text] #define vectorizer parameters tfidf_vectorizer = TfidfVectorizer(max_df=0.8, min_df=0.2, use_idf=True, ngram_range=(1, 3)) tfidf_matrix = tfidf_vectorizer.fit_transform(tfidf_input) dist = 1 - cosine_similarity(tfidf_matrix) linkage_matrix = ward(dist) cl = linkage_matrix numclust = 5 hier_clust_labels = fcluster(cl, numclust, criterion='maxclust') hier_clust_labels = hier_clust_labels - 1 output_list = list(zip(hier_clust_labels, all_rev_ttls)) print( 'The Levels assigned by a {}-Tiered Hierarchical Clustering are: \n\n'. format(numclust)) for i in sorted(output_list): print(i) print('\n') ttls_and_labels_hier_clust = [] i = 0 for rev, ttl in all_good_movie_reviews: ttls_and_labels_hier_clust.append((ttl, hier_clust_labels[i])) i += 1 hier_clust_codebook = [] for i in range(hier_clust_labels.min(), hier_clust_labels.max() + 1): hier_clust_codebook.append(d2v_model_best.docvecs.vectors_docs[ hier_clust_labels == i].mean(0)) hier_clust_centroids = np.vstack(hier_clust_codebook) get_docs_closest_to_centroids(data_length=100, n_clust=5, clust_labels=hier_clust_labels, centroid_input=hier_clust_centroids, doc_vecs=d2v_model_best.docvecs.vectors_docs, all_reviews=all_good_movie_reviews, ttls_with_labels=ttls_and_labels_hier_clust) #plot_spectral_embed_agglom_clusters(dist, 5) #dendro_static(linkage_matrix, all_rev_ttls) #dendro_interactive(dist, all_rev_ttls) #dendro_heatmap(dist, all_rev_ttls) #####################################REVIEWS OF 100 WORST MOVIES on IMDB######################################### #open the base URL webpage html_page_2 = urlopen("https://www.imdb.com/list/ls061324742/") #instantiate beautiful soup object of the html page soup_2 = BeautifulSoup(html_page_2, 'lxml') review_text_2 = get_movie_reviews(soup_2) all_bad_movie_reviews = get_tagged_documents(review_text_2, cont, stopwords, tokenizer, punctuations, LabeledSentence1) print('\n') d2v_model_worst = doc2vec_similarity(all_bad_movie_reviews, 'Worst') #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~K-MEANS CLUSTERING~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ all_rev_text_bad = [rev for rev, ttl in all_bad_movie_reviews] all_rev_ttls_bad = [ttl[0] for rev, ttl in all_bad_movie_reviews] numclust = 4 kmeans_model_worst = KMeans(n_clusters=numclust, init='k-means++', max_iter=300, random_state=111) X_worst = kmeans_model_worst.fit(d2v_model_worst.docvecs.vectors_docs) kmeans_clust_labels_worst = kmeans_model_worst.labels_.tolist() output_list = list(zip(kmeans_clust_labels_worst, all_rev_ttls_bad)) print('The Groupings assigned by K-Means Clustering are: \n\n'.format( numclust)) for i in sorted(output_list): print(i) print('\n') ttls_and_labels_kmeans_clust_bad = [] i = 0 for rev, ttl in all_bad_movie_reviews: ttls_and_labels_kmeans_clust_bad.append( (ttl, kmeans_clust_labels_worst[i])) i += 1 kmeans_clust_centroids_worst = np.array( kmeans_model_worst.cluster_centers_) get_docs_closest_to_centroids( data_length=100, n_clust=4, clust_labels=kmeans_clust_labels_worst, centroid_input=kmeans_clust_centroids_worst, doc_vecs=d2v_model_worst.docvecs.vectors_docs, all_reviews=all_bad_movie_reviews, ttls_with_labels=ttls_and_labels_kmeans_clust_bad) #pca_worst = PCA(n_components=2).fit(d2v_model_worst.docvecs.vectors_docs) #datapoint_worst = pca_worst.transform(d2v_model_worst.docvecs.vectors_docs) #plt.figure #label1 = ["#FFFF00", "#008000", "#0000FF", "#800080", "#ff0000"] #color = [label1[i] for i in kmeans_clust_labels_worst] #plt.scatter(datapoint_worst[:, 0], datapoint_worst[:, 1], c=color) #centroids = kmeans_model_worst.cluster_centers_ #centroidpoint = pca_worst.transform(centroids) #plt.scatter(centroidpoint[:, 0], centroidpoint[:, 1], marker='^', s=150, c='#000000') #plt.show() #kmeans_3D(d2v_model_worst) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~AGGLOMERATIVE HIERARCHICAL CLUSTERING~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ tfidf_input_bad = [' '.join(t) for t in all_rev_text_bad] #define vectorizer parameters tfidf_vectorizer = TfidfVectorizer(max_df=0.8, min_df=0.2, use_idf=True, ngram_range=(1, 3)) tfidf_matrix_bad = tfidf_vectorizer.fit_transform(tfidf_input_bad) dist_bad = 1 - cosine_similarity(tfidf_matrix_bad) linkage_matrix_bad = ward(dist_bad) cl = linkage_matrix_bad numclust = 5 hier_clust_labels_bad = fcluster(cl, numclust, criterion='maxclust') hier_clust_labels_bad = hier_clust_labels_bad - 1 output_list = list(zip(hier_clust_labels_bad, all_rev_ttls_bad)) print( 'The Levels assigned by a {}-Tiered Hierarchical Clustering are: \n\n'. format(numclust)) for i in sorted(output_list): print(i) print('\n') ttls_and_labels_hier_clust_bad = [] i = 0 for rev, ttl in all_bad_movie_reviews: ttls_and_labels_hier_clust_bad.append((ttl, hier_clust_labels_bad[i])) i += 1 hier_clust_codebook_bad = [] for i in range(hier_clust_labels_bad.min(), hier_clust_labels_bad.max() + 1): hier_clust_codebook_bad.append(d2v_model_worst.docvecs.vectors_docs[ hier_clust_labels_bad == i].mean(0)) hier_clust_centroids_bad = np.vstack(hier_clust_codebook_bad) get_docs_closest_to_centroids( data_length=100, n_clust=5, clust_labels=hier_clust_labels_bad, centroid_input=hier_clust_centroids_bad, doc_vecs=d2v_model_worst.docvecs.vectors_docs, all_reviews=all_bad_movie_reviews, ttls_with_labels=ttls_and_labels_hier_clust_bad) #plot_spectral_embed_agglom_clusters(dist_bad, n_clust=5) #dendro_static(linkage_matrix_bad, all_rev_ttls_bad) #dendro_interactive(dist_bad, all_rev_ttls_bad) #dendro_heatmap(dist_bad, all_rev_ttls_bad) return None
from pycontractions import Contractions """ Contractions are words that we write with an apostrophe. Examples of contractions are words like “ain’t” or “aren’t”. For standartize text better replace them """ # See description in README.md cont = Contractions('./data/GoogleNews-vectors-negative300.bin') cont.load_models() """ Will expand contractions in list of texts. Better use on big amounts on texts. One by One proecceing will be slow expand_texts produce generator of texts """ def replace_contractions(batch_of_texts): try: return list(cont.expand_texts(batch_of_texts, precise=True)) # can fail on some words, like `he's` except Exception: return batch_of_texts
class TextProcessing: """ Class to clean text """ def __init__(self, nlp=spacy.load("en_core_web_sm")): self.nlp = nlp contextualSpellCheck.add_to_pipe(self.nlp) model = api.load(cfg['embeddings']['embedding_file']) self.cont = Contractions(kv_model=model) self.cont.load_models() dirname = os.path.dirname(__file__) with open(os.path.join(dirname, 'acronym.json')) as f: self.acronyms = json.load(f) def process_text(self, text): """ Processes text as follows: 1. decode to unicode 2. remove extra repeated special characters 3. put space around the special characters 4. Remove extra whitespaces 5. replace acronyms 6. expand contractions of english words like ain't 7. correct spelling mistakes 8. replace NE in the text 9. lower case the string Args: text: text to be processed """ text = self.unidecode(text) text = self.remove_repeated_chars(text) text = self.put_space_around_special_chars(text) text = self.remove_extra_whitespaces(text) text = self.replace_acronyms(text) text = self.expand_contractions(text) text = self.correct_spellings(text) text = self.replace_named_entity(text) text = self.lower_case(text) return text def remove_repeated_chars(self, text): """ Removes repeated instances of consecutive special chars Args: text: text to be processed """ text = re.sub(r'([!@#$%^&*,./?\'";:\\])\1+', r'\1', text) return text def put_space_around_special_chars(self, text): """ Puts space around special chars like '[({$&*#@!' Args: text: text to be processed """ chars = [ '$', '?', '%', '@', '!', '#', '^', '*', '&', '"', ':', ';', '/', '\\', ',', '+', '(', ')', '[', ']', '{', '}', '<', '>' ] for char in chars: text = text.replace(char, ' ' + char + ' ') return text def remove_extra_whitespaces(self, text): """ Removes extra whitespaces from the text Args: text: text to be processed """ return text.strip() def unidecode(self, text): """ unidecodes the text Args: text: text to be processed """ return unidecode.unidecode(text.lower()) def lower_case(self, text): """ lower cases the text Args: text: text to be processed """ return text.lower() def expand_contractions(self, text): """ expands contractions for example, "ain't" expands to "am not" Args: text: text to be processed """ return list(self.cont.expand_texts([text.lower()], precise=True))[0] def correct_spellings(self, text): """ corrects spellings from text Args: text: text to be processed """ doc = self.nlp(text) if doc._.performed_spellCheck: text = doc._.outcome_spellCheck return text def replace_acronyms(self, text): """ Replaces acronyms found in English For example: ttyl -> talk to you later Args: text: text to be processed """ for acronym, expansion in self.acronyms.items(): text = text.replace(' ' + acronym.lower() + ' ', ' ' + expansion.lower() + ' ') return text def replace_named_entity(self, text): """ Replaces named entity in the text For example: $5bn loss estimated in the coming year -> MONEY loss estimated in the coming year Args: text: text to be processed """ doc = list( self.nlp.pipe( [text], disable=["tagger", "parser", "contextual spellchecker"]))[0] for ent in doc.ents: text = text.replace(ent.text, ent.label_) return text def token_list(self, text): doc = self.nlp(text) tokens = [] for token in doc: tokens += [token.text] return tokens
class NLP(): nlp = None doc = None model = None def __init__(self, spacy_model='en_core_web_sm', gensim_model='glove-twitter-25'): self.nlp = spacy.load(spacy_model) self.model = api.load(gensim_model) self.cont = Contractions(kv_model=self.model) def remove_html(self, text): """Strip HTML tags from text""" soup = BeautifulSoup(text, 'html.parser') return soup.get_text(separator=" ") def remove_accents(self, text): """Remove accented characters from text for non-english words""" return unidecode.unidecode(text) def expand_contractions(self, text): """Convert contractions into whole words. e.g. can't -> can not""" return list(self.cont.expand_texts([text], precise=True))[0] def preprocess(self, text, remove_numbers=False, remove_stopwords=False, excluded_sw=None, toke=False): """Preprocess using standard protocols. @param remove_numbers converts words to digits and removes @param remove_stopwords removes stop words @param excluded_sw is any stopwords to exclude @param toke if true, return tokens, default return text """ text = self.remove_html(text) text = self.remove_accents(text) text = self.expand_contractions(text) if toke or remove_numbers or remove_stopwords: if excluded_sw is not None: for w in excluded_sw: self.nlp.vocab[w].is_stop = False doc = self.nlp(text) tokens = [] for token in doc: if token.pos_ == 'NUM' and not remove_numbers: tokens.append(w2n.word_to_num(token.text)) elif not token.is_stop: tokens.append(token.text) if toke: return tokens text = " ".join(tokens) return text def lemmatize(self, tokens, toke=False): lookups = Lookups() lookups.add_table('lemma_index', lemma_index) lookups.add_table('lemma_exc', lemma_exc) lookups.add_table('lemma_rules', lemma_rules) lemmatizer = Lemmatizer(lookups) lemmas = [] for t in tokens: lemmas.append(lemmatizer(token.text, token.tag_)) if toke: return lemmas return " ".join(lemmas) def get_syllables(self, word): count = 0 vowels = ("a", "e", "i", "o", "u", "y") prev = False for c in word: vowel = c in vowels if vowel and not prev: count += 1 prev = vowel return count def get_lexical_density(self, tokens): c_words = t_words = 0 cont_pos = ['PROPN', 'NOUN', 'VERB', 'ADJ', 'ADV'] for t in tokens: if token.pos_ in cont_pos: c_words += 1 t_words += 1 elif token.pos_ != 'PUNCT': t_words += 1 return round((c_words / t_words), 4) def get_coherence(self, text): doc = self.nlp(text) sentences = [sent for sent in doc.sents if len(sent) >= 2] frequency = defaultdict(int) token_sents = [] for s in sentences: tmp = [] for t in self.preprocess(s, remove_stopwords=True, excluded_sw=['no', 'not'], toke=True): tmp.append(t.text) frequency[t] += 1 token_sents.append(tmp) vocab = [[word for word in sent if frequency[word] > 1] for sent in token_sents] dictionary = corpora.Dictionary(vocab) corpus = [dictionary.doc2bow(word) for word in vocab] tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=20) corpus_lsi = lsi[corpus_tfidf] sums = {} topic_count = max([len(line) for line in corpus_lsi]) for line in corpus_lsi: for topic in line: t_num = topic[0] if t_num not in sums: sums[t_num] = abs(topic[1]) else: sums[t_num] += abs(topic(1)) best_topic = max(zip(sums.values(), sums.keys()))[1] ordered = [] i = 0 for line in corpus_lsi: ordered.append((i, line[topic][1])) i += 1 ordered = sorted(ordered, key=lambda x: x[1], reverse=True) threshold = ordered[0][1] - (0.90 * (ordered[0][1] - ordered[-1][1])) problem_sentences = [] for s in ordered: if s[1] < threshold: problem_sentences.append((s[1]), s) problem_sentences = [s for s in ordered if s[1] < threshold] output = {} for p in problem_sentences: output[p[0]] = (p[1], str(sentences[p[0]])) return output def get_readability(self, text): scores = {} doc = self.nlp(text) sentence _count = len(doc) words = self.preprocess(text, toke=True) characters = 0 for word in words: characters += len(word) word_count = len(words) syllable_count = 0 complex_words = 0 for word in words: c = self.get_syllables(word) syllable_count += c if c >= 3 and not word[0].isupper(): complex_words += 1 avgwps = word_count / sentence_count # Automated Readability Index ari = 0.0 ari_grade = 0 if word_count > 0: ari = 4.71 * (characters / word_count) + 0.5 * \ (word_count / sentence_count) - 21.43 if ari < 2: ari_grade = 0 elif ari > 12: ari_grade = 13 else: ari_grade = ari scores["ari"] = (ari, ari_grade) # Flesch Reading Ease flesch_reading_ease = 101 fre_grade = 0 if word_count > 0 and sentence_count > 0: flesch_reading_ease = 206.835 - \ 1.015(word_count / sentence_count) - \ 84.6(syllable_count / word_count) if flesch_reading_ease > 100: fre_grade = 4 elif flesch_reading_ease > 90.0: fre_grade = 5 elif flesch_reading_ease > 80.0: fre_grade = 6 elif flesch_reading_ease > 70.0: fre_grade = 7 elif flesch_reading_ease > 60.0: fre_grade = 9 elif flesch_reading_ease > 50: fre_grade = 12 else: fre_grade = 13 scores["flesch_reading_ease"] = (flesch_reading_ease, fre_grade) # Flesch-Kincaid Grade Level fkg = 0.0 if word_count > 0 and sentence_count > 0: fkg = 0.39(word_count / sentence_count) + \ 11.8(syllable_count / word_count) - 15.59 scores["flesch_kinkaid_grade_level"] = (fkg, int(fkg)) # Gunning Fog Index gfi = 0.0 gfi_grade = 0 if sentence_count > 0 and word_count > 0: gfi = 0.4 * ((word_count / sentence_count) + 100(complex_words / word_count)) if gfi < 6: gfi_grade = 5 elif gfi <= 12: gfi_grade = int(gfi) else: gfi_grade = 13 scores["gunning_fog_index"] = (gfi, gfi_grade) # SMOG Readability smog = 0.0 smog_grade = 0 if sentence_count > 0: smog = 1.0430 * math.sqrt(complex_words * (30 / sentence_count)) + 3.1291 if smog >= 13: smog_grade = 13 else: smog_grade = int(smog) scores["smog_readability"] = (smog, smog_grade) # ColemanLiauIndex coleman = 0.0 coleman_grade = 0 if word_count > 0: coleman = (5.89 * (characters / word_count)) - \ (30 * (sentence_count / word_count)) - 15.8 if coleman >= 13: coleman_grade = 13 else: coleman_grade = int(coleman) scores["coleman_liau"] = (coleman, coleman_grade) # LIX & RIX lix = 0.0 rix = 0.0 lix_grade = 0 rix_grade = 0 if sentence_count > 0 and word_count > 0: long_words = 0 for word in words: if len(word) >= 7: long_words += 1 lix = word_count / sentence_count + ((100. * long_words) / word_count) rix = long_words / sentence_count if lix >= 13: lix_grade = 13 else: lix_grade = int(lix) if rix >= 13: rix_grade = 13 else: rix_grade = int(rix) scores["LIX"] = (lix, lix_grade) scores["RIX"] = (rix, rix_grade) count = 0 avg = 0.0 for k, v in scores.items: avg += v[1] count += 1 scores["AVERAGE_GRADE"] = (avg / count, int(avg / count)) return scores
def train_model() -> None: train_data = fetch_data.fetch_imdb_train_data() cont = Contractions(constants.CONTRACTIONS_BIN_FILE) cont.load_models() for index, row in train_data.iterrows(): row.review = BeautifulSoup(row.review, features="html.parser").get_text() row.review = cont.expand_texts(row.review, precise=True) train_data.review = clean_reviews(train_data.review) reviews = list(tokenize_sentences(train_data.review)) labels = list(train_data.sentiment) tokenizer = Tokenizer(num_words=constants.MAX_NB_WORDS) tokenizer.fit_on_texts(train_data.review) data = np.zeros((len(train_data.review), constants.MAX_SENTS, constants.MAX_SENT_LENGTH), dtype='float32') words = list() for i, sentences in enumerate(reviews): for j, sent in enumerate(sentences): if j < constants.MAX_SENTS: wordTokens = text_to_word_sequence(sent) k = 0 for _, word in enumerate(wordTokens): if k < constants.MAX_SENT_LENGTH and tokenizer.word_index[word] < constants.MAX_NB_WORDS: data[i, j, k] = tokenizer.word_index[word] k = k + 1 words.append(wordTokens) word_index = tokenizer.word_index print('Total %s unique tokens.' % len(word_index)) labels = to_categorical(np.asarray(labels)) print('Shape of data tensor:', data.shape) print('Shape of label tensor:', labels.shape) wordSkipGramModel = gensim.models.Word2Vec(words, min_count=5, size=constants.EMBEDDING_DIM, window=4, sg=1) word_embedding_matrix = np.random.random((len(word_index) + 1, constants.EMBEDDING_DIM)) for word, i in word_index.items(): try: word_embedding_vector = wordSkipGramModel.wv.get_vector(word) except KeyError: continue # words not found in embedding index will be all-zeros.EMBEDDING_DIM if word_embedding_vector is not None: word_embedding_matrix[i] = word_embedding_vector embedding_layer = Embedding(len(word_index) + 1, constants.EMBEDDING_DIM, weights=[word_embedding_matrix], input_length=constants.MAX_SENT_LENGTH, trainable=True) sentence_input = Input(shape=(constants.MAX_SENT_LENGTH,), dtype='float32') embedded_sequences = embedding_layer(sentence_input) sentence_lstm = Bidirectional(LSTM(200, return_sequences=True))(embedded_sequences) l_dropout = Dropout(0.5)(sentence_lstm) l_dense = TimeDistributed(Dense(400))(l_dropout) l_att = attention_layer.AttLayer()(l_dense) l_dropout_1 = Dropout(0.4)(l_att) sentEncoder = Model(sentence_input, l_dropout_1) review_input = Input(shape=(constants.MAX_SENTS, constants.MAX_SENT_LENGTH), dtype='float64') review_encoder = TimeDistributed(sentEncoder)(review_input) review_dropout = Dropout(0.3)(review_encoder) l_lstm_review = Bidirectional(LSTM(100, return_sequences=True))(review_dropout) l_att_dropout_review = Dropout(0.2)(l_lstm_review) l_dense_review = TimeDistributed(Dense(200))(l_att_dropout_review) l_dropout_review = Dropout(0.2)(l_dense_review) l_att_review = attention_layer.AttLayer()(l_dropout_review) preds = Dense(2, activation='softmax')(l_att_review) model = Model(review_input, preds) adam = Adam(lr=0.0001) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) model.fit(data, labels, validation_split=0.2, epochs=10, batch_size=50, shuffle=False, verbose=1) model.save('deeplearn_sentiment_model.h5') # Save Tokenizer i.e. Vocabulary with open('reviews_tokenizer.pkl', 'wb') as handle: pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
#Store in a file the preprocessed tweetd if exportPreprocessDataToFile == 1: dfExport=pd.concat([tweetTextPreprocessed, y], axis=1) dfExport.to_csv(fileToExport, index=False, encoding = "utf-8") return tweetTextPreprocessed, y #--------------------------Main--------------------------# if __name__ == "Modules.Preprocess": #Setting up the dataset colNames=['Target','Id','Date','Flag','User','Text'] #Get the name of the columns encoding="ISO-8859-1"#utf8 cannot read some special characters so ISO-8859-1 is used fileName='tweet_dataset.csv' print("Loading language model. This may take several seconds.") nlp = en_core_web_md.load()#Load the English medium spaCy language model('en_core_web_md') print("Loading GloveTwitter model. This may take up to 1 minute.") # Choose the model. Others such us "word2vec-google-news-300"" are available too. #Use "glove-twitter-100" (<1GB) or "glove-twitter-200" (1GB)for final results. "glove-twitter-25"(200MB) is just for fast checks cont = Contractions(api_key="glove-twitter-100") cont.load_models() #Get the contractions for English and prevents loading on firs expand_text call #Exlude some words with potential negative sentimental analysis deselect_stop_words = ["no", "not", "n't", "less", "enough", "never"] for w in deselect_stop_words: nlp.vocab[w].is_stop = False
lst, query, num, Api ): #will append "num" tweets based on "query" to list "lst" using API "Api. It will also remove retweets. These tweets will usually contain hashtags" sample_tweets = tweepy.Cursor(Api.search, q=("# " + query + " -filter:retweets"), lang="en", tweet_mode='extended').items(num) for tweet in sample_tweets: lst.append(tweet) #Code does not process links/URLs in tweets properly. #Code will remove newline characters in tweets during processing (done as part of Task #2) #Code will not process non-english names correctly (it would be useful to add them to relevant lexicons) #Code does not process the '£' sign properly cont = Contractions( api_key="glove-twitter-25" ) #this will be used in tweet_Processor to expand contractions #cont.load_models() d = eht.Dict( "en_GB" ) #this will be used in tweet_Processor to detect words outside the British English Dictionary sc = SpellChecker() parser = GingerIt() #for grammar correction #Task 2 related stuff covered here. It was decided that using .json files is not efficient def tweet_Processor(tweet): sentences = sent_tokenize(str(tweet.full_text)) for s in range(len(sentences)): sentences[s] = pytypo.correct_sentence(
#Helper method for pre-processing from nltk.tokenize import sent_tokenize, word_tokenize from nltk.corpus import stopwords import glob import nltk from pycontractions import Contractions import spacy #method to remove contraction def removeconcat(line, cont): return list(cont.expand_texts([line])) #Note this method must be run after the SpellCheck method provided in the C# program stopWords = set(stopwords.words()) path = 'C:\\Temp\\conversations\\*.json' cont = Contractions(api_key="glove-twitter-100") cont.load_models() files = glob.glob(path) nlp = spacy.load('en') for file in files: f = open(file, 'r') lines = f.readlines() alllines = removeconcat(line, cont) for line in lines: doc = nlp(line) for ent in doc.ents: print(ent, ent.lemma_, ent.label_) f.close()
class Cleaner: def __init__( self, expand_contractions=True, strip_text_in_brackets=False, combine_concatenations=False, w2v_path=None, api_key="word2vec-google-news-300", ): self.opt_expand_contractions = expand_contractions self.opt_strip_text_in_brackets = strip_text_in_brackets self.opt_combine_concatenations = combine_concatenations if expand_contractions: print( "Loading contractions dataset (this will take a while the first time)" ) # Load your favorite word2vec model self.cont = Contractions(w2v_path=w2v_path, api_key=api_key) print("Contractions dataset downloaded") print("Training contractions model (this will take a while)") # prevents loading on first expand_texts call self.cont.load_models() print("Contraction model successfully trained") def expand_contractions(self, text): text = text.replace("’", "'") # need to put in the correct apostrophe expanded_text = list(self.cont.expand_texts([text], precise=True)) return expanded_text[0] def strip_brackets(self, text): # Remove strings in brackets # Eg. "This is a sentence (extra info) description." # Becomes "This is a sentence description." """ Remove brackets from text Matches (), [], {} Converts: 'hello (there) you (my[best] friend) lets {dine } }' -> 'hello you lets }' """ brace_open_type = "" brace_pair = {'(': ')', '[': ']', '{': '}'} open_brace_list = list(brace_pair.keys()) res = "" for c in text: if len(brace_open_type) == 0: # not opened if c in open_brace_list: brace_open_type = c else: res += c else: # opened if brace_pair[brace_open_type] == c: brace_open_type = "" return res def combine_concatenations(self, sentence): """ Recieves string sentence "This is a sentence" """ # convert concatenated words into seperate words # georgetown-louisville becomes georgetown louisville # Pd matches all types of dashes # https://www.compart.com/en/unicode/category/Pd if self.opt_combine_concatenations: def _refu(sent): return regex.sub(r'\p{Pd}+', '', sent) else: def _refu(sent): return regex.sub(r'\p{Pd}+', ' ', sent) return _refu(sentence) def remove_non_english(self, tokens): """ Removes non-english words and all punctuation and numbers Removes extra white space Recieves list of tokens comprising a single sentence: ['this', 'is', 'a', 'sentence'] """ # remove all punctuation (removes non-english words too) # stripped = re.sub('[^a-zA-Z\s]*', '', stripped) # removes extra white spaces # stripped = re.sub('[ ]{2,}',' ', stripped) cleaned_tokens = [] for token in tokens: cleaned = re.sub('[ ]{2,}', ' ', re.sub('[^a-zA-Z\s]*', '', token)).strip() if len(cleaned) != 0: cleaned_tokens.append(cleaned) return cleaned_tokens def lemmatize_sentences(self, tokenized_sentences): """ Recieves Args: tokenized_sentences is of form [['this', 'is', 'sentence'], ['this', 'is', 'another'] ['this', 'is', 'another']] Returns: lemmatized 2d list of same form [['this', 'is', 'sentenc'], ['this', 'is', 'anoth'] ['this', 'is', 'anoth']] """ lemmatized_sentences = [] for sentence in tokenized_sentences: lemmatized_sentences.append(lemmatize(sentence)) # lemmatized_sentences = [lemmatize(sentence) for sentence in tokenized_sentences] return lemmatized_sentences def clean(self, text): if self.opt_expand_contractions: # Expands it's -> it is text = self.expand_contractions(text) # text is lowercased after contractions are expanded # the contractions will be capitalized after they are expanded # eg. (i'm -> [I, am]). Therefore, the lowercasing is done afterwards text = text.lower() if self.opt_strip_text_in_brackets: text = self.strip_brackets(text) sentences = sent_tokenize(text) sentences = [ self.combine_concatenations(sentence) for sentence in sentences ] tokens_per_sentence = [word_tokenize(sent) for sent in sentences] lemmatized_tokens_per_sent = self.lemmatize_sentences( tokens_per_sentence) cleaned_tokens_per_sent = [ self.remove_non_english(sent) for sent in lemmatized_tokens_per_sent ] return cleaned_tokens_per_sent
def remove_url(text): # Remove any web url starting with http or www return re.sub(r'(www|http)\S+', '', text) def remove_email_address(text): # Remove any email address return re.sub(r'\S+@\S+', '', text) # we should not use the so big precompiled word2vec model in container, # it would be slow and container size would be big model = KeyedVectors.load('/app/lib/gensim/GoogleNews-vectors-negative300', mmap='r') cont = Contractions(kv_model=model) cont.load_models() def expand_contractions(text): """expand shortened words, e.g. don't to do not""" text = list(cont.expand_texts([text], precise=True))[0] return text @app.route("/v1/preprocess", methods=["GET", "POST"]) def preprocess(): data = {"success": False} # get the request parameter params = flask.request.json if (params == None):
from paragraph_segmentation_dcnn import make_cnn as ParagraphSegmentationNet, paragraph_segmentation_transform from word_segmentation import SSD as WordSegmentationNet, predict_bounding_boxes from handwriting_line_recognition import Network as HandwritingRecognitionNet, handwriting_recognition_transform from handwriting_line_recognition import decode as decoder_handwriting import cv2 import nltk from pyScan import pyScan import os ctx = mx.gpu(0) alphabet_encoding = r' !"#&\'()*+,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz' ls = LexiconSearch() contractions = Contractions( '/home/jrmo/git/HandwrittenTextRecognition_MXNet/models/GoogleNews-vectors-negative300.bin.gz' ) lm_model, vocab = gluonnlp.model.get_model('awd_lstm_lm_1150', pretrained=True, ctx=ctx) '''Thanks to Thomas Delteil for creating this model''' predictions = [] # Allows to pass initialized nets def predict(image, psn=None, wsn=None, hlrn=None, min_c=0.01,
class TextCleaner: word_re = re.compile('[a-zA-Z]+') number_re = re.compile('[0-9]+$') spell_checker = SpellChecker() lemmatizer = WordNetLemmatizer() all_words = set(words.words()) def __init__(self, save_path, word2vec_model_path, previously_processed=[]): self.contractions = Contractions(word2vec_model_path) self.previously_processed = previously_processed self.save_path = save_path def _get_all_comments(self, subreddit): comments = [] for submission in subreddit["submissions"]: for comment in submission["comments"]: comments.extend(sent_tokenize(comment["body"])) return comments def _remove_urls(self, text): url_pattern = r'(((https?|ftp)://)?(([a-zA-Z])+\.)?([a-zA-Z])+\.([a-zA-Z])+/?.*)|http' new_sentences = [] for word in text.split(): if re.compile(url_pattern).search(word): new_sentences.append(re.sub(url_pattern, "__isurl__", word)) else: new_sentences.append(word) return " ".join(new_sentences) def _invalid_characters(self, string): string = re.sub("(\s|-|_|\.\.\.)+", " ", string) return re.sub("!|#|&|\(|\)|–|\[|{|}|\]|:|;|\?|\*", "", string) def _expand_sentences(self, texts): return list( self.contractions.expand_texts( [x.replace("’", "'") for x in texts], precise=True)) def _replace(self, sentence, is_spell_check=True): words = [] for word in word_tokenize(sentence): word = word.strip() if "/" in word or "\\" in word: words.append("__isslashinword__") elif self.word_re.match(word): if is_spell_check and word not in self.all_words: words.append(self.spell_checker.correction(word)) else: words.append(word) elif self.number_re.match(word): words.append("__isnumber__") elif "__isurl__" in word: words.append("__isurl__") else: words.append("__isinvalidword__") return words def _words_and_tags(self, words): lemmas = [] pos_tags = [] for word, pos_tag in nltk.pos_tag(words): pos_tags.append(pos_tag) if self._get_wordnet_pos(pos_tag): lemmas.append( self.lemmatizer.lemmatize( word, pos=self._get_wordnet_pos(pos_tag))) else: lemmas.append(self.lemmatizer.lemmatize(word)) return (" ".join(lemmas), pos_tags) ## there are others but this is sufficient, e.g. one more wordnet pos tag (adjective satellite) and many more nltk pos tags def _get_wordnet_pos(self, treebank_tag): if treebank_tag.startswith('J'): return wordnet.ADJ elif treebank_tag.startswith('V'): return wordnet.VERB elif treebank_tag.startswith('N'): return wordnet.NOUN elif treebank_tag.startswith('R'): return wordnet.ADV else: return '' def process_subreddits(self, subreddits, save=True, check_previous=True): for subreddit in subreddits: print(subreddit["display_name"]) pathlib.Path(self.save_path).mkdir(exist_ok=True) all_raw_comments = self._get_all_comments(subreddit) raw_comments = all_raw_comments comment_no_urls = [] comment_removed_chars = [] comment_expandeds = [] comment_replaced_spell_corrections = [] comment_processed_spell_corrections = [] pos_tag_sent_spell_corrections = [] comment_replaced_no_spell_corrections = [] comment_processed_no_spell_corrections = [] pos_tag_no_sent_spell_corrections = [] count = 0 total = len(raw_comments) for comment in raw_comments: print(comment) comment_no_url = self._remove_urls(comment) comment_removed_char = self._invalid_characters(comment_no_url) comment_expanded = self._expand_sentences( [comment_removed_char])[0] comment_replaced_spell_correction = self._replace( comment_expanded.lower(), is_spell_check=True) comment_processed_spell_correction, pos_tag_sent_spell_correction = self._words_and_tags( comment_replaced_spell_correction) comment_replaced_no_spell_correction = self._replace( comment_expanded.lower(), is_spell_check=False) comment_processed_no_spell_correction, pos_tag_no_sent_spell_correction = self._words_and_tags( comment_replaced_no_spell_correction) count += 1 print("count:", count, "total:", total, subreddit["display_name"]) # Appending comment_no_urls.append(comment_no_url) comment_removed_chars.append(comment_removed_char) comment_expandeds.append(comment_expanded) comment_replaced_spell_corrections.append( comment_replaced_spell_correction) comment_processed_spell_corrections.append( comment_processed_spell_correction) pos_tag_sent_spell_corrections.append( pos_tag_sent_spell_correction) comment_replaced_no_spell_corrections.append( comment_replaced_no_spell_correction) comment_processed_no_spell_corrections.append( comment_processed_no_spell_correction) pos_tag_no_sent_spell_corrections.append( pos_tag_no_sent_spell_correction) data = { "raw": raw_comments, "comment_no_urls": comment_no_urls, "comment_removed_chars": comment_removed_chars, "comment_expandeds": comment_expandeds, "comment_replaced_spell_corrections": comment_replaced_spell_corrections, "comment_processed_spell_corrections": comment_processed_spell_corrections, "pos_tag_sent_spell_corrections": pos_tag_sent_spell_corrections, "comment_replaced_no_spell_corrections": comment_replaced_no_spell_corrections, "comment_processed_no_spell_corrections": comment_processed_no_spell_corrections, "pos_tag_no_sent_spell_corrections": pos_tag_no_sent_spell_corrections, } if save: subreddit_path = self.save_path + "TEST" + subreddit[ "display_name"] + ".json" with open(subreddit_path, 'w') as fp: json.dump(data, fp) else: return data
with open('abbreviations.mapper', 'r') as file: content = file.read() abbreviations_map = literal_eval(content) paragraph_separator = '\n\n' sentence_separator = ' ' token_separator = ' ' unnecessary_identifier_regex = '[0-9\[\]%/,()–\'<>^~`@|#$+:;’]' unnecessary_space = ' ' unnecessary_unresolved_pron = '-PRON-' unnecessary_apostrophe = ' \'' unnecessary_space_period = ' \.' period_regex = '\.' valid_eos_token = '[!?]' # Time taking step expander = Contractions(api_key='glove-wiki-gigaword-50') assert list(expander.expand_texts(['loader_demo_text' ]))[0] == 'loader_demo_text' # Time taking step spacy_tool = spacy.load('en_md') neuralcoref.add_to_pipe(spacy_tool) logging.basicConfig(filename='summarizer.log', filemode='w', format='%(name)s - %(levelname)s - %(message)s', level=logging.DEBUG) # Takes about ~40 seconds to start-up
data = re.sub(r"([0-9]+)000", r"\1k", data) return data # %% # %% # nltk.download('stopwords') stop_words = set(stopwords.words('english')) def stemming(sentence): stemmer = PorterStemmer() sentence = sentence.split() sentence = ' '.join(stemmer.stem(word) for word in sentence ) #if word not in stop_words) return sentence # %% cont = Contractions(api_key="glove-twitter-100") # %% data['question1'] = list(cont.expand_texts(data['question1'])) data['question2'] = list(cont.expand_texts(data['question2'])) data['question1'] = data['question1'].fillna('').apply(lambda x: BeautifulSoup(x, "lxml").text) data['question2'] = data['question2'].fillna('').apply(lambda x: BeautifulSoup(x, "lxml").text) data['question1'] = data['question1'].fillna('').apply(punctutions) data['question2'] = data['question2'].fillna('').apply(punctutions) data['question1'] = data['question1'].fillna('').apply(stemming) data['question2'] = data['question2'].fillna('').apply(stemming) #%% data['fuzz_ratio'] = data.apply(lambda x : fuzz.ratio(x['question1'],x['question2']),axis=1) data['fuzz_partial_ratio'] = data.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1) data['token_sort_ratio'] = data.apply(lambda x: fuzz.token_sort_ratio(x['question1'], x['question2']),axis=1)
from TwitterAPI import TwitterAPI from gensim.models import Word2Vec import gensim.downloader as api import sys import re import string from nltk.tokenize import TweetTokenizer import preprocessor tknzr = TweetTokenizer(strip_handles=True) from pycontractions import Contractions # Load your favorite semantic vector model in gensim keyedvectors format from disk cont = Contractions(api_key="glove-twitter-100") def getTweets(query, api): r = api.request('search/tweets', {'q': query}) tweetList = [] for item in r: if (item['lang'] == 'en'): tweetList.append(item['text']) return tweetList def main(): # Interface with OPSUS sentiment analysis API api = TwitterAPI('7ldroPca5V9h2GczFb2ySRuqS', 'Di98ZN3xmRcoeL3St1Xe6fEo6expkyNZLezdwn2ON8sUCK2t6T', '1049876480447123457-cEA1uhUauGFjA1oPGxpUB2tCJGAaen', 'xsGol4ZwM6FRLxiM2ucp80brUDENKdn3r3pf0h8yhEO5t') # Get all of the tweets mentioning the brand of interest
def __init__(self, spacy_model='en_core_web_sm', gensim_model='glove-twitter-25'): self.nlp = spacy.load(spacy_model) self.model = api.load(gensim_model) self.cont = Contractions(kv_model=self.model)