Example #1
0
 def __init__(self,
              save_path,
              word2vec_model_path,
              previously_processed=[]):
     self.contractions = Contractions(word2vec_model_path)
     self.previously_processed = previously_processed
     self.save_path = save_path
    def initialize(self):
        print("Initializing Text Cleaner..")
       
        print("Initializing Smart Contractions Module..")
        self.cont = Contractions(self.embedding_for_smart_contraction)
        self.cont.load_models()
        
        print("Initializing Stopwords Module..")
        self.stop_words = set(stopwords.words('english'))
        stop_words_without_negation = copy.deepcopy(self.stop_words)
        stop_words_without_negation.remove('no')
        stop_words_without_negation.remove('nor')
        stop_words_without_negation.remove('not')
        self.stop_words_without_negation = stop_words_without_negation
        self.pos_tags_set_1 = {'NNP'}

        print("Initializing Wordnet Lemmatizer Module..")
        self.wnl = WordNetLemmatizer()
        
        print("Initializing Spellcheck Module..")
        max_edit_distance_dictionary = 2
        prefix_length = 7
        self.sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
        dictionary_path = os.path.abspath('')+"\\"+self.spell_dictonarypath
        self.sym_spell.load_dictionary(dictionary_path, 0, 1)
        
        print("Initialization complete!")
Example #3
0
    def __init__(
        self,
        expand_contractions=True,
        strip_text_in_brackets=False,
        combine_concatenations=False,
        w2v_path=None,
        api_key="word2vec-google-news-300",
    ):

        self.opt_expand_contractions = expand_contractions
        self.opt_strip_text_in_brackets = strip_text_in_brackets
        self.opt_combine_concatenations = combine_concatenations

        if expand_contractions:
            print(
                "Loading contractions dataset (this will take a while the first time)"
            )

            # Load your favorite word2vec model
            self.cont = Contractions(w2v_path=w2v_path, api_key=api_key)
            print("Contractions dataset downloaded")

            print("Training contractions model (this will take a while)")
            # prevents loading on first expand_texts call
            self.cont.load_models()
            print("Contraction model successfully trained")
Example #4
0
 def __init__(self, kv_model=None, api_key='glove-twitter-100', precise=False):
     self.kv_model = kv_model
     self.api_key = api_key
     self.precise = precise
     if api_key:
         self.contractions = Contractions(api_key=api_key)
     else:
         self.contractions = Contractions(kv_model=kv_model)
     self.contractions.load_models()
Example #5
0
 def __init__(self, nlp=spacy.load("en_core_web_sm")):
     self.nlp = nlp
     contextualSpellCheck.add_to_pipe(self.nlp)
     model = api.load(cfg['embeddings']['embedding_file'])
     self.cont = Contractions(kv_model=model)
     self.cont.load_models()
     dirname = os.path.dirname(__file__)
     with open(os.path.join(dirname, 'acronym.json')) as f:
         self.acronyms = json.load(f)
Example #6
0
def contraction_removal():
       
    from pycontractions import Contractions
    import gensim.downloader as api
    # model = api.load("glove-twitter-25")
    # model = api.load("glove-twitter-100")
    model = api.load("word2vec-google-news-300")

    cont = Contractions(kv_model=model)
    cont.load_models()
Example #7
0
class ContractionsExpander(TransformerMixin):
    def __init__(self, kv_model=None, api_key='glove-twitter-100', precise=False):
        self.kv_model = kv_model
        self.api_key = api_key
        self.precise = precise
        if api_key:
            self.contractions = Contractions(api_key=api_key)
        else:
            self.contractions = Contractions(kv_model=kv_model)
        self.contractions.load_models()
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return self.contractions.expand_texts(X)
Example #8
0
    def __init__(self, model=None):
        '''
        :param model: Pretrained word embedding model.
        '''
        super().__init__()

        if model is None:
            # If no model is given, use the default one and store it as a static class variable to avoid multiple loadings
            if ContractionExpander.model_contraction_expander is None:
                model_path = os.path.join(NLP_MODELS_PATH, 'pretrained', 'word_embeddings', 'pubmed2018_w2v_400D',
                                  'pubmed2018_w2v_400D.bin')
                ContractionExpander.model_contraction_expander = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)

            model = ContractionExpander.model_contraction_expander

        self.cont = Contractions(kv_model=model)
def preprocessing_text(df, series):

    #removing patterns (usernames & links to websites)
    pattern_list = ['@[\w]*', r'http\S+']
    remove_patterns(df, series, pattern_list)

    #remove hashtag from clean text
    df[series] = remove_hashtags(df[series])

    # de-emojizing
    df[series] = df[series].apply(lambda x: emoji.demojize(x))

    #remove characters repeated more than 2 times
    df[series] = df[series].apply(lambda x: ReplaceThreeOrMore(x))

    #remove html characters
    char_list = ['&amp', '\n', 'á', '<', '>']
    remove_chars(df, series, char_list)

    #handle contractions
    cont = Contractions(api_key="glove-twitter-100")
    df[series] = df[series].apply(lambda x: list(cont.expand_texts([x])))
    df[series] = df[series].apply(lambda x: str(x))

    #removing numbers
    df[series] = df[series].apply(
        lambda x: ''.join([i for i in x if not i.isdigit()]))

    #remove punctuation
    df[series] = df[series].str.replace('[^\w\s]', ' ')

    #set to lowercase
    df[series] = df[series].apply(
        lambda x: " ".join(x.lower() for x in x.split()))

    #lemmatization
    df[series] = df[series].apply(lambda x: str(x))
    df[series] = df[series].apply(
        lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

    return df
Example #10
0
 def contractions_fun(self):
     """
     This function replaces words that are --
     by checking a word if a word is present in a dictionary
     if the word is present in dictionary then it is replaced
     with its value from dictionary
     """
     if self.contraction_method == 'mapping':
         self.doc = self.mapping_decontraction(str(self.doc))
     elif self.contraction_method == 'word2vec':
         model = pretrained_model
         cont = Contractions(model)
         cont.load_models()
         self.doc = list(cont.expand_texts([str(self.doc)],
                                           precise=True))[0]
     elif self.contraction_method == 'glove':
         model = api.load("glove-twitter-25")
         cont = Contractions(kv_model=model)
         cont.load_models()
         self.doc = list(cont.expand_texts([str(self.doc)],
                                           precise=True))[0]
Example #11
0
class ContractionExpander(TextProcessingBaseClass):
    '''
    Removes contractions from the text and uses the full version instead (unification).

    Example:
    I'll walk down the road --> I will walk down the road
    '''

    model_contraction_expander = None

    def __init__(self, model=None):
        '''
        :param model: Pretrained word embedding model.
        '''
        super().__init__()

        if model is None:
            # If no model is given, use the default one and store it as a static class variable to avoid multiple loadings
            if ContractionExpander.model_contraction_expander is None:
                model_path = os.path.join(NLP_MODELS_PATH, 'pretrained', 'word_embeddings', 'pubmed2018_w2v_400D',
                                  'pubmed2018_w2v_400D.bin')
                ContractionExpander.model_contraction_expander = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)

            model = ContractionExpander.model_contraction_expander

        self.cont = Contractions(kv_model=model)

    def level(self) -> str:
        return "text"

    def _process_internal(self, text: str) -> str:
        '''
        :param text: Input string.
        :return: The string without contractions.
        '''
        return list(self.cont.expand_texts([text], precise=True))[0]
class Cleaner:

    def __init__(self,
                embedding_for_smart_contraction="GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin",
                spell_dictonarypath = "frequency_dictionary_en_82_765.txt"):
        self.embedding_for_smart_contraction = embedding_for_smart_contraction
        self.spell_dictonarypath = spell_dictonarypath
        self.initialized = False 


    def initialize(self):
        print("Initializing Text Cleaner..")
       
        print("Initializing Smart Contractions Module..")
        self.cont = Contractions(self.embedding_for_smart_contraction)
        self.cont.load_models()
        
        print("Initializing Stopwords Module..")
        self.stop_words = set(stopwords.words('english'))
        stop_words_without_negation = copy.deepcopy(self.stop_words)
        stop_words_without_negation.remove('no')
        stop_words_without_negation.remove('nor')
        stop_words_without_negation.remove('not')
        self.stop_words_without_negation = stop_words_without_negation
        self.pos_tags_set_1 = {'NNP'}

        print("Initializing Wordnet Lemmatizer Module..")
        self.wnl = WordNetLemmatizer()
        
        print("Initializing Spellcheck Module..")
        max_edit_distance_dictionary = 2
        prefix_length = 7
        self.sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
        dictionary_path = os.path.abspath('')+"\\"+self.spell_dictonarypath
        self.sym_spell.load_dictionary(dictionary_path, 0, 1)
        
        print("Initialization complete!")

    def expand_contractions(self,text):
        try:
            text = list(self.cont.expand_texts([text], precise=False))[0]
        except Exception as e:
            return text
        return text

    
    def apostrophe_correction(self,text):
        text = re.sub("’", "'", text)
        return text
    
    
    def try_decode(self,text):
        try:
            text = unidecode.unidecode(codecs.decode(text, 'unicode_escape'))
        except:
            text = unidecode.unidecode(text)
        return text

    
    def tokenize_and_keep_only_words(self,text):
        text = re.findall(r"[a-zA-Z]+", text.lower())
        return text
    
    
    def remove_stop_words(self,text):
        text = [word for word in text if (word not in self.stop_words_without_negation and len(word)>2)]
        return text

    
    def lemmatize(self,text):
        text = [self.wnl.lemmatize(word) for word in text]
        return text

    
    def spell_check(self,text,max_edit_distance_lookup = 2):
        # tokenize each word
        text = word_tokenize(text)
        # apply pos to each word
        text = pos_tag(text)
        correct_text = []
        # for each word in sentece
        for word in text:
            # if word is not a noun
            if word[1] not in self.pos_tags_set_1:
                # check if we can correct it, then correct it
                suggestions = self.sym_spell.lookup(word[0],Verbosity.CLOSEST,
                                    max_edit_distance_lookup)
                for suggestion in suggestions:
                    # take the first correction
                    correct_text.append(suggestion.term)
                    break
            else:
                correct_text.append(word[0])
        text = ' '.join([word for word in correct_text])
        return text


    def full_clean(self,text,debug=False):
        if not self.initialized:
            self.initialize()
            self.initialized = True
        if debug:
            print("pre-clean: ",text)
        text = self.try_decode(text)
        text = self.apostrophe_correction(text)
        text = self.spell_check(text)
        text = self.expand_contractions(text)
        text = self.tokenize_and_keep_only_words(text)
        text = self.remove_stop_words(text)
        text = self.lemmatize(text)

        text = ' '.join(text)
        if debug:
            print("post-clean: ",text)
        return text
Example #13
0
    def transform(self, x):
        if self.verbose > 0:
            print(
                colored("Called Description Transformer Transform",
                        color="blue",
                        attrs=['bold', 'underline']))
            print("Processing description text")

        # Copy the data and find the name of the description column
        self.data = x.copy()
        self.column_name = self.data.columns.values[0]

        # Load spaCy language processor
        nlp = spacy.load("en_core_web_sm")
        # Load pre-trained word embedding if using contractions
        contraction = Contractions(
            api_key="glove-twitter-25") if self.contractions else None

        # Process text by iterating over each sample's index and description
        for idx, sample in zip(self.data.index.values, self.data.values):
            # Change accented characters, e.g à -> a
            sample = self.remove_accents(str(sample))
            if contraction:
                # Contract words, e.g "hasn't" -> "has not"
                sample = list(contraction.expand_texts([sample], precise=True))
                sample = ''.join(sample)

            # Input sample text into spaCy language processor
            doc = nlp(sample)
            # Split sample text into sentences
            sentences = list(doc.sents)

            for word_idx in range(len(sentences)):
                # Remove punctuation tokens, e.g. ! , .
                sentences[word_idx] = [
                    token for token in sentences[word_idx]
                    if not token.is_punct
                ]

                # Remove stop words
                if self.stop_words:
                    sentences[word_idx] = [
                        token for token in sentences[word_idx]
                        if token.text.lower() not in self.stop_words
                    ]

                # Apply lemmatization
                if self.transformation[0].lower() == "l":
                    # Resolve words to their dictionary form using PoS tags
                    sentences[word_idx] = [
                        token.lemma_.lower() for token in sentences[word_idx]
                    ]

                # Apply stemming (only if lemmatization not applied)
                elif self.transformation[0].lower() == "s":
                    # Stem tokens
                    for char_idx in range(len(sentences[word_idx])):
                        # Apply stemmer to each word
                        stemmed = self.stemmer_algorithm.stem(
                            sentences[word_idx][char_idx].text)
                        # Convert back to type Token and update word in sentence
                        sentences[word_idx][char_idx] = nlp(stemmed)[0]

                # Remove remaining punctuation within tokens, e.g. "(years)" -> "years", not including -
                sentences[word_idx] = [
                    token.translate(
                        str.maketrans('', '',
                                      '!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~'))
                    for token in sentences[word_idx]
                ]

            # Split words containing dash or spaces caused by lemmatization, e.g. "16-year" -> "16" + "year"
            for k in range(len(sentences)):
                new_sentence = []
                for token in sentences[k]:
                    split_token = re.split(' |-', token)
                    for word in split_token:
                        # Check word not empty
                        if word:
                            new_sentence.append(word)
                # Replace words in sentence
                sentences[k] = new_sentence

            # Remove empty lists from list of sentences
            sentences = [sent for sent in sentences if sent != []]
            # The join the sentences and update the descriptions dataframe
            word_list = [word for sent in sentences for word in sent]
            self.data.loc[idx, self.column_name] = ' '.join(
                [str(elem) for elem in word_list])


#         if self.verbose > 1:
#             display(self.data)
        if self.verbose > 0:
            print(
                colored("Finshed processing all descriptions\n",
                        color="blue",
                        attrs=['bold', 'underline']))

        return self.data
Example #14
0
def main():

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~INITIALIZATIONS~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    nlp = spacy.load('en')
    cont = Contractions('../GoogleNews-vectors-negative300.bin.gz')
    cont.load_models()

    #stopwords = spacy.lang.en.STOP_WORDS
    #spacy.lang.en.STOP_WORDS.add("e.g.")
    #nlp.vocab['the'].is_stop
    nlp.Defaults.stop_words |= {
        "(a)",
        "(b)",
        "(c)",
        "etc",
        "etc.",
        "etc.)",
        "w/e",
        "(e.g.",
        "no?",
        "s",
        "film",
        "movie",
        "0",
        "1",
        "2",
        "3",
        "4",
        "5",
        "6",
        "7",
        "8",
        "9",
        "10",
        "e",
        "f",
        "k",
        "n",
        "q",
        "de",
        "oh",
        "ones",
        "miike",
        "http",
        "imdb",
    }
    stopwords = list(nlp.Defaults.stop_words)
    tokenizer = Tokenizer(nlp.vocab)
    punctuations = string.punctuation

    LabeledSentence1 = gensim.models.doc2vec.TaggedDocument

    #####################################REVIEWS OF 100 BEST MOVIES on IMDB#########################################

    #open the base URL webpage
    html_page = urlopen("https://www.imdb.com/list/ls059633855/")

    #instantiate beautiful soup object of the html page
    soup = BeautifulSoup(html_page, 'lxml')

    review_text = get_movie_reviews(soup)

    all_good_movie_reviews = get_tagged_documents(review_text, cont, stopwords,
                                                  tokenizer, punctuations,
                                                  LabeledSentence1)

    print('\n')
    d2v_model_best = doc2vec_similarity(all_good_movie_reviews, 'Best')

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~K-MEANS CLUSTERING~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    all_rev_text = [rev for rev, ttl in all_good_movie_reviews]
    all_rev_ttls = [ttl[0] for rev, ttl in all_good_movie_reviews]

    numclust = 4
    kmeans_model = KMeans(n_clusters=numclust,
                          init='k-means++',
                          max_iter=300,
                          random_state=111)
    X = kmeans_model.fit(d2v_model_best.docvecs.vectors_docs)
    kmeans_clust_labels = kmeans_model.labels_.tolist()

    output_list = list(zip(kmeans_clust_labels, all_rev_ttls))
    print('The Groupings assigned by K-Means Clustering are: \n\n'.format(
        numclust))

    for i in sorted(output_list):
        print(i)
    print('\n')

    ttls_and_labels_kmeans_clust = []
    i = 0
    for rev, ttl in all_good_movie_reviews:
        ttls_and_labels_kmeans_clust.append((ttl, kmeans_clust_labels[i]))
        i += 1

    kmeans_clust_centroids = np.array(kmeans_model.cluster_centers_)

    get_docs_closest_to_centroids(
        data_length=100,
        n_clust=4,
        clust_labels=kmeans_clust_labels,
        centroid_input=kmeans_clust_centroids,
        doc_vecs=d2v_model_best.docvecs.vectors_docs,
        all_reviews=all_good_movie_reviews,
        ttls_with_labels=ttls_and_labels_kmeans_clust)

    #pca = PCA(n_components=2).fit(d2v_model_best.docvecs.vectors_docs)
    #datapoint = pca.transform(d2v_model_best.docvecs.vectors_docs)

    #plt.figure
    #label1 = ["#FFFF00", "#008000", "#0000FF", "#800080", "#ff0000"]
    #color = [label1[i] for i in kmeans_clust_labels]
    #plt.scatter(datapoint[:, 0], datapoint[:, 1], c=color)

    #centroids = kmeans_model.cluster_centers_
    #centroidpoint = pca.transform(centroids)
    #plt.scatter(centroidpoint[:, 0], centroidpoint[:, 1], marker='^', s=150, c='#000000')
    #plt.show()

    #kmeans_3D(d2v_model_best)

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~AGGLOMERATIVE HIERARCHICAL CLUSTERING~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    tfidf_input = [' '.join(t) for t in all_rev_text]

    #define vectorizer parameters
    tfidf_vectorizer = TfidfVectorizer(max_df=0.8,
                                       min_df=0.2,
                                       use_idf=True,
                                       ngram_range=(1, 3))

    tfidf_matrix = tfidf_vectorizer.fit_transform(tfidf_input)

    dist = 1 - cosine_similarity(tfidf_matrix)
    linkage_matrix = ward(dist)

    cl = linkage_matrix
    numclust = 5
    hier_clust_labels = fcluster(cl, numclust, criterion='maxclust')

    hier_clust_labels = hier_clust_labels - 1

    output_list = list(zip(hier_clust_labels, all_rev_ttls))
    print(
        'The Levels assigned by a {}-Tiered Hierarchical Clustering are: \n\n'.
        format(numclust))

    for i in sorted(output_list):
        print(i)
    print('\n')

    ttls_and_labels_hier_clust = []
    i = 0
    for rev, ttl in all_good_movie_reviews:
        ttls_and_labels_hier_clust.append((ttl, hier_clust_labels[i]))
        i += 1

    hier_clust_codebook = []

    for i in range(hier_clust_labels.min(), hier_clust_labels.max() + 1):
        hier_clust_codebook.append(d2v_model_best.docvecs.vectors_docs[
            hier_clust_labels == i].mean(0))

    hier_clust_centroids = np.vstack(hier_clust_codebook)

    get_docs_closest_to_centroids(data_length=100,
                                  n_clust=5,
                                  clust_labels=hier_clust_labels,
                                  centroid_input=hier_clust_centroids,
                                  doc_vecs=d2v_model_best.docvecs.vectors_docs,
                                  all_reviews=all_good_movie_reviews,
                                  ttls_with_labels=ttls_and_labels_hier_clust)

    #plot_spectral_embed_agglom_clusters(dist, 5)

    #dendro_static(linkage_matrix, all_rev_ttls)

    #dendro_interactive(dist, all_rev_ttls)

    #dendro_heatmap(dist, all_rev_ttls)

    #####################################REVIEWS OF 100 WORST MOVIES on IMDB#########################################

    #open the base URL webpage
    html_page_2 = urlopen("https://www.imdb.com/list/ls061324742/")

    #instantiate beautiful soup object of the html page
    soup_2 = BeautifulSoup(html_page_2, 'lxml')

    review_text_2 = get_movie_reviews(soup_2)

    all_bad_movie_reviews = get_tagged_documents(review_text_2, cont,
                                                 stopwords, tokenizer,
                                                 punctuations,
                                                 LabeledSentence1)

    print('\n')
    d2v_model_worst = doc2vec_similarity(all_bad_movie_reviews, 'Worst')

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~K-MEANS CLUSTERING~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    all_rev_text_bad = [rev for rev, ttl in all_bad_movie_reviews]
    all_rev_ttls_bad = [ttl[0] for rev, ttl in all_bad_movie_reviews]

    numclust = 4
    kmeans_model_worst = KMeans(n_clusters=numclust,
                                init='k-means++',
                                max_iter=300,
                                random_state=111)
    X_worst = kmeans_model_worst.fit(d2v_model_worst.docvecs.vectors_docs)
    kmeans_clust_labels_worst = kmeans_model_worst.labels_.tolist()

    output_list = list(zip(kmeans_clust_labels_worst, all_rev_ttls_bad))
    print('The Groupings assigned by K-Means Clustering are: \n\n'.format(
        numclust))

    for i in sorted(output_list):
        print(i)
    print('\n')

    ttls_and_labels_kmeans_clust_bad = []
    i = 0
    for rev, ttl in all_bad_movie_reviews:
        ttls_and_labels_kmeans_clust_bad.append(
            (ttl, kmeans_clust_labels_worst[i]))
        i += 1

    kmeans_clust_centroids_worst = np.array(
        kmeans_model_worst.cluster_centers_)

    get_docs_closest_to_centroids(
        data_length=100,
        n_clust=4,
        clust_labels=kmeans_clust_labels_worst,
        centroid_input=kmeans_clust_centroids_worst,
        doc_vecs=d2v_model_worst.docvecs.vectors_docs,
        all_reviews=all_bad_movie_reviews,
        ttls_with_labels=ttls_and_labels_kmeans_clust_bad)

    #pca_worst = PCA(n_components=2).fit(d2v_model_worst.docvecs.vectors_docs)
    #datapoint_worst = pca_worst.transform(d2v_model_worst.docvecs.vectors_docs)

    #plt.figure
    #label1 = ["#FFFF00", "#008000", "#0000FF", "#800080", "#ff0000"]
    #color = [label1[i] for i in kmeans_clust_labels_worst]
    #plt.scatter(datapoint_worst[:, 0], datapoint_worst[:, 1], c=color)

    #centroids = kmeans_model_worst.cluster_centers_
    #centroidpoint = pca_worst.transform(centroids)
    #plt.scatter(centroidpoint[:, 0], centroidpoint[:, 1], marker='^', s=150, c='#000000')
    #plt.show()

    #kmeans_3D(d2v_model_worst)

    #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~AGGLOMERATIVE HIERARCHICAL CLUSTERING~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    tfidf_input_bad = [' '.join(t) for t in all_rev_text_bad]

    #define vectorizer parameters
    tfidf_vectorizer = TfidfVectorizer(max_df=0.8,
                                       min_df=0.2,
                                       use_idf=True,
                                       ngram_range=(1, 3))

    tfidf_matrix_bad = tfidf_vectorizer.fit_transform(tfidf_input_bad)

    dist_bad = 1 - cosine_similarity(tfidf_matrix_bad)
    linkage_matrix_bad = ward(dist_bad)

    cl = linkage_matrix_bad
    numclust = 5
    hier_clust_labels_bad = fcluster(cl, numclust, criterion='maxclust')

    hier_clust_labels_bad = hier_clust_labels_bad - 1

    output_list = list(zip(hier_clust_labels_bad, all_rev_ttls_bad))
    print(
        'The Levels assigned by a {}-Tiered Hierarchical Clustering are: \n\n'.
        format(numclust))

    for i in sorted(output_list):
        print(i)
    print('\n')

    ttls_and_labels_hier_clust_bad = []
    i = 0
    for rev, ttl in all_bad_movie_reviews:
        ttls_and_labels_hier_clust_bad.append((ttl, hier_clust_labels_bad[i]))
        i += 1

    hier_clust_codebook_bad = []

    for i in range(hier_clust_labels_bad.min(),
                   hier_clust_labels_bad.max() + 1):
        hier_clust_codebook_bad.append(d2v_model_worst.docvecs.vectors_docs[
            hier_clust_labels_bad == i].mean(0))

    hier_clust_centroids_bad = np.vstack(hier_clust_codebook_bad)

    get_docs_closest_to_centroids(
        data_length=100,
        n_clust=5,
        clust_labels=hier_clust_labels_bad,
        centroid_input=hier_clust_centroids_bad,
        doc_vecs=d2v_model_worst.docvecs.vectors_docs,
        all_reviews=all_bad_movie_reviews,
        ttls_with_labels=ttls_and_labels_hier_clust_bad)

    #plot_spectral_embed_agglom_clusters(dist_bad, n_clust=5)

    #dendro_static(linkage_matrix_bad, all_rev_ttls_bad)

    #dendro_interactive(dist_bad, all_rev_ttls_bad)

    #dendro_heatmap(dist_bad, all_rev_ttls_bad)

    return None
Example #15
0
from pycontractions import Contractions
"""
    Contractions are words that we write with an apostrophe.
    Examples of contractions are words like “ain’t” or “aren’t”.
    For standartize text better replace them
"""

# See description in README.md
cont = Contractions('./data/GoogleNews-vectors-negative300.bin')
cont.load_models()
""" 
    Will expand contractions in list of texts. 
    Better use on big amounts on texts. One by One proecceing will be slow
    expand_texts produce generator of texts
"""


def replace_contractions(batch_of_texts):
    try:
        return list(cont.expand_texts(batch_of_texts, precise=True))

    # can fail on some words, like `he's`
    except Exception:
        return batch_of_texts
Example #16
0
class TextProcessing:
    """
    Class to clean text
    """
    def __init__(self, nlp=spacy.load("en_core_web_sm")):
        self.nlp = nlp
        contextualSpellCheck.add_to_pipe(self.nlp)
        model = api.load(cfg['embeddings']['embedding_file'])
        self.cont = Contractions(kv_model=model)
        self.cont.load_models()
        dirname = os.path.dirname(__file__)
        with open(os.path.join(dirname, 'acronym.json')) as f:
            self.acronyms = json.load(f)

    def process_text(self, text):
        """
        Processes text as follows:
        1. decode to unicode
        2. remove extra repeated special characters
        3. put space around the special characters
        4. Remove extra whitespaces
        5. replace acronyms
        6. expand contractions of english words like ain't
        7. correct spelling mistakes
        8. replace NE in the text
        9. lower case the string
        Args:
            text: text to be processed
        """
        text = self.unidecode(text)
        text = self.remove_repeated_chars(text)
        text = self.put_space_around_special_chars(text)
        text = self.remove_extra_whitespaces(text)
        text = self.replace_acronyms(text)
        text = self.expand_contractions(text)
        text = self.correct_spellings(text)
        text = self.replace_named_entity(text)
        text = self.lower_case(text)
        return text

    def remove_repeated_chars(self, text):
        """
        Removes repeated instances of consecutive special chars
        Args:
            text: text to be processed
        """
        text = re.sub(r'([!@#$%^&*,./?\'";:\\])\1+', r'\1', text)
        return text

    def put_space_around_special_chars(self, text):
        """
        Puts space around special chars like '[({$&*#@!'
        Args:
            text: text to be processed
        """

        chars = [
            '$', '?', '%', '@', '!', '#', '^', '*', '&', '"', ':', ';', '/',
            '\\', ',', '+', '(', ')', '[', ']', '{', '}', '<', '>'
        ]

        for char in chars:
            text = text.replace(char, ' ' + char + ' ')
        return text

    def remove_extra_whitespaces(self, text):
        """
        Removes extra whitespaces from the text
        Args:
            text: text to be processed
        """
        return text.strip()

    def unidecode(self, text):
        """
        unidecodes the text
        Args:
            text: text to be processed
        """
        return unidecode.unidecode(text.lower())

    def lower_case(self, text):
        """
        lower cases the text
        Args:
            text: text to be processed
        """
        return text.lower()

    def expand_contractions(self, text):
        """
        expands contractions for example, "ain't" expands to "am not"
        Args:
            text: text to be processed
        """
        return list(self.cont.expand_texts([text.lower()], precise=True))[0]

    def correct_spellings(self, text):
        """
        corrects spellings from text
        Args:
            text: text to be processed
        """
        doc = self.nlp(text)
        if doc._.performed_spellCheck:
            text = doc._.outcome_spellCheck
        return text

    def replace_acronyms(self, text):
        """
        Replaces acronyms found in English
        For example: ttyl -> talk to you later
        Args:
            text: text to be processed
        """
        for acronym, expansion in self.acronyms.items():
            text = text.replace(' ' + acronym.lower() + ' ',
                                ' ' + expansion.lower() + ' ')
        return text

    def replace_named_entity(self, text):
        """
        Replaces named entity in the text
        For example: $5bn loss estimated in the coming year
                    -> MONEY loss estimated in the coming year
        Args:
            text: text to be processed
        """
        doc = list(
            self.nlp.pipe(
                [text],
                disable=["tagger", "parser", "contextual spellchecker"]))[0]
        for ent in doc.ents:
            text = text.replace(ent.text, ent.label_)
        return text

    def token_list(self, text):
        doc = self.nlp(text)
        tokens = []
        for token in doc:
            tokens += [token.text]
        return tokens
Example #17
0
class NLP():
    nlp = None
    doc = None
    model = None

    def __init__(self, spacy_model='en_core_web_sm', gensim_model='glove-twitter-25'):
        self.nlp = spacy.load(spacy_model)
        self.model = api.load(gensim_model)
        self.cont = Contractions(kv_model=self.model)

    def remove_html(self, text):
        """Strip HTML tags from text"""
        soup = BeautifulSoup(text, 'html.parser')
        return soup.get_text(separator=" ")

    def remove_accents(self, text):
        """Remove accented characters from text for non-english words"""
        return unidecode.unidecode(text)

    def expand_contractions(self, text):
        """Convert contractions into whole words. e.g. can't -> can not"""
        return list(self.cont.expand_texts([text], precise=True))[0]

    def preprocess(self, text, remove_numbers=False, remove_stopwords=False, excluded_sw=None, toke=False):
        """Preprocess using standard protocols. 
        @param remove_numbers converts words to digits and removes
        @param remove_stopwords removes stop words
        @param excluded_sw is any stopwords to exclude
        @param toke if true, return tokens, default return text 
        """
        text = self.remove_html(text)
        text = self.remove_accents(text)
        text = self.expand_contractions(text)

        if toke or remove_numbers or remove_stopwords:
            if excluded_sw is not None:
                for w in excluded_sw:
                    self.nlp.vocab[w].is_stop = False
            doc = self.nlp(text)
            tokens = []
            for token in doc:
                if token.pos_ == 'NUM' and not remove_numbers:
                    tokens.append(w2n.word_to_num(token.text))
                elif not token.is_stop:
                    tokens.append(token.text)
            if toke:
                return tokens
            text = " ".join(tokens)
        return text

    def lemmatize(self, tokens, toke=False):

        lookups = Lookups()
        lookups.add_table('lemma_index', lemma_index)
        lookups.add_table('lemma_exc', lemma_exc)
        lookups.add_table('lemma_rules', lemma_rules)
        lemmatizer = Lemmatizer(lookups)

        lemmas = []
        for t in tokens:
            lemmas.append(lemmatizer(token.text, token.tag_))

        if toke:
            return lemmas

        return " ".join(lemmas)

    def get_syllables(self, word):
        count = 0
        vowels = ("a", "e", "i", "o", "u", "y")
        prev = False
        for c in word:
            vowel = c in vowels
            if vowel and not prev:
                count += 1
            prev = vowel
        return count

    def get_lexical_density(self, tokens):
        c_words = t_words = 0

        cont_pos = ['PROPN', 'NOUN', 'VERB', 'ADJ', 'ADV']
        for t in tokens:
            if token.pos_ in cont_pos:
                c_words += 1
                t_words += 1
            elif token.pos_ != 'PUNCT':
                t_words += 1

        return round((c_words / t_words), 4)

    def get_coherence(self, text):
        doc = self.nlp(text)
        sentences = [sent for sent in doc.sents if len(sent) >= 2]
        frequency = defaultdict(int)
        token_sents = []
        for s in sentences:
            tmp = []
            for t in self.preprocess(s, remove_stopwords=True, excluded_sw=['no', 'not'], toke=True):
                tmp.append(t.text)
                frequency[t] += 1
            token_sents.append(tmp)

        vocab = [[word for word in sent if frequency[word] > 1]
                 for sent in token_sents]
        dictionary = corpora.Dictionary(vocab)
        corpus = [dictionary.doc2bow(word) for word in vocab]
        tfidf = models.TfidfModel(corpus)
        corpus_tfidf = tfidf[corpus]
        lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=20)
        corpus_lsi = lsi[corpus_tfidf]

        sums = {}
        topic_count = max([len(line) for line in corpus_lsi])
        for line in corpus_lsi:
            for topic in line:
                t_num = topic[0]
                if t_num not in sums:
                    sums[t_num] = abs(topic[1])
                else:
                    sums[t_num] += abs(topic(1))
        best_topic = max(zip(sums.values(), sums.keys()))[1]
        ordered = []
        i = 0
        for line in corpus_lsi:
            ordered.append((i, line[topic][1]))
            i += 1

        ordered = sorted(ordered, key=lambda x: x[1], reverse=True)
        threshold = ordered[0][1] - (0.90 * (ordered[0][1] - ordered[-1][1]))
        problem_sentences = []
        for s in ordered:
            if s[1] < threshold:
                problem_sentences.append((s[1]), s)
        problem_sentences = [s for s in ordered if s[1] < threshold]

        output = {}
        for p in problem_sentences:
            output[p[0]] = (p[1], str(sentences[p[0]]))

        return output

    def get_readability(self, text):
        scores = {}

        doc = self.nlp(text)
        sentence _count = len(doc)

        words = self.preprocess(text, toke=True)
        characters = 0
        for word in words:
            characters += len(word)
        word_count = len(words)
        
        syllable_count = 0
        complex_words = 0
        for word in words:
            c = self.get_syllables(word)
            syllable_count += c
            if c >= 3 and not word[0].isupper():
                complex_words += 1
        avgwps = word_count / sentence_count

        # Automated Readability Index
        ari = 0.0
        ari_grade = 0
        if word_count > 0:
            ari = 4.71 * (characters / word_count) + 0.5 * \
                (word_count / sentence_count) - 21.43
        if ari < 2:
            ari_grade = 0
        elif ari > 12:
            ari_grade = 13
        else:
            ari_grade = ari
        scores["ari"] = (ari, ari_grade)

        # Flesch Reading Ease
        flesch_reading_ease = 101
        fre_grade = 0
        if word_count > 0 and sentence_count > 0:
            flesch_reading_ease = 206.835 - \
                1.015(word_count / sentence_count) - \
                84.6(syllable_count / word_count)
        if flesch_reading_ease > 100:
            fre_grade = 4
        elif flesch_reading_ease > 90.0:
            fre_grade = 5
        elif flesch_reading_ease > 80.0:
            fre_grade = 6
        elif flesch_reading_ease > 70.0:
            fre_grade = 7
        elif flesch_reading_ease > 60.0:
            fre_grade = 9
        elif flesch_reading_ease > 50:
            fre_grade = 12
        else:
            fre_grade = 13
        scores["flesch_reading_ease"] = (flesch_reading_ease, fre_grade)

        # Flesch-Kincaid Grade Level
        fkg = 0.0
        if word_count > 0 and sentence_count > 0:
            fkg = 0.39(word_count / sentence_count) + \
                11.8(syllable_count / word_count) - 15.59
        scores["flesch_kinkaid_grade_level"] = (fkg, int(fkg))

        # Gunning Fog Index
        gfi = 0.0
        gfi_grade = 0
        if sentence_count > 0 and word_count > 0:
            gfi = 0.4 * ((word_count / sentence_count) +
                        100(complex_words / word_count))
        if gfi < 6:
            gfi_grade = 5
        elif gfi <= 12:
            gfi_grade = int(gfi)
        else:
            gfi_grade = 13
        scores["gunning_fog_index"] = (gfi, gfi_grade)

        # SMOG Readability
        smog = 0.0
        smog_grade = 0
        if sentence_count > 0:
            smog = 1.0430 * math.sqrt(complex_words *
                                    (30 / sentence_count)) + 3.1291
        if smog >= 13:
            smog_grade = 13
        else:
            smog_grade = int(smog)
        scores["smog_readability"] = (smog, smog_grade)

        # ColemanLiauIndex
        coleman = 0.0
        coleman_grade = 0
        if word_count > 0:
            coleman = (5.89 * (characters / word_count)) - \
                (30 * (sentence_count / word_count)) - 15.8
        if coleman >= 13:
            coleman_grade = 13
        else:
            coleman_grade = int(coleman)
        scores["coleman_liau"] = (coleman, coleman_grade)

        # LIX & RIX
        lix = 0.0
        rix = 0.0
        lix_grade = 0
        rix_grade = 0
        if sentence_count > 0 and word_count > 0:
            long_words = 0
            for word in words:
                if len(word) >= 7:
                    long_words += 1
            lix = word_count / sentence_count + ((100. * long_words) / word_count)
            rix = long_words / sentence_count
        if lix >= 13:
            lix_grade = 13
        else:
            lix_grade = int(lix)
        if rix >= 13:
            rix_grade = 13
        else:
            rix_grade = int(rix)
        scores["LIX"] = (lix, lix_grade)
        scores["RIX"] = (rix, rix_grade)

        count = 0
        avg = 0.0
        for k, v in scores.items:
            avg += v[1]
            count += 1
        scores["AVERAGE_GRADE"] = (avg / count, int(avg / count))

        return scores
def train_model() -> None:
    train_data = fetch_data.fetch_imdb_train_data()

    cont = Contractions(constants.CONTRACTIONS_BIN_FILE)
    cont.load_models()

    for index, row in train_data.iterrows():
        row.review = BeautifulSoup(row.review, features="html.parser").get_text()
        row.review = cont.expand_texts(row.review, precise=True)

    train_data.review = clean_reviews(train_data.review)

    reviews = list(tokenize_sentences(train_data.review))

    labels = list(train_data.sentiment)

    tokenizer = Tokenizer(num_words=constants.MAX_NB_WORDS)
    tokenizer.fit_on_texts(train_data.review)

    data = np.zeros((len(train_data.review), constants.MAX_SENTS, constants.MAX_SENT_LENGTH), dtype='float32')

    words = list()
    for i, sentences in enumerate(reviews):
        for j, sent in enumerate(sentences):
            if j < constants.MAX_SENTS:
                wordTokens = text_to_word_sequence(sent)
                k = 0
                for _, word in enumerate(wordTokens):
                    if k < constants.MAX_SENT_LENGTH and tokenizer.word_index[word] < constants.MAX_NB_WORDS:
                        data[i, j, k] = tokenizer.word_index[word]
                        k = k + 1
                words.append(wordTokens)

    word_index = tokenizer.word_index
    print('Total %s unique tokens.' % len(word_index))

    labels = to_categorical(np.asarray(labels))
    print('Shape of data tensor:', data.shape)
    print('Shape of label tensor:', labels.shape)

    wordSkipGramModel = gensim.models.Word2Vec(words, min_count=5, size=constants.EMBEDDING_DIM, window=4, sg=1)

    word_embedding_matrix = np.random.random((len(word_index) + 1, constants.EMBEDDING_DIM))
    for word, i in word_index.items():
        try:
            word_embedding_vector = wordSkipGramModel.wv.get_vector(word)
        except KeyError:
            continue
            # words not found in embedding index will be all-zeros.EMBEDDING_DIM
        if word_embedding_vector is not None:
            word_embedding_matrix[i] = word_embedding_vector

    embedding_layer = Embedding(len(word_index) + 1, constants.EMBEDDING_DIM, weights=[word_embedding_matrix],
                                input_length=constants.MAX_SENT_LENGTH, trainable=True)

    sentence_input = Input(shape=(constants.MAX_SENT_LENGTH,), dtype='float32')
    embedded_sequences = embedding_layer(sentence_input)
    sentence_lstm = Bidirectional(LSTM(200, return_sequences=True))(embedded_sequences)
    l_dropout = Dropout(0.5)(sentence_lstm)
    l_dense = TimeDistributed(Dense(400))(l_dropout)
    l_att = attention_layer.AttLayer()(l_dense)
    l_dropout_1 = Dropout(0.4)(l_att)
    sentEncoder = Model(sentence_input, l_dropout_1)

    review_input = Input(shape=(constants.MAX_SENTS, constants.MAX_SENT_LENGTH), dtype='float64')
    review_encoder = TimeDistributed(sentEncoder)(review_input)
    review_dropout = Dropout(0.3)(review_encoder)
    l_lstm_review = Bidirectional(LSTM(100, return_sequences=True))(review_dropout)
    l_att_dropout_review = Dropout(0.2)(l_lstm_review)
    l_dense_review = TimeDistributed(Dense(200))(l_att_dropout_review)
    l_dropout_review = Dropout(0.2)(l_dense_review)
    l_att_review = attention_layer.AttLayer()(l_dropout_review)
    preds = Dense(2, activation='softmax')(l_att_review)
    model = Model(review_input, preds)
    adam = Adam(lr=0.0001)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    model.fit(data, labels, validation_split=0.2, epochs=10, batch_size=50, shuffle=False, verbose=1)
    model.save('deeplearn_sentiment_model.h5')

    # Save Tokenizer i.e. Vocabulary
    with open('reviews_tokenizer.pkl', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
Example #19
0
    
    
    #Store in a file the preprocessed tweetd
    if exportPreprocessDataToFile == 1:
        dfExport=pd.concat([tweetTextPreprocessed, y], axis=1)
        dfExport.to_csv(fileToExport, index=False, encoding = "utf-8")
        
    return tweetTextPreprocessed, y

#--------------------------Main--------------------------#

if __name__ == "Modules.Preprocess":
    #Setting up the dataset
    colNames=['Target','Id','Date','Flag','User','Text'] #Get the name of the columns
    encoding="ISO-8859-1"#utf8 cannot read some special characters so ISO-8859-1 is used
    fileName='tweet_dataset.csv'
    
    
    print("Loading language model. This may take several seconds.")
    nlp = en_core_web_md.load()#Load the English medium spaCy language model('en_core_web_md')
    
    print("Loading GloveTwitter model. This may take up to 1 minute.")
    # Choose the model. Others such us "word2vec-google-news-300"" are available too.
    #Use "glove-twitter-100" (<1GB) or "glove-twitter-200" (1GB)for final results. "glove-twitter-25"(200MB) is just for fast checks
    cont = Contractions(api_key="glove-twitter-100")
    cont.load_models()    #Get the contractions for English and prevents loading on firs expand_text call
    
    #Exlude some words with potential negative sentimental analysis
    deselect_stop_words = ["no", "not", "n't", "less", "enough", "never"]
    for w in deselect_stop_words:
            nlp.vocab[w].is_stop = False
    lst, query, num, Api
):  #will append "num" tweets based on "query" to list "lst" using API "Api. It will also remove retweets. These tweets will usually contain hashtags"
    sample_tweets = tweepy.Cursor(Api.search,
                                  q=("# " + query + " -filter:retweets"),
                                  lang="en",
                                  tweet_mode='extended').items(num)
    for tweet in sample_tweets:
        lst.append(tweet)


#Code does not process links/URLs in tweets properly.
#Code will remove newline characters in tweets during processing (done as part of Task #2)
#Code will not process non-english names correctly (it would be useful to add them to relevant lexicons)
#Code does not process the '£' sign properly
cont = Contractions(
    api_key="glove-twitter-25"
)  #this will be used in tweet_Processor to expand contractions
#cont.load_models()

d = eht.Dict(
    "en_GB"
)  #this will be used in tweet_Processor to detect words outside the British English Dictionary
sc = SpellChecker()
parser = GingerIt()  #for grammar correction


#Task 2 related stuff covered here. It was decided that using .json files is not efficient
def tweet_Processor(tweet):
    sentences = sent_tokenize(str(tweet.full_text))
    for s in range(len(sentences)):
        sentences[s] = pytypo.correct_sentence(
#Helper method for pre-processing
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
import glob
import nltk
from pycontractions import Contractions
import spacy


#method to remove contraction
def removeconcat(line, cont):
    return list(cont.expand_texts([line]))


#Note this method must be run after the SpellCheck method provided in the C# program
stopWords = set(stopwords.words())
path = 'C:\\Temp\\conversations\\*.json'
cont = Contractions(api_key="glove-twitter-100")
cont.load_models()
files = glob.glob(path)
nlp = spacy.load('en')
for file in files:
    f = open(file, 'r')
    lines = f.readlines()
    alllines = removeconcat(line, cont)
    for line in lines:
        doc = nlp(line)
        for ent in doc.ents:
            print(ent, ent.lemma_, ent.label_)
    f.close()
Example #22
0
class Cleaner:
    def __init__(
        self,
        expand_contractions=True,
        strip_text_in_brackets=False,
        combine_concatenations=False,
        w2v_path=None,
        api_key="word2vec-google-news-300",
    ):

        self.opt_expand_contractions = expand_contractions
        self.opt_strip_text_in_brackets = strip_text_in_brackets
        self.opt_combine_concatenations = combine_concatenations

        if expand_contractions:
            print(
                "Loading contractions dataset (this will take a while the first time)"
            )

            # Load your favorite word2vec model
            self.cont = Contractions(w2v_path=w2v_path, api_key=api_key)
            print("Contractions dataset downloaded")

            print("Training contractions model (this will take a while)")
            # prevents loading on first expand_texts call
            self.cont.load_models()
            print("Contraction model successfully trained")

    def expand_contractions(self, text):
        text = text.replace("’", "'")  # need to put in the correct apostrophe
        expanded_text = list(self.cont.expand_texts([text], precise=True))
        return expanded_text[0]

    def strip_brackets(self, text):
        # Remove strings in brackets
        # Eg. "This is a sentence (extra info) description."
        # Becomes "This is a sentence description."
        """ Remove brackets from text
            Matches (), [], {}

            Converts:
            'hello (there) you (my[best] friend) lets {dine } }' -> 'hello  you  lets  }'
        """

        brace_open_type = ""
        brace_pair = {'(': ')', '[': ']', '{': '}'}
        open_brace_list = list(brace_pair.keys())

        res = ""
        for c in text:
            if len(brace_open_type) == 0:
                # not opened
                if c in open_brace_list:
                    brace_open_type = c
                else:
                    res += c
            else:
                # opened
                if brace_pair[brace_open_type] == c:
                    brace_open_type = ""

        return res

    def combine_concatenations(self, sentence):
        """
        Recieves string sentence
        "This is a sentence"
        """
        # convert concatenated words into seperate words
        # georgetown-louisville becomes georgetown louisville

        # Pd matches all types of dashes
        # https://www.compart.com/en/unicode/category/Pd

        if self.opt_combine_concatenations:

            def _refu(sent):
                return regex.sub(r'\p{Pd}+', '', sent)
        else:

            def _refu(sent):
                return regex.sub(r'\p{Pd}+', ' ', sent)

        return _refu(sentence)

    def remove_non_english(self, tokens):
        """
        Removes non-english words and all punctuation and numbers
        Removes extra white space

        Recieves list of tokens comprising a single sentence:
        ['this', 'is', 'a', 'sentence']
        """
        # remove all punctuation (removes non-english words too)
        # stripped = re.sub('[^a-zA-Z\s]*', '', stripped)

        # removes extra white spaces
        # stripped = re.sub('[ ]{2,}',' ', stripped)

        cleaned_tokens = []
        for token in tokens:
            cleaned = re.sub('[ ]{2,}', ' ', re.sub('[^a-zA-Z\s]*', '',
                                                    token)).strip()
            if len(cleaned) != 0:
                cleaned_tokens.append(cleaned)

        return cleaned_tokens

    def lemmatize_sentences(self, tokenized_sentences):
        """
        Recieves
            Args: tokenized_sentences is of form
                [['this', 'is', 'sentence'],
                ['this', 'is', 'another']
                ['this', 'is', 'another']]

            Returns: lemmatized 2d list of same form
                [['this', 'is', 'sentenc'],
                ['this', 'is', 'anoth']
                ['this', 'is', 'anoth']]
        """
        lemmatized_sentences = []
        for sentence in tokenized_sentences:
            lemmatized_sentences.append(lemmatize(sentence))
        # lemmatized_sentences = [lemmatize(sentence) for sentence in tokenized_sentences]
        return lemmatized_sentences

    def clean(self, text):
        if self.opt_expand_contractions:
            # Expands it's -> it is
            text = self.expand_contractions(text)

        # text is lowercased after contractions are expanded
        # the contractions will be capitalized after they are expanded
        # eg. (i'm -> [I, am]). Therefore, the lowercasing is done afterwards
        text = text.lower()

        if self.opt_strip_text_in_brackets:
            text = self.strip_brackets(text)

        sentences = sent_tokenize(text)
        sentences = [
            self.combine_concatenations(sentence) for sentence in sentences
        ]
        tokens_per_sentence = [word_tokenize(sent) for sent in sentences]
        lemmatized_tokens_per_sent = self.lemmatize_sentences(
            tokens_per_sentence)
        cleaned_tokens_per_sent = [
            self.remove_non_english(sent)
            for sent in lemmatized_tokens_per_sent
        ]

        return cleaned_tokens_per_sent
Example #23
0
def remove_url(text):
    # Remove any web url starting with http or www
    return re.sub(r'(www|http)\S+', '', text)


def remove_email_address(text):
    # Remove any email address
    return re.sub(r'\S+@\S+', '', text)


# we should not use the so big precompiled word2vec model in container,
# it would be slow and container size would be big
model = KeyedVectors.load('/app/lib/gensim/GoogleNews-vectors-negative300',
                          mmap='r')
cont = Contractions(kv_model=model)
cont.load_models()


def expand_contractions(text):
    """expand shortened words, e.g. don't to do not"""
    text = list(cont.expand_texts([text], precise=True))[0]
    return text


@app.route("/v1/preprocess", methods=["GET", "POST"])
def preprocess():
    data = {"success": False}
    # get the request parameter
    params = flask.request.json
    if (params == None):
Example #24
0
from paragraph_segmentation_dcnn import make_cnn as ParagraphSegmentationNet, paragraph_segmentation_transform
from word_segmentation import SSD as WordSegmentationNet, predict_bounding_boxes
from handwriting_line_recognition import Network as HandwritingRecognitionNet, handwriting_recognition_transform
from handwriting_line_recognition import decode as decoder_handwriting

import cv2
import nltk
from pyScan import pyScan
import os

ctx = mx.gpu(0)
alphabet_encoding = r' !"#&\'()*+,-./0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'
ls = LexiconSearch()
contractions = Contractions(
    '/home/jrmo/git/HandwrittenTextRecognition_MXNet/models/GoogleNews-vectors-negative300.bin.gz'
)
lm_model, vocab = gluonnlp.model.get_model('awd_lstm_lm_1150',
                                           pretrained=True,
                                           ctx=ctx)
'''Thanks to Thomas Delteil for creating this model'''

predictions = []


# Allows to pass initialized nets
def predict(image,
            psn=None,
            wsn=None,
            hlrn=None,
            min_c=0.01,
Example #25
0
class TextCleaner:
    word_re = re.compile('[a-zA-Z]+')
    number_re = re.compile('[0-9]+$')
    spell_checker = SpellChecker()
    lemmatizer = WordNetLemmatizer()
    all_words = set(words.words())

    def __init__(self,
                 save_path,
                 word2vec_model_path,
                 previously_processed=[]):
        self.contractions = Contractions(word2vec_model_path)
        self.previously_processed = previously_processed
        self.save_path = save_path

    def _get_all_comments(self, subreddit):
        comments = []
        for submission in subreddit["submissions"]:
            for comment in submission["comments"]:
                comments.extend(sent_tokenize(comment["body"]))
        return comments

    def _remove_urls(self, text):
        url_pattern = r'(((https?|ftp)://)?(([a-zA-Z])+\.)?([a-zA-Z])+\.([a-zA-Z])+/?.*)|http'

        new_sentences = []
        for word in text.split():
            if re.compile(url_pattern).search(word):
                new_sentences.append(re.sub(url_pattern, "__isurl__", word))
            else:
                new_sentences.append(word)
        return " ".join(new_sentences)

    def _invalid_characters(self, string):
        string = re.sub("(\s|-|_|\.\.\.)+", " ", string)
        return re.sub("!|#|&|\(|\)|–|\[|{|}|\]|:|;|\?|\*", "", string)

    def _expand_sentences(self, texts):
        return list(
            self.contractions.expand_texts(
                [x.replace("’", "'") for x in texts], precise=True))

    def _replace(self, sentence, is_spell_check=True):
        words = []
        for word in word_tokenize(sentence):
            word = word.strip()
            if "/" in word or "\\" in word:
                words.append("__isslashinword__")
            elif self.word_re.match(word):
                if is_spell_check and word not in self.all_words:
                    words.append(self.spell_checker.correction(word))
                else:
                    words.append(word)
            elif self.number_re.match(word):
                words.append("__isnumber__")
            elif "__isurl__" in word:
                words.append("__isurl__")
            else:
                words.append("__isinvalidword__")
        return words

    def _words_and_tags(self, words):
        lemmas = []
        pos_tags = []
        for word, pos_tag in nltk.pos_tag(words):
            pos_tags.append(pos_tag)
            if self._get_wordnet_pos(pos_tag):
                lemmas.append(
                    self.lemmatizer.lemmatize(
                        word, pos=self._get_wordnet_pos(pos_tag)))
            else:
                lemmas.append(self.lemmatizer.lemmatize(word))
        return (" ".join(lemmas), pos_tags)

    ## there are others but this is sufficient, e.g. one more wordnet pos tag (adjective satellite) and many more nltk pos tags
    def _get_wordnet_pos(self, treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return ''

    def process_subreddits(self, subreddits, save=True, check_previous=True):
        for subreddit in subreddits:
            print(subreddit["display_name"])
            pathlib.Path(self.save_path).mkdir(exist_ok=True)

            all_raw_comments = self._get_all_comments(subreddit)
            raw_comments = all_raw_comments

            comment_no_urls = []
            comment_removed_chars = []
            comment_expandeds = []

            comment_replaced_spell_corrections = []
            comment_processed_spell_corrections = []
            pos_tag_sent_spell_corrections = []

            comment_replaced_no_spell_corrections = []
            comment_processed_no_spell_corrections = []
            pos_tag_no_sent_spell_corrections = []

            count = 0
            total = len(raw_comments)
            for comment in raw_comments:
                print(comment)
                comment_no_url = self._remove_urls(comment)
                comment_removed_char = self._invalid_characters(comment_no_url)
                comment_expanded = self._expand_sentences(
                    [comment_removed_char])[0]

                comment_replaced_spell_correction = self._replace(
                    comment_expanded.lower(), is_spell_check=True)
                comment_processed_spell_correction, pos_tag_sent_spell_correction = self._words_and_tags(
                    comment_replaced_spell_correction)

                comment_replaced_no_spell_correction = self._replace(
                    comment_expanded.lower(), is_spell_check=False)
                comment_processed_no_spell_correction, pos_tag_no_sent_spell_correction = self._words_and_tags(
                    comment_replaced_no_spell_correction)

                count += 1
                print("count:", count, "total:", total,
                      subreddit["display_name"])

                # Appending
                comment_no_urls.append(comment_no_url)
                comment_removed_chars.append(comment_removed_char)
                comment_expandeds.append(comment_expanded)

                comment_replaced_spell_corrections.append(
                    comment_replaced_spell_correction)
                comment_processed_spell_corrections.append(
                    comment_processed_spell_correction)
                pos_tag_sent_spell_corrections.append(
                    pos_tag_sent_spell_correction)

                comment_replaced_no_spell_corrections.append(
                    comment_replaced_no_spell_correction)
                comment_processed_no_spell_corrections.append(
                    comment_processed_no_spell_correction)
                pos_tag_no_sent_spell_corrections.append(
                    pos_tag_no_sent_spell_correction)

            data = {
                "raw":
                raw_comments,
                "comment_no_urls":
                comment_no_urls,
                "comment_removed_chars":
                comment_removed_chars,
                "comment_expandeds":
                comment_expandeds,
                "comment_replaced_spell_corrections":
                comment_replaced_spell_corrections,
                "comment_processed_spell_corrections":
                comment_processed_spell_corrections,
                "pos_tag_sent_spell_corrections":
                pos_tag_sent_spell_corrections,
                "comment_replaced_no_spell_corrections":
                comment_replaced_no_spell_corrections,
                "comment_processed_no_spell_corrections":
                comment_processed_no_spell_corrections,
                "pos_tag_no_sent_spell_corrections":
                pos_tag_no_sent_spell_corrections,
            }

            if save:
                subreddit_path = self.save_path + "TEST" + subreddit[
                    "display_name"] + ".json"
                with open(subreddit_path, 'w') as fp:
                    json.dump(data, fp)
            else:
                return data
Example #26
0
with open('abbreviations.mapper', 'r') as file:
    content = file.read()
    abbreviations_map = literal_eval(content)

paragraph_separator = '\n\n'
sentence_separator = ' '
token_separator = ' '
unnecessary_identifier_regex = '[0-9\[\]%/,()–\'<>^~`@|#$+:;’]'
unnecessary_space = '  '
unnecessary_unresolved_pron = '-PRON-'
unnecessary_apostrophe = ' \''
unnecessary_space_period = ' \.'
period_regex = '\.'
valid_eos_token = '[!?]'

# Time taking step
expander = Contractions(api_key='glove-wiki-gigaword-50')
assert list(expander.expand_texts(['loader_demo_text'
                                   ]))[0] == 'loader_demo_text'

# Time taking step
spacy_tool = spacy.load('en_md')
neuralcoref.add_to_pipe(spacy_tool)

logging.basicConfig(filename='summarizer.log',
                    filemode='w',
                    format='%(name)s - %(levelname)s - %(message)s',
                    level=logging.DEBUG)

# Takes about ~40 seconds to start-up
Example #27
0
    data = re.sub(r"([0-9]+)000", r"\1k", data)
    return data


# %%
# %%
# nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
def stemming(sentence):
    stemmer = PorterStemmer()
    sentence = sentence.split()
    sentence = ' '.join(stemmer.stem(word) for word in sentence ) #if word not in stop_words)
    return sentence

# %%
cont = Contractions(api_key="glove-twitter-100")
# %%

data['question1'] = list(cont.expand_texts(data['question1']))
data['question2'] = list(cont.expand_texts(data['question2']))
data['question1'] = data['question1'].fillna('').apply(lambda x: BeautifulSoup(x, "lxml").text)
data['question2'] = data['question2'].fillna('').apply(lambda x: BeautifulSoup(x, "lxml").text)
data['question1'] = data['question1'].fillna('').apply(punctutions)
data['question2'] = data['question2'].fillna('').apply(punctutions) 
data['question1'] = data['question1'].fillna('').apply(stemming)
data['question2'] = data['question2'].fillna('').apply(stemming)

#%%
data['fuzz_ratio'] = data.apply(lambda x : fuzz.ratio(x['question1'],x['question2']),axis=1)
data['fuzz_partial_ratio'] = data.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)
data['token_sort_ratio'] = data.apply(lambda x: fuzz.token_sort_ratio(x['question1'], x['question2']),axis=1)
Example #28
0
from TwitterAPI import TwitterAPI
from gensim.models import Word2Vec
import gensim.downloader as api
import sys
import re
import string
from nltk.tokenize import TweetTokenizer
import preprocessor
tknzr = TweetTokenizer(strip_handles=True)
from pycontractions import Contractions

# Load your favorite semantic vector model in gensim keyedvectors format from disk
cont = Contractions(api_key="glove-twitter-100")


def getTweets(query, api):
    r = api.request('search/tweets', {'q': query})
    tweetList = []
    for item in r:
        if (item['lang'] == 'en'):
            tweetList.append(item['text'])
    return tweetList


def main():
    # Interface with OPSUS sentiment analysis API
    api = TwitterAPI('7ldroPca5V9h2GczFb2ySRuqS',
                     'Di98ZN3xmRcoeL3St1Xe6fEo6expkyNZLezdwn2ON8sUCK2t6T',
                     '1049876480447123457-cEA1uhUauGFjA1oPGxpUB2tCJGAaen',
                     'xsGol4ZwM6FRLxiM2ucp80brUDENKdn3r3pf0h8yhEO5t')
    # Get all of the tweets mentioning the brand of interest
Example #29
0
 def __init__(self, spacy_model='en_core_web_sm', gensim_model='glove-twitter-25'):
     self.nlp = spacy.load(spacy_model)
     self.model = api.load(gensim_model)
     self.cont = Contractions(kv_model=self.model)