コード例 #1
0
 def contractions_fun(self):
     """
     This function replaces words that are --
     by checking a word if a word is present in a dictionary
     if the word is present in dictionary then it is replaced
     with its value from dictionary
     """
     if self.contraction_method == 'mapping':
         self.doc = self.mapping_decontraction(str(self.doc))
     elif self.contraction_method == 'word2vec':
         model = pretrained_model
         cont = Contractions(model)
         cont.load_models()
         self.doc = list(cont.expand_texts([str(self.doc)],
                                           precise=True))[0]
     elif self.contraction_method == 'glove':
         model = api.load("glove-twitter-25")
         cont = Contractions(kv_model=model)
         cont.load_models()
         self.doc = list(cont.expand_texts([str(self.doc)],
                                           precise=True))[0]
コード例 #2
0
class ContractionsExpander(TransformerMixin):
    def __init__(self, kv_model=None, api_key='glove-twitter-100', precise=False):
        self.kv_model = kv_model
        self.api_key = api_key
        self.precise = precise
        if api_key:
            self.contractions = Contractions(api_key=api_key)
        else:
            self.contractions = Contractions(kv_model=kv_model)
        self.contractions.load_models()
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        return self.contractions.expand_texts(X)
コード例 #3
0
def preprocessing_text(df, series):

    #removing patterns (usernames & links to websites)
    pattern_list = ['@[\w]*', r'http\S+']
    remove_patterns(df, series, pattern_list)

    #remove hashtag from clean text
    df[series] = remove_hashtags(df[series])

    # de-emojizing
    df[series] = df[series].apply(lambda x: emoji.demojize(x))

    #remove characters repeated more than 2 times
    df[series] = df[series].apply(lambda x: ReplaceThreeOrMore(x))

    #remove html characters
    char_list = ['&amp', '\n', 'á', '<', '>']
    remove_chars(df, series, char_list)

    #handle contractions
    cont = Contractions(api_key="glove-twitter-100")
    df[series] = df[series].apply(lambda x: list(cont.expand_texts([x])))
    df[series] = df[series].apply(lambda x: str(x))

    #removing numbers
    df[series] = df[series].apply(
        lambda x: ''.join([i for i in x if not i.isdigit()]))

    #remove punctuation
    df[series] = df[series].str.replace('[^\w\s]', ' ')

    #set to lowercase
    df[series] = df[series].apply(
        lambda x: " ".join(x.lower() for x in x.split()))

    #lemmatization
    df[series] = df[series].apply(lambda x: str(x))
    df[series] = df[series].apply(
        lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

    return df
コード例 #4
0
class ContractionExpander(TextProcessingBaseClass):
    '''
    Removes contractions from the text and uses the full version instead (unification).

    Example:
    I'll walk down the road --> I will walk down the road
    '''

    model_contraction_expander = None

    def __init__(self, model=None):
        '''
        :param model: Pretrained word embedding model.
        '''
        super().__init__()

        if model is None:
            # If no model is given, use the default one and store it as a static class variable to avoid multiple loadings
            if ContractionExpander.model_contraction_expander is None:
                model_path = os.path.join(NLP_MODELS_PATH, 'pretrained', 'word_embeddings', 'pubmed2018_w2v_400D',
                                  'pubmed2018_w2v_400D.bin')
                ContractionExpander.model_contraction_expander = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)

            model = ContractionExpander.model_contraction_expander

        self.cont = Contractions(kv_model=model)

    def level(self) -> str:
        return "text"

    def _process_internal(self, text: str) -> str:
        '''
        :param text: Input string.
        :return: The string without contractions.
        '''
        return list(self.cont.expand_texts([text], precise=True))[0]
コード例 #5
0
def train_model() -> None:
    train_data = fetch_data.fetch_imdb_train_data()

    cont = Contractions(constants.CONTRACTIONS_BIN_FILE)
    cont.load_models()

    for index, row in train_data.iterrows():
        row.review = BeautifulSoup(row.review, features="html.parser").get_text()
        row.review = cont.expand_texts(row.review, precise=True)

    train_data.review = clean_reviews(train_data.review)

    reviews = list(tokenize_sentences(train_data.review))

    labels = list(train_data.sentiment)

    tokenizer = Tokenizer(num_words=constants.MAX_NB_WORDS)
    tokenizer.fit_on_texts(train_data.review)

    data = np.zeros((len(train_data.review), constants.MAX_SENTS, constants.MAX_SENT_LENGTH), dtype='float32')

    words = list()
    for i, sentences in enumerate(reviews):
        for j, sent in enumerate(sentences):
            if j < constants.MAX_SENTS:
                wordTokens = text_to_word_sequence(sent)
                k = 0
                for _, word in enumerate(wordTokens):
                    if k < constants.MAX_SENT_LENGTH and tokenizer.word_index[word] < constants.MAX_NB_WORDS:
                        data[i, j, k] = tokenizer.word_index[word]
                        k = k + 1
                words.append(wordTokens)

    word_index = tokenizer.word_index
    print('Total %s unique tokens.' % len(word_index))

    labels = to_categorical(np.asarray(labels))
    print('Shape of data tensor:', data.shape)
    print('Shape of label tensor:', labels.shape)

    wordSkipGramModel = gensim.models.Word2Vec(words, min_count=5, size=constants.EMBEDDING_DIM, window=4, sg=1)

    word_embedding_matrix = np.random.random((len(word_index) + 1, constants.EMBEDDING_DIM))
    for word, i in word_index.items():
        try:
            word_embedding_vector = wordSkipGramModel.wv.get_vector(word)
        except KeyError:
            continue
            # words not found in embedding index will be all-zeros.EMBEDDING_DIM
        if word_embedding_vector is not None:
            word_embedding_matrix[i] = word_embedding_vector

    embedding_layer = Embedding(len(word_index) + 1, constants.EMBEDDING_DIM, weights=[word_embedding_matrix],
                                input_length=constants.MAX_SENT_LENGTH, trainable=True)

    sentence_input = Input(shape=(constants.MAX_SENT_LENGTH,), dtype='float32')
    embedded_sequences = embedding_layer(sentence_input)
    sentence_lstm = Bidirectional(LSTM(200, return_sequences=True))(embedded_sequences)
    l_dropout = Dropout(0.5)(sentence_lstm)
    l_dense = TimeDistributed(Dense(400))(l_dropout)
    l_att = attention_layer.AttLayer()(l_dense)
    l_dropout_1 = Dropout(0.4)(l_att)
    sentEncoder = Model(sentence_input, l_dropout_1)

    review_input = Input(shape=(constants.MAX_SENTS, constants.MAX_SENT_LENGTH), dtype='float64')
    review_encoder = TimeDistributed(sentEncoder)(review_input)
    review_dropout = Dropout(0.3)(review_encoder)
    l_lstm_review = Bidirectional(LSTM(100, return_sequences=True))(review_dropout)
    l_att_dropout_review = Dropout(0.2)(l_lstm_review)
    l_dense_review = TimeDistributed(Dense(200))(l_att_dropout_review)
    l_dropout_review = Dropout(0.2)(l_dense_review)
    l_att_review = attention_layer.AttLayer()(l_dropout_review)
    preds = Dense(2, activation='softmax')(l_att_review)
    model = Model(review_input, preds)
    adam = Adam(lr=0.0001)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    model.fit(data, labels, validation_split=0.2, epochs=10, batch_size=50, shuffle=False, verbose=1)
    model.save('deeplearn_sentiment_model.h5')

    # Save Tokenizer i.e. Vocabulary
    with open('reviews_tokenizer.pkl', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
コード例 #6
0
with open('abbreviations.mapper', 'r') as file:
    content = file.read()
    abbreviations_map = literal_eval(content)

paragraph_separator = '\n\n'
sentence_separator = ' '
token_separator = ' '
unnecessary_identifier_regex = '[0-9\[\]%/,()–\'<>^~`@|#$+:;’]'
unnecessary_space = '  '
unnecessary_unresolved_pron = '-PRON-'
unnecessary_apostrophe = ' \''
unnecessary_space_period = ' \.'
period_regex = '\.'
valid_eos_token = '[!?]'

# Time taking step
expander = Contractions(api_key='glove-wiki-gigaword-50')
assert list(expander.expand_texts(['loader_demo_text'
                                   ]))[0] == 'loader_demo_text'

# Time taking step
spacy_tool = spacy.load('en_md')
neuralcoref.add_to_pipe(spacy_tool)

logging.basicConfig(filename='summarizer.log',
                    filemode='w',
                    format='%(name)s - %(levelname)s - %(message)s',
                    level=logging.DEBUG)

# Takes about ~40 seconds to start-up
コード例 #7
0
class Cleaner:

    def __init__(self,
                embedding_for_smart_contraction="GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin",
                spell_dictonarypath = "frequency_dictionary_en_82_765.txt"):
        self.embedding_for_smart_contraction = embedding_for_smart_contraction
        self.spell_dictonarypath = spell_dictonarypath
        self.initialized = False 


    def initialize(self):
        print("Initializing Text Cleaner..")
       
        print("Initializing Smart Contractions Module..")
        self.cont = Contractions(self.embedding_for_smart_contraction)
        self.cont.load_models()
        
        print("Initializing Stopwords Module..")
        self.stop_words = set(stopwords.words('english'))
        stop_words_without_negation = copy.deepcopy(self.stop_words)
        stop_words_without_negation.remove('no')
        stop_words_without_negation.remove('nor')
        stop_words_without_negation.remove('not')
        self.stop_words_without_negation = stop_words_without_negation
        self.pos_tags_set_1 = {'NNP'}

        print("Initializing Wordnet Lemmatizer Module..")
        self.wnl = WordNetLemmatizer()
        
        print("Initializing Spellcheck Module..")
        max_edit_distance_dictionary = 2
        prefix_length = 7
        self.sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
        dictionary_path = os.path.abspath('')+"\\"+self.spell_dictonarypath
        self.sym_spell.load_dictionary(dictionary_path, 0, 1)
        
        print("Initialization complete!")

    def expand_contractions(self,text):
        try:
            text = list(self.cont.expand_texts([text], precise=False))[0]
        except Exception as e:
            return text
        return text

    
    def apostrophe_correction(self,text):
        text = re.sub("’", "'", text)
        return text
    
    
    def try_decode(self,text):
        try:
            text = unidecode.unidecode(codecs.decode(text, 'unicode_escape'))
        except:
            text = unidecode.unidecode(text)
        return text

    
    def tokenize_and_keep_only_words(self,text):
        text = re.findall(r"[a-zA-Z]+", text.lower())
        return text
    
    
    def remove_stop_words(self,text):
        text = [word for word in text if (word not in self.stop_words_without_negation and len(word)>2)]
        return text

    
    def lemmatize(self,text):
        text = [self.wnl.lemmatize(word) for word in text]
        return text

    
    def spell_check(self,text,max_edit_distance_lookup = 2):
        # tokenize each word
        text = word_tokenize(text)
        # apply pos to each word
        text = pos_tag(text)
        correct_text = []
        # for each word in sentece
        for word in text:
            # if word is not a noun
            if word[1] not in self.pos_tags_set_1:
                # check if we can correct it, then correct it
                suggestions = self.sym_spell.lookup(word[0],Verbosity.CLOSEST,
                                    max_edit_distance_lookup)
                for suggestion in suggestions:
                    # take the first correction
                    correct_text.append(suggestion.term)
                    break
            else:
                correct_text.append(word[0])
        text = ' '.join([word for word in correct_text])
        return text


    def full_clean(self,text,debug=False):
        if not self.initialized:
            self.initialize()
            self.initialized = True
        if debug:
            print("pre-clean: ",text)
        text = self.try_decode(text)
        text = self.apostrophe_correction(text)
        text = self.spell_check(text)
        text = self.expand_contractions(text)
        text = self.tokenize_and_keep_only_words(text)
        text = self.remove_stop_words(text)
        text = self.lemmatize(text)

        text = ' '.join(text)
        if debug:
            print("post-clean: ",text)
        return text
コード例 #8
0
    def transform(self, x):
        if self.verbose > 0:
            print(
                colored("Called Description Transformer Transform",
                        color="blue",
                        attrs=['bold', 'underline']))
            print("Processing description text")

        # Copy the data and find the name of the description column
        self.data = x.copy()
        self.column_name = self.data.columns.values[0]

        # Load spaCy language processor
        nlp = spacy.load("en_core_web_sm")
        # Load pre-trained word embedding if using contractions
        contraction = Contractions(
            api_key="glove-twitter-25") if self.contractions else None

        # Process text by iterating over each sample's index and description
        for idx, sample in zip(self.data.index.values, self.data.values):
            # Change accented characters, e.g à -> a
            sample = self.remove_accents(str(sample))
            if contraction:
                # Contract words, e.g "hasn't" -> "has not"
                sample = list(contraction.expand_texts([sample], precise=True))
                sample = ''.join(sample)

            # Input sample text into spaCy language processor
            doc = nlp(sample)
            # Split sample text into sentences
            sentences = list(doc.sents)

            for word_idx in range(len(sentences)):
                # Remove punctuation tokens, e.g. ! , .
                sentences[word_idx] = [
                    token for token in sentences[word_idx]
                    if not token.is_punct
                ]

                # Remove stop words
                if self.stop_words:
                    sentences[word_idx] = [
                        token for token in sentences[word_idx]
                        if token.text.lower() not in self.stop_words
                    ]

                # Apply lemmatization
                if self.transformation[0].lower() == "l":
                    # Resolve words to their dictionary form using PoS tags
                    sentences[word_idx] = [
                        token.lemma_.lower() for token in sentences[word_idx]
                    ]

                # Apply stemming (only if lemmatization not applied)
                elif self.transformation[0].lower() == "s":
                    # Stem tokens
                    for char_idx in range(len(sentences[word_idx])):
                        # Apply stemmer to each word
                        stemmed = self.stemmer_algorithm.stem(
                            sentences[word_idx][char_idx].text)
                        # Convert back to type Token and update word in sentence
                        sentences[word_idx][char_idx] = nlp(stemmed)[0]

                # Remove remaining punctuation within tokens, e.g. "(years)" -> "years", not including -
                sentences[word_idx] = [
                    token.translate(
                        str.maketrans('', '',
                                      '!"#$%&\'()*+,./:;<=>?@[\\]^_`{|}~'))
                    for token in sentences[word_idx]
                ]

            # Split words containing dash or spaces caused by lemmatization, e.g. "16-year" -> "16" + "year"
            for k in range(len(sentences)):
                new_sentence = []
                for token in sentences[k]:
                    split_token = re.split(' |-', token)
                    for word in split_token:
                        # Check word not empty
                        if word:
                            new_sentence.append(word)
                # Replace words in sentence
                sentences[k] = new_sentence

            # Remove empty lists from list of sentences
            sentences = [sent for sent in sentences if sent != []]
            # The join the sentences and update the descriptions dataframe
            word_list = [word for sent in sentences for word in sent]
            self.data.loc[idx, self.column_name] = ' '.join(
                [str(elem) for elem in word_list])


#         if self.verbose > 1:
#             display(self.data)
        if self.verbose > 0:
            print(
                colored("Finshed processing all descriptions\n",
                        color="blue",
                        attrs=['bold', 'underline']))

        return self.data
コード例 #9
0
class TextProcessing:
    """
    Class to clean text
    """
    def __init__(self, nlp=spacy.load("en_core_web_sm")):
        self.nlp = nlp
        contextualSpellCheck.add_to_pipe(self.nlp)
        model = api.load(cfg['embeddings']['embedding_file'])
        self.cont = Contractions(kv_model=model)
        self.cont.load_models()
        dirname = os.path.dirname(__file__)
        with open(os.path.join(dirname, 'acronym.json')) as f:
            self.acronyms = json.load(f)

    def process_text(self, text):
        """
        Processes text as follows:
        1. decode to unicode
        2. remove extra repeated special characters
        3. put space around the special characters
        4. Remove extra whitespaces
        5. replace acronyms
        6. expand contractions of english words like ain't
        7. correct spelling mistakes
        8. replace NE in the text
        9. lower case the string
        Args:
            text: text to be processed
        """
        text = self.unidecode(text)
        text = self.remove_repeated_chars(text)
        text = self.put_space_around_special_chars(text)
        text = self.remove_extra_whitespaces(text)
        text = self.replace_acronyms(text)
        text = self.expand_contractions(text)
        text = self.correct_spellings(text)
        text = self.replace_named_entity(text)
        text = self.lower_case(text)
        return text

    def remove_repeated_chars(self, text):
        """
        Removes repeated instances of consecutive special chars
        Args:
            text: text to be processed
        """
        text = re.sub(r'([!@#$%^&*,./?\'";:\\])\1+', r'\1', text)
        return text

    def put_space_around_special_chars(self, text):
        """
        Puts space around special chars like '[({$&*#@!'
        Args:
            text: text to be processed
        """

        chars = [
            '$', '?', '%', '@', '!', '#', '^', '*', '&', '"', ':', ';', '/',
            '\\', ',', '+', '(', ')', '[', ']', '{', '}', '<', '>'
        ]

        for char in chars:
            text = text.replace(char, ' ' + char + ' ')
        return text

    def remove_extra_whitespaces(self, text):
        """
        Removes extra whitespaces from the text
        Args:
            text: text to be processed
        """
        return text.strip()

    def unidecode(self, text):
        """
        unidecodes the text
        Args:
            text: text to be processed
        """
        return unidecode.unidecode(text.lower())

    def lower_case(self, text):
        """
        lower cases the text
        Args:
            text: text to be processed
        """
        return text.lower()

    def expand_contractions(self, text):
        """
        expands contractions for example, "ain't" expands to "am not"
        Args:
            text: text to be processed
        """
        return list(self.cont.expand_texts([text.lower()], precise=True))[0]

    def correct_spellings(self, text):
        """
        corrects spellings from text
        Args:
            text: text to be processed
        """
        doc = self.nlp(text)
        if doc._.performed_spellCheck:
            text = doc._.outcome_spellCheck
        return text

    def replace_acronyms(self, text):
        """
        Replaces acronyms found in English
        For example: ttyl -> talk to you later
        Args:
            text: text to be processed
        """
        for acronym, expansion in self.acronyms.items():
            text = text.replace(' ' + acronym.lower() + ' ',
                                ' ' + expansion.lower() + ' ')
        return text

    def replace_named_entity(self, text):
        """
        Replaces named entity in the text
        For example: $5bn loss estimated in the coming year
                    -> MONEY loss estimated in the coming year
        Args:
            text: text to be processed
        """
        doc = list(
            self.nlp.pipe(
                [text],
                disable=["tagger", "parser", "contextual spellchecker"]))[0]
        for ent in doc.ents:
            text = text.replace(ent.text, ent.label_)
        return text

    def token_list(self, text):
        doc = self.nlp(text)
        tokens = []
        for token in doc:
            tokens += [token.text]
        return tokens
コード例 #10
0
# %%
# %%
# nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
def stemming(sentence):
    stemmer = PorterStemmer()
    sentence = sentence.split()
    sentence = ' '.join(stemmer.stem(word) for word in sentence ) #if word not in stop_words)
    return sentence

# %%
cont = Contractions(api_key="glove-twitter-100")
# %%

data['question1'] = list(cont.expand_texts(data['question1']))
data['question2'] = list(cont.expand_texts(data['question2']))
data['question1'] = data['question1'].fillna('').apply(lambda x: BeautifulSoup(x, "lxml").text)
data['question2'] = data['question2'].fillna('').apply(lambda x: BeautifulSoup(x, "lxml").text)
data['question1'] = data['question1'].fillna('').apply(punctutions)
data['question2'] = data['question2'].fillna('').apply(punctutions) 
data['question1'] = data['question1'].fillna('').apply(stemming)
data['question2'] = data['question2'].fillna('').apply(stemming)

#%%
data['fuzz_ratio'] = data.apply(lambda x : fuzz.ratio(x['question1'],x['question2']),axis=1)
data['fuzz_partial_ratio'] = data.apply(lambda x: fuzz.partial_ratio(str(x['question1']), str(x['question2'])), axis=1)
data['token_sort_ratio'] = data.apply(lambda x: fuzz.token_sort_ratio(x['question1'], x['question2']),axis=1)
data['token_set_ratio'] = data.apply(lambda x: fuzz.token_set_ratio(x['question1'],x['question2']),axis=1)

# # %%
コード例 #11
0
ファイル: aiotext.py プロジェクト: EricWiener/aiotext
class Cleaner:
    def __init__(
        self,
        expand_contractions=True,
        strip_text_in_brackets=False,
        combine_concatenations=False,
        w2v_path=None,
        api_key="word2vec-google-news-300",
    ):

        self.opt_expand_contractions = expand_contractions
        self.opt_strip_text_in_brackets = strip_text_in_brackets
        self.opt_combine_concatenations = combine_concatenations

        if expand_contractions:
            print(
                "Loading contractions dataset (this will take a while the first time)"
            )

            # Load your favorite word2vec model
            self.cont = Contractions(w2v_path=w2v_path, api_key=api_key)
            print("Contractions dataset downloaded")

            print("Training contractions model (this will take a while)")
            # prevents loading on first expand_texts call
            self.cont.load_models()
            print("Contraction model successfully trained")

    def expand_contractions(self, text):
        text = text.replace("’", "'")  # need to put in the correct apostrophe
        expanded_text = list(self.cont.expand_texts([text], precise=True))
        return expanded_text[0]

    def strip_brackets(self, text):
        # Remove strings in brackets
        # Eg. "This is a sentence (extra info) description."
        # Becomes "This is a sentence description."
        """ Remove brackets from text
            Matches (), [], {}

            Converts:
            'hello (there) you (my[best] friend) lets {dine } }' -> 'hello  you  lets  }'
        """

        brace_open_type = ""
        brace_pair = {'(': ')', '[': ']', '{': '}'}
        open_brace_list = list(brace_pair.keys())

        res = ""
        for c in text:
            if len(brace_open_type) == 0:
                # not opened
                if c in open_brace_list:
                    brace_open_type = c
                else:
                    res += c
            else:
                # opened
                if brace_pair[brace_open_type] == c:
                    brace_open_type = ""

        return res

    def combine_concatenations(self, sentence):
        """
        Recieves string sentence
        "This is a sentence"
        """
        # convert concatenated words into seperate words
        # georgetown-louisville becomes georgetown louisville

        # Pd matches all types of dashes
        # https://www.compart.com/en/unicode/category/Pd

        if self.opt_combine_concatenations:

            def _refu(sent):
                return regex.sub(r'\p{Pd}+', '', sent)
        else:

            def _refu(sent):
                return regex.sub(r'\p{Pd}+', ' ', sent)

        return _refu(sentence)

    def remove_non_english(self, tokens):
        """
        Removes non-english words and all punctuation and numbers
        Removes extra white space

        Recieves list of tokens comprising a single sentence:
        ['this', 'is', 'a', 'sentence']
        """
        # remove all punctuation (removes non-english words too)
        # stripped = re.sub('[^a-zA-Z\s]*', '', stripped)

        # removes extra white spaces
        # stripped = re.sub('[ ]{2,}',' ', stripped)

        cleaned_tokens = []
        for token in tokens:
            cleaned = re.sub('[ ]{2,}', ' ', re.sub('[^a-zA-Z\s]*', '',
                                                    token)).strip()
            if len(cleaned) != 0:
                cleaned_tokens.append(cleaned)

        return cleaned_tokens

    def lemmatize_sentences(self, tokenized_sentences):
        """
        Recieves
            Args: tokenized_sentences is of form
                [['this', 'is', 'sentence'],
                ['this', 'is', 'another']
                ['this', 'is', 'another']]

            Returns: lemmatized 2d list of same form
                [['this', 'is', 'sentenc'],
                ['this', 'is', 'anoth']
                ['this', 'is', 'anoth']]
        """
        lemmatized_sentences = []
        for sentence in tokenized_sentences:
            lemmatized_sentences.append(lemmatize(sentence))
        # lemmatized_sentences = [lemmatize(sentence) for sentence in tokenized_sentences]
        return lemmatized_sentences

    def clean(self, text):
        if self.opt_expand_contractions:
            # Expands it's -> it is
            text = self.expand_contractions(text)

        # text is lowercased after contractions are expanded
        # the contractions will be capitalized after they are expanded
        # eg. (i'm -> [I, am]). Therefore, the lowercasing is done afterwards
        text = text.lower()

        if self.opt_strip_text_in_brackets:
            text = self.strip_brackets(text)

        sentences = sent_tokenize(text)
        sentences = [
            self.combine_concatenations(sentence) for sentence in sentences
        ]
        tokens_per_sentence = [word_tokenize(sent) for sent in sentences]
        lemmatized_tokens_per_sent = self.lemmatize_sentences(
            tokens_per_sentence)
        cleaned_tokens_per_sent = [
            self.remove_non_english(sent)
            for sent in lemmatized_tokens_per_sent
        ]

        return cleaned_tokens_per_sent
コード例 #12
0
class NLP():
    nlp = None
    doc = None
    model = None

    def __init__(self, spacy_model='en_core_web_sm', gensim_model='glove-twitter-25'):
        self.nlp = spacy.load(spacy_model)
        self.model = api.load(gensim_model)
        self.cont = Contractions(kv_model=self.model)

    def remove_html(self, text):
        """Strip HTML tags from text"""
        soup = BeautifulSoup(text, 'html.parser')
        return soup.get_text(separator=" ")

    def remove_accents(self, text):
        """Remove accented characters from text for non-english words"""
        return unidecode.unidecode(text)

    def expand_contractions(self, text):
        """Convert contractions into whole words. e.g. can't -> can not"""
        return list(self.cont.expand_texts([text], precise=True))[0]

    def preprocess(self, text, remove_numbers=False, remove_stopwords=False, excluded_sw=None, toke=False):
        """Preprocess using standard protocols. 
        @param remove_numbers converts words to digits and removes
        @param remove_stopwords removes stop words
        @param excluded_sw is any stopwords to exclude
        @param toke if true, return tokens, default return text 
        """
        text = self.remove_html(text)
        text = self.remove_accents(text)
        text = self.expand_contractions(text)

        if toke or remove_numbers or remove_stopwords:
            if excluded_sw is not None:
                for w in excluded_sw:
                    self.nlp.vocab[w].is_stop = False
            doc = self.nlp(text)
            tokens = []
            for token in doc:
                if token.pos_ == 'NUM' and not remove_numbers:
                    tokens.append(w2n.word_to_num(token.text))
                elif not token.is_stop:
                    tokens.append(token.text)
            if toke:
                return tokens
            text = " ".join(tokens)
        return text

    def lemmatize(self, tokens, toke=False):

        lookups = Lookups()
        lookups.add_table('lemma_index', lemma_index)
        lookups.add_table('lemma_exc', lemma_exc)
        lookups.add_table('lemma_rules', lemma_rules)
        lemmatizer = Lemmatizer(lookups)

        lemmas = []
        for t in tokens:
            lemmas.append(lemmatizer(token.text, token.tag_))

        if toke:
            return lemmas

        return " ".join(lemmas)

    def get_syllables(self, word):
        count = 0
        vowels = ("a", "e", "i", "o", "u", "y")
        prev = False
        for c in word:
            vowel = c in vowels
            if vowel and not prev:
                count += 1
            prev = vowel
        return count

    def get_lexical_density(self, tokens):
        c_words = t_words = 0

        cont_pos = ['PROPN', 'NOUN', 'VERB', 'ADJ', 'ADV']
        for t in tokens:
            if token.pos_ in cont_pos:
                c_words += 1
                t_words += 1
            elif token.pos_ != 'PUNCT':
                t_words += 1

        return round((c_words / t_words), 4)

    def get_coherence(self, text):
        doc = self.nlp(text)
        sentences = [sent for sent in doc.sents if len(sent) >= 2]
        frequency = defaultdict(int)
        token_sents = []
        for s in sentences:
            tmp = []
            for t in self.preprocess(s, remove_stopwords=True, excluded_sw=['no', 'not'], toke=True):
                tmp.append(t.text)
                frequency[t] += 1
            token_sents.append(tmp)

        vocab = [[word for word in sent if frequency[word] > 1]
                 for sent in token_sents]
        dictionary = corpora.Dictionary(vocab)
        corpus = [dictionary.doc2bow(word) for word in vocab]
        tfidf = models.TfidfModel(corpus)
        corpus_tfidf = tfidf[corpus]
        lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=20)
        corpus_lsi = lsi[corpus_tfidf]

        sums = {}
        topic_count = max([len(line) for line in corpus_lsi])
        for line in corpus_lsi:
            for topic in line:
                t_num = topic[0]
                if t_num not in sums:
                    sums[t_num] = abs(topic[1])
                else:
                    sums[t_num] += abs(topic(1))
        best_topic = max(zip(sums.values(), sums.keys()))[1]
        ordered = []
        i = 0
        for line in corpus_lsi:
            ordered.append((i, line[topic][1]))
            i += 1

        ordered = sorted(ordered, key=lambda x: x[1], reverse=True)
        threshold = ordered[0][1] - (0.90 * (ordered[0][1] - ordered[-1][1]))
        problem_sentences = []
        for s in ordered:
            if s[1] < threshold:
                problem_sentences.append((s[1]), s)
        problem_sentences = [s for s in ordered if s[1] < threshold]

        output = {}
        for p in problem_sentences:
            output[p[0]] = (p[1], str(sentences[p[0]]))

        return output

    def get_readability(self, text):
        scores = {}

        doc = self.nlp(text)
        sentence _count = len(doc)

        words = self.preprocess(text, toke=True)
        characters = 0
        for word in words:
            characters += len(word)
        word_count = len(words)
        
        syllable_count = 0
        complex_words = 0
        for word in words:
            c = self.get_syllables(word)
            syllable_count += c
            if c >= 3 and not word[0].isupper():
                complex_words += 1
        avgwps = word_count / sentence_count

        # Automated Readability Index
        ari = 0.0
        ari_grade = 0
        if word_count > 0:
            ari = 4.71 * (characters / word_count) + 0.5 * \
                (word_count / sentence_count) - 21.43
        if ari < 2:
            ari_grade = 0
        elif ari > 12:
            ari_grade = 13
        else:
            ari_grade = ari
        scores["ari"] = (ari, ari_grade)

        # Flesch Reading Ease
        flesch_reading_ease = 101
        fre_grade = 0
        if word_count > 0 and sentence_count > 0:
            flesch_reading_ease = 206.835 - \
                1.015(word_count / sentence_count) - \
                84.6(syllable_count / word_count)
        if flesch_reading_ease > 100:
            fre_grade = 4
        elif flesch_reading_ease > 90.0:
            fre_grade = 5
        elif flesch_reading_ease > 80.0:
            fre_grade = 6
        elif flesch_reading_ease > 70.0:
            fre_grade = 7
        elif flesch_reading_ease > 60.0:
            fre_grade = 9
        elif flesch_reading_ease > 50:
            fre_grade = 12
        else:
            fre_grade = 13
        scores["flesch_reading_ease"] = (flesch_reading_ease, fre_grade)

        # Flesch-Kincaid Grade Level
        fkg = 0.0
        if word_count > 0 and sentence_count > 0:
            fkg = 0.39(word_count / sentence_count) + \
                11.8(syllable_count / word_count) - 15.59
        scores["flesch_kinkaid_grade_level"] = (fkg, int(fkg))

        # Gunning Fog Index
        gfi = 0.0
        gfi_grade = 0
        if sentence_count > 0 and word_count > 0:
            gfi = 0.4 * ((word_count / sentence_count) +
                        100(complex_words / word_count))
        if gfi < 6:
            gfi_grade = 5
        elif gfi <= 12:
            gfi_grade = int(gfi)
        else:
            gfi_grade = 13
        scores["gunning_fog_index"] = (gfi, gfi_grade)

        # SMOG Readability
        smog = 0.0
        smog_grade = 0
        if sentence_count > 0:
            smog = 1.0430 * math.sqrt(complex_words *
                                    (30 / sentence_count)) + 3.1291
        if smog >= 13:
            smog_grade = 13
        else:
            smog_grade = int(smog)
        scores["smog_readability"] = (smog, smog_grade)

        # ColemanLiauIndex
        coleman = 0.0
        coleman_grade = 0
        if word_count > 0:
            coleman = (5.89 * (characters / word_count)) - \
                (30 * (sentence_count / word_count)) - 15.8
        if coleman >= 13:
            coleman_grade = 13
        else:
            coleman_grade = int(coleman)
        scores["coleman_liau"] = (coleman, coleman_grade)

        # LIX & RIX
        lix = 0.0
        rix = 0.0
        lix_grade = 0
        rix_grade = 0
        if sentence_count > 0 and word_count > 0:
            long_words = 0
            for word in words:
                if len(word) >= 7:
                    long_words += 1
            lix = word_count / sentence_count + ((100. * long_words) / word_count)
            rix = long_words / sentence_count
        if lix >= 13:
            lix_grade = 13
        else:
            lix_grade = int(lix)
        if rix >= 13:
            rix_grade = 13
        else:
            rix_grade = int(rix)
        scores["LIX"] = (lix, lix_grade)
        scores["RIX"] = (rix, rix_grade)

        count = 0
        avg = 0.0
        for k, v in scores.items:
            avg += v[1]
            count += 1
        scores["AVERAGE_GRADE"] = (avg / count, int(avg / count))

        return scores
コード例 #13
0
class TextCleaner:
    word_re = re.compile('[a-zA-Z]+')
    number_re = re.compile('[0-9]+$')
    spell_checker = SpellChecker()
    lemmatizer = WordNetLemmatizer()
    all_words = set(words.words())

    def __init__(self,
                 save_path,
                 word2vec_model_path,
                 previously_processed=[]):
        self.contractions = Contractions(word2vec_model_path)
        self.previously_processed = previously_processed
        self.save_path = save_path

    def _get_all_comments(self, subreddit):
        comments = []
        for submission in subreddit["submissions"]:
            for comment in submission["comments"]:
                comments.extend(sent_tokenize(comment["body"]))
        return comments

    def _remove_urls(self, text):
        url_pattern = r'(((https?|ftp)://)?(([a-zA-Z])+\.)?([a-zA-Z])+\.([a-zA-Z])+/?.*)|http'

        new_sentences = []
        for word in text.split():
            if re.compile(url_pattern).search(word):
                new_sentences.append(re.sub(url_pattern, "__isurl__", word))
            else:
                new_sentences.append(word)
        return " ".join(new_sentences)

    def _invalid_characters(self, string):
        string = re.sub("(\s|-|_|\.\.\.)+", " ", string)
        return re.sub("!|#|&|\(|\)|–|\[|{|}|\]|:|;|\?|\*", "", string)

    def _expand_sentences(self, texts):
        return list(
            self.contractions.expand_texts(
                [x.replace("’", "'") for x in texts], precise=True))

    def _replace(self, sentence, is_spell_check=True):
        words = []
        for word in word_tokenize(sentence):
            word = word.strip()
            if "/" in word or "\\" in word:
                words.append("__isslashinword__")
            elif self.word_re.match(word):
                if is_spell_check and word not in self.all_words:
                    words.append(self.spell_checker.correction(word))
                else:
                    words.append(word)
            elif self.number_re.match(word):
                words.append("__isnumber__")
            elif "__isurl__" in word:
                words.append("__isurl__")
            else:
                words.append("__isinvalidword__")
        return words

    def _words_and_tags(self, words):
        lemmas = []
        pos_tags = []
        for word, pos_tag in nltk.pos_tag(words):
            pos_tags.append(pos_tag)
            if self._get_wordnet_pos(pos_tag):
                lemmas.append(
                    self.lemmatizer.lemmatize(
                        word, pos=self._get_wordnet_pos(pos_tag)))
            else:
                lemmas.append(self.lemmatizer.lemmatize(word))
        return (" ".join(lemmas), pos_tags)

    ## there are others but this is sufficient, e.g. one more wordnet pos tag (adjective satellite) and many more nltk pos tags
    def _get_wordnet_pos(self, treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return ''

    def process_subreddits(self, subreddits, save=True, check_previous=True):
        for subreddit in subreddits:
            print(subreddit["display_name"])
            pathlib.Path(self.save_path).mkdir(exist_ok=True)

            all_raw_comments = self._get_all_comments(subreddit)
            raw_comments = all_raw_comments

            comment_no_urls = []
            comment_removed_chars = []
            comment_expandeds = []

            comment_replaced_spell_corrections = []
            comment_processed_spell_corrections = []
            pos_tag_sent_spell_corrections = []

            comment_replaced_no_spell_corrections = []
            comment_processed_no_spell_corrections = []
            pos_tag_no_sent_spell_corrections = []

            count = 0
            total = len(raw_comments)
            for comment in raw_comments:
                print(comment)
                comment_no_url = self._remove_urls(comment)
                comment_removed_char = self._invalid_characters(comment_no_url)
                comment_expanded = self._expand_sentences(
                    [comment_removed_char])[0]

                comment_replaced_spell_correction = self._replace(
                    comment_expanded.lower(), is_spell_check=True)
                comment_processed_spell_correction, pos_tag_sent_spell_correction = self._words_and_tags(
                    comment_replaced_spell_correction)

                comment_replaced_no_spell_correction = self._replace(
                    comment_expanded.lower(), is_spell_check=False)
                comment_processed_no_spell_correction, pos_tag_no_sent_spell_correction = self._words_and_tags(
                    comment_replaced_no_spell_correction)

                count += 1
                print("count:", count, "total:", total,
                      subreddit["display_name"])

                # Appending
                comment_no_urls.append(comment_no_url)
                comment_removed_chars.append(comment_removed_char)
                comment_expandeds.append(comment_expanded)

                comment_replaced_spell_corrections.append(
                    comment_replaced_spell_correction)
                comment_processed_spell_corrections.append(
                    comment_processed_spell_correction)
                pos_tag_sent_spell_corrections.append(
                    pos_tag_sent_spell_correction)

                comment_replaced_no_spell_corrections.append(
                    comment_replaced_no_spell_correction)
                comment_processed_no_spell_corrections.append(
                    comment_processed_no_spell_correction)
                pos_tag_no_sent_spell_corrections.append(
                    pos_tag_no_sent_spell_correction)

            data = {
                "raw":
                raw_comments,
                "comment_no_urls":
                comment_no_urls,
                "comment_removed_chars":
                comment_removed_chars,
                "comment_expandeds":
                comment_expandeds,
                "comment_replaced_spell_corrections":
                comment_replaced_spell_corrections,
                "comment_processed_spell_corrections":
                comment_processed_spell_corrections,
                "pos_tag_sent_spell_corrections":
                pos_tag_sent_spell_corrections,
                "comment_replaced_no_spell_corrections":
                comment_replaced_no_spell_corrections,
                "comment_processed_no_spell_corrections":
                comment_processed_no_spell_corrections,
                "pos_tag_no_sent_spell_corrections":
                pos_tag_no_sent_spell_corrections,
            }

            if save:
                subreddit_path = self.save_path + "TEST" + subreddit[
                    "display_name"] + ".json"
                with open(subreddit_path, 'w') as fp:
                    json.dump(data, fp)
            else:
                return data