Ejemplo n.º 1
0
def clean_text(text):
    """ Removes punctuation, capitalizations, numbers, stop words, and stems words"""
    ps = PorterStemmer()

    stop_words = set(stopwords.words('english'))

    text = text.lower()
    text = contractions.expandContractions(text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)  # remove punctuation
    text = re.sub('\s+', ' ', text)
    text = re.sub('\d+', ' ', text)  # remove numbers
    text = re.sub(
        r'(.)\1\1+', r'\1\1',
        text)  # letters repeated 3 or more times in a row are repeated twice
    text = re.sub(r'(ha)\1\1+', r'haha', text)
    text = re.sub(r'(lo)\1\1+', r'lol', text)
    text = text.strip(' ')

    # stem words
    tokenizer = WhitespaceTokenizer()
    tokenized_comment = tokenizer.tokenize(text)
    filtered_sentence = [w for w in tokenized_comment if not w in stop_words]
    stemmed_comment = [ps.stem(word) for word in filtered_sentence]
    text = " ".join(stemmed_comment)
    return text
 def general_clean_comment(self, comment):
     comment = comment.lower()
     comment = expandContractions(comment)
     comment = self.split_integer_digit_string(comment)
     comment = self.remove_digits(comment)
     comment = self.remove_punctuation(comment)
     tokenized = word_tokenize(comment)
     stop_word_removed = []
     for word in tokenized:
         if word not in self.general_stop_words:
             stop_word_removed.append(word)
     return ' '.join(stop_word_removed)
def classify(document):
    """
    Classify a document with the Hierarchial Attention Network (HAN).

    :param document: a document in text form
    :return: pre-processed tokenized document, class scores, attention weights for words, attention weights for sentences, sentence lengths
    """
    # A list to store the document tokenized into words
    doc = list()

    # Tokenize document into sentences
    sentences = list()
    for paragraph in preprocess(document).splitlines():
        sentences.extend([s for s in sent_tokenizer.tokenize(paragraph)])

    # Tokenize sentences into words
    for s in sentences[:sentence_limit]:
        s1 = expandContractions(s)
        s2 = ''.join([i for i in s1 if i.isalpha() or i.isspace()])
        wakati = mecab.parse(s2)
        w = word_tokenizer.tokenize(wakati)[:word_limit]
        if len(w) == 0:
            continue
        doc.append(w)

    # Number of sentences in the document
    sentences_in_doc = len(doc)
    sentences_in_doc = torch.LongTensor([sentences_in_doc]).to(device)  # (1)

    # Number of words in each sentence
    words_in_each_sentence = list(map(lambda s: len(s), doc))
    words_in_each_sentence = torch.LongTensor(words_in_each_sentence).unsqueeze(0).to(device)  # (1, n_sentences)

    # Encode document with indices from the word map
    encoded_doc = list(
        map(lambda s: list(map(lambda w: word_map.get(w, word_map['<unk>']), s)) + [0] * (word_limit - len(s)),
            doc)) + [[0] * word_limit] * (sentence_limit - len(doc))
    encoded_doc = torch.LongTensor(encoded_doc).unsqueeze(0).to(device)

    # Apply the HAN model
    scores, word_alphas, sentence_alphas = model(encoded_doc, sentences_in_doc,
                                                 words_in_each_sentence)  # (1, n_classes), (1, n_sentences, max_sent_len_in_document), (1, n_sentences)
    scores = scores.squeeze(0)  # (n_classes)
    scores = nn.functional.softmax(scores, dim=0)  # (n_classes)
    word_alphas = word_alphas.squeeze(0)  # (n_sentences, max_sent_len_in_document)
    sentence_alphas = sentence_alphas.squeeze(0)  # (n_sentences)
    words_in_each_sentence = words_in_each_sentence.squeeze(0)  # (n_sentences)

    return doc, scores, word_alphas, sentence_alphas, words_in_each_sentence
Ejemplo n.º 4
0
def process_text(text):
    text = hashtags.sub(' hashtag', text)
    text = mentions.sub(' entity', text)
    text = urls.sub(' website', text)
    text = re.sub(r"[^A-Za-z0-9(),!.?\'\`]", " ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\d+", "", text)
    text = re.sub(r":", " ", text)
    text = re.sub(r"-", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ", text)
    text = re.sub(r"\(", " ( ", text)
    text = re.sub(r"\)", " ) ", text)
    text = re.sub(r"\?", " ", text)
    text = re.sub(r"\s{2,}", " ", text)
    text = text.split()
    text = [contractions.expandContractions(x) for x in text]
    text = sp(' '.join([word for word in text if not word in stop_words]))
    text = ' '.join([word.lemma_ for word in text])

    return text.strip().lower()
Ejemplo n.º 5
0
# In[2]:


text = pd.read_csv('train.csv').drop(['Complaint-Status'], axis = 1)
text = text.append(pd.read_csv('test.csv'), ignore_index = True)
'''text['word count'] = text['Consumer-complaint-summary'].apply(lambda x : len(str(x).split(' ')))
text['char_len'] = text['Consumer-complaint-summary'].apply(lambda x : len(str(x)))

def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

text['avg_word'] = text['Consumer-complaint-summary'].apply(lambda x: avg_word(str(x)))
word_features = text.iloc[:, -3:].values
np.savetxt('word_features.txt', word_features)'''
text['Consumer-Complaint-summary'] = text['Consumer-complaint-summary'].apply(lambda x: expandContractions(x))
text['Consumer-complaint-summary'] = text['Consumer-complaint-summary'].apply(lambda x: re.sub('[~`!@#$%^&*():;"{}_/?><\|.,`0-9]', '', x.replace('-', ' ')))
#text['Consumer-complaint-summary'] = text['Consumer-complaint-summary'].apply(lambda x: unidecode.unidecode(x))
text = text['Consumer-complaint-summary'].iloc[:].values


# In[3]:


# detecting the corresponding languages of summary



"""!pip install langdetect
from langdetect import detect
languages = []