Esempio n. 1
0
def preprocessing(conversations):
    """
    Word Stemming and Stop words Removal
    """
    pred_conversations = []
    for i in range(len(conversations)):
        # Remove unique chars
        conversation = ""
        for j in range(len(conversations[i])):
            if ord(conversations[i][j]) < 128:
                conversation += conversations[i][j]
        sentence = []
        # word tokenize
        words = word_tokenize(conversation)
        removal = "?!.,( )"
        stop_words = set(stopwords.words('english'))
        stop_words.update(("'s", "n't", "'m", "'ve", "'re", "'d", "'"))
        for word in words:
            # Remove ?!.,
            pred_word = ""
            for j in range(len(word)):
                if word[j] in removal: continue
                pred_word += word[j]
            # Lower, word stemming and stop words removal
            if len(pred_word) != 0:
                pred_word = SnowballStemmer("english").stem(pred_word.lower())
                if pred_word in stop_words: continue
                sentence.append(pred_word)
        pred_conversations.append(sentence)

    return pred_conversations
Esempio n. 2
0
def morphy_stem(word):
    """
    Simple stemmer
    """
    #stem = wn.morphy(word)
    stem = SnowballStemmer('english').stem(word)
    #stem = lemma.lemmatize(stem2)
    if stem:
        return stem.lower()
    else:
        return word.lower()
def get_data(name_train, name_test_closed, name_test_open):
    PATH_TRAINING = name_train
    PATH_TEST = name_test_open

    train_set = pd.read_csv(PATH_TRAINING,
                            header=0,
                            sep=',',
                            quotechar='"',
                            names=['autor', 'gender', 'age', 'text'])
    test_set = pd.read_csv(PATH_TEST,
                           header=0,
                           sep=',',
                           quotechar='"',
                           names=['autor', 'gender', 'age', 'text'])

    # train_set = train_set.groupby('autor').agg({'gender': 'first',
    #                                             'age': 'first',
    #                                             'text': ' '.join}).reset_index()

    punct = set(punctuation)
    punct_cleaned = set(punctuation.replace("-", "").replace("'", ""))

    for i in range(len(train_set)):
        old_text = train_set["text"].iloc[i]

        # split into sentences
        sentences = re.split('[.!?]', old_text)
        sentences_clean = []

        # for each sentences
        for sentence in sentences:
            sentence_new = []
            for word in sentence.split():
                if word not in punct_cleaned:
                    word = ''.join(ch for ch in word
                                   if ch not in punct_cleaned)
                    word = word.lower()
                    word = SnowballStemmer('english').stem(word)
                    if word.isnumeric():
                        word = "NUMBER"
                    sentences_clean.append(word)
        train_set["text"].iloc[i] = ' '.join(sentences_clean)

    for i in range(len(test_set)):
        old_text = test_set["text"].iloc[i]

        # split into sentences
        sentences = re.split('[.!?]', old_text)
        sentences_clean = []

        # for each sentences
        for sentence in sentences:
            sentence_new = []
            for word in sentence.split():
                if word not in punct_cleaned:
                    word = ''.join(ch for ch in word
                                   if ch not in punct_cleaned)
                    word = word.lower()
                    word = SnowballStemmer('english').stem(word)
                    if word.isnumeric():
                        word = "NUMBER"
                    sentences_clean.append(word)
        test_set["text"].iloc[i] = ' '.join(sentences_clean)

    return train_set, test_set