예제 #1
0
class TestPreprocessor(unittest.TestCase):
    def setUp(self):
        self.pp = Preprocessor()

    def testPreporcessNull(self):
        #Null String should return none
        result = self.pp.clean('')
        self.assertEquals(result, None)

    def testPreporcessOneHiragana(self):
        #One hiragana should return None
        result = self.pp.clean(u'あw(^^)w')
        self.assertEquals(result, None)

    def testPreporcessZenkaku(self):
        #Zenkaku stuff should be converted and cleaned
        result = self.pp.clean(u'全角です123ww')
        self.assertEquals(result, u'全角です123')

    def testPreprocessNakano(self):
        #Test to see if the Preprocessing work as intended
        test_string = u'私の名前は中野ですwwww>あふぉ(^^)o'
        result = self.pp.clean(test_string)
        expected = u'私の名前は中野です'
        self.assertEquals(result, expected)
예제 #2
0
class MessageParser:
    my_name = re.compile('|'.join(MY_NAMES))

    def __init__(self):
        self.cha = chasen
        self.pp = Preprocessor()
        self.cha.setformat('%m,%y,%M,%Y,%U(%P1),%BB|$|')

    def parseLine(self, message, clean=True):
        """ Parse a given line via chasen.
            If clean is True, does the preporcessing of message
        """

        pMessage = ''
        if clean:
            pMessage = self.pp.clean(message)
        else:
            pMessage = message
        if pMessage:
            #print 'pmessage:%s' % pMessage
            pl = self.cha.sparse(pMessage.encode('euc-jp')).strip()
            return unicode(pl, 'euc-jp')
        else:
            print 'message is empty:' + message
            return None

    def parseSentence(self, message, clean=True):
        """ Return a list of pseudo words(=setence) from a string """
        myNameFlag, message = self.checkForMe(message)
        words = self.parseLine(message, clean)
        if words:
            sentence = []
            for word in words.split('|$|'):
                if word:
                    keywords = [
                        'appeared_name', 'appeared_reading', 'base_name',
                        'base_reading', 'main_type', 'sub_type'
                    ]
                    pseudo_word = generateDict(keywords, word.split(','))
                    sentence.append(pseudo_word)
            return self.doMergers(myNameFlag, sentence)
        else:
            return None

    def checkForMe(self, message):
        """Checks if my name is in the term """
        name_flag = False
        is_name = self.my_name.search(message)
        if is_name:
            message = re.sub(self.my_name, u'私', message)
            name_flag = True
        return name_flag, message

    def merge(self, part, first, second=None):
        """Merge words of a part depending on it's lex class
        """
        if len(part) == 1: return part
        new_part = []
        merged_flag = False
        if not second: second = first

        for i, word in enumerate(part):
            if word['main_type'] == first:
                #Check to see if the nextword
                if i < len(part) - 1:
                    if part[i + 1]['main_type'] == second:
                        new_word = self.joinWords(word, part[i + 1])
                        part[i + 1] = new_word
                        part.pop(i)
                        return self.merge(part, first, second)
        #printWords(part)
        return part

    def joinWords(self, word1, word2):
        """Return a pseudo word joining the given 2"""
        new_word = {}
        new_word['appeared_name'] = ''.join(
            [word1['appeared_name'], word2['appeared_name']])
        new_word['appeared_reading'] = ''.join(
            [word1['appeared_reading'], word2['appeared_reading']])
        new_word['base_name'] = ''.join(
            [word1['base_name'], word2['base_name']])
        new_word['base_reading'] = ''.join(
            [word1['base_reading'], word2['base_reading']])
        new_word['main_type'] = word2['main_type']
        new_word['sub_type'] = word2['sub_type']
        return new_word

    def doMergers(self, myNameFlag, sentence):
        """Do some post processing functions on the sentence
           Required to workaround some of the chasen limitations
        """
        jyoshi_idx = [
            sentence.index(x) for x in sentence if x['main_type'] == u'助詞'
        ]
        #Append the last letter too
        jyoshi_idx.append(len(sentence) - 1)

        parts = []
        w_part = []
        for i, word in enumerate(sentence):
            if i in jyoshi_idx:
                parts.append(w_part)
                parts.append([word])
                w_part = []
            else:
                if myNameFlag == True and word['appeared_name'] == u'私':
                    word['appeared_name'] = u'じゅな'
                    word['appared_reading'] = u'じゅな'
                    word['base_name'] = u'じゅな'
                    word['base_reading'] = u'じゅな'
                w_part.append(word)

        # We need to iterate thro the parts, and merge nonus.
        # This is because a combo noun is quite different meaning wise
        # from a regular noun
        for p in parts:
            p = self.merge(p, u'名詞')
            p = self.merge(p, u'未知語')
            p = self.merge(p, u'記号')

    #Finally we merge them all to a new pseudo sentence
        n_sentence = []
        for p in parts:
            for w in p:
                n_sentence.append(w)
        return n_sentence
예제 #3
0
from Preprocessor import Preprocessor
from Vectorizer import Vectorizer
from Classifier import Classifier
from DeepLearning import DeepLearner
from sklearn.model_selection import train_test_split as split
import numpy as np

dr = DataReader('./datasets/training-v1/offenseval-training-v1.tsv', 'A')
data, labels = dr.get_labelled_data()
data, labels = dr.shuffle(data, labels, 'random')

data = data[:]
labels = labels[:]

prp = Preprocessor('remove_stopwords', 'lemmatize')
data = prp.clean(data)

tr_data, tst_data, tr_labels, tst_labels = split(np.array(data),
                                                 labels,
                                                 test_size=0.2,
                                                 stratify=labels)
tr_data, tr_labels = dr.upsample(tr_data, tr_labels, label=1)
tr_data, tr_labels = dr.shuffle(tr_data, tr_labels, 'random')

vct = Vectorizer('count')
vct.vectorize(tr_data)

model = DeepLearner(tr_data,
                    tr_labels,
                    vocab_length=vct.vocab_length,
                    model_type='CNN')
예제 #4
0
sub_b=['UNT','TIN']

dr_tr = DataReader('./datasets/training-v1/offenseval-training-v1.tsv','B')
tr_data,tr_labels = dr_tr.get_labelled_data()
tr_data,tr_labels = dr_tr.upsample(tr_data,tr_labels,label=0)
tr_data,tr_labels = dr_tr.shuffle(tr_data,tr_labels,'random')

dr_tst = DataReader('./datasets/test-B/testset-taskb.tsv')
tst_data,tst_ids = dr_tst.get_test_data()

tr_data = tr_data[:500]
tr_labels = tr_labels[:500]

##### Naive Bayes - Lemmatize - tfidf
prp = Preprocessor('remove_stopwords')
tr_data_clean = prp.clean(tr_data)
tst_data_clean = prp.clean(tst_data)

vct = Vectorizer('tfidf')
tr_vectors = vct.vectorize(tr_data_clean)
tst_vectors = vct.vectorize(tst_data_clean)

clf = Classifier('M-NaiveBayes')
tuned_accs = clf.tune(tr_vectors,tr_labels,{'alpha':[1,5,10],'fit_prior':[True,False]},best_only=False)
print('NB Tuned:',tuned_accs)

predictions = clf.predict(tst_vectors)
with open('subtask-B-test-NB.csv','w') as f:
    for i,id in enumerate(tst_ids):
        f.write(str(id)+','+str(sub_b[predictions[i]])+'\n')
예제 #5
0
                }
            ]]

dr = DataReader('./datasets/training-v1/offenseval-training-v1.tsv', 'A')
data, labels = dr.get_labelled_data()
data, labels = dr.shuffle(data, labels, 'random')
data, _, labels, _ = split(data, labels, test_size=0.5, stratify=labels)

clf_dict = {clf[0]: {} for clf in clf_list}

for vec in vec_list:
    print('Vectorization: ', vec[0])
    for prp in prp_list:
        print('Preprocessing: ', prp)
        preprocessor = Preprocessor(prp)
        clean_data = preprocessor.clean(data)

        vectorizer = Vectorizer(type=vec[0], params=vec[1])
        vectorized_data = vectorizer.vectorize(clean_data)

        for cl in clf_list:
            if vec[0] not in ['BoW', 'tfidf'] and cl[0] == 'M-NaiveBayes':
                continue
            print('Classifier: ', cl[0])
            clf = Classifier(cl[0])
            params_accs = clf.tune(vectorized_data,
                                   labels,
                                   cl[1],
                                   best_only=False)
            print('Scores:', params_accs)
            for key, value in params_accs.items():