Esempio n. 1
0
 def test_remove_characters(self):
     assert string_process.remove_characters(u'太萌了><') == u'太萌了'
     assert string_process.remove_characters(u'!!!!!!!!!@uK3RXUYW3:') \
         == u'@uK3RXUYW3'
     assert string_process.remove_characters(u'阿里吧吧啊,是', u',') \
         == u'阿里吧吧啊是'
     assert string_process.remove_characters(u'阿 里吧吧啊?是', u'? ') \
         == u'阿里吧吧啊是'
Esempio n. 2
0
            message = unicode(','.join(transactions[6:transactions_size-4]),
                              encoding='utf-8',
                              errors='ignore')

            raw_message[raw_mid] = message
            src_message[src_mid].append(message)

            # extract text from current user
            # remove @somebody text
            # remove symbols
            message = message[:message.find(u'//')].lower().strip()
            message = re.sub(u'@.*?$', u'', message.strip())
            message = re.sub(u'@.*?:', u'', message.strip())
            message = re.sub(u'[a-z]+$', u'', message.strip())
            message = re.sub(u'^[a-z]+', u'', message.strip())
            message = string_process.remove_characters(message.strip())

            if len(message) == 0:
                continue

            csvwriter.writerow([raw_mid, src_mid, message.encode('utf-8')])

#            test_num -= 1
#            if test_num < 0:
#                break

#for raw in raw_message.keys():
#    print "-----------------------"
#    print "-----------------------"
#    print raw_message[raw].encode('utf-8')
#    for message in src_message[raw]: