コード例 #1
0
 def test_text_preprocessing(self):
     text = "This is a sample text. # ! . "
     analyser = TextAnalyser()
     processed = analyser._preprocess(text)
     expected = ('This is a sample text. # ! . ', ['sampl',
                                                   'text'], [('sampl', 1),
                                                             ('text', 1)])
     self.assertEqual(expected, processed)
コード例 #2
0
    def test_tokenization(self):
        expected, sample_docs, objects = get_test_documents()
        calculated = {}
        analyser = TextAnalyser()
        id = 0
        for s in sample_docs:
            d = analyser.add_document(s)
            calculated[str(id)] = d
            id += 1

        self.assertEqual(expected, calculated)
コード例 #3
0
 def test_tokenization(self):
     expected, sample_docs, objects = get_test_documents()
     calculated = {}
     analyser = TextAnalyser()
     id=0
     for s in sample_docs:
         d = analyser.add_document(s)
         calculated[str(id)] = d
         id+=1
         
     self.assertEqual(expected, calculated)
コード例 #4
0
ファイル: TopsyCrawler.py プロジェクト: aurora1625/pythia
 def crawl(self, only_english=False):
     '''
     Performs the actual crawling. 
     '''
     text_analyser = TextAnalyser(ngram=1, only_english=only_english)
     exception_log = []
     kw = otter.loadrc() # load api key
     count = 0
     while self.maxtime != self.to_date:
         for page in range(PAGE_SIZE):        
             try:
                 search = otter.Resource('search', **kw)
                 #search(q='#jan25 OR #egypt OR #tahrir', mintime = time.mktime(mintime.timetuple()), maxtime = time.mktime(maxtime.timetuple()), type='tweet', offset=page*10)
                 search(q=self.keywords, mintime = time.mktime(self.from_date.timetuple()), maxtime = time.mktime(self.maxtime.timetuple()), type='tweet', perpage=100, page=page+1)
                 for item in search.response.list:
                     print "--------------------------------------------------------------------------"
                     print "Storing tweet #",count, "for the period",self.from_date,"until",self.maxtime 
                     tt = self.type()
                     tt.url = item.url
                     analysed = text_analyser.add_document(item.content)
                     #if this tweet is really small just ignore it. 
                     if len(analysed['tokens']) <= 3: 
                         print"Ignoring this tweet"
                         continue
                     content = Content()
                     content.raw = analysed['raw']
                     content.tokens = analysed['tokens']
                     content.construct_word_freq_list(analysed['word_frequencies'])
                     content.date = self.from_date
                     tt.content = content
                     tt.date = self.from_date
                     tt.retweet_count = item.trackback_total
                     tt.screen_name = item.trackback_author_nick
                     tt.author_screen_name = item.trackback_author_nick
                     tt.author_name = item.trackback_author_name                        
                     tt.save(safe=True)
                     
                     if len(Author.objects(screen_name=item.trackback_author_nick)) == 0:
                         
                         author = Author()
                         author.screen_name = item.trackback_author_nick
                         author.tweets.append(tt)
                     else:
                         author = Author.objects(screen_name=item.trackback_author_nick).get()
                         author.tweets.append(tt)
                     author.save()
                     
                     count += 1                             
             except Exception, e:
                 print e
                 exception_log.append(e)
             finally:
                 pass          
コード例 #5
0
ファイル: TopsyCrawler.py プロジェクト: nihaofuyue0617/pythia
    def crawl(self, only_english=False):
        '''
        Performs the actual crawling. 
        '''
        text_analyser = TextAnalyser(ngram=1, only_english=only_english)
        exception_log = []
        kw = otter.loadrc()  # load api key
        count = 0
        while self.maxtime != self.to_date:
            for page in range(PAGE_SIZE):
                try:
                    search = otter.Resource('search', **kw)
                    #search(q='#jan25 OR #egypt OR #tahrir', mintime = time.mktime(mintime.timetuple()), maxtime = time.mktime(maxtime.timetuple()), type='tweet', offset=page*10)
                    search(q=self.keywords,
                           mintime=time.mktime(self.from_date.timetuple()),
                           maxtime=time.mktime(self.maxtime.timetuple()),
                           type='tweet',
                           perpage=100,
                           page=page + 1)
                    for item in search.response.list:
                        print "--------------------------------------------------------------------------"
                        print "Storing tweet #", count, "for the period", self.from_date, "until", self.maxtime
                        tt = self.type()
                        tt.url = item.url
                        analysed = text_analyser.add_document(item.content)
                        #if this tweet is really small just ignore it.
                        if len(analysed['tokens']) <= 3:
                            print "Ignoring this tweet"
                            continue
                        content = Content()
                        content.raw = analysed['raw']
                        content.tokens = analysed['tokens']
                        content.construct_word_freq_list(
                            analysed['word_frequencies'])
                        content.date = self.from_date
                        tt.content = content
                        tt.date = self.from_date
                        tt.retweet_count = item.trackback_total
                        tt.screen_name = item.trackback_author_nick
                        tt.author_screen_name = item.trackback_author_nick
                        tt.author_name = item.trackback_author_name
                        tt.save(safe=True)

                        if len(
                                Author.objects(screen_name=item.
                                               trackback_author_nick)) == 0:

                            author = Author()
                            author.screen_name = item.trackback_author_nick
                            author.tweets.append(tt)
                        else:
                            author = Author.objects(
                                screen_name=item.trackback_author_nick).get()
                            author.tweets.append(tt)
                        author.save()

                        count += 1
                except Exception, e:
                    print e
                    exception_log.append(e)
                finally:
                    pass
コード例 #6
0
 def test_unicode_doc_translation(self):
     expected, document = get_unicode_document()
     analyser = TextAnalyser()
     document = analyser.add_document(document)
     self.assertEqual(expected, document["raw"])
コード例 #7
0
Created on 1 Feb 2012

@author: george
'''
import unittest
from analysis.social import TwitterSocialAnalyser
from analysis.text import TextAnalyser
from collections import OrderedDict

tweet_with_RT = "RT @monaeltahawy: RT @Gheblawi Beyond belief: religious history &amp; make-up of #Egypt interesting discussion #Copts http://www.bbc.co.uk/podcasts/series/belief"
tweet_with_VIA= "Breaking News - Messi spotted outside the Etihad #transferdeadlineday http://twitpic.com/8dwcum (via @AndrewBloch )"
not_a_retweet = "This is not a retweet #test"
tweet_with_almost_RT = "RT Beyond belief: religious history &amp; make-up of #Egypt interesting discussion #Copts http://www.bbc.co.uk/podcasts/series/belief"
tweets = [tweet_with_RT, tweet_with_VIA, not_a_retweet, tweet_with_almost_RT]

t = TextAnalyser()
dataset = OrderedDict()
id = 0
for tweet in tweets:
    d = t.add_document(tweet)
    dataset[id] = d
    id += 1
    
class Test(unittest.TestCase):
    def test_retweet_filter(self):
        tsa = TwitterSocialAnalyser(dataset)
        result = tsa.filter_retweets()
        expected = []
        expected.append( (0, t.add_document(tweet_with_RT)) )
        expected.append( (1, t.add_document(tweet_with_VIA)) )
        self.assertEqual(result, OrderedDict(expected))
コード例 #8
0
 def test_text_preprocessing(self):
     text = "This is a sample text. # ! . "
     analyser = TextAnalyser()
     processed = analyser._preprocess(text)
     expected = ('This is a sample text. # ! . ', ['sampl', 'text'], [('sampl', 1), ('text', 1)])
     self.assertEqual(expected, processed)
コード例 #9
0
 def test_unicode_doc_translation(self):
     expected, document = get_unicode_document()
     analyser = TextAnalyser()
     document = analyser.add_document(document)
     self.assertEqual(expected, document["raw"])
コード例 #10
0
Created on 1 Feb 2012

@author: george
'''
import unittest
from analysis.social import TwitterSocialAnalyser
from analysis.text import TextAnalyser
from collections import OrderedDict

tweet_with_RT = "RT @monaeltahawy: RT @Gheblawi Beyond belief: religious history &amp; make-up of #Egypt interesting discussion #Copts http://www.bbc.co.uk/podcasts/series/belief"
tweet_with_VIA = "Breaking News - Messi spotted outside the Etihad #transferdeadlineday http://twitpic.com/8dwcum (via @AndrewBloch )"
not_a_retweet = "This is not a retweet #test"
tweet_with_almost_RT = "RT Beyond belief: religious history &amp; make-up of #Egypt interesting discussion #Copts http://www.bbc.co.uk/podcasts/series/belief"
tweets = [tweet_with_RT, tweet_with_VIA, not_a_retweet, tweet_with_almost_RT]

t = TextAnalyser()
dataset = OrderedDict()
id = 0
for tweet in tweets:
    d = t.add_document(tweet)
    dataset[id] = d
    id += 1


class Test(unittest.TestCase):
    def test_retweet_filter(self):
        tsa = TwitterSocialAnalyser(dataset)
        result = tsa.filter_retweets()
        expected = []
        expected.append((0, t.add_document(tweet_with_RT)))
        expected.append((1, t.add_document(tweet_with_VIA)))