Esempio n. 1
0
    def __init__(self, the_page):
        threading.Thread.__init__(self)
        HTMLParser.HTMLParser.__init__(self)
        self.sentencesplitter = SentenceSplitter()
        self.in_paragraph = False
        self.num_of_para = -1  # when we first meet tag p, we will be 0
        self.paraString = []
        self.paraSentances = []
        self.end_of_intro = False
        self.cant_find_page = False
        self.gotimage = False

        self.feed(the_page)
        self.parse_to_sentances()
Esempio n. 2
0
 def __init__(self, the_page):
     threading.Thread.__init__(self)
     HTMLParser.HTMLParser.__init__(self)
     self.sentencesplitter=SentenceSplitter()
     self.in_paragraph=False
     self.num_of_para=-1 # when we first meet tag p, we will be 0
     self.paraString=[]
     self.paraSentances=[]
     self.end_of_intro=False
     self.cant_find_page=False
     self.gotimage=False
     
     self.feed(the_page)
     self.parse_to_sentances()
Esempio n. 3
0
class content(HTMLParser.HTMLParser, threading.Thread):
    def __init__(self, the_page):
        threading.Thread.__init__(self)
        HTMLParser.HTMLParser.__init__(self)
        self.sentencesplitter = SentenceSplitter()
        self.in_paragraph = False
        self.num_of_para = -1  # when we first meet tag p, we will be 0
        self.paraString = []
        self.paraSentances = []
        self.end_of_intro = False
        self.cant_find_page = False
        self.gotimage = False

        self.feed(the_page)
        self.parse_to_sentances()

    def handle_starttag(self, tag, attrs):
        if tag == 'p':
            #if not self.end_of_intro:
            self.in_paragraph = True
            self.num_of_para = self.num_of_para + 1
            self.paraString.append('')
        #print 'len of parastring: %d' % len(self.paraString)
        #print 'start paragraph: %d' % self.num_of_para

        if tag == 'table':
            if attrs[0][0] == 'id' and attrs[0][1] == 'toc':
                self.end_of_intro = True

        if tag == 'img':
            if not self.gotimage:
                for desc in attrs:
                    if desc[0] == 'src':
                        src = desc[1]
                        print src
                        if src[-3:] != 'jpg':
                            print 'breaking'
                            break
                        print 'good pic'
                        self.gotimage = True
                        print src

                        headers = {'User-Agent': config.USER_AGENT}
                        req = urllib2.Request(src, None, headers)
                        response = urllib2.urlopen(req)
                        image = response.read()
                        imagefile = file('didyouknow_tmp.img', 'w')
                        imagefile.write(image)
                        imagefile.close

    def handle_endtag(self, tag):
        if tag == 'p':
            self.in_paragraph = False
            #if the last paragraph was empty
            if not len(self.paraString[self.num_of_para]):
                self.num_of_para = self.num_of_para - 1
                self.paraString.pop()

    def handle_data(self, data):
        if data == 'Wikipedia does not have an article with this exact name.':
            print "cant find wikipedia page"
            self.no_page()

        if self.in_paragraph:  #and not self.end_of_intro:
            self.paraString[
                self.num_of_para] = self.paraString[self.num_of_para] + data

    def parse_to_sentances(self):
        for s in self.paraString:
            self.paraSentances.append(self.sentencesplitter.split(s))

    def get_random_fact(self):
        if self.cant_find_page:
            return "Cant find info on wikipedia"
        else:
            s = ''
            random_para = random.randint(0, len(self.paraSentances) - 1)
            random_sen = random.randint(
                0,
                len(self.paraSentances[random_para]) - 1)
            if random_sen > 0:
                s = self.half_sentence(
                    self.paraSentances[random_para][random_sen - 1],
                    'begin') + ' '
            s = s + self.paraSentances[random_para][random_sen]
            if random_sen < len(self.paraSentances[random_para]) - 1:
                s = s + ' ' + self.half_sentence(
                    self.paraSentances[random_para][random_sen + 1], 'end')
            return s

    def half_sentence(self, sentence, side='begin'):
        wa = sentence.split(' ')
        word_count = len(wa)
        if side == 'end':
            wa = wa[0:(word_count / 2)] + ['... ']
        if side == 'begin':
            wa = [' ...'] + wa[(word_count / 2):word_count]
        return (' '.join(wa))

    def no_page(self):
        self.cant_find_page = True
Esempio n. 4
0
class content(HTMLParser.HTMLParser,threading.Thread):
    
    def __init__(self, the_page):
        threading.Thread.__init__(self)
        HTMLParser.HTMLParser.__init__(self)
        self.sentencesplitter=SentenceSplitter()
        self.in_paragraph=False
        self.num_of_para=-1 # when we first meet tag p, we will be 0
        self.paraString=[]
        self.paraSentances=[]
        self.end_of_intro=False
        self.cant_find_page=False
        self.gotimage=False
        
        self.feed(the_page)
        self.parse_to_sentances()
        

    def handle_starttag(self,tag,attrs):
        if tag == 'p':
            #if not self.end_of_intro:
                self.in_paragraph=True
                self.num_of_para=self.num_of_para+1
                self.paraString.append('')
            #print 'len of parastring: %d' % len(self.paraString)
            #print 'start paragraph: %d' % self.num_of_para
       
        if tag =='table':
            if  attrs[0][0] == 'id' and attrs[0][1]=='toc':
                self.end_of_intro=True

        if tag =='img':
            if not self.gotimage:
                for desc in attrs:
                    if desc[0]=='src':
                        src=desc[1]
                        print src
                        if src[-3:] != 'jpg':
                            print 'breaking'
                            break
                        print 'good pic'
                        self.gotimage=True
                        print src
                        
                        
                        headers = { 'User-Agent' : config.USER_AGENT }
                        req = urllib2.Request(src, None, headers)
                        response = urllib2.urlopen(req)
                        image = response.read()
                        imagefile=file('didyouknow_tmp.img','w')
                        imagefile.write(image)
                        imagefile.close
                    
    def handle_endtag(self,tag):
        if tag =='p':
            self.in_paragraph=False
            #if the last paragraph was empty
            if not len(self.paraString[self.num_of_para]):
                self.num_of_para=self.num_of_para-1
                self.paraString.pop()

    def handle_data(self,data):
        if data=='Wikipedia does not have an article with this exact name.':
            print "cant find wikipedia page"
            self.no_page()
        
        if self.in_paragraph: #and not self.end_of_intro:
            self.paraString[self.num_of_para]=self.paraString[self.num_of_para]+data
        

    def parse_to_sentances(self):
        for s in self.paraString:
            self.paraSentances.append(self.sentencesplitter.split(s))


    def get_random_fact(self):
        if self.cant_find_page:
            return "Cant find info on wikipedia"
        else:
            s=''
            random_para=random.randint(0,len(self.paraSentances)-1)
            random_sen=random.randint(0,len(self.paraSentances[random_para])-1)
            if random_sen > 0:
                s=self.half_sentence(self.paraSentances[random_para][random_sen-1],'begin')+' '
            s=s+self.paraSentances[random_para][random_sen]
            if random_sen < len(self.paraSentances[random_para])-1:
                    s=s+' '+self.half_sentence(self.paraSentances[random_para][random_sen+1],'end')
            return s

    def half_sentence(self,sentence,side='begin'):
        wa=sentence.split(' ')
        word_count=len(wa)
        if side=='end':
            wa=wa[0:(word_count/2)]+['... ']
        if side == 'begin':
            wa=[' ...']+wa[(word_count/2):word_count]
        return (' '.join(wa))


    def no_page(self):
        self.cant_find_page=True