def __init__(self, the_page): threading.Thread.__init__(self) HTMLParser.HTMLParser.__init__(self) self.sentencesplitter = SentenceSplitter() self.in_paragraph = False self.num_of_para = -1 # when we first meet tag p, we will be 0 self.paraString = [] self.paraSentances = [] self.end_of_intro = False self.cant_find_page = False self.gotimage = False self.feed(the_page) self.parse_to_sentances()
def __init__(self, the_page): threading.Thread.__init__(self) HTMLParser.HTMLParser.__init__(self) self.sentencesplitter=SentenceSplitter() self.in_paragraph=False self.num_of_para=-1 # when we first meet tag p, we will be 0 self.paraString=[] self.paraSentances=[] self.end_of_intro=False self.cant_find_page=False self.gotimage=False self.feed(the_page) self.parse_to_sentances()
class content(HTMLParser.HTMLParser, threading.Thread): def __init__(self, the_page): threading.Thread.__init__(self) HTMLParser.HTMLParser.__init__(self) self.sentencesplitter = SentenceSplitter() self.in_paragraph = False self.num_of_para = -1 # when we first meet tag p, we will be 0 self.paraString = [] self.paraSentances = [] self.end_of_intro = False self.cant_find_page = False self.gotimage = False self.feed(the_page) self.parse_to_sentances() def handle_starttag(self, tag, attrs): if tag == 'p': #if not self.end_of_intro: self.in_paragraph = True self.num_of_para = self.num_of_para + 1 self.paraString.append('') #print 'len of parastring: %d' % len(self.paraString) #print 'start paragraph: %d' % self.num_of_para if tag == 'table': if attrs[0][0] == 'id' and attrs[0][1] == 'toc': self.end_of_intro = True if tag == 'img': if not self.gotimage: for desc in attrs: if desc[0] == 'src': src = desc[1] print src if src[-3:] != 'jpg': print 'breaking' break print 'good pic' self.gotimage = True print src headers = {'User-Agent': config.USER_AGENT} req = urllib2.Request(src, None, headers) response = urllib2.urlopen(req) image = response.read() imagefile = file('didyouknow_tmp.img', 'w') imagefile.write(image) imagefile.close def handle_endtag(self, tag): if tag == 'p': self.in_paragraph = False #if the last paragraph was empty if not len(self.paraString[self.num_of_para]): self.num_of_para = self.num_of_para - 1 self.paraString.pop() def handle_data(self, data): if data == 'Wikipedia does not have an article with this exact name.': print "cant find wikipedia page" self.no_page() if self.in_paragraph: #and not self.end_of_intro: self.paraString[ self.num_of_para] = self.paraString[self.num_of_para] + data def parse_to_sentances(self): for s in self.paraString: self.paraSentances.append(self.sentencesplitter.split(s)) def get_random_fact(self): if self.cant_find_page: return "Cant find info on wikipedia" else: s = '' random_para = random.randint(0, len(self.paraSentances) - 1) random_sen = random.randint( 0, len(self.paraSentances[random_para]) - 1) if random_sen > 0: s = self.half_sentence( self.paraSentances[random_para][random_sen - 1], 'begin') + ' ' s = s + self.paraSentances[random_para][random_sen] if random_sen < len(self.paraSentances[random_para]) - 1: s = s + ' ' + self.half_sentence( self.paraSentances[random_para][random_sen + 1], 'end') return s def half_sentence(self, sentence, side='begin'): wa = sentence.split(' ') word_count = len(wa) if side == 'end': wa = wa[0:(word_count / 2)] + ['... '] if side == 'begin': wa = [' ...'] + wa[(word_count / 2):word_count] return (' '.join(wa)) def no_page(self): self.cant_find_page = True
class content(HTMLParser.HTMLParser,threading.Thread): def __init__(self, the_page): threading.Thread.__init__(self) HTMLParser.HTMLParser.__init__(self) self.sentencesplitter=SentenceSplitter() self.in_paragraph=False self.num_of_para=-1 # when we first meet tag p, we will be 0 self.paraString=[] self.paraSentances=[] self.end_of_intro=False self.cant_find_page=False self.gotimage=False self.feed(the_page) self.parse_to_sentances() def handle_starttag(self,tag,attrs): if tag == 'p': #if not self.end_of_intro: self.in_paragraph=True self.num_of_para=self.num_of_para+1 self.paraString.append('') #print 'len of parastring: %d' % len(self.paraString) #print 'start paragraph: %d' % self.num_of_para if tag =='table': if attrs[0][0] == 'id' and attrs[0][1]=='toc': self.end_of_intro=True if tag =='img': if not self.gotimage: for desc in attrs: if desc[0]=='src': src=desc[1] print src if src[-3:] != 'jpg': print 'breaking' break print 'good pic' self.gotimage=True print src headers = { 'User-Agent' : config.USER_AGENT } req = urllib2.Request(src, None, headers) response = urllib2.urlopen(req) image = response.read() imagefile=file('didyouknow_tmp.img','w') imagefile.write(image) imagefile.close def handle_endtag(self,tag): if tag =='p': self.in_paragraph=False #if the last paragraph was empty if not len(self.paraString[self.num_of_para]): self.num_of_para=self.num_of_para-1 self.paraString.pop() def handle_data(self,data): if data=='Wikipedia does not have an article with this exact name.': print "cant find wikipedia page" self.no_page() if self.in_paragraph: #and not self.end_of_intro: self.paraString[self.num_of_para]=self.paraString[self.num_of_para]+data def parse_to_sentances(self): for s in self.paraString: self.paraSentances.append(self.sentencesplitter.split(s)) def get_random_fact(self): if self.cant_find_page: return "Cant find info on wikipedia" else: s='' random_para=random.randint(0,len(self.paraSentances)-1) random_sen=random.randint(0,len(self.paraSentances[random_para])-1) if random_sen > 0: s=self.half_sentence(self.paraSentances[random_para][random_sen-1],'begin')+' ' s=s+self.paraSentances[random_para][random_sen] if random_sen < len(self.paraSentances[random_para])-1: s=s+' '+self.half_sentence(self.paraSentances[random_para][random_sen+1],'end') return s def half_sentence(self,sentence,side='begin'): wa=sentence.split(' ') word_count=len(wa) if side=='end': wa=wa[0:(word_count/2)]+['... '] if side == 'begin': wa=[' ...']+wa[(word_count/2):word_count] return (' '.join(wa)) def no_page(self): self.cant_find_page=True