def create_transcription(comicnum=1): comic_url = 'http://www.qwantz.com/index.php?comic=' + str(comicnum) xml_url = comic_url[11:] today_comic = urlopen(comic_url) result = today_comic.read() todayparsed = fromstring(result) subject_text = 'unknown' mouseover_text = 'unknown' try: imageurl = todayparsed.cssselect('img.comic')[0].attrib['src'] mouseover_text = todayparsed.cssselect('img.comic')[0].attrib['title'] comicnum = int(re.findall(r'comic2-([0-9]+)\.png',imageurl)[0]) comic_title = 'unknown' for comment in [element for element in todayparsed.iter() if isinstance(element,HtmlComment)]: t = fromstring(comment.text_content()).cssselect('.rss-title') if len(t) != 0: comic_title = t[0].text_content() for item in todayparsed.cssselect('li'): if item.text_content() == 'contact': subject_text = item[0].attrib['href'].split('=')[1] except IndexError: return None img = StringIO(urlopen(imageurl).read()) d = Dinocr(img,title=comic_title,url=xml_url,subject_text=subject_text,mouseover_text=mouseover_text) return d.string_new_xml()
def compare_with_old_transcription(comicnum=1): comic_url = 'http://www.qwantz.com/index.php?comic=' + str(comicnum) xml_url = comic_url[11:] today_comic = urlopen(comic_url) result = today_comic.read() todayparsed = fromstring(result) try: imageurl = todayparsed.cssselect('img.comic')[0].attrib['src'] comicnum = int(re.findall(r'comic2-([0-9]+)\.png',imageurl)[0]) comic_title = 'unknown' for comment in [element for element in todayparsed.iter() if isinstance(element,HtmlComment)]: t = fromstring(comment.text_content()).cssselect('.rss-title') if len(t) != 0: comic_title = t[0].text_content() except IndexError: error_out("failed parsing qwantz.com",False) system('curl %s > /tmp/comic.png' % imageurl) d = Dinocr('/tmp/comic.png',title=comic_title,url=xml_url) d.print_comic() d.generate_old_xml() print xml_url xml_compare(d.old_xml, answers[xml_url])
def main(): comicid = '' today_comic = urlopen('http://www.qwantz.com/index.php' + comicid) result = today_comic.read() todayparsed = fromstring(result) try: comicurl = todayparsed.cssselect('img.comic')[0].attrib['src'] comicnum = int(re.findall(r'comic2-([0-9]+)\.png',comicurl)[0]) except Exception as e: print "failed parsing qwantz.com: " + e.message tmpfilep = StringIO() comic_image = urlopen(comicurl).read() tmpfilep.write(comic_image) tmpfilep.seek(0) logging.debug("wrote a file of length %d" % len(comic_image)) d = Dinocr(tmpfilep) if d.erased_pixels > 2000: error_out('large amount of erases in a new comic (%d erases, %d uncertainty)' % (d.erased_pixels,d.uncertainty),True) trigram = d.choose_random_trigram() anything = False # don't acept a trigram that has a non-alphanumeric word in it if any([all([not x.isalnum() for x in word]) for word in trigram.split()]): error_out('words in this trigram are weird: %s' % trigram,True) return trigram
comicid = '' if len(sys.argv) == 2: comicid = '?comic=' + str(sys.argv[1]) today_comic = urlopen('http://www.qwantz.com/index.php' + comicid) result = today_comic.read() todayparsed = fromstring(result) try: comicurl = todayparsed.cssselect('img.comic')[0].attrib['src'] comicnum = int(re.findall(r'comic2-([0-9]+)\.png',comicurl)[0]) except Exception as e: error_out("failed parsing qwantz.com: " + e.message,True) if comicid == '': prev_comic = update_comicnum(comicnum) if comicnum == prev_comic: error_out("same comic") if comicnum != prev_comic + 1: error_out("unexpected comic number: previous was %d, this was %d" % (prev_comic,comicnum),True) system('curl %s > /tmp/comic.png' % comicurl) d = Dinocr('/tmp/comic.png') if d.erased_pixels > 2000: error_out('large amount of erases in a new comic (%d erases, %d uncertainty)' % (d.erased_pixels,d.uncertainty),True) trigram = d.choose_random_trigram() anything = False # don't acept a trigram that has a non-alphanumeric word in it if any([all([not x.isalnum() for x in word]) for word in trigram.split()]): error_out('words in this trigram are weird: %s' % trigram,True) result = post_to_twitter(trigram) infofile = open('trigramosaurus.txt','w') infofile.write(str(comicnum)) infofile.close()