Esempio n. 1
0
 def __init__(self, kanji, page_range):
     self.j_url = "http://jisho.org/sentences?jap=" + kanji.encode('utf-8') + "&page="
     self.t_url = "http://tatoeba.org/eng/sentences/search?query=%s&from=jpn&to=und"
     self.j_parser = JishoParser()
     self.t_parser = TatoebaParser()
     self.restructurer = Restructurer()
     self.sentences = []
     self.finished = False
     self.first_page = page_range[0]
     self.stop_page = page_range[1]
     print u"Starting grab for kanji: %s" % kanji
     # start grabbing sentences
     # TODO: handle the thread termination when when object is destroyed
     thread.start_new_thread(SentenceGrabber.start_getting_sentences, (self,))
Esempio n. 2
0
class SentenceGrabber:
    def __init__(self, kanji, page_range):
        self.j_url = "http://jisho.org/sentences?jap=" + kanji.encode('utf-8') + "&page="
        self.t_url = "http://tatoeba.org/eng/sentences/search?query=%s&from=jpn&to=und"
        self.j_parser = JishoParser()
        self.t_parser = TatoebaParser()
        self.restructurer = Restructurer()
        self.sentences = []
        self.finished = False
        self.first_page = page_range[0]
        self.stop_page = page_range[1]
        print u"Starting grab for kanji: %s" % kanji
        # start grabbing sentences
        # TODO: handle the thread termination when when object is destroyed
        thread.start_new_thread(SentenceGrabber.start_getting_sentences, (self,))

    def pop_next_sentence(self):
        total_sleep = 0
        #print "Waiting to POP sentence..."
        while len(self.sentences) == 0 and total_sleep < 60:
            time.sleep(0.2)
            total_sleep += 0.2
        #print "Popped! (%.3fms)" % (total_sleep)
        return self.sentences.pop(0) if len(self.sentences) > 0 else None

    @staticmethod
    def urlopen_retry(url):
        while True:
            try:
                u = urllib2.urlopen(url)
                break
            except urllib2.URLError:
                print u"URLError: retrying..."
                pass
        return u

    def start_getting_sentences(self):
        jisho_page = self.first_page
        while jisho_page < self.stop_page:
            print "Downloading Jisho page #%d" % jisho_page
            url = self.j_url + str(jisho_page)
            u = self.urlopen_retry(url)
            encoding = u.headers.getparam('charset')
            # parse it
            self.j_parser.feed( u.read().decode(encoding) )
            jisho_sentences = self.j_parser.get_sentences()
            for i, bun in enumerate(jisho_sentences):
                print "Downloading Tatoeba sentence #%d" % (i+1)
                url = self.t_url % bun.encode('utf-8')
                u = self.urlopen_retry(url)
                # parse it
                (structure_orig, translations) = self.t_parser.feed( u.read().decode('utf-8'), bun )
                structure = ""
                if structure_orig == None:
                    print u"Could not scrape original structure for: %s." % bun
                    structure_orig = ""
                else:
                    # adjust the structure to our format
                    structure = self.restructurer.feed(structure_orig)
                # add it to the collection
                self.sentences += [ { 'sentence' : bun, 'structure' : structure, 'structure_orig' : structure_orig, 'translations' : translations } ]
            jisho_page += 1
        self.finished = True

    def any_sentence_left(self):
        return len(self.sentences) > 0 or not self.finished