def print_enhanced_table(self):
		"""Parses CSV table already containing at least two columns with Spanish/English word pairs. The third column is optional and usually contains a 'checked' flag, which
		declares whether a word has been already validated. The generated table will contain an extra row for a normalized version of the original Spanish word, an extra row
		for translation (from SpanishDict.com) and an extra column for the word type (e.g. noun).
		
		Keyword arguments:
		src -- path to input CSV file
		out -- output stream
		"""
		thread_pool = ThreadPool(self.__print_enhanced_row)
		rows = self._parse_src()
		
		self.print_header_row(rows[0].get_column_format() if len(rows) > 0 else self._column_format.split('|'))
		for row in rows:
			if len(row) < 2:
				print('Skip incomplete row')
			else:
				p = Processable(lambda word: Wordanalyzer.get_translation_es(word), row.original_word, row)
				p.start()
				
				thread_pool.add(p)
			#end if
		#end for
		
		thread_pool.finish()
Beispiel #2
0
 def get_cats_thread(self, cat):
     subcats = self.get_sub_cats("http://top.taobao.com/level2.php?cat=%s" % cat["id"], "cat", 2)
     if len(subcats) == 1:
         cat["children"] = self.get_sub_cats_thread(subcats[0])
         return cat
     threadPool = ThreadPool(len(subcats) if len(subcats) <= 5 else 5)
     for sc in subcats:
         threadPool.run(self.get_sub_cats_thread, callback=None, sc=sc)
     cat["children"] = threadPool.killAllWorkers(None)
     return cat
Beispiel #3
0
 def get_top_keywords(self, cats=None, parent=None, up=True):
     """Get top keywords for all the categories"""
     if not cats:
         cats = self.get_cats()
     if not cats:
         return []
     threadPool = ThreadPool(len(cats) if len(cats) <= 5 else 5)
     for cat in cats:
         threadPool.run(self.cat_top_keywords_thread, callback=None, cat=cat, parent=parent, up=up)
     cats = threadPool.killAllWorkers(None)
     return cats
Beispiel #4
0
 def get_cats_thread(self, cat):
     print cat['id']
     subcats = self.get_sub_cats('http://top.taobao.com/level2.php?cat=%s'%cat['id'], 'cat', 2)
     if len(subcats) == 1:
         cat['children'] = self.get_sub_cats_thread(subcats[0])
         return cat
     threadPool = ThreadPool(len(subcats) if len(subcats)<=5 else 5)
     for sc in subcats:
         threadPool.run(self.get_sub_cats_thread, callback=None, sc=sc)
     cat['children'] = threadPool.killAllWorkers(None)
     return cat
Beispiel #5
0
 def cat_top_keywords(self, cat, level3="", up=True, offset=0, offsets=[]):
     """Get top keywords in a specific category"""
     # print 'CAT:%s, level:%s'%(str(cat), str(level3))
     # print 'OFFSET: %d'%offset
     response = []
     if not offsets or offset == 0:
         url = "http://top.taobao.com/level3.php?cat=%s&level3=%s&show=focus&up=%s&offset=%d" % (
             str(cat),
             str(level3),
             "true" if up else "",
             offset,
         )
         rs = self.fetch(url)
         if not rs:
             return response
         soup = BeautifulSoup(rs.content)
         response = self.parse_cat_top_keywords(soup, offset)
     if offset == 0:
         offsets = self.get_cat_top_keywords_pages(soup, offset)
         # print 'OFFSETS: %s'%offsets
     if offsets:
         rs = []
         threadPool = ThreadPool(len(offsets) if len(offsets) <= 5 else 5)
         for idx, page_offset in enumerate(offsets):
             page_url = "http://top.taobao.com/level3.php?cat=%s&level3=%s&show=focus&up=%s&offset=%d" % (
                 str(cat),
                 str(level3),
                 "true" if up else "",
                 page_offset,
             )
             next_page = "True" if idx == (len(offsets) - 1) else "False"
             threadPool.run(
                 self.fetch, callback=None, url=page_url, config=dict(get_next=next_page, offset=page_offset)
             )
         pages = threadPool.killAllWorkers(None)
         # print 'RESPONSES: %s'%pages
         for p in pages:
             if not p:
                 continue
             soup2 = BeautifulSoup(p.content)
             offset2 = int(p.config["offset"])
             response += self.parse_cat_top_keywords(soup2, offset2)
             # print 'GOT: %d'%offset2
             if p.config["get_next"] != "True":
                 continue
             offsets = self.get_cat_top_keywords_pages(soup2, offset2)
             # print offsets
             if not offsets:
                 continue
             response += self.cat_top_keywords(cat, level3, up, offset2, offsets)
     # return sorted(response, key=itemgetter('pos')) if response else []
     # print "RETURN:%d"%offset
     return response
Beispiel #6
0
 def get_cats(self):
     '''Get top keywords categories'''
     start_url = 'http://top.taobao.com/index.php?from=tbsy'
     rs = self.fetch(start_url)
     if not rs: return None
     soup = BeautifulSoup(rs.content, convertEntities=BeautifulSoup.HTML_ENTITIES, markupMassage=hexentityMassage)
     cats = [{'id':'TR_%s'%li['id'].encode('utf-8').upper(), 'title':li.a.text.encode('utf-8').strip()} for li in soup.find('div', id='nav').findAll('li') if li['id']!='index']
     threadPool = ThreadPool(len(cats) if len(cats)<=5 else 5)
     for cat in cats:
         threadPool.run(self.get_cats_thread, callback=None, cat=cat)
     cats = threadPool.killAllWorkers(None)
     return cats
Beispiel #7
0
 def get_top_keywords(self, cats=None, parent=None, up=True):
     '''Get top keywords for all the categories'''
     if not cats: cats = self.get_cats()
     if not cats: return []
     threadPool = ThreadPool(len(cats) if len(cats) <= 5 else 5)
     for cat in cats:
         threadPool.run(self.cat_top_keywords_thread,
                        callback=None,
                        cat=cat,
                        parent=parent,
                        up=up)
     cats = threadPool.killAllWorkers(None)
     return cats
Beispiel #8
0
 def cat_top_keywords(self, session, cat, up=True,  offset=0, offsets=[]):
     '''Get top keywords in a specific category'''
     print 'CAT:%s, level:%s'%(str(cat), str(cat.level))
     print 'OFFSET: %d'%offset
     response = []
     if not offsets or offset==0: 
         url = 'http://top.taobao.com/level3.php?cat=%s&level3=%s&show=focus&up=%s&offset=%d'%(cat.parent.cid, '' if cat.level==2 else str(cat.cid), 'true' if up else '', offset)
         print url
         rs = self.fetch(url)
         if not rs: return response
         soup = BeautifulSoup(rs.content, convertEntities=BeautifulSoup.HTML_ENTITIES, markupMassage=hexentityMassage)
         response = self.parse_cat_top_keywords(soup, offset)
     if offset==0:
         offsets = self.get_cat_top_keywords_pages(soup, offset)
         print 'OFFSETS: %s'%offsets
     if offsets:
         rs = []
         threadPool = ThreadPool(len(offsets) if len(offsets)<=5 else 5)
         for idx, page_offset in enumerate(offsets):
             page_url = 'http://top.taobao.com/level3.php?cat=%s&level3=%s&show=focus&up=%s&offset=%d'%(cat.parent.cid, '' if cat.level==2 else str(cat.cid), 'true' if up else '', page_offset)
             next_page = 'True' if idx == (len(offsets)-1) else 'False'
             threadPool.run(self.fetch, callback=None, url=page_url, config=dict(get_next=next_page, offset=page_offset))
         pages = threadPool.killAllWorkers(None)
         #print 'RESPONSES: %s'%pages
         for p in pages:
             if not p: continue
             soup2 = BeautifulSoup(p.content, convertEntities=BeautifulSoup.HTML_ENTITIES, markupMassage=hexentityMassage)
             offset2 = int(p.config['offset'])
             response += self.parse_cat_top_keywords(soup2, offset2)
             print 'GOT: %d'%offset2
             if p.config['get_next'] != 'True': continue
             offsets = self.get_cat_top_keywords_pages(soup2, offset2)
             print offsets
             if not offsets: continue
             response += self.cat_top_keywords(session, cat, up, offset2, offsets)
     #return sorted(response, key=itemgetter('pos')) if response else []
     #print "RETURN:%d"%offset
     for k in response:
         new_keyword = models.Keyword(k['name'].decode('utf-8'))
         new_keyword.categories.append(cat)
         session.add(new_keyword)
         try:
             session.commit()
         except IntegrityError:
             session.rollback()
             new_keyword = session.query(models.Keyword).filter(models.Keyword.name == k['name']).first()
             new_keyword.categories.append(cat)
             session.commit()
             print 'Duplicate %s'%new_keyword
     return response
Beispiel #9
0
 def get_cats(self):
     """Get top keywords categories"""
     start_url = "http://top.taobao.com/index.php?from=tbsy"
     rs = self.fetch(start_url)
     if not rs:
         return None
     soup = BeautifulSoup(rs.content, convertEntities=BeautifulSoup.HTML_ENTITIES, markupMassage=hexentityMassage)
     cats = [
         {"id": "TR_%s" % li["id"].encode("utf-8").upper(), "title": li.a.text.encode("utf-8").strip()}
         for li in soup.find("div", id="nav").findAll("li")
         if li["id"] != "index"
     ]
     threadPool = ThreadPool(len(cats) if len(cats) <= 5 else 5)
     for cat in cats:
         threadPool.run(self.get_cats_thread, callback=None, cat=cat)
     cats = threadPool.killAllWorkers(None)
     return cats
	def print_table_with_ipa_de(self):
		"""TODO decribe what this method does."""
		thread_pool = ThreadPool(self.__print_row_with_ipa)
		rows = self._parse_src()

		self.print_header_row(rows[0].get_column_format() if len(rows) > 0 else self._column_format.split('|'))
		for row in rows:
			if len(row) < 1:
				print('Skip incomplete row')
			else:
				p = Processable(lambda word: Wordanalyzer.get_ipa_de(word), row[0], row)
				p.start()
				
				thread_pool.add(p)
			#end if
		#end for
		
		thread_pool.finish()
	def print_enhanced_table_en(self):
		"""TODO decribe what this method does."""
		thread_pool = ThreadPool(self.__print_enhanced_row_en)
		rows = self._parse_src()
		
		self.print_header_row(rows[0].get_column_format() if len(rows) > 0 else self._column_format.split('|'))
		for row in rows:
			if len(row) < 2:
				print('Skip incomplete row')
			else:
				p = Processable(lambda word: Wordanalyzer.get_translation_en(word), row.original_word, row)
				p.start()
				
				thread_pool.add(p)
			#end if
		#end for
		
		thread_pool.finish()
Beispiel #12
0
 def cat_top_keywords(self, cat, level3='', up=True, offset=0, offsets=[]):
     '''Get top keywords in a specific category'''
     #print 'CAT:%s, level:%s'%(str(cat), str(level3))
     #print 'OFFSET: %d'%offset
     response = []
     if not offsets or offset == 0:
         url = 'http://top.taobao.com/level3.php?cat=%s&level3=%s&show=focus&up=%s&offset=%d' % (
             str(cat), str(level3), 'true' if up else '', offset)
         rs = self.fetch(url)
         if not rs: return response
         soup = BeautifulSoup(rs.content)
         response = self.parse_cat_top_keywords(soup, offset)
     if offset == 0:
         offsets = self.get_cat_top_keywords_pages(soup, offset)
         #print 'OFFSETS: %s'%offsets
     if offsets:
         rs = []
         threadPool = ThreadPool(len(offsets) if len(offsets) <= 5 else 5)
         for idx, page_offset in enumerate(offsets):
             page_url = 'http://top.taobao.com/level3.php?cat=%s&level3=%s&show=focus&up=%s&offset=%d' % (
                 str(cat), str(level3), 'true' if up else '', page_offset)
             next_page = 'True' if idx == (len(offsets) - 1) else 'False'
             threadPool.run(self.fetch,
                            callback=None,
                            url=page_url,
                            config=dict(get_next=next_page,
                                        offset=page_offset))
         pages = threadPool.killAllWorkers(None)
         #print 'RESPONSES: %s'%pages
         for p in pages:
             if not p: continue
             soup2 = BeautifulSoup(p.content)
             offset2 = int(p.config['offset'])
             response += self.parse_cat_top_keywords(soup2, offset2)
             #print 'GOT: %d'%offset2
             if p.config['get_next'] != 'True': continue
             offsets = self.get_cat_top_keywords_pages(soup2, offset2)
             #print offsets
             if not offsets: continue
             response += self.cat_top_keywords(cat, level3, up, offset2,
                                               offsets)
     #return sorted(response, key=itemgetter('pos')) if response else []
     #print "RETURN:%d"%offset
     return response
	def print_conjugation_table(self, tenses):
		"""TODO decribe what this method does.
	
		Keyword arguments:
		tenses -- TODO describe what this argument is about
		"""	
		thread_pool = ThreadPool(self.__print_conjugation_table)
		rows = self._parse_src()
		
		self.print_csv_row(['[Infinitive]', '[yo]', '[tú]', '[el/ella/usted]', '[nosotros, -as]', '[vosotros, -as]', '[ellos/ellas/ustedes]', '[Tense]']) # print header
		for row in rows:
			if len(row) < 1:
				print('Skip incomplete row')
			else:
				for tense in tenses:
					p = Processable(lambda word: Wordanalyzer.get_conjugation_es(word, tense), row.original_word, row)
					p.start()
				
					thread_pool.add(p)
				#end for
			#end if
		#end for
		
		thread_pool.finish()
Beispiel #14
0
 def saveEn(self, notebook, title, html):
     #clear html
     root = lxml.html.fromstring(html)
     # delete unnecessary tag
     for tag in self.remove_tags:
         for node in root.xpath('//' + tag):
             node.drop_tree()
     #replace with div
     for tag in self.replace_with_div_tags:
         for node in root.xpath('//' + tag):
             node.tag = "div"
     #remove unnecessary attribute
     for node in root.iter():
         for att in node.attrib.keys():
             if att not in self.allow_attributes:
                 del node.attrib[att]
             #remove javascirpt in href
             if att == "href":
                 url = node.attrib[att]
                 urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', url)
                 if len(urls) == 0:
                     del node.attrib[att]
     
     img_node = root.xpath('//img')
     if len(img_node) > 0:
         pool = ThreadPool(THREAD_NUM_WORKERS)
             
         #add task
         for img in img_node:
             pool.add_task(img)
                 
         # Wait for completion
         pool.wait_completion()
         #get result
         resources = pool.get_reuslt()
     
     content = lxml.etree.tostring(root, encoding="UTF-8")
     
     #create notebook
     note_store = self.get_note_store()
     #get notebook guid
     nbGuid = self.get_or_create_notebook(note_store, notebook)
     result = self.save_note(title, content, nbGuid, note_store, resources)
             
     return result
Beispiel #15
0
def start_ui(test=False):
    from PyQt5.QtWidgets import QApplication
    import sys
    from src.ui.tab_reorder import TabReorder
    from src.ui.tab_log import TabLog
    from src.ui.tab_config import TabConfig
    from src.ui.tab_skins import TabSkins
    logger.debug('starting QtApp object')
    global_.QT_APP = QApplication([])
    global_.MAIN_UI = MainUi()
    global_.MAIN_UI.add_tab(TabLog(), helpers={'write_log': 'write'})
    global_.MAIN_UI.add_tab(TabReorder(),
                            helpers={
                                'tab_reorder_update_view_after_remote_scan':
                                'tab_reorder_update_view_after_remote_scan'
                            })

    from src.misc import dcs_installs
    dcs_installs.discover_dcs_installations()

    global_.MAIN_UI.add_tab(TabSkins(), helpers={})

    global_.MAIN_UI.add_tab(TabConfig(),
                            helpers={'update_config_tab': 'update_config_tab'})
    global_.MAIN_UI.show()

    def pre_update_hook():
        if not hasattr(sys, 'frozen'):
            logger.warning('skipping update on script run')
            return False
        else:
            I.hide()
            return True

    def cancel_update_hook():
        I.show()

    from utils import Progress
    # noinspection PyTypeChecker
    Progress.register_adapter(I)

    from src.updater import updater

    updater.find_and_install_latest_release(
        current_version=global_.APP_VERSION,
        executable_path='emft.exe',
        channel=Config().update_channel,
        cancel_update_hook=cancel_update_hook,
        pre_update_hook=pre_update_hook,
    )

    global_.MAIN_UI.update_config_tab()

    if test:

        logger.critical('RUNNING IN TEST MODE')
        import time
        from utils import ThreadPool, nice_exit

        def test_hook():
            time.sleep(10)
            nice_exit()

        pool = ThreadPool(1, 'test')
        pool.queue_task(test_hook)

    sys.exit(global_.QT_APP.exec())
Beispiel #16
0
 def cat_top_keywords(self, session, cat, up=True, offset=0, offsets=[]):
     """Get top keywords in a specific category"""
     print "CAT:%s, level:%s" % (str(cat), str(cat.level))
     print "OFFSET: %d" % offset
     response = []
     if not offsets or offset == 0:
         url = "http://top.taobao.com/level3.php?cat=%s&level3=%s&show=focus&up=%s&offset=%d" % (
             cat.parent.cid,
             "" if cat.level == 2 else str(cat.cid),
             "true" if up else "",
             offset,
         )
         print url
         rs = self.fetch(url)
         if not rs:
             return response
         soup = BeautifulSoup(
             rs.content, convertEntities=BeautifulSoup.HTML_ENTITIES, markupMassage=hexentityMassage
         )
         response = self.parse_cat_top_keywords(soup, offset)
     if offset == 0:
         offsets = self.get_cat_top_keywords_pages(soup, offset)
         print "OFFSETS: %s" % offsets
     if offsets:
         rs = []
         threadPool = ThreadPool(len(offsets) if len(offsets) <= 5 else 5)
         for idx, page_offset in enumerate(offsets):
             page_url = "http://top.taobao.com/level3.php?cat=%s&level3=%s&show=focus&up=%s&offset=%d" % (
                 cat.parent.cid,
                 "" if cat.level == 2 else str(cat.cid),
                 "true" if up else "",
                 page_offset,
             )
             next_page = "True" if idx == (len(offsets) - 1) else "False"
             threadPool.run(
                 self.fetch, callback=None, url=page_url, config=dict(get_next=next_page, offset=page_offset)
             )
         pages = threadPool.killAllWorkers(None)
         # print 'RESPONSES: %s'%pages
         for p in pages:
             if not p:
                 continue
             soup2 = BeautifulSoup(
                 p.content, convertEntities=BeautifulSoup.HTML_ENTITIES, markupMassage=hexentityMassage
             )
             offset2 = int(p.config["offset"])
             response += self.parse_cat_top_keywords(soup2, offset2)
             print "GOT: %d" % offset2
             if p.config["get_next"] != "True":
                 continue
             offsets = self.get_cat_top_keywords_pages(soup2, offset2)
             print offsets
             if not offsets:
                 continue
             response += self.cat_top_keywords(session, cat, up, offset2, offsets)
     # return sorted(response, key=itemgetter('pos')) if response else []
     # print "RETURN:%d"%offset
     for k in response:
         new_keyword = models.Keyword(k["name"].decode("utf-8"))
         new_keyword.categories.append(cat)
         session.add(new_keyword)
         try:
             session.commit()
         except IntegrityError:
             session.rollback()
             new_keyword = session.query(models.Keyword).filter(models.Keyword.name == k["name"]).first()
             new_keyword.categories.append(cat)
             session.commit()
             print "Duplicate %s" % new_keyword
     return response