def print_enhanced_table(self): """Parses CSV table already containing at least two columns with Spanish/English word pairs. The third column is optional and usually contains a 'checked' flag, which declares whether a word has been already validated. The generated table will contain an extra row for a normalized version of the original Spanish word, an extra row for translation (from SpanishDict.com) and an extra column for the word type (e.g. noun). Keyword arguments: src -- path to input CSV file out -- output stream """ thread_pool = ThreadPool(self.__print_enhanced_row) rows = self._parse_src() self.print_header_row(rows[0].get_column_format() if len(rows) > 0 else self._column_format.split('|')) for row in rows: if len(row) < 2: print('Skip incomplete row') else: p = Processable(lambda word: Wordanalyzer.get_translation_es(word), row.original_word, row) p.start() thread_pool.add(p) #end if #end for thread_pool.finish()
def get_cats_thread(self, cat): subcats = self.get_sub_cats("http://top.taobao.com/level2.php?cat=%s" % cat["id"], "cat", 2) if len(subcats) == 1: cat["children"] = self.get_sub_cats_thread(subcats[0]) return cat threadPool = ThreadPool(len(subcats) if len(subcats) <= 5 else 5) for sc in subcats: threadPool.run(self.get_sub_cats_thread, callback=None, sc=sc) cat["children"] = threadPool.killAllWorkers(None) return cat
def get_top_keywords(self, cats=None, parent=None, up=True): """Get top keywords for all the categories""" if not cats: cats = self.get_cats() if not cats: return [] threadPool = ThreadPool(len(cats) if len(cats) <= 5 else 5) for cat in cats: threadPool.run(self.cat_top_keywords_thread, callback=None, cat=cat, parent=parent, up=up) cats = threadPool.killAllWorkers(None) return cats
def get_cats_thread(self, cat): print cat['id'] subcats = self.get_sub_cats('http://top.taobao.com/level2.php?cat=%s'%cat['id'], 'cat', 2) if len(subcats) == 1: cat['children'] = self.get_sub_cats_thread(subcats[0]) return cat threadPool = ThreadPool(len(subcats) if len(subcats)<=5 else 5) for sc in subcats: threadPool.run(self.get_sub_cats_thread, callback=None, sc=sc) cat['children'] = threadPool.killAllWorkers(None) return cat
def cat_top_keywords(self, cat, level3="", up=True, offset=0, offsets=[]): """Get top keywords in a specific category""" # print 'CAT:%s, level:%s'%(str(cat), str(level3)) # print 'OFFSET: %d'%offset response = [] if not offsets or offset == 0: url = "http://top.taobao.com/level3.php?cat=%s&level3=%s&show=focus&up=%s&offset=%d" % ( str(cat), str(level3), "true" if up else "", offset, ) rs = self.fetch(url) if not rs: return response soup = BeautifulSoup(rs.content) response = self.parse_cat_top_keywords(soup, offset) if offset == 0: offsets = self.get_cat_top_keywords_pages(soup, offset) # print 'OFFSETS: %s'%offsets if offsets: rs = [] threadPool = ThreadPool(len(offsets) if len(offsets) <= 5 else 5) for idx, page_offset in enumerate(offsets): page_url = "http://top.taobao.com/level3.php?cat=%s&level3=%s&show=focus&up=%s&offset=%d" % ( str(cat), str(level3), "true" if up else "", page_offset, ) next_page = "True" if idx == (len(offsets) - 1) else "False" threadPool.run( self.fetch, callback=None, url=page_url, config=dict(get_next=next_page, offset=page_offset) ) pages = threadPool.killAllWorkers(None) # print 'RESPONSES: %s'%pages for p in pages: if not p: continue soup2 = BeautifulSoup(p.content) offset2 = int(p.config["offset"]) response += self.parse_cat_top_keywords(soup2, offset2) # print 'GOT: %d'%offset2 if p.config["get_next"] != "True": continue offsets = self.get_cat_top_keywords_pages(soup2, offset2) # print offsets if not offsets: continue response += self.cat_top_keywords(cat, level3, up, offset2, offsets) # return sorted(response, key=itemgetter('pos')) if response else [] # print "RETURN:%d"%offset return response
def get_cats(self): '''Get top keywords categories''' start_url = 'http://top.taobao.com/index.php?from=tbsy' rs = self.fetch(start_url) if not rs: return None soup = BeautifulSoup(rs.content, convertEntities=BeautifulSoup.HTML_ENTITIES, markupMassage=hexentityMassage) cats = [{'id':'TR_%s'%li['id'].encode('utf-8').upper(), 'title':li.a.text.encode('utf-8').strip()} for li in soup.find('div', id='nav').findAll('li') if li['id']!='index'] threadPool = ThreadPool(len(cats) if len(cats)<=5 else 5) for cat in cats: threadPool.run(self.get_cats_thread, callback=None, cat=cat) cats = threadPool.killAllWorkers(None) return cats
def get_top_keywords(self, cats=None, parent=None, up=True): '''Get top keywords for all the categories''' if not cats: cats = self.get_cats() if not cats: return [] threadPool = ThreadPool(len(cats) if len(cats) <= 5 else 5) for cat in cats: threadPool.run(self.cat_top_keywords_thread, callback=None, cat=cat, parent=parent, up=up) cats = threadPool.killAllWorkers(None) return cats
def cat_top_keywords(self, session, cat, up=True, offset=0, offsets=[]): '''Get top keywords in a specific category''' print 'CAT:%s, level:%s'%(str(cat), str(cat.level)) print 'OFFSET: %d'%offset response = [] if not offsets or offset==0: url = 'http://top.taobao.com/level3.php?cat=%s&level3=%s&show=focus&up=%s&offset=%d'%(cat.parent.cid, '' if cat.level==2 else str(cat.cid), 'true' if up else '', offset) print url rs = self.fetch(url) if not rs: return response soup = BeautifulSoup(rs.content, convertEntities=BeautifulSoup.HTML_ENTITIES, markupMassage=hexentityMassage) response = self.parse_cat_top_keywords(soup, offset) if offset==0: offsets = self.get_cat_top_keywords_pages(soup, offset) print 'OFFSETS: %s'%offsets if offsets: rs = [] threadPool = ThreadPool(len(offsets) if len(offsets)<=5 else 5) for idx, page_offset in enumerate(offsets): page_url = 'http://top.taobao.com/level3.php?cat=%s&level3=%s&show=focus&up=%s&offset=%d'%(cat.parent.cid, '' if cat.level==2 else str(cat.cid), 'true' if up else '', page_offset) next_page = 'True' if idx == (len(offsets)-1) else 'False' threadPool.run(self.fetch, callback=None, url=page_url, config=dict(get_next=next_page, offset=page_offset)) pages = threadPool.killAllWorkers(None) #print 'RESPONSES: %s'%pages for p in pages: if not p: continue soup2 = BeautifulSoup(p.content, convertEntities=BeautifulSoup.HTML_ENTITIES, markupMassage=hexentityMassage) offset2 = int(p.config['offset']) response += self.parse_cat_top_keywords(soup2, offset2) print 'GOT: %d'%offset2 if p.config['get_next'] != 'True': continue offsets = self.get_cat_top_keywords_pages(soup2, offset2) print offsets if not offsets: continue response += self.cat_top_keywords(session, cat, up, offset2, offsets) #return sorted(response, key=itemgetter('pos')) if response else [] #print "RETURN:%d"%offset for k in response: new_keyword = models.Keyword(k['name'].decode('utf-8')) new_keyword.categories.append(cat) session.add(new_keyword) try: session.commit() except IntegrityError: session.rollback() new_keyword = session.query(models.Keyword).filter(models.Keyword.name == k['name']).first() new_keyword.categories.append(cat) session.commit() print 'Duplicate %s'%new_keyword return response
def get_cats(self): """Get top keywords categories""" start_url = "http://top.taobao.com/index.php?from=tbsy" rs = self.fetch(start_url) if not rs: return None soup = BeautifulSoup(rs.content, convertEntities=BeautifulSoup.HTML_ENTITIES, markupMassage=hexentityMassage) cats = [ {"id": "TR_%s" % li["id"].encode("utf-8").upper(), "title": li.a.text.encode("utf-8").strip()} for li in soup.find("div", id="nav").findAll("li") if li["id"] != "index" ] threadPool = ThreadPool(len(cats) if len(cats) <= 5 else 5) for cat in cats: threadPool.run(self.get_cats_thread, callback=None, cat=cat) cats = threadPool.killAllWorkers(None) return cats
def print_table_with_ipa_de(self): """TODO decribe what this method does.""" thread_pool = ThreadPool(self.__print_row_with_ipa) rows = self._parse_src() self.print_header_row(rows[0].get_column_format() if len(rows) > 0 else self._column_format.split('|')) for row in rows: if len(row) < 1: print('Skip incomplete row') else: p = Processable(lambda word: Wordanalyzer.get_ipa_de(word), row[0], row) p.start() thread_pool.add(p) #end if #end for thread_pool.finish()
def print_enhanced_table_en(self): """TODO decribe what this method does.""" thread_pool = ThreadPool(self.__print_enhanced_row_en) rows = self._parse_src() self.print_header_row(rows[0].get_column_format() if len(rows) > 0 else self._column_format.split('|')) for row in rows: if len(row) < 2: print('Skip incomplete row') else: p = Processable(lambda word: Wordanalyzer.get_translation_en(word), row.original_word, row) p.start() thread_pool.add(p) #end if #end for thread_pool.finish()
def cat_top_keywords(self, cat, level3='', up=True, offset=0, offsets=[]): '''Get top keywords in a specific category''' #print 'CAT:%s, level:%s'%(str(cat), str(level3)) #print 'OFFSET: %d'%offset response = [] if not offsets or offset == 0: url = 'http://top.taobao.com/level3.php?cat=%s&level3=%s&show=focus&up=%s&offset=%d' % ( str(cat), str(level3), 'true' if up else '', offset) rs = self.fetch(url) if not rs: return response soup = BeautifulSoup(rs.content) response = self.parse_cat_top_keywords(soup, offset) if offset == 0: offsets = self.get_cat_top_keywords_pages(soup, offset) #print 'OFFSETS: %s'%offsets if offsets: rs = [] threadPool = ThreadPool(len(offsets) if len(offsets) <= 5 else 5) for idx, page_offset in enumerate(offsets): page_url = 'http://top.taobao.com/level3.php?cat=%s&level3=%s&show=focus&up=%s&offset=%d' % ( str(cat), str(level3), 'true' if up else '', page_offset) next_page = 'True' if idx == (len(offsets) - 1) else 'False' threadPool.run(self.fetch, callback=None, url=page_url, config=dict(get_next=next_page, offset=page_offset)) pages = threadPool.killAllWorkers(None) #print 'RESPONSES: %s'%pages for p in pages: if not p: continue soup2 = BeautifulSoup(p.content) offset2 = int(p.config['offset']) response += self.parse_cat_top_keywords(soup2, offset2) #print 'GOT: %d'%offset2 if p.config['get_next'] != 'True': continue offsets = self.get_cat_top_keywords_pages(soup2, offset2) #print offsets if not offsets: continue response += self.cat_top_keywords(cat, level3, up, offset2, offsets) #return sorted(response, key=itemgetter('pos')) if response else [] #print "RETURN:%d"%offset return response
def print_conjugation_table(self, tenses): """TODO decribe what this method does. Keyword arguments: tenses -- TODO describe what this argument is about """ thread_pool = ThreadPool(self.__print_conjugation_table) rows = self._parse_src() self.print_csv_row(['[Infinitive]', '[yo]', '[tú]', '[el/ella/usted]', '[nosotros, -as]', '[vosotros, -as]', '[ellos/ellas/ustedes]', '[Tense]']) # print header for row in rows: if len(row) < 1: print('Skip incomplete row') else: for tense in tenses: p = Processable(lambda word: Wordanalyzer.get_conjugation_es(word, tense), row.original_word, row) p.start() thread_pool.add(p) #end for #end if #end for thread_pool.finish()
def saveEn(self, notebook, title, html): #clear html root = lxml.html.fromstring(html) # delete unnecessary tag for tag in self.remove_tags: for node in root.xpath('//' + tag): node.drop_tree() #replace with div for tag in self.replace_with_div_tags: for node in root.xpath('//' + tag): node.tag = "div" #remove unnecessary attribute for node in root.iter(): for att in node.attrib.keys(): if att not in self.allow_attributes: del node.attrib[att] #remove javascirpt in href if att == "href": url = node.attrib[att] urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', url) if len(urls) == 0: del node.attrib[att] img_node = root.xpath('//img') if len(img_node) > 0: pool = ThreadPool(THREAD_NUM_WORKERS) #add task for img in img_node: pool.add_task(img) # Wait for completion pool.wait_completion() #get result resources = pool.get_reuslt() content = lxml.etree.tostring(root, encoding="UTF-8") #create notebook note_store = self.get_note_store() #get notebook guid nbGuid = self.get_or_create_notebook(note_store, notebook) result = self.save_note(title, content, nbGuid, note_store, resources) return result
def start_ui(test=False): from PyQt5.QtWidgets import QApplication import sys from src.ui.tab_reorder import TabReorder from src.ui.tab_log import TabLog from src.ui.tab_config import TabConfig from src.ui.tab_skins import TabSkins logger.debug('starting QtApp object') global_.QT_APP = QApplication([]) global_.MAIN_UI = MainUi() global_.MAIN_UI.add_tab(TabLog(), helpers={'write_log': 'write'}) global_.MAIN_UI.add_tab(TabReorder(), helpers={ 'tab_reorder_update_view_after_remote_scan': 'tab_reorder_update_view_after_remote_scan' }) from src.misc import dcs_installs dcs_installs.discover_dcs_installations() global_.MAIN_UI.add_tab(TabSkins(), helpers={}) global_.MAIN_UI.add_tab(TabConfig(), helpers={'update_config_tab': 'update_config_tab'}) global_.MAIN_UI.show() def pre_update_hook(): if not hasattr(sys, 'frozen'): logger.warning('skipping update on script run') return False else: I.hide() return True def cancel_update_hook(): I.show() from utils import Progress # noinspection PyTypeChecker Progress.register_adapter(I) from src.updater import updater updater.find_and_install_latest_release( current_version=global_.APP_VERSION, executable_path='emft.exe', channel=Config().update_channel, cancel_update_hook=cancel_update_hook, pre_update_hook=pre_update_hook, ) global_.MAIN_UI.update_config_tab() if test: logger.critical('RUNNING IN TEST MODE') import time from utils import ThreadPool, nice_exit def test_hook(): time.sleep(10) nice_exit() pool = ThreadPool(1, 'test') pool.queue_task(test_hook) sys.exit(global_.QT_APP.exec())
def cat_top_keywords(self, session, cat, up=True, offset=0, offsets=[]): """Get top keywords in a specific category""" print "CAT:%s, level:%s" % (str(cat), str(cat.level)) print "OFFSET: %d" % offset response = [] if not offsets or offset == 0: url = "http://top.taobao.com/level3.php?cat=%s&level3=%s&show=focus&up=%s&offset=%d" % ( cat.parent.cid, "" if cat.level == 2 else str(cat.cid), "true" if up else "", offset, ) print url rs = self.fetch(url) if not rs: return response soup = BeautifulSoup( rs.content, convertEntities=BeautifulSoup.HTML_ENTITIES, markupMassage=hexentityMassage ) response = self.parse_cat_top_keywords(soup, offset) if offset == 0: offsets = self.get_cat_top_keywords_pages(soup, offset) print "OFFSETS: %s" % offsets if offsets: rs = [] threadPool = ThreadPool(len(offsets) if len(offsets) <= 5 else 5) for idx, page_offset in enumerate(offsets): page_url = "http://top.taobao.com/level3.php?cat=%s&level3=%s&show=focus&up=%s&offset=%d" % ( cat.parent.cid, "" if cat.level == 2 else str(cat.cid), "true" if up else "", page_offset, ) next_page = "True" if idx == (len(offsets) - 1) else "False" threadPool.run( self.fetch, callback=None, url=page_url, config=dict(get_next=next_page, offset=page_offset) ) pages = threadPool.killAllWorkers(None) # print 'RESPONSES: %s'%pages for p in pages: if not p: continue soup2 = BeautifulSoup( p.content, convertEntities=BeautifulSoup.HTML_ENTITIES, markupMassage=hexentityMassage ) offset2 = int(p.config["offset"]) response += self.parse_cat_top_keywords(soup2, offset2) print "GOT: %d" % offset2 if p.config["get_next"] != "True": continue offsets = self.get_cat_top_keywords_pages(soup2, offset2) print offsets if not offsets: continue response += self.cat_top_keywords(session, cat, up, offset2, offsets) # return sorted(response, key=itemgetter('pos')) if response else [] # print "RETURN:%d"%offset for k in response: new_keyword = models.Keyword(k["name"].decode("utf-8")) new_keyword.categories.append(cat) session.add(new_keyword) try: session.commit() except IntegrityError: session.rollback() new_keyword = session.query(models.Keyword).filter(models.Keyword.name == k["name"]).first() new_keyword.categories.append(cat) session.commit() print "Duplicate %s" % new_keyword return response