def test_save_zhuanlan(): query = (Page.select(Page, Task) .join(Task) .where((Task.page_type == 'zhihu_article') & (Page.title.contains('无痛的机器学习'))) .group_by(Page.task) .having(Page.watch_date == fn.MAX(Page.watch_date)) .limit(9999)) for page in query: log(page.title) log(page.task) page.to_local_file(folder='deep', fetch_images=False)
def test_to_local_file_3(): query = (Page.select(Page, Task) .join(Task) .where(Page.topic.contains('矩阵')) .group_by(Page.task) .having(Page.watch_date == fn.MAX(Page.watch_date)) .limit(8800)) for page in query: log(page.title) # log(page.metadata) page.to_local_file(folder='deep', fetch_images=False)
def main(): ''' Set up progress bar. ''' widgets = [ 'Progress: ', Percentage(), ' ', Bar(marker=RotatingMarker()), ' ', ETA(), ' Downloaded ', Counter(), ' sites.' ] pbar = ProgressBar(widgets=widgets, maxval=Page.select().count()) pbar.start() page_index = 0 for page in Page.select().where(Page.language == 'jquery'): dest = os.path.join('pages', page.language, page.query, str(page.rank)) if not os.path.isdir(dest): os.makedirs(dest) output, return_code = run_wget(page.link, dest) bad_ssl = False if return_code == 5: # SSL error -- double-check with requests package try: requests.get(page.link) output, return_code = run_wget(page.link, dest, skip_certificate=True) except requests.exceptions.SSLError: bad_ssl = True ''' Run wget to fetch page and all it's dependencies. ''' ''' First downloaded file is the index. Store where it's saved. ''' save_locs = re.findall(r"^Saving to: '(.*)'$", output, re.MULTILINE) if bad_ssl or len(save_locs) == 0: logging.warn("Failed fetch (code=%d): %s", return_code, page.link) else: page.dest = save_locs[0] page.save() logging.info("Fetched file (code=%d): %s", return_code, page.link) page_index += 1 pbar.update(page_index) pbar.finish()
def label_pages(start_index, unknown_only=False, purpose=None): page_query = \ (Page.select() .group_by(Page.link) .where( Page.language == 'regex', Page.has_example == 1, )) pages = [p for p in page_query] random.shuffle(pages) browser = webdriver.Firefox() print "Press enter to open page.", raw_input() for i, p in enumerate(pages[start_index:], start=start_index): if ((unknown_only and p.purpose != 'unknown') or (purpose is not None and p.purpose != purpose)): continue link = build_local_url(p) browser.get(link) pshort = '' while pshort not in PURPOSES.keys(): pshort = raw_input( "Page {idx} loaded. Type class ({opts}): " .format(idx=i, opts=','.join(PURPOSES.keys()))) pshort = pshort.lower() for same_page in Page.select().where(Page.link == p.link): same_page.purpose = PURPOSES[pshort] same_page.save() print "You have labeled the purposes of all pages." browser.close()
def order_pages(language, random_seed): ''' Random seed should be deterministic, but different for each language. ''' random.seed(random_seed * hash(language)) pages = (Page.select() .group_by(Page.link) .where( Page.language == language, Page.has_example == 1, )) ids = [p.id for p in pages] random.shuffle(ids) ordered_pages = [Page.get(Page.id == id_) for id_ in ids] return ordered_pages
def test_to_local_file(): # page = Page.select().order_by(-Page.id).get() # page = Page.select(Page.topic).distinct().where(Page.topic.contains('房')).limit(5) # q = Page.select(Page.id).distinct() # for p in q: # print(p) query = (Page.select(Page, Task) .join(Task) .where(Page.author == '十年寒霜') # .where(Page.topic.contains('建筑')) .group_by(Page.task) .having(Page.watch_date == fn.MAX(Page.watch_date)) .limit(8800)) for page in query: log(page.title) # log(page.metadata) page.to_local_file(folder='test', fetch_images=False)
def order_pages(language, random_seed): ''' Random seed should be deterministic, but different for each language. ''' random.seed(random_seed * hash(language)) pages = (Page.select() .group_by(Page.link) .where( Page.language == language, (Page.purpose == 'targeted end use') | (Page.purpose == 'miscellany end use') )) ids = [p.id for p in pages] random.shuffle(ids) ordered_pages = [Page.get(Page.id == id_) for id_ in ids] return ordered_pages
def main(start_index): page_query = \ (Page.select() .group_by(Page.link) .where( Page.language == 'regex', Page.has_example == 1, )) pages = [p for p in page_query] random.shuffle(pages) with open(OUTPUT_FILE, 'a') as outfile: browser = webdriver.Firefox() print "Press enter to open page.", raw_input() for i, p in enumerate(pages[start_index:], start=start_index): link = build_local_url(p) browser.get(link) class_ = raw_input("Page {idx} loaded. Type class: ".format(idx=i)) outfile.write(',,,'.join([link, class_]) + '\n') outfile.flush()
def main(): print "\t".join(["domain", "language", "query", "has_example", "notfound"]) for page in Page.select().group_by(Page.link): print "\t".join( str(_) for _ in [urlparse(page.link).netloc, page.language, page.query, page.has_example, page.notfound] )