コード例 #1
0
def test_save_zhuanlan():
  query = (Page.select(Page, Task)
           .join(Task)
           .where((Task.page_type == 'zhihu_article') & (Page.title.contains('无痛的机器学习')))
           .group_by(Page.task)
           .having(Page.watch_date == fn.MAX(Page.watch_date))
           .limit(9999))
  for page in query:
    log(page.title)
    log(page.task)
    page.to_local_file(folder='deep', fetch_images=False)
コード例 #2
0
def test_to_local_file_3():

  query = (Page.select(Page, Task)
           .join(Task)
           .where(Page.topic.contains('矩阵'))
           .group_by(Page.task)
           .having(Page.watch_date == fn.MAX(Page.watch_date))
           .limit(8800))
  for page in query:
    log(page.title)
    # log(page.metadata)
    page.to_local_file(folder='deep', fetch_images=False)
コード例 #3
0
ファイル: fetch_pages.py プロジェクト: andrewhead/StackSkim
def main():

    ''' Set up progress bar. '''
    widgets = [
        'Progress: ', Percentage(), ' ', Bar(marker=RotatingMarker()), ' ', ETA(),
        ' Downloaded ', Counter(), ' sites.'
    ]
    pbar = ProgressBar(widgets=widgets, maxval=Page.select().count())
    pbar.start()

    page_index = 0
    for page in Page.select().where(Page.language == 'jquery'):

        dest = os.path.join('pages', page.language, page.query, str(page.rank))
        if not os.path.isdir(dest):
            os.makedirs(dest)

        output, return_code = run_wget(page.link, dest)
        bad_ssl = False
        if return_code == 5:  # SSL error -- double-check with requests package
            try:
                requests.get(page.link)
                output, return_code = run_wget(page.link, dest, skip_certificate=True)
            except requests.exceptions.SSLError:
                bad_ssl = True

        ''' Run wget to fetch page and all it's dependencies. '''
        ''' First downloaded file is the index.  Store where it's saved. '''
        save_locs = re.findall(r"^Saving to: '(.*)'$", output, re.MULTILINE)
        if bad_ssl or len(save_locs) == 0:
            logging.warn("Failed fetch (code=%d): %s", return_code, page.link)
        else:
            page.dest = save_locs[0]
            page.save()
            logging.info("Fetched file (code=%d): %s", return_code, page.link)

        page_index += 1
        pbar.update(page_index)

    pbar.finish()
コード例 #4
0
ファイル: mark_purposes.py プロジェクト: andrewhead/StackSkim
def label_pages(start_index, unknown_only=False, purpose=None):

    page_query = \
        (Page.select()
             .group_by(Page.link)
             .where(
                 Page.language == 'regex',
                 Page.has_example == 1,
                 ))
    pages = [p for p in page_query]
    random.shuffle(pages)

    browser = webdriver.Firefox()
    print "Press enter to open page.",
    raw_input()
    for i, p in enumerate(pages[start_index:], start=start_index):

        if ((unknown_only and p.purpose != 'unknown') or
           (purpose is not None and p.purpose != purpose)):
            continue

        link = build_local_url(p)
        browser.get(link)

        pshort = ''
        while pshort not in PURPOSES.keys():
            pshort = raw_input(
                "Page {idx} loaded. Type class ({opts}): "
                .format(idx=i, opts=','.join(PURPOSES.keys())))
            pshort = pshort.lower()

        for same_page in Page.select().where(Page.link == p.link):
            same_page.purpose = PURPOSES[pshort]
            same_page.save()

    print "You have labeled the purposes of all pages."
    browser.close()
コード例 #5
0
ファイル: order.py プロジェクト: andrewhead/StackSkim
def order_pages(language, random_seed):

    ''' Random seed should be deterministic, but different for each language. '''
    random.seed(random_seed * hash(language))

    pages = (Page.select()
             .group_by(Page.link)
             .where(
                 Page.language == language,
                 Page.has_example == 1,
                 ))

    ids = [p.id for p in pages]
    random.shuffle(ids)
    ordered_pages = [Page.get(Page.id == id_) for id_ in ids]
    return ordered_pages
コード例 #6
0
def test_to_local_file():
  # page = Page.select().order_by(-Page.id).get()

  # page = Page.select(Page.topic).distinct().where(Page.topic.contains('房')).limit(5)
  # q = Page.select(Page.id).distinct()
  # for p in q:
  #   print(p)
  query = (Page.select(Page, Task)
           .join(Task)
           .where(Page.author == '十年寒霜')  # .where(Page.topic.contains('建筑'))
           .group_by(Page.task)
           .having(Page.watch_date == fn.MAX(Page.watch_date))
           .limit(8800))
  for page in query:
    log(page.title)
    # log(page.metadata)
    page.to_local_file(folder='test', fetch_images=False)
コード例 #7
0
ファイル: order.py プロジェクト: andrewhead/StackSkim
def order_pages(language, random_seed):

    ''' Random seed should be deterministic, but different for each language. '''
    random.seed(random_seed * hash(language))

    pages = (Page.select()
             .group_by(Page.link)
             .where(
                 Page.language == language,
                 (Page.purpose == 'targeted end use') |
                 (Page.purpose == 'miscellany end use')
                 ))

    ids = [p.id for p in pages]
    random.shuffle(ids)
    ordered_pages = [Page.get(Page.id == id_) for id_ in ids]
    return ordered_pages
コード例 #8
0
ファイル: random_page.py プロジェクト: andrewhead/StackSkim
def main(start_index):

    page_query = \
        (Page.select()
             .group_by(Page.link)
             .where(
                 Page.language == 'regex',
                 Page.has_example == 1,
                 ))
    pages = [p for p in page_query]
    random.shuffle(pages)

    with open(OUTPUT_FILE, 'a') as outfile:

        browser = webdriver.Firefox()
        print "Press enter to open page.",
        raw_input()
        for i, p in enumerate(pages[start_index:], start=start_index):
            link = build_local_url(p)
            browser.get(link)
            class_ = raw_input("Page {idx} loaded. Type class: ".format(idx=i))
            outfile.write(',,,'.join([link, class_]) + '\n')
            outfile.flush()
コード例 #9
0
ファイル: data_to_tsv.py プロジェクト: andrewhead/StackSkim
def main():
    print "\t".join(["domain", "language", "query", "has_example", "notfound"])
    for page in Page.select().group_by(Page.link):
        print "\t".join(
            str(_) for _ in [urlparse(page.link).netloc, page.language, page.query, page.has_example, page.notfound]
        )